sse2fdct.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
  9. * by the Xiph.Org Foundation http://www.xiph.org/ *
  10. * *
  11. ********************************************************************/
  12. /*SSE2 fDCT implementation for x86_64.*/
  13. /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
  14. #include <stddef.h>
  15. #include "x86enc.h"
  16. #include "x86zigzag.h"
  17. #include "sse2trans.h"
  18. #if defined(OC_X86_64_ASM)
  19. # define OC_FDCT_8x8 \
  20. /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
  21. "#OC_FDCT_8x8\n\t" \
  22. /*Stage 1:*/ \
  23. "movdqa %%xmm0,%%xmm11\n\t" \
  24. "movdqa %%xmm1,%%xmm10\n\t" \
  25. "movdqa %%xmm2,%%xmm9\n\t" \
  26. "movdqa %%xmm3,%%xmm8\n\t" \
  27. /*xmm11=t7'=t0-t7*/ \
  28. "psubw %%xmm7,%%xmm11\n\t" \
  29. /*xmm10=t6'=t1-t6*/ \
  30. "psubw %%xmm6,%%xmm10\n\t" \
  31. /*xmm9=t5'=t2-t5*/ \
  32. "psubw %%xmm5,%%xmm9\n\t" \
  33. /*xmm8=t4'=t3-t4*/ \
  34. "psubw %%xmm4,%%xmm8\n\t" \
  35. /*xmm0=t0'=t0+t7*/ \
  36. "paddw %%xmm7,%%xmm0\n\t" \
  37. /*xmm1=t1'=t1+t6*/ \
  38. "paddw %%xmm6,%%xmm1\n\t" \
  39. /*xmm5=t2'=t2+t5*/ \
  40. "paddw %%xmm2,%%xmm5\n\t" \
  41. /*xmm4=t3'=t3+t4*/ \
  42. "paddw %%xmm3,%%xmm4\n\t" \
  43. /*xmm2,3,6,7 are now free.*/ \
  44. /*Stage 2:*/ \
  45. "movdqa %%xmm0,%%xmm3\n\t" \
  46. "mov $0x5A806A0A,%[a]\n\t" \
  47. "movdqa %%xmm1,%%xmm2\n\t" \
  48. "movd %[a],%%xmm13\n\t" \
  49. "movdqa %%xmm10,%%xmm6\n\t" \
  50. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  51. /*xmm2=t2''=t1'-t2'*/ \
  52. "psubw %%xmm5,%%xmm2\n\t" \
  53. "pxor %%xmm12,%%xmm12\n\t" \
  54. /*xmm3=t3''=t0'-t3'*/ \
  55. "psubw %%xmm4,%%xmm3\n\t" \
  56. "psubw %%xmm14,%%xmm12\n\t" \
  57. /*xmm10=t5''=t6'-t5'*/ \
  58. "psubw %%xmm9,%%xmm10\n\t" \
  59. "paddw %%xmm12,%%xmm12\n\t" \
  60. /*xmm4=t0''=t0'+t3'*/ \
  61. "paddw %%xmm0,%%xmm4\n\t" \
  62. /*xmm1=t1''=t1'+t2'*/ \
  63. "paddw %%xmm5,%%xmm1\n\t" \
  64. /*xmm6=t6''=t6'+t5'*/ \
  65. "paddw %%xmm9,%%xmm6\n\t" \
  66. /*xmm0,xmm5,xmm9 are now free.*/ \
  67. /*Stage 3:*/ \
  68. /*xmm10:xmm5=t5''*27146+0xB500 \
  69. xmm0=t5''*/ \
  70. "movdqa %%xmm10,%%xmm5\n\t" \
  71. "movdqa %%xmm10,%%xmm0\n\t" \
  72. "punpckhwd %%xmm12,%%xmm10\n\t" \
  73. "pmaddwd %%xmm13,%%xmm10\n\t" \
  74. "punpcklwd %%xmm12,%%xmm5\n\t" \
  75. "pmaddwd %%xmm13,%%xmm5\n\t" \
  76. /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
  77. "psrad $16,%%xmm10\n\t" \
  78. "psrad $16,%%xmm5\n\t" \
  79. "packssdw %%xmm10,%%xmm5\n\t" \
  80. "paddw %%xmm0,%%xmm5\n\t" \
  81. /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
  82. "pcmpeqw %%xmm15,%%xmm0\n\t" \
  83. "psubw %%xmm14,%%xmm0\n\t" \
  84. "paddw %%xmm5,%%xmm0\n\t" \
  85. "movdqa %%xmm8,%%xmm5\n\t" \
  86. "psraw $1,%%xmm0\n\t" \
  87. /*xmm5=t5'''=t4'-s*/ \
  88. "psubw %%xmm0,%%xmm5\n\t" \
  89. /*xmm8=t4''=t4'+s*/ \
  90. "paddw %%xmm0,%%xmm8\n\t" \
  91. /*xmm0,xmm7,xmm9,xmm10 are free.*/ \
  92. /*xmm7:xmm9=t6''*27146+0xB500*/ \
  93. "movdqa %%xmm6,%%xmm7\n\t" \
  94. "movdqa %%xmm6,%%xmm9\n\t" \
  95. "punpckhwd %%xmm12,%%xmm7\n\t" \
  96. "pmaddwd %%xmm13,%%xmm7\n\t" \
  97. "punpcklwd %%xmm12,%%xmm9\n\t" \
  98. "pmaddwd %%xmm13,%%xmm9\n\t" \
  99. /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
  100. "psrad $16,%%xmm7\n\t" \
  101. "psrad $16,%%xmm9\n\t" \
  102. "packssdw %%xmm7,%%xmm9\n\t" \
  103. "paddw %%xmm6,%%xmm9\n\t" \
  104. /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
  105. "pcmpeqw %%xmm15,%%xmm6\n\t" \
  106. "psubw %%xmm14,%%xmm6\n\t" \
  107. "paddw %%xmm6,%%xmm9\n\t" \
  108. "movdqa %%xmm11,%%xmm7\n\t" \
  109. "psraw $1,%%xmm9\n\t" \
  110. /*xmm7=t6'''=t7'-s*/ \
  111. "psubw %%xmm9,%%xmm7\n\t" \
  112. /*xmm9=t7''=t7'+s*/ \
  113. "paddw %%xmm11,%%xmm9\n\t" \
  114. /*xmm0,xmm6,xmm10,xmm11 are free.*/ \
  115. /*Stage 4:*/ \
  116. /*xmm10:xmm0=t1''*27146+0xB500*/ \
  117. "movdqa %%xmm1,%%xmm0\n\t" \
  118. "movdqa %%xmm1,%%xmm10\n\t" \
  119. "punpcklwd %%xmm12,%%xmm0\n\t" \
  120. "pmaddwd %%xmm13,%%xmm0\n\t" \
  121. "punpckhwd %%xmm12,%%xmm10\n\t" \
  122. "pmaddwd %%xmm13,%%xmm10\n\t" \
  123. /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
  124. "psrad $16,%%xmm0\n\t" \
  125. "psrad $16,%%xmm10\n\t" \
  126. "mov $0x20006A0A,%[a]\n\t" \
  127. "packssdw %%xmm10,%%xmm0\n\t" \
  128. "movd %[a],%%xmm13\n\t" \
  129. "paddw %%xmm1,%%xmm0\n\t" \
  130. /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
  131. "pcmpeqw %%xmm15,%%xmm1\n\t" \
  132. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  133. "psubw %%xmm14,%%xmm1\n\t" \
  134. "paddw %%xmm1,%%xmm0\n\t" \
  135. /*xmm10:xmm4=t0''*27146+0x4000*/ \
  136. "movdqa %%xmm4,%%xmm1\n\t" \
  137. "movdqa %%xmm4,%%xmm10\n\t" \
  138. "punpcklwd %%xmm12,%%xmm4\n\t" \
  139. "pmaddwd %%xmm13,%%xmm4\n\t" \
  140. "punpckhwd %%xmm12,%%xmm10\n\t" \
  141. "pmaddwd %%xmm13,%%xmm10\n\t" \
  142. /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
  143. "psrad $16,%%xmm4\n\t" \
  144. "psrad $16,%%xmm10\n\t" \
  145. "mov $0x6CB7,%[a]\n\t" \
  146. "packssdw %%xmm10,%%xmm4\n\t" \
  147. "movd %[a],%%xmm12\n\t" \
  148. "paddw %%xmm1,%%xmm4\n\t" \
  149. /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
  150. "pcmpeqw %%xmm15,%%xmm1\n\t" \
  151. "pshufd $00,%%xmm12,%%xmm12\n\t" \
  152. "psubw %%xmm14,%%xmm1\n\t" \
  153. "mov $0x7FFF6C84,%[a]\n\t" \
  154. "paddw %%xmm1,%%xmm4\n\t" \
  155. /*xmm0=_y[0]=u=r+s>>1 \
  156. The naive implementation could cause overflow, so we use \
  157. u=(r&s)+((r^s)>>1).*/ \
  158. "movdqa %%xmm0,%%xmm6\n\t" \
  159. "pxor %%xmm4,%%xmm0\n\t" \
  160. "pand %%xmm4,%%xmm6\n\t" \
  161. "psraw $1,%%xmm0\n\t" \
  162. "movd %[a],%%xmm13\n\t" \
  163. "paddw %%xmm6,%%xmm0\n\t" \
  164. /*xmm4=_y[4]=v=r-u*/ \
  165. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  166. "psubw %%xmm0,%%xmm4\n\t" \
  167. /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
  168. /*xmm6:xmm10=60547*t3''+0x6CB7*/ \
  169. "movdqa %%xmm3,%%xmm10\n\t" \
  170. "movdqa %%xmm3,%%xmm6\n\t" \
  171. "punpcklwd %%xmm3,%%xmm10\n\t" \
  172. "pmaddwd %%xmm13,%%xmm10\n\t" \
  173. "mov $0x61F861F8,%[a]\n\t" \
  174. "punpckhwd %%xmm3,%%xmm6\n\t" \
  175. "pmaddwd %%xmm13,%%xmm6\n\t" \
  176. "movd %[a],%%xmm13\n\t" \
  177. "paddd %%xmm12,%%xmm10\n\t" \
  178. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  179. "paddd %%xmm12,%%xmm6\n\t" \
  180. /*xmm1:xmm2=25080*t2'' \
  181. xmm12=t2''*/ \
  182. "movdqa %%xmm2,%%xmm11\n\t" \
  183. "movdqa %%xmm2,%%xmm12\n\t" \
  184. "pmullw %%xmm13,%%xmm2\n\t" \
  185. "pmulhw %%xmm13,%%xmm11\n\t" \
  186. "movdqa %%xmm2,%%xmm1\n\t" \
  187. "punpcklwd %%xmm11,%%xmm2\n\t" \
  188. "punpckhwd %%xmm11,%%xmm1\n\t" \
  189. /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
  190. "paddd %%xmm2,%%xmm10\n\t" \
  191. "paddd %%xmm1,%%xmm6\n\t" \
  192. "psrad $16,%%xmm10\n\t" \
  193. "pcmpeqw %%xmm15,%%xmm3\n\t" \
  194. "psrad $16,%%xmm6\n\t" \
  195. "psubw %%xmm14,%%xmm3\n\t" \
  196. "packssdw %%xmm6,%%xmm10\n\t" \
  197. "paddw %%xmm3,%%xmm10\n\t" \
  198. /*xmm2=_y[2]=u \
  199. xmm10=s=(25080*u>>16)-t2''*/ \
  200. "movdqa %%xmm10,%%xmm2\n\t" \
  201. "pmulhw %%xmm13,%%xmm10\n\t" \
  202. "psubw %%xmm12,%%xmm10\n\t" \
  203. /*xmm1:xmm6=s*21600+0x2800*/ \
  204. "pxor %%xmm12,%%xmm12\n\t" \
  205. "psubw %%xmm14,%%xmm12\n\t" \
  206. "mov $0x28005460,%[a]\n\t" \
  207. "movd %[a],%%xmm13\n\t" \
  208. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  209. "movdqa %%xmm10,%%xmm6\n\t" \
  210. "movdqa %%xmm10,%%xmm1\n\t" \
  211. "punpcklwd %%xmm12,%%xmm6\n\t" \
  212. "pmaddwd %%xmm13,%%xmm6\n\t" \
  213. "mov $0x0E3D,%[a]\n\t" \
  214. "punpckhwd %%xmm12,%%xmm1\n\t" \
  215. "pmaddwd %%xmm13,%%xmm1\n\t" \
  216. /*xmm6=(s*21600+0x2800>>18)+s*/ \
  217. "psrad $18,%%xmm6\n\t" \
  218. "psrad $18,%%xmm1\n\t" \
  219. "movd %[a],%%xmm12\n\t" \
  220. "packssdw %%xmm1,%%xmm6\n\t" \
  221. "pshufd $00,%%xmm12,%%xmm12\n\t" \
  222. "paddw %%xmm10,%%xmm6\n\t" \
  223. /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
  224. "mov $0x7FFF54DC,%[a]\n\t" \
  225. "pcmpeqw %%xmm15,%%xmm10\n\t" \
  226. "movd %[a],%%xmm13\n\t" \
  227. "psubw %%xmm14,%%xmm10\n\t" \
  228. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  229. "paddw %%xmm10,%%xmm6\n\t " \
  230. /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
  231. /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
  232. "movdqa %%xmm5,%%xmm10\n\t" \
  233. "movdqa %%xmm5,%%xmm11\n\t" \
  234. "punpcklwd %%xmm5,%%xmm10\n\t" \
  235. "pmaddwd %%xmm13,%%xmm10\n\t" \
  236. "mov $0x8E3A8E3A,%[a]\n\t" \
  237. "punpckhwd %%xmm5,%%xmm11\n\t" \
  238. "pmaddwd %%xmm13,%%xmm11\n\t" \
  239. "movd %[a],%%xmm13\n\t" \
  240. "paddd %%xmm12,%%xmm10\n\t" \
  241. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  242. "paddd %%xmm12,%%xmm11\n\t" \
  243. /*xmm7:xmm12=36410*t6''' \
  244. xmm1=t6'''*/ \
  245. "movdqa %%xmm7,%%xmm3\n\t" \
  246. "movdqa %%xmm7,%%xmm1\n\t" \
  247. "pmulhw %%xmm13,%%xmm3\n\t" \
  248. "pmullw %%xmm13,%%xmm7\n\t" \
  249. "paddw %%xmm1,%%xmm3\n\t" \
  250. "movdqa %%xmm7,%%xmm12\n\t" \
  251. "punpckhwd %%xmm3,%%xmm7\n\t" \
  252. "punpcklwd %%xmm3,%%xmm12\n\t" \
  253. /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
  254. "paddd %%xmm12,%%xmm10\n\t" \
  255. "paddd %%xmm7,%%xmm11\n\t" \
  256. "psrad $16,%%xmm10\n\t" \
  257. "pcmpeqw %%xmm15,%%xmm5\n\t" \
  258. "psrad $16,%%xmm11\n\t" \
  259. "psubw %%xmm14,%%xmm5\n\t" \
  260. "packssdw %%xmm11,%%xmm10\n\t" \
  261. "pxor %%xmm12,%%xmm12\n\t" \
  262. "paddw %%xmm5,%%xmm10\n\t" \
  263. /*xmm5=_y[5]=u \
  264. xmm1=s=t6'''-(36410*u>>16)*/ \
  265. "psubw %%xmm14,%%xmm12\n\t" \
  266. "movdqa %%xmm10,%%xmm5\n\t" \
  267. "mov $0x340067C8,%[a]\n\t" \
  268. "pmulhw %%xmm13,%%xmm10\n\t" \
  269. "movd %[a],%%xmm13\n\t" \
  270. "paddw %%xmm5,%%xmm10\n\t" \
  271. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  272. "psubw %%xmm10,%%xmm1\n\t" \
  273. /*xmm11:xmm3=s*26568+0x3400*/ \
  274. "movdqa %%xmm1,%%xmm3\n\t" \
  275. "movdqa %%xmm1,%%xmm11\n\t" \
  276. "punpcklwd %%xmm12,%%xmm3\n\t" \
  277. "pmaddwd %%xmm13,%%xmm3\n\t" \
  278. "mov $0x7B1B,%[a]\n\t" \
  279. "punpckhwd %%xmm12,%%xmm11\n\t" \
  280. "pmaddwd %%xmm13,%%xmm11\n\t" \
  281. /*xmm3=(s*26568+0x3400>>17)+s*/ \
  282. "psrad $17,%%xmm3\n\t" \
  283. "psrad $17,%%xmm11\n\t" \
  284. "movd %[a],%%xmm12\n\t" \
  285. "packssdw %%xmm11,%%xmm3\n\t" \
  286. "pshufd $00,%%xmm12,%%xmm12\n\t" \
  287. "paddw %%xmm1,%%xmm3\n\t" \
  288. /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
  289. "mov $0x7FFF7B16,%[a]\n\t" \
  290. "pcmpeqw %%xmm15,%%xmm1\n\t" \
  291. "movd %[a],%%xmm13\n\t" \
  292. "psubw %%xmm14,%%xmm1\n\t" \
  293. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  294. "paddw %%xmm1,%%xmm3\n\t " \
  295. /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
  296. /*xmm11:xmm10=64277*t7''+0x7B1B*/ \
  297. "movdqa %%xmm9,%%xmm10\n\t" \
  298. "movdqa %%xmm9,%%xmm11\n\t" \
  299. "punpcklwd %%xmm9,%%xmm10\n\t" \
  300. "pmaddwd %%xmm13,%%xmm10\n\t" \
  301. "mov $0x31F131F1,%[a]\n\t" \
  302. "punpckhwd %%xmm9,%%xmm11\n\t" \
  303. "pmaddwd %%xmm13,%%xmm11\n\t" \
  304. "movd %[a],%%xmm13\n\t" \
  305. "paddd %%xmm12,%%xmm10\n\t" \
  306. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  307. "paddd %%xmm12,%%xmm11\n\t" \
  308. /*xmm12:xmm7=12785*t4''*/ \
  309. "movdqa %%xmm8,%%xmm7\n\t" \
  310. "movdqa %%xmm8,%%xmm1\n\t" \
  311. "pmullw %%xmm13,%%xmm7\n\t" \
  312. "pmulhw %%xmm13,%%xmm1\n\t" \
  313. "movdqa %%xmm7,%%xmm12\n\t" \
  314. "punpcklwd %%xmm1,%%xmm7\n\t" \
  315. "punpckhwd %%xmm1,%%xmm12\n\t" \
  316. /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
  317. "paddd %%xmm7,%%xmm10\n\t" \
  318. "paddd %%xmm12,%%xmm11\n\t" \
  319. "psrad $16,%%xmm10\n\t" \
  320. "pcmpeqw %%xmm15,%%xmm9\n\t" \
  321. "psrad $16,%%xmm11\n\t" \
  322. "psubw %%xmm14,%%xmm9\n\t" \
  323. "packssdw %%xmm11,%%xmm10\n\t" \
  324. "pxor %%xmm12,%%xmm12\n\t" \
  325. "paddw %%xmm9,%%xmm10\n\t" \
  326. /*xmm1=_y[1]=u \
  327. xmm10=s=(12785*u>>16)-t4''*/ \
  328. "psubw %%xmm14,%%xmm12\n\t" \
  329. "movdqa %%xmm10,%%xmm1\n\t" \
  330. "mov $0x3000503B,%[a]\n\t" \
  331. "pmulhw %%xmm13,%%xmm10\n\t" \
  332. "movd %[a],%%xmm13\n\t" \
  333. "psubw %%xmm8,%%xmm10\n\t" \
  334. "pshufd $00,%%xmm13,%%xmm13\n\t" \
  335. /*xmm8:xmm7=s*20539+0x3000*/ \
  336. "movdqa %%xmm10,%%xmm7\n\t" \
  337. "movdqa %%xmm10,%%xmm8\n\t" \
  338. "punpcklwd %%xmm12,%%xmm7\n\t" \
  339. "pmaddwd %%xmm13,%%xmm7\n\t" \
  340. "punpckhwd %%xmm12,%%xmm8\n\t" \
  341. "pmaddwd %%xmm13,%%xmm8\n\t" \
  342. /*xmm7=(s*20539+0x3000>>20)+s*/ \
  343. "psrad $20,%%xmm7\n\t" \
  344. "psrad $20,%%xmm8\n\t" \
  345. "packssdw %%xmm8,%%xmm7\n\t" \
  346. "paddw %%xmm10,%%xmm7\n\t" \
  347. /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
  348. "pcmpeqw %%xmm15,%%xmm10\n\t" \
  349. "psubw %%xmm14,%%xmm10\n\t" \
  350. "paddw %%xmm10,%%xmm7\n\t " \
  351. /*SSE2 implementation of the fDCT for x86-64 only.
  352. Because of the 8 extra XMM registers on x86-64, this version can operate
  353. without any temporary stack access at all.*/
  354. void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
  355. ptrdiff_t a;
  356. __asm__ __volatile__(
  357. /*Load the input.*/
  358. "movdqa 0x00(%[x]),%%xmm0\n\t"
  359. "movdqa 0x10(%[x]),%%xmm1\n\t"
  360. "movdqa 0x20(%[x]),%%xmm2\n\t"
  361. "movdqa 0x30(%[x]),%%xmm3\n\t"
  362. "movdqa 0x40(%[x]),%%xmm4\n\t"
  363. "movdqa 0x50(%[x]),%%xmm5\n\t"
  364. "movdqa 0x60(%[x]),%%xmm6\n\t"
  365. "movdqa 0x70(%[x]),%%xmm7\n\t"
  366. /*Add two extra bits of working precision to improve accuracy; any more and
  367. we could overflow.*/
  368. /*We also add a few biases to correct for some systematic error that
  369. remains in the full fDCT->iDCT round trip.*/
  370. /*xmm15={0}x8*/
  371. "pxor %%xmm15,%%xmm15\n\t"
  372. /*xmm14={-1}x8*/
  373. "pcmpeqb %%xmm14,%%xmm14\n\t"
  374. "psllw $2,%%xmm0\n\t"
  375. /*xmm8=xmm0*/
  376. "movdqa %%xmm0,%%xmm8\n\t"
  377. "psllw $2,%%xmm1\n\t"
  378. /*xmm8={_x[7...0]==0}*/
  379. "pcmpeqw %%xmm15,%%xmm8\n\t"
  380. "psllw $2,%%xmm2\n\t"
  381. /*xmm8={_x[7...0]!=0}*/
  382. "psubw %%xmm14,%%xmm8\n\t"
  383. "psllw $2,%%xmm3\n\t"
  384. /*%[a]=1*/
  385. "mov $1,%[a]\n\t"
  386. /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
  387. "pslld $16,%%xmm8\n\t"
  388. "psllw $2,%%xmm4\n\t"
  389. /*xmm9={0,0,0,0,0,0,0,1}*/
  390. "movd %[a],%%xmm9\n\t"
  391. /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
  392. "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
  393. "psllw $2,%%xmm5\n\t"
  394. /*%[a]={1}x2*/
  395. "mov $0x10001,%[a]\n\t"
  396. /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
  397. "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
  398. "psllw $2,%%xmm6\n\t"
  399. /*xmm10={0,0,0,0,0,0,1,1}*/
  400. "movd %[a],%%xmm10\n\t"
  401. /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
  402. "paddw %%xmm8,%%xmm0\n\t"
  403. "psllw $2,%%xmm7\n\t"
  404. /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
  405. "paddw %%xmm10,%%xmm0\n\t"
  406. /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
  407. "psubw %%xmm9,%%xmm1\n\t"
  408. /*Transform columns.*/
  409. OC_FDCT_8x8
  410. /*Transform rows.*/
  411. OC_TRANSPOSE_8x8
  412. OC_FDCT_8x8
  413. /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
  414. "paddw %%xmm14,%%xmm14\n\t"
  415. "psubw %%xmm14,%%xmm0\n\t"
  416. "psubw %%xmm14,%%xmm1\n\t"
  417. "psraw $2,%%xmm0\n\t"
  418. "psubw %%xmm14,%%xmm2\n\t"
  419. "psraw $2,%%xmm1\n\t"
  420. "psubw %%xmm14,%%xmm3\n\t"
  421. "psraw $2,%%xmm2\n\t"
  422. "psubw %%xmm14,%%xmm4\n\t"
  423. "psraw $2,%%xmm3\n\t"
  424. "psubw %%xmm14,%%xmm5\n\t"
  425. "psraw $2,%%xmm4\n\t"
  426. "psubw %%xmm14,%%xmm6\n\t"
  427. "psraw $2,%%xmm5\n\t"
  428. "psubw %%xmm14,%%xmm7\n\t"
  429. "psraw $2,%%xmm6\n\t"
  430. "psraw $2,%%xmm7\n\t"
  431. /*Transpose, zig-zag, and store the result.*/
  432. /*We could probably do better using SSSE3's palignr, but re-using MMXEXT
  433. version will do for now.*/
  434. #define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
  435. "movdq2q %%xmm"#_row","_reg"\n\t" \
  436. #define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
  437. "punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
  438. "movdq2q %%xmm"#_row","_reg"\n\t" \
  439. OC_TRANSPOSE_ZIG_ZAG_MMXEXT
  440. #undef OC_ZZ_LOAD_ROW_LO
  441. #undef OC_ZZ_LOAD_ROW_HI
  442. :[a]"=&r"(a)
  443. :[y]"r"(_y),[x]"r"(_x)
  444. :"memory"
  445. );
  446. }
  447. #endif