sse2encfrag.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
  9. * by the Xiph.Org Foundation http://www.xiph.org/ *
  10. * *
  11. ********************************************************************
  12. function:
  13. last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
  14. ********************************************************************/
  15. #include <stddef.h>
  16. #include "x86enc.h"
  17. #include "sse2trans.h"
  18. #if defined(OC_X86_ASM)
  19. /*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
  20. 16-bit differences.
  21. On output, these are stored in _m0, xmm1, xmm2, and xmm3.
  22. xmm4 and xmm5 are clobbered.*/
  23. #define OC_LOAD_SUB_4x8(_m0) \
  24. "#OC_LOAD_SUB_4x8\n\t" \
  25. /*Load the first three rows.*/ \
  26. "movq (%[src]),"_m0"\n\t" \
  27. "movq (%[ref]),%%xmm4\n\t" \
  28. "movq (%[src],%[ystride]),%%xmm1\n\t" \
  29. "movq (%[ref],%[ystride]),%%xmm3\n\t" \
  30. "movq (%[src],%[ystride],2),%%xmm2\n\t" \
  31. "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
  32. /*Unpack and subtract.*/ \
  33. "punpcklbw %%xmm4,"_m0"\n\t" \
  34. "punpcklbw %%xmm4,%%xmm4\n\t" \
  35. "punpcklbw %%xmm3,%%xmm1\n\t" \
  36. "punpcklbw %%xmm3,%%xmm3\n\t" \
  37. "psubw %%xmm4,"_m0"\n\t" \
  38. "psubw %%xmm3,%%xmm1\n\t" \
  39. /*Load the last row.*/ \
  40. "movq (%[src],%[ystride3]),%%xmm3\n\t" \
  41. "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
  42. /*Unpack, subtract, and advance the pointers.*/ \
  43. "punpcklbw %%xmm5,%%xmm2\n\t" \
  44. "punpcklbw %%xmm5,%%xmm5\n\t" \
  45. "lea (%[src],%[ystride],4),%[src]\n\t" \
  46. "psubw %%xmm5,%%xmm2\n\t" \
  47. "punpcklbw %%xmm4,%%xmm3\n\t" \
  48. "punpcklbw %%xmm4,%%xmm4\n\t" \
  49. "lea (%[ref],%[ystride],4),%[ref]\n\t" \
  50. "psubw %%xmm4,%%xmm3\n\t" \
  51. /*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
  52. On output, xmm0 contains the sum of two of the rows, and the other two are
  53. added to xmm7.*/
  54. #define OC_SSD_4x8(_m0) \
  55. "pmaddwd "_m0","_m0"\n\t" \
  56. "pmaddwd %%xmm1,%%xmm1\n\t" \
  57. "pmaddwd %%xmm2,%%xmm2\n\t" \
  58. "pmaddwd %%xmm3,%%xmm3\n\t" \
  59. "paddd %%xmm1,"_m0"\n\t" \
  60. "paddd %%xmm3,%%xmm2\n\t" \
  61. "paddd %%xmm2,%%xmm7\n\t" \
  62. unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
  63. const unsigned char *_ref,int _ystride){
  64. unsigned ret;
  65. __asm__ __volatile__(
  66. OC_LOAD_SUB_4x8("%%xmm7")
  67. OC_SSD_4x8("%%xmm7")
  68. OC_LOAD_SUB_4x8("%%xmm0")
  69. OC_SSD_4x8("%%xmm0")
  70. "paddd %%xmm0,%%xmm7\n\t"
  71. "movdqa %%xmm7,%%xmm6\n\t"
  72. "punpckhqdq %%xmm7,%%xmm7\n\t"
  73. "paddd %%xmm6,%%xmm7\n\t"
  74. "pshufd $1,%%xmm7,%%xmm6\n\t"
  75. "paddd %%xmm6,%%xmm7\n\t"
  76. "movd %%xmm7,%[ret]\n\t"
  77. :[ret]"=a"(ret)
  78. :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
  79. [ystride3]"r"((ptrdiff_t)_ystride*3)
  80. );
  81. return ret;
  82. }
  83. static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
  84. 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
  85. };
  86. /*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
  87. horizontal sums as well as their 16-bit differences subject to a mask.
  88. %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
  89. #define OC_LOAD_SUB_MASK_2x8 \
  90. "#OC_LOAD_SUB_MASK_2x8\n\t" \
  91. /*Start the loads and expand the next 8 bits of the mask.*/ \
  92. "shl $8,%[m]\n\t" \
  93. "movq (%[src]),%%xmm0\n\t" \
  94. "mov %h[m],%b[m]\n\t" \
  95. "movq (%[ref]),%%xmm2\n\t" \
  96. "movd %[m],%%xmm4\n\t" \
  97. "shr $8,%[m]\n\t" \
  98. "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
  99. "mov %h[m],%b[m]\n\t" \
  100. "pand %%xmm6,%%xmm4\n\t" \
  101. "pcmpeqb %%xmm6,%%xmm4\n\t" \
  102. /*Perform the masking.*/ \
  103. "pand %%xmm4,%%xmm0\n\t" \
  104. "pand %%xmm4,%%xmm2\n\t" \
  105. /*Finish the loads while unpacking the first set of rows, and expand the next
  106. 8 bits of the mask.*/ \
  107. "movd %[m],%%xmm4\n\t" \
  108. "movq (%[src],%[ystride]),%%xmm1\n\t" \
  109. "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
  110. "movq (%[ref],%[ystride]),%%xmm3\n\t" \
  111. "pand %%xmm6,%%xmm4\n\t" \
  112. "punpcklbw %%xmm2,%%xmm0\n\t" \
  113. "pcmpeqb %%xmm6,%%xmm4\n\t" \
  114. "punpcklbw %%xmm2,%%xmm2\n\t" \
  115. /*Mask and unpack the second set of rows.*/ \
  116. "pand %%xmm4,%%xmm1\n\t" \
  117. "pand %%xmm4,%%xmm3\n\t" \
  118. "punpcklbw %%xmm3,%%xmm1\n\t" \
  119. "punpcklbw %%xmm3,%%xmm3\n\t" \
  120. "psubw %%xmm2,%%xmm0\n\t" \
  121. "psubw %%xmm3,%%xmm1\n\t" \
  122. unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
  123. const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
  124. ptrdiff_t ystride;
  125. unsigned ret;
  126. int i;
  127. ystride=_ystride;
  128. __asm__ __volatile__(
  129. "pxor %%xmm7,%%xmm7\n\t"
  130. "movq %[c],%%xmm6\n\t"
  131. :
  132. :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
  133. );
  134. for(i=0;i<4;i++){
  135. unsigned m;
  136. m=_mask&0xFFFF;
  137. _mask>>=16;
  138. if(m){
  139. __asm__ __volatile__(
  140. OC_LOAD_SUB_MASK_2x8
  141. "pmaddwd %%xmm0,%%xmm0\n\t"
  142. "pmaddwd %%xmm1,%%xmm1\n\t"
  143. "paddd %%xmm0,%%xmm7\n\t"
  144. "paddd %%xmm1,%%xmm7\n\t"
  145. :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
  146. );
  147. }
  148. _src+=2*ystride;
  149. _ref+=2*ystride;
  150. }
  151. __asm__ __volatile__(
  152. "movdqa %%xmm7,%%xmm6\n\t"
  153. "punpckhqdq %%xmm7,%%xmm7\n\t"
  154. "paddd %%xmm6,%%xmm7\n\t"
  155. "pshufd $1,%%xmm7,%%xmm6\n\t"
  156. "paddd %%xmm6,%%xmm7\n\t"
  157. "movd %%xmm7,%[ret]\n\t"
  158. :[ret]"=a"(ret)
  159. );
  160. return ret;
  161. }
  162. /*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
  163. 16-bit difference in %%xmm0...%%xmm7.*/
  164. #define OC_LOAD_SUB_8x8 \
  165. "#OC_LOAD_SUB_8x8\n\t" \
  166. "movq (%[src]),%%xmm0\n\t" \
  167. "movq (%[ref]),%%xmm4\n\t" \
  168. "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
  169. "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  170. "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
  171. "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  172. "movq (%[src]),%%xmm2\n\t" \
  173. "movq (%[ref]),%%xmm7\n\t" \
  174. "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
  175. "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
  176. "punpcklbw %%xmm4,%%xmm0\n\t" \
  177. "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  178. "punpcklbw %%xmm4,%%xmm4\n\t" \
  179. "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  180. "psubw %%xmm4,%%xmm0\n\t" \
  181. "movq (%[src]),%%xmm4\n\t" \
  182. "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
  183. "movq (%[ref]),%%xmm0\n\t" \
  184. "punpcklbw %%xmm5,%%xmm1\n\t" \
  185. "punpcklbw %%xmm5,%%xmm5\n\t" \
  186. "psubw %%xmm5,%%xmm1\n\t" \
  187. "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
  188. "punpcklbw %%xmm7,%%xmm2\n\t" \
  189. "punpcklbw %%xmm7,%%xmm7\n\t" \
  190. "psubw %%xmm7,%%xmm2\n\t" \
  191. "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
  192. "punpcklbw %%xmm6,%%xmm3\n\t" \
  193. "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  194. "punpcklbw %%xmm6,%%xmm6\n\t" \
  195. "psubw %%xmm6,%%xmm3\n\t" \
  196. "movq (%[src]),%%xmm6\n\t" \
  197. "punpcklbw %%xmm0,%%xmm4\n\t" \
  198. "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  199. "punpcklbw %%xmm0,%%xmm0\n\t" \
  200. "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  201. "psubw %%xmm0,%%xmm4\n\t" \
  202. "movq (%[ref]),%%xmm0\n\t" \
  203. "punpcklbw %%xmm7,%%xmm5\n\t" \
  204. "neg %[src_ystride]\n\t" \
  205. "punpcklbw %%xmm7,%%xmm7\n\t" \
  206. "psubw %%xmm7,%%xmm5\n\t" \
  207. "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
  208. "punpcklbw %%xmm0,%%xmm6\n\t" \
  209. "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  210. "punpcklbw %%xmm0,%%xmm0\n\t" \
  211. "neg %[ref_ystride]\n\t" \
  212. "psubw %%xmm0,%%xmm6\n\t" \
  213. "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
  214. "punpcklbw %%xmm0,%%xmm7\n\t" \
  215. "punpcklbw %%xmm0,%%xmm0\n\t" \
  216. "psubw %%xmm0,%%xmm7\n\t" \
  217. "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
  218. /*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
  219. #define OC_LOAD_8x8 \
  220. "#OC_LOAD_8x8\n\t" \
  221. "movq (%[src]),%%xmm0\n\t" \
  222. "movq (%[src],%[ystride]),%%xmm1\n\t" \
  223. "movq (%[src],%[ystride],2),%%xmm2\n\t" \
  224. "pxor %%xmm7,%%xmm7\n\t" \
  225. "movq (%[src],%[ystride3]),%%xmm3\n\t" \
  226. "punpcklbw %%xmm7,%%xmm0\n\t" \
  227. "movq (%[src4]),%%xmm4\n\t" \
  228. "punpcklbw %%xmm7,%%xmm1\n\t" \
  229. "movq (%[src4],%[ystride]),%%xmm5\n\t" \
  230. "punpcklbw %%xmm7,%%xmm2\n\t" \
  231. "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
  232. "punpcklbw %%xmm7,%%xmm3\n\t" \
  233. "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
  234. "punpcklbw %%xmm4,%%xmm4\n\t" \
  235. "punpcklbw %%xmm5,%%xmm5\n\t" \
  236. "psrlw $8,%%xmm4\n\t" \
  237. "psrlw $8,%%xmm5\n\t" \
  238. "punpcklbw %%xmm6,%%xmm6\n\t" \
  239. "punpcklbw %%xmm7,%%xmm7\n\t" \
  240. "psrlw $8,%%xmm6\n\t" \
  241. "psrlw $8,%%xmm7\n\t" \
  242. /*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
  243. Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
  244. perform this stage in place with no temporary registers).*/
  245. #define OC_HADAMARD_AB_8x8 \
  246. "#OC_HADAMARD_AB_8x8\n\t" \
  247. /*Stage A:*/ \
  248. "paddw %%xmm5,%%xmm1\n\t" \
  249. "paddw %%xmm6,%%xmm2\n\t" \
  250. "paddw %%xmm5,%%xmm5\n\t" \
  251. "paddw %%xmm6,%%xmm6\n\t" \
  252. "psubw %%xmm1,%%xmm5\n\t" \
  253. "psubw %%xmm2,%%xmm6\n\t" \
  254. "paddw %%xmm7,%%xmm3\n\t" \
  255. "paddw %%xmm4,%%xmm0\n\t" \
  256. "paddw %%xmm7,%%xmm7\n\t" \
  257. "paddw %%xmm4,%%xmm4\n\t" \
  258. "psubw %%xmm3,%%xmm7\n\t" \
  259. "psubw %%xmm0,%%xmm4\n\t" \
  260. /*Stage B:*/ \
  261. "paddw %%xmm2,%%xmm0\n\t" \
  262. "paddw %%xmm3,%%xmm1\n\t" \
  263. "paddw %%xmm6,%%xmm4\n\t" \
  264. "paddw %%xmm7,%%xmm5\n\t" \
  265. "paddw %%xmm2,%%xmm2\n\t" \
  266. "paddw %%xmm3,%%xmm3\n\t" \
  267. "paddw %%xmm6,%%xmm6\n\t" \
  268. "paddw %%xmm7,%%xmm7\n\t" \
  269. "psubw %%xmm0,%%xmm2\n\t" \
  270. "psubw %%xmm1,%%xmm3\n\t" \
  271. "psubw %%xmm4,%%xmm6\n\t" \
  272. "psubw %%xmm5,%%xmm7\n\t" \
  273. /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
  274. Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
  275. place with no temporary registers).*/
  276. #define OC_HADAMARD_C_8x8 \
  277. "#OC_HADAMARD_C_8x8\n\t" \
  278. /*Stage C:*/ \
  279. "paddw %%xmm1,%%xmm0\n\t" \
  280. "paddw %%xmm3,%%xmm2\n\t" \
  281. "paddw %%xmm5,%%xmm4\n\t" \
  282. "paddw %%xmm7,%%xmm6\n\t" \
  283. "paddw %%xmm1,%%xmm1\n\t" \
  284. "paddw %%xmm3,%%xmm3\n\t" \
  285. "paddw %%xmm5,%%xmm5\n\t" \
  286. "paddw %%xmm7,%%xmm7\n\t" \
  287. "psubw %%xmm0,%%xmm1\n\t" \
  288. "psubw %%xmm2,%%xmm3\n\t" \
  289. "psubw %%xmm4,%%xmm5\n\t" \
  290. "psubw %%xmm6,%%xmm7\n\t" \
  291. /*Performs an 8-point 1-D Hadamard transform in place.
  292. Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
  293. in place with no temporary registers).*/
  294. #define OC_HADAMARD_8x8 \
  295. OC_HADAMARD_AB_8x8 \
  296. OC_HADAMARD_C_8x8 \
  297. /*Performs the first part of the final stage of the Hadamard transform and
  298. summing of absolute values.
  299. At the end of this part, %%xmm1 will contain the DC coefficient of the
  300. transform.*/
  301. #define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
  302. /*We use the fact that \
  303. (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
  304. to merge the final butterfly with the abs and the first stage of \
  305. accumulation. \
  306. Thus we can avoid using pabsw, which is not available until SSSE3. \
  307. Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
  308. implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
  309. registers). \
  310. Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
  311. This implementation is only 26 (+4 for spilling registers).*/ \
  312. "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
  313. "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
  314. "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
  315. /*xmm7={0x7FFF}x4 \
  316. xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
  317. "pcmpeqb %%xmm7,%%xmm7\n\t" \
  318. "movdqa %%xmm4,%%xmm6\n\t" \
  319. "psrlw $1,%%xmm7\n\t" \
  320. "paddw %%xmm5,%%xmm6\n\t" \
  321. "pmaxsw %%xmm5,%%xmm4\n\t" \
  322. "paddsw %%xmm7,%%xmm6\n\t" \
  323. "psubw %%xmm6,%%xmm4\n\t" \
  324. /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
  325. xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
  326. "movdqa %%xmm2,%%xmm6\n\t" \
  327. "movdqa %%xmm0,%%xmm5\n\t" \
  328. "pmaxsw %%xmm3,%%xmm2\n\t" \
  329. "pmaxsw %%xmm1,%%xmm0\n\t" \
  330. "paddw %%xmm3,%%xmm6\n\t" \
  331. "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
  332. "paddw %%xmm5,%%xmm1\n\t" \
  333. "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
  334. /*Performs the second part of the final stage of the Hadamard transform and
  335. summing of absolute values.*/
  336. #define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
  337. "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
  338. "paddsw %%xmm7,%%xmm6\n\t" \
  339. "paddsw %%xmm7,%%xmm1\n\t" \
  340. "psubw %%xmm6,%%xmm2\n\t" \
  341. "psubw %%xmm1,%%xmm0\n\t" \
  342. /*xmm7={1}x4 (needed for the horizontal add that follows) \
  343. xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
  344. "movdqa %%xmm3,%%xmm6\n\t" \
  345. "pmaxsw %%xmm5,%%xmm3\n\t" \
  346. "paddw %%xmm2,%%xmm0\n\t" \
  347. "paddw %%xmm5,%%xmm6\n\t" \
  348. "paddw %%xmm4,%%xmm0\n\t" \
  349. "paddsw %%xmm7,%%xmm6\n\t" \
  350. "paddw %%xmm3,%%xmm0\n\t" \
  351. "psrlw $14,%%xmm7\n\t" \
  352. "psubw %%xmm6,%%xmm0\n\t" \
  353. /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
  354. absolute value of each component, and accumulates everything into xmm0.*/
  355. #define OC_HADAMARD_C_ABS_ACCUM_8x8 \
  356. OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
  357. OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
  358. /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
  359. component, and accumulates everything into xmm0.
  360. Note that xmm0 will have an extra 4 added to each column, and that after
  361. removing this value, the remainder will be half the conventional value.*/
  362. #define OC_HADAMARD_ABS_ACCUM_8x8 \
  363. OC_HADAMARD_AB_8x8 \
  364. OC_HADAMARD_C_ABS_ACCUM_8x8
  365. static unsigned oc_int_frag_satd_sse2(int *_dc,
  366. const unsigned char *_src,int _src_ystride,
  367. const unsigned char *_ref,int _ref_ystride){
  368. OC_ALIGN16(ogg_int16_t buf[16]);
  369. unsigned ret;
  370. unsigned ret2;
  371. int dc;
  372. __asm__ __volatile__(
  373. OC_LOAD_SUB_8x8
  374. OC_HADAMARD_8x8
  375. OC_TRANSPOSE_8x8
  376. /*We split out the stages here so we can save the DC coefficient in the
  377. middle.*/
  378. OC_HADAMARD_AB_8x8
  379. OC_HADAMARD_C_ABS_ACCUM_A_8x8
  380. "movd %%xmm1,%[dc]\n\t"
  381. OC_HADAMARD_C_ABS_ACCUM_B_8x8
  382. /*Up to this point, everything fit in 16 bits (8 input + 1 for the
  383. difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
  384. for the factor of two we dropped + 3 for the vertical accumulation).
  385. Now we finally have to promote things to dwords.
  386. We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
  387. latency of pmaddwd by starting to compute abs(dc) here.*/
  388. "pmaddwd %%xmm7,%%xmm0\n\t"
  389. "movsx %w[dc],%[dc]\n\t"
  390. "cdq\n\t"
  391. "movdqa %%xmm0,%%xmm1\n\t"
  392. "punpckhqdq %%xmm0,%%xmm0\n\t"
  393. "paddd %%xmm1,%%xmm0\n\t"
  394. "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
  395. "paddd %%xmm1,%%xmm0\n\t"
  396. "movd %%xmm0,%[ret]\n\t"
  397. /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
  398. added to them, a factor of two removed, and the DC value included;
  399. correct the final sum here.*/
  400. "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
  401. "xor %[dc],%[ret2]\n\t"
  402. "sub %[ret2],%[ret]\n\t"
  403. /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
  404. and %[dc] with some of the inputs, since for once we don't write to
  405. them until after we're done using everything but %[buf].*/
  406. /*Note that _src_ystride and _ref_ystride must be given non-overlapping
  407. constraints, otherewise if gcc can prove they're equal it will allocate
  408. them to the same register (which is bad); _src and _ref face a similar
  409. problem.
  410. All four are destructively modified, but if we list them as output
  411. constraints, gcc can't alias them with other outputs.*/
  412. :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
  413. [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
  414. :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
  415. [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
  416. /*We have to use neg, so we actually clobber the condition codes for once
  417. (not to mention sub, and add).*/
  418. :"cc"
  419. );
  420. *_dc=dc;
  421. return ret;
  422. }
  423. unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
  424. const unsigned char *_ref,int _ystride){
  425. return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
  426. }
  427. unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
  428. const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
  429. OC_ALIGN8(unsigned char ref[64]);
  430. oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
  431. return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
  432. }
  433. unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
  434. const unsigned char *_src,int _ystride){
  435. OC_ALIGN16(ogg_int16_t buf[16]);
  436. unsigned ret;
  437. int dc;
  438. __asm__ __volatile__(
  439. OC_LOAD_8x8
  440. OC_HADAMARD_8x8
  441. OC_TRANSPOSE_8x8
  442. /*We split out the stages here so we can save the DC coefficient in the
  443. middle.*/
  444. OC_HADAMARD_AB_8x8
  445. OC_HADAMARD_C_ABS_ACCUM_A_8x8
  446. "movd %%xmm1,%[dc]\n\t"
  447. OC_HADAMARD_C_ABS_ACCUM_B_8x8
  448. /*Up to this point, everything fit in 16 bits (8 input + 1 for the
  449. difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
  450. for the factor of two we dropped + 3 for the vertical accumulation).
  451. Now we finally have to promote things to dwords.*/
  452. "pmaddwd %%xmm7,%%xmm0\n\t"
  453. /*We assume that the DC coefficient is always positive (which is true,
  454. because the input to the INTRA transform was not a difference).*/
  455. "movzx %w[dc],%[dc]\n\t"
  456. "movdqa %%xmm0,%%xmm1\n\t"
  457. "punpckhqdq %%xmm0,%%xmm0\n\t"
  458. "paddd %%xmm1,%%xmm0\n\t"
  459. "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
  460. "paddd %%xmm1,%%xmm0\n\t"
  461. "movd %%xmm0,%[ret]\n\t"
  462. "lea -64(%[ret],%[ret]),%[ret]\n\t"
  463. "sub %[dc],%[ret]\n\t"
  464. /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
  465. and %[dc] with some of the inputs, since for once we don't write to
  466. them until after we're done using everything but %[buf].*/
  467. :[ret]"=a"(ret),[dc]"=r"(dc),
  468. [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
  469. :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
  470. [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
  471. /*We have to use sub, so we actually clobber the condition codes for once.*/
  472. :"cc"
  473. );
  474. *_dc=dc;
  475. return ret;
  476. }
  477. #endif