jidctflt-sse2-64.asm 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. ;
  2. ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains a floating-point implementation of the inverse DCT
  18. ; (Discrete Cosine Transform). The following code is based directly on
  19. ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  20. ;
  21. ; [TAB8]
  22. %include "jsimdext.inc"
  23. %include "jdct.inc"
  24. ; --------------------------------------------------------------------------
  25. %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  26. shufps %1,%2,0x44
  27. %endmacro
  28. %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  29. shufps %1,%2,0xEE
  30. %endmacro
  31. ; --------------------------------------------------------------------------
  32. SECTION SEG_CONST
  33. alignz 16
  34. global EXTN(jconst_idct_float_sse2)
  35. EXTN(jconst_idct_float_sse2):
  36. PD_1_414 times 4 dd 1.414213562373095048801689
  37. PD_1_847 times 4 dd 1.847759065022573512256366
  38. PD_1_082 times 4 dd 1.082392200292393968799446
  39. PD_M2_613 times 4 dd -2.613125929752753055713286
  40. PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
  41. PB_CENTERJSAMP times 16 db CENTERJSAMPLE
  42. alignz 16
  43. ; --------------------------------------------------------------------------
  44. SECTION SEG_TEXT
  45. BITS 64
  46. ;
  47. ; Perform dequantization and inverse DCT on one block of coefficients.
  48. ;
  49. ; GLOBAL(void)
  50. ; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
  51. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  52. ;
  53. ; r10 = void *dct_table
  54. ; r11 = JCOEFPTR coef_block
  55. ; r12 = JSAMPARRAY output_buf
  56. ; r13 = JDIMENSION output_col
  57. %define original_rbp rbp+0
  58. %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  59. %define WK_NUM 2
  60. %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
  61. ; FAST_FLOAT workspace[DCTSIZE2]
  62. align 16
  63. global EXTN(jsimd_idct_float_sse2)
  64. EXTN(jsimd_idct_float_sse2):
  65. push rbp
  66. mov rax,rsp ; rax = original rbp
  67. sub rsp, byte 4
  68. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  69. mov [rsp],rax
  70. mov rbp,rsp ; rbp = aligned rbp
  71. lea rsp, [workspace]
  72. collect_args
  73. push rbx
  74. ; ---- Pass 1: process columns from input, store into work array.
  75. mov rdx, r10 ; quantptr
  76. mov rsi, r11 ; inptr
  77. lea rdi, [workspace] ; FAST_FLOAT *wsptr
  78. mov rcx, DCTSIZE/4 ; ctr
  79. .columnloop:
  80. %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  81. mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  82. or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  83. jnz near .columnDCT
  84. movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  85. movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  86. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
  87. movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
  88. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
  89. movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
  90. movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
  91. por xmm1,xmm2
  92. por xmm3,xmm4
  93. por xmm5,xmm6
  94. por xmm1,xmm3
  95. por xmm5,xmm7
  96. por xmm1,xmm5
  97. packsswb xmm1,xmm1
  98. movd eax,xmm1
  99. test rax,rax
  100. jnz short .columnDCT
  101. ; -- AC terms all zero
  102. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
  103. punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  104. psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  105. cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
  106. mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  107. movaps xmm1,xmm0
  108. movaps xmm2,xmm0
  109. movaps xmm3,xmm0
  110. shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
  111. shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
  112. shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
  113. shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
  114. movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
  115. movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
  116. movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
  117. movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
  118. movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
  119. movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
  120. movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
  121. movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
  122. jmp near .nextcolumn
  123. %endif
  124. .columnDCT:
  125. ; -- Even part
  126. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
  127. movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  128. movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
  129. movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
  130. punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  131. punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
  132. psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  133. psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
  134. cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
  135. cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
  136. punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
  137. punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
  138. psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
  139. psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
  140. cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
  141. cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
  142. mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  143. mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  144. mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  145. mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  146. movaps xmm4,xmm0
  147. movaps xmm5,xmm1
  148. subps xmm0,xmm2 ; xmm0=tmp11
  149. subps xmm1,xmm3
  150. addps xmm4,xmm2 ; xmm4=tmp10
  151. addps xmm5,xmm3 ; xmm5=tmp13
  152. mulps xmm1,[rel PD_1_414]
  153. subps xmm1,xmm5 ; xmm1=tmp12
  154. movaps xmm6,xmm4
  155. movaps xmm7,xmm0
  156. subps xmm4,xmm5 ; xmm4=tmp3
  157. subps xmm0,xmm1 ; xmm0=tmp2
  158. addps xmm6,xmm5 ; xmm6=tmp0
  159. addps xmm7,xmm1 ; xmm7=tmp1
  160. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  161. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  162. ; -- Odd part
  163. movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  164. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
  165. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
  166. movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
  167. punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
  168. punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
  169. psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
  170. psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
  171. cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
  172. cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
  173. punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
  174. punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
  175. psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
  176. psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
  177. cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
  178. cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
  179. mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  180. mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  181. mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  182. mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  183. movaps xmm4,xmm2
  184. movaps xmm0,xmm5
  185. addps xmm2,xmm1 ; xmm2=z11
  186. addps xmm5,xmm3 ; xmm5=z13
  187. subps xmm4,xmm1 ; xmm4=z12
  188. subps xmm0,xmm3 ; xmm0=z10
  189. movaps xmm1,xmm2
  190. subps xmm2,xmm5
  191. addps xmm1,xmm5 ; xmm1=tmp7
  192. mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
  193. movaps xmm3,xmm0
  194. addps xmm0,xmm4
  195. mulps xmm0,[rel PD_1_847] ; xmm0=z5
  196. mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
  197. mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
  198. addps xmm3,xmm0 ; xmm3=tmp12
  199. subps xmm4,xmm0 ; xmm4=tmp10
  200. ; -- Final output stage
  201. subps xmm3,xmm1 ; xmm3=tmp6
  202. movaps xmm5,xmm6
  203. movaps xmm0,xmm7
  204. addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
  205. addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
  206. subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
  207. subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
  208. subps xmm2,xmm3 ; xmm2=tmp5
  209. movaps xmm1,xmm6 ; transpose coefficients(phase 1)
  210. unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
  211. unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
  212. movaps xmm3,xmm0 ; transpose coefficients(phase 1)
  213. unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
  214. unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
  215. movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
  216. movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
  217. movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
  218. movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
  219. addps xmm4,xmm2 ; xmm4=tmp4
  220. movaps xmm0,xmm7
  221. movaps xmm3,xmm5
  222. addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
  223. addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
  224. subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
  225. subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
  226. movaps xmm2,xmm7 ; transpose coefficients(phase 1)
  227. unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
  228. unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
  229. movaps xmm4,xmm5 ; transpose coefficients(phase 1)
  230. unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
  231. unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
  232. movaps xmm3,xmm6 ; transpose coefficients(phase 2)
  233. unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
  234. unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
  235. movaps xmm0,xmm1 ; transpose coefficients(phase 2)
  236. unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
  237. unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
  238. movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
  239. movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
  240. movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
  241. movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
  242. movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
  243. movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
  244. movaps xmm6,xmm5 ; transpose coefficients(phase 2)
  245. unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
  246. unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
  247. movaps xmm3,xmm4 ; transpose coefficients(phase 2)
  248. unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
  249. unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
  250. movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
  251. movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
  252. movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
  253. movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
  254. .nextcolumn:
  255. add rsi, byte 4*SIZEOF_JCOEF ; coef_block
  256. add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
  257. add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
  258. dec rcx ; ctr
  259. jnz near .columnloop
  260. ; -- Prefetch the next coefficient block
  261. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
  262. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
  263. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
  264. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
  265. ; ---- Pass 2: process rows from work array, store into output array.
  266. mov rax, [original_rbp]
  267. lea rsi, [workspace] ; FAST_FLOAT *wsptr
  268. mov rdi, r12 ; (JSAMPROW *)
  269. mov eax, r13d
  270. mov rcx, DCTSIZE/4 ; ctr
  271. .rowloop:
  272. ; -- Even part
  273. movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
  274. movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
  275. movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
  276. movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
  277. movaps xmm4,xmm0
  278. movaps xmm5,xmm1
  279. subps xmm0,xmm2 ; xmm0=tmp11
  280. subps xmm1,xmm3
  281. addps xmm4,xmm2 ; xmm4=tmp10
  282. addps xmm5,xmm3 ; xmm5=tmp13
  283. mulps xmm1,[rel PD_1_414]
  284. subps xmm1,xmm5 ; xmm1=tmp12
  285. movaps xmm6,xmm4
  286. movaps xmm7,xmm0
  287. subps xmm4,xmm5 ; xmm4=tmp3
  288. subps xmm0,xmm1 ; xmm0=tmp2
  289. addps xmm6,xmm5 ; xmm6=tmp0
  290. addps xmm7,xmm1 ; xmm7=tmp1
  291. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  292. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  293. ; -- Odd part
  294. movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
  295. movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
  296. movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
  297. movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
  298. movaps xmm4,xmm2
  299. movaps xmm0,xmm5
  300. addps xmm2,xmm1 ; xmm2=z11
  301. addps xmm5,xmm3 ; xmm5=z13
  302. subps xmm4,xmm1 ; xmm4=z12
  303. subps xmm0,xmm3 ; xmm0=z10
  304. movaps xmm1,xmm2
  305. subps xmm2,xmm5
  306. addps xmm1,xmm5 ; xmm1=tmp7
  307. mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
  308. movaps xmm3,xmm0
  309. addps xmm0,xmm4
  310. mulps xmm0,[rel PD_1_847] ; xmm0=z5
  311. mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
  312. mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
  313. addps xmm3,xmm0 ; xmm3=tmp12
  314. subps xmm4,xmm0 ; xmm4=tmp10
  315. ; -- Final output stage
  316. subps xmm3,xmm1 ; xmm3=tmp6
  317. movaps xmm5,xmm6
  318. movaps xmm0,xmm7
  319. addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
  320. addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
  321. subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
  322. subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
  323. subps xmm2,xmm3 ; xmm2=tmp5
  324. movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
  325. pcmpeqd xmm3,xmm3
  326. psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  327. addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
  328. addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
  329. addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
  330. addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
  331. pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
  332. pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
  333. pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
  334. pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
  335. por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
  336. por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
  337. movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
  338. movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
  339. addps xmm4,xmm2 ; xmm4=tmp4
  340. movaps xmm7,xmm1
  341. movaps xmm5,xmm3
  342. addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
  343. addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
  344. subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
  345. subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
  346. movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
  347. pcmpeqd xmm4,xmm4
  348. psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  349. addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
  350. addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
  351. addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
  352. addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
  353. pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
  354. pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
  355. pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
  356. pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
  357. por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
  358. por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
  359. movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
  360. packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
  361. packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
  362. paddb xmm6,xmm2
  363. paddb xmm1,xmm2
  364. movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
  365. punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
  366. punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
  367. movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
  368. punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
  369. punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
  370. pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
  371. pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
  372. mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
  373. mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
  374. movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
  375. movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
  376. mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
  377. mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
  378. movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
  379. movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
  380. add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
  381. add rdi, byte 4*SIZEOF_JSAMPROW
  382. dec rcx ; ctr
  383. jnz near .rowloop
  384. pop rbx
  385. uncollect_args
  386. mov rsp,rbp ; rsp <- aligned rbp
  387. pop rsp ; rsp <- original rbp
  388. pop rbp
  389. ret
  390. ; For some reason, the OS X linker does not honor the request to align the
  391. ; segment unless we do this.
  392. align 16