jidctfst-mmx.asm 20 KB


  1. ;
  2. ; jidctfst.asm - fast integer IDCT (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler),
  11. ; can *not* be assembled with Microsoft's MASM or any compatible
  12. ; assembler (including Borland's Turbo Assembler).
  13. ; NASM is available from http://nasm.sourceforge.net/ or
  14. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  15. ;
  16. ; This file contains a fast, not so accurate integer implementation of
  17. ; the inverse DCT (Discrete Cosine Transform). The following code is
  18. ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
  19. ; for more details.
  20. ;
  21. ; [TAB8]
  22. %include "jsimdext.inc"
  23. %include "jdct.inc"
  24. ; --------------------------------------------------------------------------
  25. %define CONST_BITS 8 ; 14 is also OK.
  26. %define PASS1_BITS 2
  27. %if IFAST_SCALE_BITS != PASS1_BITS
  28. %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
  29. %endif
  30. %if CONST_BITS == 8
  31. F_1_082 equ 277 ; FIX(1.082392200)
  32. F_1_414 equ 362 ; FIX(1.414213562)
  33. F_1_847 equ 473 ; FIX(1.847759065)
  34. F_2_613 equ 669 ; FIX(2.613125930)
  35. F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
  36. %else
  37. ; NASM cannot do compile-time arithmetic on floating-point constants.
  38. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
  39. F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
  40. F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
  41. F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
  42. F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
  43. F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
  44. %endif
  45. ; --------------------------------------------------------------------------
  46. SECTION SEG_CONST
  47. ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
  48. ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
  49. %define PRE_MULTIPLY_SCALE_BITS 2
  50. %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
  51. alignz 16
  52. global EXTN(jconst_idct_ifast_mmx)
  53. EXTN(jconst_idct_ifast_mmx):
  54. PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
  55. PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
  56. PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
  57. PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
  58. PB_CENTERJSAMP times 8 db CENTERJSAMPLE
  59. alignz 16
  60. ; --------------------------------------------------------------------------
  61. SECTION SEG_TEXT
  62. BITS 32
  63. ;
  64. ; Perform dequantization and inverse DCT on one block of coefficients.
  65. ;
  66. ; GLOBAL(void)
  67. ; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
  68. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  69. ;
  70. %define dct_table(b) (b)+8 ; jpeg_component_info *compptr
  71. %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
  72. %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
  73. %define output_col(b) (b)+20 ; JDIMENSION output_col
  74. %define original_ebp ebp+0
  75. %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
  76. %define WK_NUM 2
  77. %define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
  78. ; JCOEF workspace[DCTSIZE2]
  79. align 16
  80. global EXTN(jsimd_idct_ifast_mmx)
  81. EXTN(jsimd_idct_ifast_mmx):
  82. push ebp
  83. mov eax,esp ; eax = original ebp
  84. sub esp, byte 4
  85. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  86. mov [esp],eax
  87. mov ebp,esp ; ebp = aligned ebp
  88. lea esp, [workspace]
  89. push ebx
  90. ; push ecx ; need not be preserved
  91. ; push edx ; need not be preserved
  92. push esi
  93. push edi
  94. get_GOT ebx ; get GOT address
  95. ; ---- Pass 1: process columns from input, store into work array.
  96. ; mov eax, [original_ebp]
  97. mov edx, POINTER [dct_table(eax)] ; quantptr
  98. mov esi, JCOEFPTR [coef_block(eax)] ; inptr
  99. lea edi, [workspace] ; JCOEF *wsptr
  100. mov ecx, DCTSIZE/4 ; ctr
  101. alignx 16,7
  102. .columnloop:
  103. %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
  104. mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  105. or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  106. jnz short .columnDCT
  107. movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  108. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  109. por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  110. por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  111. por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  112. por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  113. por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  114. por mm1,mm0
  115. packsswb mm1,mm1
  116. movd eax,mm1
  117. test eax,eax
  118. jnz short .columnDCT
  119. ; -- AC terms all zero
  120. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  121. pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  122. movq mm2,mm0 ; mm0=in0=(00 01 02 03)
  123. punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
  124. punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
  125. movq mm1,mm0
  126. punpckldq mm0,mm0 ; mm0=(00 00 00 00)
  127. punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
  128. movq mm3,mm2
  129. punpckldq mm2,mm2 ; mm2=(02 02 02 02)
  130. punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
  131. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  132. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
  133. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
  134. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
  135. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
  136. movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
  137. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
  138. movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
  139. jmp near .nextcolumn
  140. alignx 16,7
  141. %endif
  142. .columnDCT:
  143. ; -- Even part
  144. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  145. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  146. pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  147. pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  148. movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  149. movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  150. pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  151. pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  152. movq mm4,mm0
  153. movq mm5,mm1
  154. psubw mm0,mm2 ; mm0=tmp11
  155. psubw mm1,mm3
  156. paddw mm4,mm2 ; mm4=tmp10
  157. paddw mm5,mm3 ; mm5=tmp13
  158. psllw mm1,PRE_MULTIPLY_SCALE_BITS
  159. pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
  160. psubw mm1,mm5 ; mm1=tmp12
  161. movq mm6,mm4
  162. movq mm7,mm0
  163. psubw mm4,mm5 ; mm4=tmp3
  164. psubw mm0,mm1 ; mm0=tmp2
  165. paddw mm6,mm5 ; mm6=tmp0
  166. paddw mm7,mm1 ; mm7=tmp1
  167. movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
  168. movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
  169. ; -- Odd part
  170. movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  171. movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  172. pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  173. pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  174. movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  175. movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  176. pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  177. pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  178. movq mm4,mm2
  179. movq mm0,mm5
  180. psubw mm2,mm1 ; mm2=z12
  181. psubw mm5,mm3 ; mm5=z10
  182. paddw mm4,mm1 ; mm4=z11
  183. paddw mm0,mm3 ; mm0=z13
  184. movq mm1,mm5 ; mm1=z10(unscaled)
  185. psllw mm2,PRE_MULTIPLY_SCALE_BITS
  186. psllw mm5,PRE_MULTIPLY_SCALE_BITS
  187. movq mm3,mm4
  188. psubw mm4,mm0
  189. paddw mm3,mm0 ; mm3=tmp7
  190. psllw mm4,PRE_MULTIPLY_SCALE_BITS
  191. pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
  192. ; To avoid overflow...
  193. ;
  194. ; (Original)
  195. ; tmp12 = -2.613125930 * z10 + z5;
  196. ;
  197. ; (This implementation)
  198. ; tmp12 = (-1.613125930 - 1) * z10 + z5;
  199. ; = -1.613125930 * z10 - z10 + z5;
  200. movq mm0,mm5
  201. paddw mm5,mm2
  202. pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
  203. pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
  204. pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
  205. psubw mm0,mm1
  206. psubw mm2,mm5 ; mm2=tmp10
  207. paddw mm0,mm5 ; mm0=tmp12
  208. ; -- Final output stage
  209. psubw mm0,mm3 ; mm0=tmp6
  210. movq mm1,mm6
  211. movq mm5,mm7
  212. paddw mm6,mm3 ; mm6=data0=(00 01 02 03)
  213. paddw mm7,mm0 ; mm7=data1=(10 11 12 13)
  214. psubw mm1,mm3 ; mm1=data7=(70 71 72 73)
  215. psubw mm5,mm0 ; mm5=data6=(60 61 62 63)
  216. psubw mm4,mm0 ; mm4=tmp5
  217. movq mm3,mm6 ; transpose coefficients(phase 1)
  218. punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
  219. punpckhwd mm3,mm7 ; mm3=(02 12 03 13)
  220. movq mm0,mm5 ; transpose coefficients(phase 1)
  221. punpcklwd mm5,mm1 ; mm5=(60 70 61 71)
  222. punpckhwd mm0,mm1 ; mm0=(62 72 63 73)
  223. movq mm7, MMWORD [wk(0)] ; mm7=tmp2
  224. movq mm1, MMWORD [wk(1)] ; mm1=tmp3
  225. movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
  226. movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
  227. paddw mm2,mm4 ; mm2=tmp4
  228. movq mm5,mm7
  229. movq mm0,mm1
  230. paddw mm7,mm4 ; mm7=data2=(20 21 22 23)
  231. paddw mm1,mm2 ; mm1=data4=(40 41 42 43)
  232. psubw mm5,mm4 ; mm5=data5=(50 51 52 53)
  233. psubw mm0,mm2 ; mm0=data3=(30 31 32 33)
  234. movq mm4,mm7 ; transpose coefficients(phase 1)
  235. punpcklwd mm7,mm0 ; mm7=(20 30 21 31)
  236. punpckhwd mm4,mm0 ; mm4=(22 32 23 33)
  237. movq mm2,mm1 ; transpose coefficients(phase 1)
  238. punpcklwd mm1,mm5 ; mm1=(40 50 41 51)
  239. punpckhwd mm2,mm5 ; mm2=(42 52 43 53)
  240. movq mm0,mm6 ; transpose coefficients(phase 2)
  241. punpckldq mm6,mm7 ; mm6=(00 10 20 30)
  242. punpckhdq mm0,mm7 ; mm0=(01 11 21 31)
  243. movq mm5,mm3 ; transpose coefficients(phase 2)
  244. punpckldq mm3,mm4 ; mm3=(02 12 22 32)
  245. punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
  246. movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
  247. movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
  248. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
  249. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
  250. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
  251. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
  252. movq mm6,mm1 ; transpose coefficients(phase 2)
  253. punpckldq mm1,mm7 ; mm1=(40 50 60 70)
  254. punpckhdq mm6,mm7 ; mm6=(41 51 61 71)
  255. movq mm0,mm2 ; transpose coefficients(phase 2)
  256. punpckldq mm2,mm4 ; mm2=(42 52 62 72)
  257. punpckhdq mm0,mm4 ; mm0=(43 53 63 73)
  258. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
  259. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
  260. movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
  261. movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
  262. .nextcolumn:
  263. add esi, byte 4*SIZEOF_JCOEF ; coef_block
  264. add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
  265. add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
  266. dec ecx ; ctr
  267. jnz near .columnloop
  268. ; ---- Pass 2: process rows from work array, store into output array.
  269. mov eax, [original_ebp]
  270. lea esi, [workspace] ; JCOEF *wsptr
  271. mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
  272. mov eax, JDIMENSION [output_col(eax)]
  273. mov ecx, DCTSIZE/4 ; ctr
  274. alignx 16,7
  275. .rowloop:
  276. ; -- Even part
  277. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  278. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  279. movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  280. movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  281. movq mm4,mm0
  282. movq mm5,mm1
  283. psubw mm0,mm2 ; mm0=tmp11
  284. psubw mm1,mm3
  285. paddw mm4,mm2 ; mm4=tmp10
  286. paddw mm5,mm3 ; mm5=tmp13
  287. psllw mm1,PRE_MULTIPLY_SCALE_BITS
  288. pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
  289. psubw mm1,mm5 ; mm1=tmp12
  290. movq mm6,mm4
  291. movq mm7,mm0
  292. psubw mm4,mm5 ; mm4=tmp3
  293. psubw mm0,mm1 ; mm0=tmp2
  294. paddw mm6,mm5 ; mm6=tmp0
  295. paddw mm7,mm1 ; mm7=tmp1
  296. movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
  297. movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
  298. ; -- Odd part
  299. movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  300. movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  301. movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  302. movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  303. movq mm4,mm2
  304. movq mm0,mm5
  305. psubw mm2,mm1 ; mm2=z12
  306. psubw mm5,mm3 ; mm5=z10
  307. paddw mm4,mm1 ; mm4=z11
  308. paddw mm0,mm3 ; mm0=z13
  309. movq mm1,mm5 ; mm1=z10(unscaled)
  310. psllw mm2,PRE_MULTIPLY_SCALE_BITS
  311. psllw mm5,PRE_MULTIPLY_SCALE_BITS
  312. movq mm3,mm4
  313. psubw mm4,mm0
  314. paddw mm3,mm0 ; mm3=tmp7
  315. psllw mm4,PRE_MULTIPLY_SCALE_BITS
  316. pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
  317. ; To avoid overflow...
  318. ;
  319. ; (Original)
  320. ; tmp12 = -2.613125930 * z10 + z5;
  321. ;
  322. ; (This implementation)
  323. ; tmp12 = (-1.613125930 - 1) * z10 + z5;
  324. ; = -1.613125930 * z10 - z10 + z5;
  325. movq mm0,mm5
  326. paddw mm5,mm2
  327. pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
  328. pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
  329. pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
  330. psubw mm0,mm1
  331. psubw mm2,mm5 ; mm2=tmp10
  332. paddw mm0,mm5 ; mm0=tmp12
  333. ; -- Final output stage
  334. psubw mm0,mm3 ; mm0=tmp6
  335. movq mm1,mm6
  336. movq mm5,mm7
  337. paddw mm6,mm3 ; mm6=data0=(00 10 20 30)
  338. paddw mm7,mm0 ; mm7=data1=(01 11 21 31)
  339. psraw mm6,(PASS1_BITS+3) ; descale
  340. psraw mm7,(PASS1_BITS+3) ; descale
  341. psubw mm1,mm3 ; mm1=data7=(07 17 27 37)
  342. psubw mm5,mm0 ; mm5=data6=(06 16 26 36)
  343. psraw mm1,(PASS1_BITS+3) ; descale
  344. psraw mm5,(PASS1_BITS+3) ; descale
  345. psubw mm4,mm0 ; mm4=tmp5
  346. packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36)
  347. packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37)
  348. movq mm3, MMWORD [wk(0)] ; mm3=tmp2
  349. movq mm0, MMWORD [wk(1)] ; mm0=tmp3
  350. paddw mm2,mm4 ; mm2=tmp4
  351. movq mm5,mm3
  352. movq mm1,mm0
  353. paddw mm3,mm4 ; mm3=data2=(02 12 22 32)
  354. paddw mm0,mm2 ; mm0=data4=(04 14 24 34)
  355. psraw mm3,(PASS1_BITS+3) ; descale
  356. psraw mm0,(PASS1_BITS+3) ; descale
  357. psubw mm5,mm4 ; mm5=data5=(05 15 25 35)
  358. psubw mm1,mm2 ; mm1=data3=(03 13 23 33)
  359. psraw mm5,(PASS1_BITS+3) ; descale
  360. psraw mm1,(PASS1_BITS+3) ; descale
  361. movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
  362. packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34)
  363. packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35)
  364. paddb mm6,mm4
  365. paddb mm7,mm4
  366. paddb mm3,mm4
  367. paddb mm1,mm4
  368. movq mm2,mm6 ; transpose coefficients(phase 1)
  369. punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31)
  370. punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37)
  371. movq mm0,mm3 ; transpose coefficients(phase 1)
  372. punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33)
  373. punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35)
  374. movq mm5,mm6 ; transpose coefficients(phase 2)
  375. punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13)
  376. punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33)
  377. movq mm4,mm0 ; transpose coefficients(phase 2)
  378. punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17)
  379. punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37)
  380. movq mm7,mm6 ; transpose coefficients(phase 3)
  381. punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07)
  382. punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17)
  383. movq mm1,mm5 ; transpose coefficients(phase 3)
  384. punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27)
  385. punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37)
  386. pushpic ebx ; save GOT address
  387. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  388. mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  389. movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
  390. movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
  391. mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
  392. mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
  393. movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
  394. movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
  395. poppic ebx ; restore GOT address
  396. add esi, byte 4*SIZEOF_JCOEF ; wsptr
  397. add edi, byte 4*SIZEOF_JSAMPROW
  398. dec ecx ; ctr
  399. jnz near .rowloop
  400. emms ; empty MMX state
  401. pop edi
  402. pop esi
  403. ; pop edx ; need not be preserved
  404. ; pop ecx ; need not be preserved
  405. pop ebx
  406. mov esp,ebp ; esp <- aligned ebp
  407. pop esp ; esp <- original ebp
  408. pop ebp
  409. ret
  410. ; For some reason, the OS X linker does not honor the request to align the
  411. ; segment unless we do this.
  412. align 16