jfdctint-mmx.asm 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. ;
  2. ; jfdctint.asm - accurate integer FDCT (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler),
  11. ; can *not* be assembled with Microsoft's MASM or any compatible
  12. ; assembler (including Borland's Turbo Assembler).
  13. ; NASM is available from http://nasm.sourceforge.net/ or
  14. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  15. ;
  16. ; This file contains a slow-but-accurate integer implementation of the
  17. ; forward DCT (Discrete Cosine Transform). The following code is based
  18. ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
  19. ; more details.
  20. ;
  21. ; [TAB8]
  22. %include "jsimdext.inc"
  23. %include "jdct.inc"
  24. ; --------------------------------------------------------------------------
  25. %define CONST_BITS 13
  26. %define PASS1_BITS 2
  27. %define DESCALE_P1 (CONST_BITS-PASS1_BITS)
  28. %define DESCALE_P2 (CONST_BITS+PASS1_BITS)
  29. %if CONST_BITS == 13
  30. F_0_298 equ 2446 ; FIX(0.298631336)
  31. F_0_390 equ 3196 ; FIX(0.390180644)
  32. F_0_541 equ 4433 ; FIX(0.541196100)
  33. F_0_765 equ 6270 ; FIX(0.765366865)
  34. F_0_899 equ 7373 ; FIX(0.899976223)
  35. F_1_175 equ 9633 ; FIX(1.175875602)
  36. F_1_501 equ 12299 ; FIX(1.501321110)
  37. F_1_847 equ 15137 ; FIX(1.847759065)
  38. F_1_961 equ 16069 ; FIX(1.961570560)
  39. F_2_053 equ 16819 ; FIX(2.053119869)
  40. F_2_562 equ 20995 ; FIX(2.562915447)
  41. F_3_072 equ 25172 ; FIX(3.072711026)
  42. %else
  43. ; NASM cannot do compile-time arithmetic on floating-point constants.
  44. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
  45. F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
  46. F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
  47. F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
  48. F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
  49. F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
  50. F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
  51. F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
  52. F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
  53. F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
  54. F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
  55. F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
  56. F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
  57. %endif
  58. ; --------------------------------------------------------------------------
  59. SECTION SEG_CONST
  60. alignz 16
  61. global EXTN(jconst_fdct_islow_mmx)
  62. EXTN(jconst_fdct_islow_mmx):
  63. PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541
  64. PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847)
  65. PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175
  66. PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390)
  67. PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899
  68. PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899)
  69. PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562
  70. PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562)
  71. PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1)
  72. PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
  73. PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1)
  74. alignz 16
  75. ; --------------------------------------------------------------------------
  76. SECTION SEG_TEXT
  77. BITS 32
  78. ;
  79. ; Perform the forward DCT on one block of samples.
  80. ;
  81. ; GLOBAL(void)
  82. ; jsimd_fdct_islow_mmx (DCTELEM *data)
  83. ;
  84. %define data(b) (b)+8 ; DCTELEM *data
  85. %define original_ebp ebp+0
  86. %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
  87. %define WK_NUM 2
  88. align 16
  89. global EXTN(jsimd_fdct_islow_mmx)
  90. EXTN(jsimd_fdct_islow_mmx):
  91. push ebp
  92. mov eax,esp ; eax = original ebp
  93. sub esp, byte 4
  94. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  95. mov [esp],eax
  96. mov ebp,esp ; ebp = aligned ebp
  97. lea esp, [wk(0)]
  98. pushpic ebx
  99. ; push ecx ; need not be preserved
  100. ; push edx ; need not be preserved
  101. ; push esi ; unused
  102. ; push edi ; unused
  103. get_GOT ebx ; get GOT address
  104. ; ---- Pass 1: process rows.
  105. mov edx, POINTER [data(eax)] ; (DCTELEM *)
  106. mov ecx, DCTSIZE/4
  107. alignx 16,7
  108. .rowloop:
  109. movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
  110. movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
  111. movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
  112. movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
  113. ; mm0=(20 21 22 23), mm2=(24 25 26 27)
  114. ; mm1=(30 31 32 33), mm3=(34 35 36 37)
  115. movq mm4,mm0 ; transpose coefficients(phase 1)
  116. punpcklwd mm0,mm1 ; mm0=(20 30 21 31)
  117. punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
  118. movq mm5,mm2 ; transpose coefficients(phase 1)
  119. punpcklwd mm2,mm3 ; mm2=(24 34 25 35)
  120. punpckhwd mm5,mm3 ; mm5=(26 36 27 37)
  121. movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
  122. movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
  123. movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
  124. movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
  125. ; mm6=(00 01 02 03), mm1=(04 05 06 07)
  126. ; mm7=(10 11 12 13), mm3=(14 15 16 17)
  127. movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
  128. movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
  129. movq mm4,mm6 ; transpose coefficients(phase 1)
  130. punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
  131. punpckhwd mm4,mm7 ; mm4=(02 12 03 13)
  132. movq mm2,mm1 ; transpose coefficients(phase 1)
  133. punpcklwd mm1,mm3 ; mm1=(04 14 05 15)
  134. punpckhwd mm2,mm3 ; mm2=(06 16 07 17)
  135. movq mm7,mm6 ; transpose coefficients(phase 2)
  136. punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0
  137. punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1
  138. movq mm3,mm2 ; transpose coefficients(phase 2)
  139. punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6
  140. punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7
  141. movq mm0,mm7
  142. movq mm5,mm6
  143. psubw mm7,mm2 ; mm7=data1-data6=tmp6
  144. psubw mm6,mm3 ; mm6=data0-data7=tmp7
  145. paddw mm0,mm2 ; mm0=data1+data6=tmp1
  146. paddw mm5,mm3 ; mm5=data0+data7=tmp0
  147. movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
  148. movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
  149. movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
  150. movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
  151. movq mm7,mm4 ; transpose coefficients(phase 2)
  152. punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2
  153. punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3
  154. movq mm6,mm1 ; transpose coefficients(phase 2)
  155. punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4
  156. punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5
  157. movq mm2,mm7
  158. movq mm3,mm4
  159. paddw mm7,mm1 ; mm7=data3+data4=tmp3
  160. paddw mm4,mm6 ; mm4=data2+data5=tmp2
  161. psubw mm2,mm1 ; mm2=data3-data4=tmp4
  162. psubw mm3,mm6 ; mm3=data2-data5=tmp5
  163. ; -- Even part
  164. movq mm1,mm5
  165. movq mm6,mm0
  166. paddw mm5,mm7 ; mm5=tmp10
  167. paddw mm0,mm4 ; mm0=tmp11
  168. psubw mm1,mm7 ; mm1=tmp13
  169. psubw mm6,mm4 ; mm6=tmp12
  170. movq mm7,mm5
  171. paddw mm5,mm0 ; mm5=tmp10+tmp11
  172. psubw mm7,mm0 ; mm7=tmp10-tmp11
  173. psllw mm5,PASS1_BITS ; mm5=data0
  174. psllw mm7,PASS1_BITS ; mm7=data4
  175. movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
  176. movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
  177. ; (Original)
  178. ; z1 = (tmp12 + tmp13) * 0.541196100;
  179. ; data2 = z1 + tmp13 * 0.765366865;
  180. ; data6 = z1 + tmp12 * -1.847759065;
  181. ;
  182. ; (This implementation)
  183. ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
  184. ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
  185. movq mm4,mm1 ; mm1=tmp13
  186. movq mm0,mm1
  187. punpcklwd mm4,mm6 ; mm6=tmp12
  188. punpckhwd mm0,mm6
  189. movq mm1,mm4
  190. movq mm6,mm0
  191. pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
  192. pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
  193. pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
  194. pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
  195. paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
  196. paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
  197. psrad mm4,DESCALE_P1
  198. psrad mm0,DESCALE_P1
  199. paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
  200. paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
  201. psrad mm1,DESCALE_P1
  202. psrad mm6,DESCALE_P1
  203. packssdw mm4,mm0 ; mm4=data2
  204. packssdw mm1,mm6 ; mm1=data6
  205. movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
  206. movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
  207. ; -- Odd part
  208. movq mm5, MMWORD [wk(0)] ; mm5=tmp6
  209. movq mm7, MMWORD [wk(1)] ; mm7=tmp7
  210. movq mm0,mm2 ; mm2=tmp4
  211. movq mm6,mm3 ; mm3=tmp5
  212. paddw mm0,mm5 ; mm0=z3
  213. paddw mm6,mm7 ; mm6=z4
  214. ; (Original)
  215. ; z5 = (z3 + z4) * 1.175875602;
  216. ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
  217. ; z3 += z5; z4 += z5;
  218. ;
  219. ; (This implementation)
  220. ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
  221. ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
  222. movq mm4,mm0
  223. movq mm1,mm0
  224. punpcklwd mm4,mm6
  225. punpckhwd mm1,mm6
  226. movq mm0,mm4
  227. movq mm6,mm1
  228. pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
  229. pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
  230. pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
  231. pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
  232. movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
  233. movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
  234. ; (Original)
  235. ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
  236. ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
  237. ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
  238. ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
  239. ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
  240. ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
  241. ;
  242. ; (This implementation)
  243. ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
  244. ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
  245. ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
  246. ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
  247. ; data7 = tmp4 + z3; data5 = tmp5 + z4;
  248. ; data3 = tmp6 + z3; data1 = tmp7 + z4;
  249. movq mm4,mm2
  250. movq mm1,mm2
  251. punpcklwd mm4,mm7
  252. punpckhwd mm1,mm7
  253. movq mm2,mm4
  254. movq mm7,mm1
  255. pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
  256. pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
  257. pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
  258. pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
  259. paddd mm4, MMWORD [wk(0)] ; mm4=data7L
  260. paddd mm1, MMWORD [wk(1)] ; mm1=data7H
  261. paddd mm2,mm0 ; mm2=data1L
  262. paddd mm7,mm6 ; mm7=data1H
  263. paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
  264. paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
  265. psrad mm4,DESCALE_P1
  266. psrad mm1,DESCALE_P1
  267. paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
  268. paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
  269. psrad mm2,DESCALE_P1
  270. psrad mm7,DESCALE_P1
  271. packssdw mm4,mm1 ; mm4=data7
  272. packssdw mm2,mm7 ; mm2=data1
  273. movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
  274. movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
  275. movq mm1,mm3
  276. movq mm7,mm3
  277. punpcklwd mm1,mm5
  278. punpckhwd mm7,mm5
  279. movq mm3,mm1
  280. movq mm5,mm7
  281. pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
  282. pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
  283. pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
  284. pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
  285. paddd mm1,mm0 ; mm1=data5L
  286. paddd mm7,mm6 ; mm7=data5H
  287. paddd mm3, MMWORD [wk(0)] ; mm3=data3L
  288. paddd mm5, MMWORD [wk(1)] ; mm5=data3H
  289. paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
  290. paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
  291. psrad mm1,DESCALE_P1
  292. psrad mm7,DESCALE_P1
  293. paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
  294. paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
  295. psrad mm3,DESCALE_P1
  296. psrad mm5,DESCALE_P1
  297. packssdw mm1,mm7 ; mm1=data5
  298. packssdw mm3,mm5 ; mm3=data3
  299. movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
  300. movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
  301. add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
  302. dec ecx
  303. jnz near .rowloop
  304. ; ---- Pass 2: process columns.
  305. mov edx, POINTER [data(eax)] ; (DCTELEM *)
  306. mov ecx, DCTSIZE/4
  307. alignx 16,7
  308. .columnloop:
  309. movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
  310. movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
  311. movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
  312. movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
  313. ; mm0=(02 12 22 32), mm2=(42 52 62 72)
  314. ; mm1=(03 13 23 33), mm3=(43 53 63 73)
  315. movq mm4,mm0 ; transpose coefficients(phase 1)
  316. punpcklwd mm0,mm1 ; mm0=(02 03 12 13)
  317. punpckhwd mm4,mm1 ; mm4=(22 23 32 33)
  318. movq mm5,mm2 ; transpose coefficients(phase 1)
  319. punpcklwd mm2,mm3 ; mm2=(42 43 52 53)
  320. punpckhwd mm5,mm3 ; mm5=(62 63 72 73)
  321. movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
  322. movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
  323. movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
  324. movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
  325. ; mm6=(00 10 20 30), mm1=(40 50 60 70)
  326. ; mm7=(01 11 21 31), mm3=(41 51 61 71)
  327. movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
  328. movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
  329. movq mm4,mm6 ; transpose coefficients(phase 1)
  330. punpcklwd mm6,mm7 ; mm6=(00 01 10 11)
  331. punpckhwd mm4,mm7 ; mm4=(20 21 30 31)
  332. movq mm2,mm1 ; transpose coefficients(phase 1)
  333. punpcklwd mm1,mm3 ; mm1=(40 41 50 51)
  334. punpckhwd mm2,mm3 ; mm2=(60 61 70 71)
  335. movq mm7,mm6 ; transpose coefficients(phase 2)
  336. punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0
  337. punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1
  338. movq mm3,mm2 ; transpose coefficients(phase 2)
  339. punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6
  340. punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7
  341. movq mm0,mm7
  342. movq mm5,mm6
  343. psubw mm7,mm2 ; mm7=data1-data6=tmp6
  344. psubw mm6,mm3 ; mm6=data0-data7=tmp7
  345. paddw mm0,mm2 ; mm0=data1+data6=tmp1
  346. paddw mm5,mm3 ; mm5=data0+data7=tmp0
  347. movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
  348. movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
  349. movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
  350. movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
  351. movq mm7,mm4 ; transpose coefficients(phase 2)
  352. punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2
  353. punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3
  354. movq mm6,mm1 ; transpose coefficients(phase 2)
  355. punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4
  356. punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5
  357. movq mm2,mm7
  358. movq mm3,mm4
  359. paddw mm7,mm1 ; mm7=data3+data4=tmp3
  360. paddw mm4,mm6 ; mm4=data2+data5=tmp2
  361. psubw mm2,mm1 ; mm2=data3-data4=tmp4
  362. psubw mm3,mm6 ; mm3=data2-data5=tmp5
  363. ; -- Even part
  364. movq mm1,mm5
  365. movq mm6,mm0
  366. paddw mm5,mm7 ; mm5=tmp10
  367. paddw mm0,mm4 ; mm0=tmp11
  368. psubw mm1,mm7 ; mm1=tmp13
  369. psubw mm6,mm4 ; mm6=tmp12
  370. movq mm7,mm5
  371. paddw mm5,mm0 ; mm5=tmp10+tmp11
  372. psubw mm7,mm0 ; mm7=tmp10-tmp11
  373. paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
  374. paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
  375. psraw mm5,PASS1_BITS ; mm5=data0
  376. psraw mm7,PASS1_BITS ; mm7=data4
  377. movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
  378. movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
  379. ; (Original)
  380. ; z1 = (tmp12 + tmp13) * 0.541196100;
  381. ; data2 = z1 + tmp13 * 0.765366865;
  382. ; data6 = z1 + tmp12 * -1.847759065;
  383. ;
  384. ; (This implementation)
  385. ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
  386. ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
  387. movq mm4,mm1 ; mm1=tmp13
  388. movq mm0,mm1
  389. punpcklwd mm4,mm6 ; mm6=tmp12
  390. punpckhwd mm0,mm6
  391. movq mm1,mm4
  392. movq mm6,mm0
  393. pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
  394. pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
  395. pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
  396. pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
  397. paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
  398. paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
  399. psrad mm4,DESCALE_P2
  400. psrad mm0,DESCALE_P2
  401. paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
  402. paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
  403. psrad mm1,DESCALE_P2
  404. psrad mm6,DESCALE_P2
  405. packssdw mm4,mm0 ; mm4=data2
  406. packssdw mm1,mm6 ; mm1=data6
  407. movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
  408. movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
  409. ; -- Odd part
  410. movq mm5, MMWORD [wk(0)] ; mm5=tmp6
  411. movq mm7, MMWORD [wk(1)] ; mm7=tmp7
  412. movq mm0,mm2 ; mm2=tmp4
  413. movq mm6,mm3 ; mm3=tmp5
  414. paddw mm0,mm5 ; mm0=z3
  415. paddw mm6,mm7 ; mm6=z4
  416. ; (Original)
  417. ; z5 = (z3 + z4) * 1.175875602;
  418. ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
  419. ; z3 += z5; z4 += z5;
  420. ;
  421. ; (This implementation)
  422. ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
  423. ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
  424. movq mm4,mm0
  425. movq mm1,mm0
  426. punpcklwd mm4,mm6
  427. punpckhwd mm1,mm6
  428. movq mm0,mm4
  429. movq mm6,mm1
  430. pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
  431. pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
  432. pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
  433. pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
  434. movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
  435. movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
  436. ; (Original)
  437. ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
  438. ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
  439. ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
  440. ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
  441. ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
  442. ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
  443. ;
  444. ; (This implementation)
  445. ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
  446. ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
  447. ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
  448. ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
  449. ; data7 = tmp4 + z3; data5 = tmp5 + z4;
  450. ; data3 = tmp6 + z3; data1 = tmp7 + z4;
  451. movq mm4,mm2
  452. movq mm1,mm2
  453. punpcklwd mm4,mm7
  454. punpckhwd mm1,mm7
  455. movq mm2,mm4
  456. movq mm7,mm1
  457. pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
  458. pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
  459. pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
  460. pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
  461. paddd mm4, MMWORD [wk(0)] ; mm4=data7L
  462. paddd mm1, MMWORD [wk(1)] ; mm1=data7H
  463. paddd mm2,mm0 ; mm2=data1L
  464. paddd mm7,mm6 ; mm7=data1H
  465. paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
  466. paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
  467. psrad mm4,DESCALE_P2
  468. psrad mm1,DESCALE_P2
  469. paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
  470. paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
  471. psrad mm2,DESCALE_P2
  472. psrad mm7,DESCALE_P2
  473. packssdw mm4,mm1 ; mm4=data7
  474. packssdw mm2,mm7 ; mm2=data1
  475. movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
  476. movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
  477. movq mm1,mm3
  478. movq mm7,mm3
  479. punpcklwd mm1,mm5
  480. punpckhwd mm7,mm5
  481. movq mm3,mm1
  482. movq mm5,mm7
  483. pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
  484. pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
  485. pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
  486. pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
  487. paddd mm1,mm0 ; mm1=data5L
  488. paddd mm7,mm6 ; mm7=data5H
  489. paddd mm3, MMWORD [wk(0)] ; mm3=data3L
  490. paddd mm5, MMWORD [wk(1)] ; mm5=data3H
  491. paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
  492. paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
  493. psrad mm1,DESCALE_P2
  494. psrad mm7,DESCALE_P2
  495. paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
  496. paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
  497. psrad mm3,DESCALE_P2
  498. psrad mm5,DESCALE_P2
  499. packssdw mm1,mm7 ; mm1=data5
  500. packssdw mm3,mm5 ; mm3=data3
  501. movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
  502. movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
  503. add edx, byte 4*SIZEOF_DCTELEM
  504. dec ecx
  505. jnz near .columnloop
  506. emms ; empty MMX state
  507. ; pop edi ; unused
  508. ; pop esi ; unused
  509. ; pop edx ; need not be preserved
  510. ; pop ecx ; need not be preserved
  511. poppic ebx
  512. mov esp,ebp ; esp <- aligned ebp
  513. pop esp ; esp <- original ebp
  514. pop ebp
  515. ret
  516. ; For some reason, the OS X linker does not honor the request to align the
  517. ; segment unless we do this.
  518. align 16