jdsample-mmx.asm 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737
  1. ;
  2. ; jdsample.asm - upsampling (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler),
  11. ; can *not* be assembled with Microsoft's MASM or any compatible
  12. ; assembler (including Borland's Turbo Assembler).
  13. ; NASM is available from http://nasm.sourceforge.net/ or
  14. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  15. ;
  16. ; [TAB8]
  17. %include "jsimdext.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_CONST
  20. alignz 16
  21. global EXTN(jconst_fancy_upsample_mmx)
  22. EXTN(jconst_fancy_upsample_mmx):
  23. PW_ONE times 4 dw 1
  24. PW_TWO times 4 dw 2
  25. PW_THREE times 4 dw 3
  26. PW_SEVEN times 4 dw 7
  27. PW_EIGHT times 4 dw 8
  28. alignz 16
  29. ; --------------------------------------------------------------------------
  30. SECTION SEG_TEXT
  31. BITS 32
  32. ;
  33. ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  34. ;
  35. ; The upsampling algorithm is linear interpolation between pixel centers,
  36. ; also known as a "triangle filter". This is a good compromise between
  37. ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
  38. ; of the way between input pixel centers.
  39. ;
  40. ; GLOBAL(void)
  41. ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
  42. ; JDIMENSION downsampled_width,
  43. ; JSAMPARRAY input_data,
  44. ; JSAMPARRAY *output_data_ptr);
  45. ;
  46. %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
  47. %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
  48. %define input_data(b) (b)+16 ; JSAMPARRAY input_data
  49. %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
  50. align 16
  51. global EXTN(jsimd_h2v1_fancy_upsample_mmx)
  52. EXTN(jsimd_h2v1_fancy_upsample_mmx):
  53. push ebp
  54. mov ebp,esp
  55. pushpic ebx
  56. ; push ecx ; need not be preserved
  57. ; push edx ; need not be preserved
  58. push esi
  59. push edi
  60. get_GOT ebx ; get GOT address
  61. mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
  62. test eax,eax
  63. jz near .return
  64. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  65. test ecx,ecx
  66. jz near .return
  67. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  68. mov edi, POINTER [output_data_ptr(ebp)]
  69. mov edi, JSAMPARRAY [edi] ; output_data
  70. alignx 16,7
  71. .rowloop:
  72. push eax ; colctr
  73. push edi
  74. push esi
  75. mov esi, JSAMPROW [esi] ; inptr
  76. mov edi, JSAMPROW [edi] ; outptr
  77. test eax, SIZEOF_MMWORD-1
  78. jz short .skip
  79. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  80. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  81. .skip:
  82. pxor mm0,mm0 ; mm0=(all 0's)
  83. pcmpeqb mm7,mm7
  84. psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
  85. pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
  86. add eax, byte SIZEOF_MMWORD-1
  87. and eax, byte -SIZEOF_MMWORD
  88. cmp eax, byte SIZEOF_MMWORD
  89. ja short .columnloop
  90. alignx 16,7
  91. .columnloop_last:
  92. pcmpeqb mm6,mm6
  93. psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
  94. pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
  95. jmp short .upsample
  96. alignx 16,7
  97. .columnloop:
  98. movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
  99. psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
  100. .upsample:
  101. movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
  102. movq mm2,mm1
  103. movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7)
  104. psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
  105. psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
  106. por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6)
  107. por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8)
  108. movq mm7,mm1
  109. psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
  110. movq mm4,mm1
  111. punpcklbw mm1,mm0 ; mm1=( 0 1 2 3)
  112. punpckhbw mm4,mm0 ; mm4=( 4 5 6 7)
  113. movq mm5,mm2
  114. punpcklbw mm2,mm0 ; mm2=(-1 0 1 2)
  115. punpckhbw mm5,mm0 ; mm5=( 3 4 5 6)
  116. movq mm6,mm3
  117. punpcklbw mm3,mm0 ; mm3=( 1 2 3 4)
  118. punpckhbw mm6,mm0 ; mm6=( 5 6 7 8)
  119. pmullw mm1,[GOTOFF(ebx,PW_THREE)]
  120. pmullw mm4,[GOTOFF(ebx,PW_THREE)]
  121. paddw mm2,[GOTOFF(ebx,PW_ONE)]
  122. paddw mm5,[GOTOFF(ebx,PW_ONE)]
  123. paddw mm3,[GOTOFF(ebx,PW_TWO)]
  124. paddw mm6,[GOTOFF(ebx,PW_TWO)]
  125. paddw mm2,mm1
  126. paddw mm5,mm4
  127. psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6)
  128. psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14)
  129. paddw mm3,mm1
  130. paddw mm6,mm4
  131. psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7)
  132. psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15)
  133. psllw mm3,BYTE_BIT
  134. psllw mm6,BYTE_BIT
  135. por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
  136. por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
  137. movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
  138. movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
  139. sub eax, byte SIZEOF_MMWORD
  140. add esi, byte 1*SIZEOF_MMWORD ; inptr
  141. add edi, byte 2*SIZEOF_MMWORD ; outptr
  142. cmp eax, byte SIZEOF_MMWORD
  143. ja near .columnloop
  144. test eax,eax
  145. jnz near .columnloop_last
  146. pop esi
  147. pop edi
  148. pop eax
  149. add esi, byte SIZEOF_JSAMPROW ; input_data
  150. add edi, byte SIZEOF_JSAMPROW ; output_data
  151. dec ecx ; rowctr
  152. jg near .rowloop
  153. emms ; empty MMX state
  154. .return:
  155. pop edi
  156. pop esi
  157. ; pop edx ; need not be preserved
  158. ; pop ecx ; need not be preserved
  159. poppic ebx
  160. pop ebp
  161. ret
  162. ; --------------------------------------------------------------------------
  163. ;
  164. ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  165. ; Again a triangle filter; see comments for h2v1 case, above.
  166. ;
  167. ; GLOBAL(void)
  168. ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
  169. ; JDIMENSION downsampled_width,
  170. ; JSAMPARRAY input_data,
  171. ; JSAMPARRAY *output_data_ptr);
  172. ;
  173. %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
  174. %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
  175. %define input_data(b) (b)+16 ; JSAMPARRAY input_data
  176. %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
  177. %define original_ebp ebp+0
  178. %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
  179. %define WK_NUM 4
  180. %define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
  181. align 16
  182. global EXTN(jsimd_h2v2_fancy_upsample_mmx)
  183. EXTN(jsimd_h2v2_fancy_upsample_mmx):
  184. push ebp
  185. mov eax,esp ; eax = original ebp
  186. sub esp, byte 4
  187. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  188. mov [esp],eax
  189. mov ebp,esp ; ebp = aligned ebp
  190. lea esp, [wk(0)]
  191. pushpic eax ; make a room for GOT address
  192. push ebx
  193. ; push ecx ; need not be preserved
  194. ; push edx ; need not be preserved
  195. push esi
  196. push edi
  197. get_GOT ebx ; get GOT address
  198. movpic POINTER [gotptr], ebx ; save GOT address
  199. mov edx,eax ; edx = original ebp
  200. mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
  201. test eax,eax
  202. jz near .return
  203. mov ecx, INT [max_v_samp(edx)] ; rowctr
  204. test ecx,ecx
  205. jz near .return
  206. mov esi, JSAMPARRAY [input_data(edx)] ; input_data
  207. mov edi, POINTER [output_data_ptr(edx)]
  208. mov edi, JSAMPARRAY [edi] ; output_data
  209. alignx 16,7
  210. .rowloop:
  211. push eax ; colctr
  212. push ecx
  213. push edi
  214. push esi
  215. mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
  216. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  217. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
  218. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  219. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  220. test eax, SIZEOF_MMWORD-1
  221. jz short .skip
  222. push edx
  223. mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
  224. mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
  225. mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
  226. mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
  227. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  228. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  229. pop edx
  230. .skip:
  231. ; -- process the first column block
  232. movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
  233. movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
  234. movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
  235. pushpic ebx
  236. movpic ebx, POINTER [gotptr] ; load GOT address
  237. pxor mm3,mm3 ; mm3=(all 0's)
  238. movq mm4,mm0
  239. punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3)
  240. punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7)
  241. movq mm5,mm1
  242. punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3)
  243. punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7)
  244. movq mm6,mm2
  245. punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3)
  246. punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7)
  247. pmullw mm0,[GOTOFF(ebx,PW_THREE)]
  248. pmullw mm4,[GOTOFF(ebx,PW_THREE)]
  249. pcmpeqb mm7,mm7
  250. psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
  251. paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
  252. paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
  253. paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
  254. paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
  255. movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
  256. movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
  257. movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
  258. movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
  259. pand mm1,mm7 ; mm1=( 0 - - -)
  260. pand mm2,mm7 ; mm2=( 0 - - -)
  261. movq MMWORD [wk(0)], mm1
  262. movq MMWORD [wk(1)], mm2
  263. poppic ebx
  264. add eax, byte SIZEOF_MMWORD-1
  265. and eax, byte -SIZEOF_MMWORD
  266. cmp eax, byte SIZEOF_MMWORD
  267. ja short .columnloop
  268. alignx 16,7
  269. .columnloop_last:
  270. ; -- process the last column block
  271. pushpic ebx
  272. movpic ebx, POINTER [gotptr] ; load GOT address
  273. pcmpeqb mm1,mm1
  274. psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
  275. movq mm2,mm1
  276. pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
  277. pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
  278. movq MMWORD [wk(2)], mm1
  279. movq MMWORD [wk(3)], mm2
  280. jmp short .upsample
  281. alignx 16,7
  282. .columnloop:
  283. ; -- process the next column block
  284. movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
  285. movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
  286. movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
  287. pushpic ebx
  288. movpic ebx, POINTER [gotptr] ; load GOT address
  289. pxor mm3,mm3 ; mm3=(all 0's)
  290. movq mm4,mm0
  291. punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3)
  292. punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7)
  293. movq mm5,mm1
  294. punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3)
  295. punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7)
  296. movq mm6,mm2
  297. punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3)
  298. punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7)
  299. pmullw mm0,[GOTOFF(ebx,PW_THREE)]
  300. pmullw mm4,[GOTOFF(ebx,PW_THREE)]
  301. paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
  302. paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
  303. paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
  304. paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
  305. movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
  306. movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
  307. movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
  308. movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
  309. psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
  310. psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
  311. movq MMWORD [wk(2)], mm1
  312. movq MMWORD [wk(3)], mm2
  313. .upsample:
  314. ; -- process the upper row
  315. movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
  316. movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
  317. movq mm0,mm7
  318. movq mm4,mm3
  319. psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -)
  320. psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
  321. movq mm5,mm7
  322. movq mm6,mm3
  323. psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
  324. psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6)
  325. por mm0,mm4 ; mm0=( 1 2 3 4)
  326. por mm5,mm6 ; mm5=( 3 4 5 6)
  327. movq mm1,mm7
  328. movq mm2,mm3
  329. psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
  330. psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -)
  331. movq mm4,mm3
  332. psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
  333. por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
  334. por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
  335. movq MMWORD [wk(0)], mm4
  336. pmullw mm7,[GOTOFF(ebx,PW_THREE)]
  337. pmullw mm3,[GOTOFF(ebx,PW_THREE)]
  338. paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
  339. paddw mm5,[GOTOFF(ebx,PW_EIGHT)]
  340. paddw mm0,[GOTOFF(ebx,PW_SEVEN)]
  341. paddw mm2,[GOTOFF(ebx,PW_SEVEN)]
  342. paddw mm1,mm7
  343. paddw mm5,mm3
  344. psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6)
  345. psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14)
  346. paddw mm0,mm7
  347. paddw mm2,mm3
  348. psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7)
  349. psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15)
  350. psllw mm0,BYTE_BIT
  351. psllw mm2,BYTE_BIT
  352. por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
  353. por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
  354. movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
  355. movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
  356. ; -- process the lower row
  357. movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
  358. movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
  359. movq mm7,mm6
  360. movq mm3,mm4
  361. psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -)
  362. psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
  363. movq mm0,mm6
  364. movq mm2,mm4
  365. psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
  366. psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6)
  367. por mm7,mm3 ; mm7=( 1 2 3 4)
  368. por mm0,mm2 ; mm0=( 3 4 5 6)
  369. movq mm1,mm6
  370. movq mm5,mm4
  371. psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
  372. psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -)
  373. movq mm3,mm4
  374. psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
  375. por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
  376. por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
  377. movq MMWORD [wk(1)], mm3
  378. pmullw mm6,[GOTOFF(ebx,PW_THREE)]
  379. pmullw mm4,[GOTOFF(ebx,PW_THREE)]
  380. paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
  381. paddw mm0,[GOTOFF(ebx,PW_EIGHT)]
  382. paddw mm7,[GOTOFF(ebx,PW_SEVEN)]
  383. paddw mm5,[GOTOFF(ebx,PW_SEVEN)]
  384. paddw mm1,mm6
  385. paddw mm0,mm4
  386. psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6)
  387. psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14)
  388. paddw mm7,mm6
  389. paddw mm5,mm4
  390. psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7)
  391. psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15)
  392. psllw mm7,BYTE_BIT
  393. psllw mm5,BYTE_BIT
  394. por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
  395. por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
  396. movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
  397. movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
  398. poppic ebx
  399. sub eax, byte SIZEOF_MMWORD
  400. add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
  401. add ebx, byte 1*SIZEOF_MMWORD ; inptr0
  402. add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
  403. add edx, byte 2*SIZEOF_MMWORD ; outptr0
  404. add edi, byte 2*SIZEOF_MMWORD ; outptr1
  405. cmp eax, byte SIZEOF_MMWORD
  406. ja near .columnloop
  407. test eax,eax
  408. jnz near .columnloop_last
  409. pop esi
  410. pop edi
  411. pop ecx
  412. pop eax
  413. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  414. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  415. sub ecx, byte 2 ; rowctr
  416. jg near .rowloop
  417. emms ; empty MMX state
  418. .return:
  419. pop edi
  420. pop esi
  421. ; pop edx ; need not be preserved
  422. ; pop ecx ; need not be preserved
  423. pop ebx
  424. mov esp,ebp ; esp <- aligned ebp
  425. pop esp ; esp <- original ebp
  426. pop ebp
  427. ret
  428. ; --------------------------------------------------------------------------
  429. ;
  430. ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
  431. ; It's still a box filter.
  432. ;
  433. ; GLOBAL(void)
  434. ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
  435. ; JDIMENSION output_width,
  436. ; JSAMPARRAY input_data,
  437. ; JSAMPARRAY *output_data_ptr);
  438. ;
  439. %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
  440. %define output_width(b) (b)+12 ; JDIMENSION output_width
  441. %define input_data(b) (b)+16 ; JSAMPARRAY input_data
  442. %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
  443. align 16
  444. global EXTN(jsimd_h2v1_upsample_mmx)
  445. EXTN(jsimd_h2v1_upsample_mmx):
  446. push ebp
  447. mov ebp,esp
  448. ; push ebx ; unused
  449. ; push ecx ; need not be preserved
  450. ; push edx ; need not be preserved
  451. push esi
  452. push edi
  453. mov edx, JDIMENSION [output_width(ebp)]
  454. add edx, byte (2*SIZEOF_MMWORD)-1
  455. and edx, byte -(2*SIZEOF_MMWORD)
  456. jz short .return
  457. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  458. test ecx,ecx
  459. jz short .return
  460. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  461. mov edi, POINTER [output_data_ptr(ebp)]
  462. mov edi, JSAMPARRAY [edi] ; output_data
  463. alignx 16,7
  464. .rowloop:
  465. push edi
  466. push esi
  467. mov esi, JSAMPROW [esi] ; inptr
  468. mov edi, JSAMPROW [edi] ; outptr
  469. mov eax,edx ; colctr
  470. alignx 16,7
  471. .columnloop:
  472. movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
  473. movq mm1,mm0
  474. punpcklbw mm0,mm0
  475. punpckhbw mm1,mm1
  476. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  477. movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
  478. sub eax, byte 2*SIZEOF_MMWORD
  479. jz short .nextrow
  480. movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
  481. movq mm3,mm2
  482. punpcklbw mm2,mm2
  483. punpckhbw mm3,mm3
  484. movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
  485. movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
  486. sub eax, byte 2*SIZEOF_MMWORD
  487. jz short .nextrow
  488. add esi, byte 2*SIZEOF_MMWORD ; inptr
  489. add edi, byte 4*SIZEOF_MMWORD ; outptr
  490. jmp short .columnloop
  491. alignx 16,7
  492. .nextrow:
  493. pop esi
  494. pop edi
  495. add esi, byte SIZEOF_JSAMPROW ; input_data
  496. add edi, byte SIZEOF_JSAMPROW ; output_data
  497. dec ecx ; rowctr
  498. jg short .rowloop
  499. emms ; empty MMX state
  500. .return:
  501. pop edi
  502. pop esi
  503. ; pop edx ; need not be preserved
  504. ; pop ecx ; need not be preserved
  505. ; pop ebx ; unused
  506. pop ebp
  507. ret
  508. ; --------------------------------------------------------------------------
  509. ;
  510. ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
  511. ; It's still a box filter.
  512. ;
  513. ; GLOBAL(void)
  514. ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
  515. ; JDIMENSION output_width,
  516. ; JSAMPARRAY input_data,
  517. ; JSAMPARRAY *output_data_ptr);
  518. ;
  519. %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
  520. %define output_width(b) (b)+12 ; JDIMENSION output_width
  521. %define input_data(b) (b)+16 ; JSAMPARRAY input_data
  522. %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
  523. align 16
  524. global EXTN(jsimd_h2v2_upsample_mmx)
  525. EXTN(jsimd_h2v2_upsample_mmx):
  526. push ebp
  527. mov ebp,esp
  528. push ebx
  529. ; push ecx ; need not be preserved
  530. ; push edx ; need not be preserved
  531. push esi
  532. push edi
  533. mov edx, JDIMENSION [output_width(ebp)]
  534. add edx, byte (2*SIZEOF_MMWORD)-1
  535. and edx, byte -(2*SIZEOF_MMWORD)
  536. jz near .return
  537. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  538. test ecx,ecx
  539. jz short .return
  540. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  541. mov edi, POINTER [output_data_ptr(ebp)]
  542. mov edi, JSAMPARRAY [edi] ; output_data
  543. alignx 16,7
  544. .rowloop:
  545. push edi
  546. push esi
  547. mov esi, JSAMPROW [esi] ; inptr
  548. mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  549. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  550. mov eax,edx ; colctr
  551. alignx 16,7
  552. .columnloop:
  553. movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
  554. movq mm1,mm0
  555. punpcklbw mm0,mm0
  556. punpckhbw mm1,mm1
  557. movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
  558. movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
  559. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  560. movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
  561. sub eax, byte 2*SIZEOF_MMWORD
  562. jz short .nextrow
  563. movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
  564. movq mm3,mm2
  565. punpcklbw mm2,mm2
  566. punpckhbw mm3,mm3
  567. movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
  568. movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
  569. movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
  570. movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
  571. sub eax, byte 2*SIZEOF_MMWORD
  572. jz short .nextrow
  573. add esi, byte 2*SIZEOF_MMWORD ; inptr
  574. add ebx, byte 4*SIZEOF_MMWORD ; outptr0
  575. add edi, byte 4*SIZEOF_MMWORD ; outptr1
  576. jmp short .columnloop
  577. alignx 16,7
  578. .nextrow:
  579. pop esi
  580. pop edi
  581. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  582. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  583. sub ecx, byte 2 ; rowctr
  584. jg short .rowloop
  585. emms ; empty MMX state
  586. .return:
  587. pop edi
  588. pop esi
  589. ; pop edx ; need not be preserved
  590. ; pop ecx ; need not be preserved
  591. pop ebx
  592. pop ebp
  593. ret
  594. ; For some reason, the OS X linker does not honor the request to align the
  595. ; segment unless we do this.
  596. align 16