jdsample-sse2.asm 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729
  1. ;
  2. ; jdsample.asm - upsampling (SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler),
  11. ; can *not* be assembled with Microsoft's MASM or any compatible
  12. ; assembler (including Borland's Turbo Assembler).
  13. ; NASM is available from http://nasm.sourceforge.net/ or
  14. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  15. ;
  16. ; [TAB8]
  17. %include "jsimdext.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_CONST
  20. alignz 16
  21. global EXTN(jconst_fancy_upsample_sse2)
  22. EXTN(jconst_fancy_upsample_sse2):
  23. PW_ONE times 8 dw 1
  24. PW_TWO times 8 dw 2
  25. PW_THREE times 8 dw 3
  26. PW_SEVEN times 8 dw 7
  27. PW_EIGHT times 8 dw 8
  28. alignz 16
  29. ; --------------------------------------------------------------------------
  30. SECTION SEG_TEXT
  31. BITS 32
  32. ;
  33. ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  34. ;
  35. ; The upsampling algorithm is linear interpolation between pixel centers,
  36. ; also known as a "triangle filter". This is a good compromise between
  37. ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
  38. ; of the way between input pixel centers.
  39. ;
  40. ; GLOBAL(void)
  41. ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
  42. ; JDIMENSION downsampled_width,
  43. ; JSAMPARRAY input_data,
  44. ; JSAMPARRAY *output_data_ptr);
  45. ;
  46. %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
  47. %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
  48. %define input_data(b) (b)+16 ; JSAMPARRAY input_data
  49. %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
  50. align 16
  51. global EXTN(jsimd_h2v1_fancy_upsample_sse2)
  52. EXTN(jsimd_h2v1_fancy_upsample_sse2):
  53. push ebp
  54. mov ebp,esp
  55. pushpic ebx
  56. ; push ecx ; need not be preserved
  57. ; push edx ; need not be preserved
  58. push esi
  59. push edi
  60. get_GOT ebx ; get GOT address
  61. mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
  62. test eax,eax
  63. jz near .return
  64. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  65. test ecx,ecx
  66. jz near .return
  67. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  68. mov edi, POINTER [output_data_ptr(ebp)]
  69. mov edi, JSAMPARRAY [edi] ; output_data
  70. alignx 16,7
  71. .rowloop:
  72. push eax ; colctr
  73. push edi
  74. push esi
  75. mov esi, JSAMPROW [esi] ; inptr
  76. mov edi, JSAMPROW [edi] ; outptr
  77. test eax, SIZEOF_XMMWORD-1
  78. jz short .skip
  79. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  80. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  81. .skip:
  82. pxor xmm0,xmm0 ; xmm0=(all 0's)
  83. pcmpeqb xmm7,xmm7
  84. psrldq xmm7,(SIZEOF_XMMWORD-1)
  85. pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
  86. add eax, byte SIZEOF_XMMWORD-1
  87. and eax, byte -SIZEOF_XMMWORD
  88. cmp eax, byte SIZEOF_XMMWORD
  89. ja short .columnloop
  90. alignx 16,7
  91. .columnloop_last:
  92. pcmpeqb xmm6,xmm6
  93. pslldq xmm6,(SIZEOF_XMMWORD-1)
  94. pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
  95. jmp short .upsample
  96. alignx 16,7
  97. .columnloop:
  98. movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
  99. pslldq xmm6,(SIZEOF_XMMWORD-1)
  100. .upsample:
  101. movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
  102. movdqa xmm2,xmm1
  103. movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
  104. pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
  105. psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
  106. por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
  107. por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
  108. movdqa xmm7,xmm1
  109. psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
  110. movdqa xmm4,xmm1
  111. punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
  112. punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
  113. movdqa xmm5,xmm2
  114. punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
  115. punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
  116. movdqa xmm6,xmm3
  117. punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
  118. punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
  119. pmullw xmm1,[GOTOFF(ebx,PW_THREE)]
  120. pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
  121. paddw xmm2,[GOTOFF(ebx,PW_ONE)]
  122. paddw xmm5,[GOTOFF(ebx,PW_ONE)]
  123. paddw xmm3,[GOTOFF(ebx,PW_TWO)]
  124. paddw xmm6,[GOTOFF(ebx,PW_TWO)]
  125. paddw xmm2,xmm1
  126. paddw xmm5,xmm4
  127. psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
  128. psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
  129. paddw xmm3,xmm1
  130. paddw xmm6,xmm4
  131. psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
  132. psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
  133. psllw xmm3,BYTE_BIT
  134. psllw xmm6,BYTE_BIT
  135. por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
  136. por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
  137. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
  138. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
  139. sub eax, byte SIZEOF_XMMWORD
  140. add esi, byte 1*SIZEOF_XMMWORD ; inptr
  141. add edi, byte 2*SIZEOF_XMMWORD ; outptr
  142. cmp eax, byte SIZEOF_XMMWORD
  143. ja near .columnloop
  144. test eax,eax
  145. jnz near .columnloop_last
  146. pop esi
  147. pop edi
  148. pop eax
  149. add esi, byte SIZEOF_JSAMPROW ; input_data
  150. add edi, byte SIZEOF_JSAMPROW ; output_data
  151. dec ecx ; rowctr
  152. jg near .rowloop
  153. .return:
  154. pop edi
  155. pop esi
  156. ; pop edx ; need not be preserved
  157. ; pop ecx ; need not be preserved
  158. poppic ebx
  159. pop ebp
  160. ret
  161. ; --------------------------------------------------------------------------
  162. ;
  163. ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  164. ; Again a triangle filter; see comments for h2v1 case, above.
  165. ;
  166. ; GLOBAL(void)
  167. ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
  168. ; JDIMENSION downsampled_width,
  169. ; JSAMPARRAY input_data,
  170. ; JSAMPARRAY *output_data_ptr);
  171. ;
  172. %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
  173. %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
  174. %define input_data(b) (b)+16 ; JSAMPARRAY input_data
  175. %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
  176. %define original_ebp ebp+0
  177. %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  178. %define WK_NUM 4
  179. %define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
  180. align 16
  181. global EXTN(jsimd_h2v2_fancy_upsample_sse2)
  182. EXTN(jsimd_h2v2_fancy_upsample_sse2):
  183. push ebp
  184. mov eax,esp ; eax = original ebp
  185. sub esp, byte 4
  186. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  187. mov [esp],eax
  188. mov ebp,esp ; ebp = aligned ebp
  189. lea esp, [wk(0)]
  190. pushpic eax ; make a room for GOT address
  191. push ebx
  192. ; push ecx ; need not be preserved
  193. ; push edx ; need not be preserved
  194. push esi
  195. push edi
  196. get_GOT ebx ; get GOT address
  197. movpic POINTER [gotptr], ebx ; save GOT address
  198. mov edx,eax ; edx = original ebp
  199. mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
  200. test eax,eax
  201. jz near .return
  202. mov ecx, INT [max_v_samp(edx)] ; rowctr
  203. test ecx,ecx
  204. jz near .return
  205. mov esi, JSAMPARRAY [input_data(edx)] ; input_data
  206. mov edi, POINTER [output_data_ptr(edx)]
  207. mov edi, JSAMPARRAY [edi] ; output_data
  208. alignx 16,7
  209. .rowloop:
  210. push eax ; colctr
  211. push ecx
  212. push edi
  213. push esi
  214. mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
  215. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  216. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
  217. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  218. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  219. test eax, SIZEOF_XMMWORD-1
  220. jz short .skip
  221. push edx
  222. mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
  223. mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
  224. mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
  225. mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
  226. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  227. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  228. pop edx
  229. .skip:
  230. ; -- process the first column block
  231. movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
  232. movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
  233. movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
  234. pushpic ebx
  235. movpic ebx, POINTER [gotptr] ; load GOT address
  236. pxor xmm3,xmm3 ; xmm3=(all 0's)
  237. movdqa xmm4,xmm0
  238. punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
  239. punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
  240. movdqa xmm5,xmm1
  241. punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
  242. punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
  243. movdqa xmm6,xmm2
  244. punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
  245. punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
  246. pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
  247. pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
  248. pcmpeqb xmm7,xmm7
  249. psrldq xmm7,(SIZEOF_XMMWORD-2)
  250. paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
  251. paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
  252. paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
  253. paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
  254. movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
  255. movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
  256. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
  257. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
  258. pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
  259. pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
  260. movdqa XMMWORD [wk(0)], xmm1
  261. movdqa XMMWORD [wk(1)], xmm2
  262. poppic ebx
  263. add eax, byte SIZEOF_XMMWORD-1
  264. and eax, byte -SIZEOF_XMMWORD
  265. cmp eax, byte SIZEOF_XMMWORD
  266. ja short .columnloop
  267. alignx 16,7
  268. .columnloop_last:
  269. ; -- process the last column block
  270. pushpic ebx
  271. movpic ebx, POINTER [gotptr] ; load GOT address
  272. pcmpeqb xmm1,xmm1
  273. pslldq xmm1,(SIZEOF_XMMWORD-2)
  274. movdqa xmm2,xmm1
  275. pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
  276. pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
  277. movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
  278. movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
  279. jmp near .upsample
  280. alignx 16,7
  281. .columnloop:
  282. ; -- process the next column block
  283. movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
  284. movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
  285. movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
  286. pushpic ebx
  287. movpic ebx, POINTER [gotptr] ; load GOT address
  288. pxor xmm3,xmm3 ; xmm3=(all 0's)
  289. movdqa xmm4,xmm0
  290. punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
  291. punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
  292. movdqa xmm5,xmm1
  293. punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
  294. punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
  295. movdqa xmm6,xmm2
  296. punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
  297. punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
  298. pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
  299. pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
  300. paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
  301. paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
  302. paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
  303. paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
  304. movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
  305. movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
  306. movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
  307. movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
  308. pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
  309. pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
  310. movdqa XMMWORD [wk(2)], xmm1
  311. movdqa XMMWORD [wk(3)], xmm2
  312. .upsample:
  313. ; -- process the upper row
  314. movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
  315. movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
  316. movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
  317. movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
  318. psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
  319. pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
  320. movdqa xmm5,xmm7
  321. movdqa xmm6,xmm3
  322. psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
  323. pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
  324. por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
  325. por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
  326. movdqa xmm1,xmm7
  327. movdqa xmm2,xmm3
  328. pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
  329. psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
  330. movdqa xmm4,xmm3
  331. psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
  332. por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
  333. por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
  334. movdqa XMMWORD [wk(0)], xmm4
  335. pmullw xmm7,[GOTOFF(ebx,PW_THREE)]
  336. pmullw xmm3,[GOTOFF(ebx,PW_THREE)]
  337. paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
  338. paddw xmm5,[GOTOFF(ebx,PW_EIGHT)]
  339. paddw xmm0,[GOTOFF(ebx,PW_SEVEN)]
  340. paddw xmm2,[GOTOFF(ebx,PW_SEVEN)]
  341. paddw xmm1,xmm7
  342. paddw xmm5,xmm3
  343. psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
  344. psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
  345. paddw xmm0,xmm7
  346. paddw xmm2,xmm3
  347. psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
  348. psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
  349. psllw xmm0,BYTE_BIT
  350. psllw xmm2,BYTE_BIT
  351. por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
  352. por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
  353. movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
  354. movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
  355. ; -- process the lower row
  356. movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
  357. movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
  358. movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
  359. movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
  360. psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
  361. pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
  362. movdqa xmm0,xmm6
  363. movdqa xmm2,xmm4
  364. psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
  365. pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
  366. por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
  367. por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
  368. movdqa xmm1,xmm6
  369. movdqa xmm5,xmm4
  370. pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
  371. psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
  372. movdqa xmm3,xmm4
  373. psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
  374. por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
  375. por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
  376. movdqa XMMWORD [wk(1)], xmm3
  377. pmullw xmm6,[GOTOFF(ebx,PW_THREE)]
  378. pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
  379. paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
  380. paddw xmm0,[GOTOFF(ebx,PW_EIGHT)]
  381. paddw xmm7,[GOTOFF(ebx,PW_SEVEN)]
  382. paddw xmm5,[GOTOFF(ebx,PW_SEVEN)]
  383. paddw xmm1,xmm6
  384. paddw xmm0,xmm4
  385. psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
  386. psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
  387. paddw xmm7,xmm6
  388. paddw xmm5,xmm4
  389. psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
  390. psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
  391. psllw xmm7,BYTE_BIT
  392. psllw xmm5,BYTE_BIT
  393. por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
  394. por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
  395. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
  396. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
  397. poppic ebx
  398. sub eax, byte SIZEOF_XMMWORD
  399. add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
  400. add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
  401. add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
  402. add edx, byte 2*SIZEOF_XMMWORD ; outptr0
  403. add edi, byte 2*SIZEOF_XMMWORD ; outptr1
  404. cmp eax, byte SIZEOF_XMMWORD
  405. ja near .columnloop
  406. test eax,eax
  407. jnz near .columnloop_last
  408. pop esi
  409. pop edi
  410. pop ecx
  411. pop eax
  412. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  413. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  414. sub ecx, byte 2 ; rowctr
  415. jg near .rowloop
  416. .return:
  417. pop edi
  418. pop esi
  419. ; pop edx ; need not be preserved
  420. ; pop ecx ; need not be preserved
  421. pop ebx
  422. mov esp,ebp ; esp <- aligned ebp
  423. pop esp ; esp <- original ebp
  424. pop ebp
  425. ret
  426. ; --------------------------------------------------------------------------
  427. ;
  428. ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
  429. ; It's still a box filter.
  430. ;
  431. ; GLOBAL(void)
  432. ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
  433. ; JDIMENSION output_width,
  434. ; JSAMPARRAY input_data,
  435. ; JSAMPARRAY *output_data_ptr);
  436. ;
  437. %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
  438. %define output_width(b) (b)+12 ; JDIMENSION output_width
  439. %define input_data(b) (b)+16 ; JSAMPARRAY input_data
  440. %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
  441. align 16
  442. global EXTN(jsimd_h2v1_upsample_sse2)
  443. EXTN(jsimd_h2v1_upsample_sse2):
  444. push ebp
  445. mov ebp,esp
  446. ; push ebx ; unused
  447. ; push ecx ; need not be preserved
  448. ; push edx ; need not be preserved
  449. push esi
  450. push edi
  451. mov edx, JDIMENSION [output_width(ebp)]
  452. add edx, byte (2*SIZEOF_XMMWORD)-1
  453. and edx, byte -(2*SIZEOF_XMMWORD)
  454. jz short .return
  455. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  456. test ecx,ecx
  457. jz short .return
  458. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  459. mov edi, POINTER [output_data_ptr(ebp)]
  460. mov edi, JSAMPARRAY [edi] ; output_data
  461. alignx 16,7
  462. .rowloop:
  463. push edi
  464. push esi
  465. mov esi, JSAMPROW [esi] ; inptr
  466. mov edi, JSAMPROW [edi] ; outptr
  467. mov eax,edx ; colctr
  468. alignx 16,7
  469. .columnloop:
  470. movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
  471. movdqa xmm1,xmm0
  472. punpcklbw xmm0,xmm0
  473. punpckhbw xmm1,xmm1
  474. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
  475. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
  476. sub eax, byte 2*SIZEOF_XMMWORD
  477. jz short .nextrow
  478. movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
  479. movdqa xmm3,xmm2
  480. punpcklbw xmm2,xmm2
  481. punpckhbw xmm3,xmm3
  482. movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
  483. movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
  484. sub eax, byte 2*SIZEOF_XMMWORD
  485. jz short .nextrow
  486. add esi, byte 2*SIZEOF_XMMWORD ; inptr
  487. add edi, byte 4*SIZEOF_XMMWORD ; outptr
  488. jmp short .columnloop
  489. alignx 16,7
  490. .nextrow:
  491. pop esi
  492. pop edi
  493. add esi, byte SIZEOF_JSAMPROW ; input_data
  494. add edi, byte SIZEOF_JSAMPROW ; output_data
  495. dec ecx ; rowctr
  496. jg short .rowloop
  497. .return:
  498. pop edi
  499. pop esi
  500. ; pop edx ; need not be preserved
  501. ; pop ecx ; need not be preserved
  502. ; pop ebx ; unused
  503. pop ebp
  504. ret
  505. ; --------------------------------------------------------------------------
  506. ;
  507. ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
  508. ; It's still a box filter.
  509. ;
  510. ; GLOBAL(void)
  511. ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
  512. ; JDIMENSION output_width,
  513. ; JSAMPARRAY input_data,
  514. ; JSAMPARRAY *output_data_ptr);
  515. ;
  516. %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
  517. %define output_width(b) (b)+12 ; JDIMENSION output_width
  518. %define input_data(b) (b)+16 ; JSAMPARRAY input_data
  519. %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
  520. align 16
  521. global EXTN(jsimd_h2v2_upsample_sse2)
  522. EXTN(jsimd_h2v2_upsample_sse2):
  523. push ebp
  524. mov ebp,esp
  525. push ebx
  526. ; push ecx ; need not be preserved
  527. ; push edx ; need not be preserved
  528. push esi
  529. push edi
  530. mov edx, JDIMENSION [output_width(ebp)]
  531. add edx, byte (2*SIZEOF_XMMWORD)-1
  532. and edx, byte -(2*SIZEOF_XMMWORD)
  533. jz near .return
  534. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  535. test ecx,ecx
  536. jz near .return
  537. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  538. mov edi, POINTER [output_data_ptr(ebp)]
  539. mov edi, JSAMPARRAY [edi] ; output_data
  540. alignx 16,7
  541. .rowloop:
  542. push edi
  543. push esi
  544. mov esi, JSAMPROW [esi] ; inptr
  545. mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  546. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  547. mov eax,edx ; colctr
  548. alignx 16,7
  549. .columnloop:
  550. movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
  551. movdqa xmm1,xmm0
  552. punpcklbw xmm0,xmm0
  553. punpckhbw xmm1,xmm1
  554. movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
  555. movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
  556. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
  557. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
  558. sub eax, byte 2*SIZEOF_XMMWORD
  559. jz short .nextrow
  560. movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
  561. movdqa xmm3,xmm2
  562. punpcklbw xmm2,xmm2
  563. punpckhbw xmm3,xmm3
  564. movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
  565. movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
  566. movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
  567. movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
  568. sub eax, byte 2*SIZEOF_XMMWORD
  569. jz short .nextrow
  570. add esi, byte 2*SIZEOF_XMMWORD ; inptr
  571. add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
  572. add edi, byte 4*SIZEOF_XMMWORD ; outptr1
  573. jmp short .columnloop
  574. alignx 16,7
  575. .nextrow:
  576. pop esi
  577. pop edi
  578. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  579. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  580. sub ecx, byte 2 ; rowctr
  581. jg short .rowloop
  582. .return:
  583. pop edi
  584. pop esi
  585. ; pop edx ; need not be preserved
  586. ; pop ecx ; need not be preserved
  587. pop ebx
  588. pop ebp
  589. ret
  590. ; For some reason, the OS X linker does not honor the request to align the
  591. ; segment unless we do this.
  592. align 16