jdsample-sse2-64.asm 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671
  1. ;
  2. ; jdsample.asm - upsampling (64-bit SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; [TAB8]
  18. %include "jsimdext.inc"
  19. ; --------------------------------------------------------------------------
  20. SECTION SEG_CONST
  21. alignz 16
  22. global EXTN(jconst_fancy_upsample_sse2)
  23. EXTN(jconst_fancy_upsample_sse2):
  24. PW_ONE times 8 dw 1
  25. PW_TWO times 8 dw 2
  26. PW_THREE times 8 dw 3
  27. PW_SEVEN times 8 dw 7
  28. PW_EIGHT times 8 dw 8
  29. alignz 16
  30. ; --------------------------------------------------------------------------
  31. SECTION SEG_TEXT
  32. BITS 64
  33. ;
  34. ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  35. ;
  36. ; The upsampling algorithm is linear interpolation between pixel centers,
  37. ; also known as a "triangle filter". This is a good compromise between
  38. ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
  39. ; of the way between input pixel centers.
  40. ;
  41. ; GLOBAL(void)
  42. ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
  43. ; JDIMENSION downsampled_width,
  44. ; JSAMPARRAY input_data,
  45. ; JSAMPARRAY *output_data_ptr);
  46. ;
  47. ; r10 = int max_v_samp_factor
  48. ; r11 = JDIMENSION downsampled_width
  49. ; r12 = JSAMPARRAY input_data
  50. ; r13 = JSAMPARRAY *output_data_ptr
  51. align 16
  52. global EXTN(jsimd_h2v1_fancy_upsample_sse2)
  53. EXTN(jsimd_h2v1_fancy_upsample_sse2):
  54. push rbp
  55. mov rax,rsp
  56. mov rbp,rsp
  57. collect_args
  58. mov eax, r11d ; colctr
  59. test rax,rax
  60. jz near .return
  61. mov rcx, r10 ; rowctr
  62. test rcx,rcx
  63. jz near .return
  64. mov rsi, r12 ; input_data
  65. mov rdi, r13
  66. mov rdi, JSAMPARRAY [rdi] ; output_data
  67. .rowloop:
  68. push rax ; colctr
  69. push rdi
  70. push rsi
  71. mov rsi, JSAMPROW [rsi] ; inptr
  72. mov rdi, JSAMPROW [rdi] ; outptr
  73. test rax, SIZEOF_XMMWORD-1
  74. jz short .skip
  75. mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
  76. mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  77. .skip:
  78. pxor xmm0,xmm0 ; xmm0=(all 0's)
  79. pcmpeqb xmm7,xmm7
  80. psrldq xmm7,(SIZEOF_XMMWORD-1)
  81. pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  82. add rax, byte SIZEOF_XMMWORD-1
  83. and rax, byte -SIZEOF_XMMWORD
  84. cmp rax, byte SIZEOF_XMMWORD
  85. ja short .columnloop
  86. .columnloop_last:
  87. pcmpeqb xmm6,xmm6
  88. pslldq xmm6,(SIZEOF_XMMWORD-1)
  89. pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  90. jmp short .upsample
  91. .columnloop:
  92. movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  93. pslldq xmm6,(SIZEOF_XMMWORD-1)
  94. .upsample:
  95. movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  96. movdqa xmm2,xmm1
  97. movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
  98. pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
  99. psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
  100. por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
  101. por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
  102. movdqa xmm7,xmm1
  103. psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
  104. movdqa xmm4,xmm1
  105. punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
  106. punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
  107. movdqa xmm5,xmm2
  108. punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
  109. punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
  110. movdqa xmm6,xmm3
  111. punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
  112. punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
  113. pmullw xmm1,[rel PW_THREE]
  114. pmullw xmm4,[rel PW_THREE]
  115. paddw xmm2,[rel PW_ONE]
  116. paddw xmm5,[rel PW_ONE]
  117. paddw xmm3,[rel PW_TWO]
  118. paddw xmm6,[rel PW_TWO]
  119. paddw xmm2,xmm1
  120. paddw xmm5,xmm4
  121. psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
  122. psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
  123. paddw xmm3,xmm1
  124. paddw xmm6,xmm4
  125. psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
  126. psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
  127. psllw xmm3,BYTE_BIT
  128. psllw xmm6,BYTE_BIT
  129. por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
  130. por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
  131. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
  132. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
  133. sub rax, byte SIZEOF_XMMWORD
  134. add rsi, byte 1*SIZEOF_XMMWORD ; inptr
  135. add rdi, byte 2*SIZEOF_XMMWORD ; outptr
  136. cmp rax, byte SIZEOF_XMMWORD
  137. ja near .columnloop
  138. test eax,eax
  139. jnz near .columnloop_last
  140. pop rsi
  141. pop rdi
  142. pop rax
  143. add rsi, byte SIZEOF_JSAMPROW ; input_data
  144. add rdi, byte SIZEOF_JSAMPROW ; output_data
  145. dec rcx ; rowctr
  146. jg near .rowloop
  147. .return:
  148. uncollect_args
  149. pop rbp
  150. ret
  151. ; --------------------------------------------------------------------------
  152. ;
  153. ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  154. ; Again a triangle filter; see comments for h2v1 case, above.
  155. ;
  156. ; GLOBAL(void)
  157. ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
  158. ; JDIMENSION downsampled_width,
  159. ; JSAMPARRAY input_data,
  160. ; JSAMPARRAY *output_data_ptr);
  161. ;
  162. ; r10 = int max_v_samp_factor
  163. ; r11 = JDIMENSION downsampled_width
  164. ; r12 = JSAMPARRAY input_data
  165. ; r13 = JSAMPARRAY *output_data_ptr
  166. %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  167. %define WK_NUM 4
  168. align 16
  169. global EXTN(jsimd_h2v2_fancy_upsample_sse2)
  170. EXTN(jsimd_h2v2_fancy_upsample_sse2):
  171. push rbp
  172. mov rax,rsp ; rax = original rbp
  173. sub rsp, byte 4
  174. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  175. mov [rsp],rax
  176. mov rbp,rsp ; rbp = aligned rbp
  177. lea rsp, [wk(0)]
  178. collect_args
  179. push rbx
  180. mov eax, r11d ; colctr
  181. test rax,rax
  182. jz near .return
  183. mov rcx, r10 ; rowctr
  184. test rcx,rcx
  185. jz near .return
  186. mov rsi, r12 ; input_data
  187. mov rdi, r13
  188. mov rdi, JSAMPARRAY [rdi] ; output_data
  189. .rowloop:
  190. push rax ; colctr
  191. push rcx
  192. push rdi
  193. push rsi
  194. mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
  195. mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
  196. mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
  197. mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
  198. mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
  199. test rax, SIZEOF_XMMWORD-1
  200. jz short .skip
  201. push rdx
  202. mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
  203. mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
  204. mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
  205. mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
  206. mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
  207. mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  208. pop rdx
  209. .skip:
  210. ; -- process the first column block
  211. movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
  212. movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
  213. movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
  214. pxor xmm3,xmm3 ; xmm3=(all 0's)
  215. movdqa xmm4,xmm0
  216. punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
  217. punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
  218. movdqa xmm5,xmm1
  219. punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
  220. punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
  221. movdqa xmm6,xmm2
  222. punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
  223. punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
  224. pmullw xmm0,[rel PW_THREE]
  225. pmullw xmm4,[rel PW_THREE]
  226. pcmpeqb xmm7,xmm7
  227. psrldq xmm7,(SIZEOF_XMMWORD-2)
  228. paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
  229. paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
  230. paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
  231. paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
  232. movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
  233. movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
  234. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
  235. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
  236. pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
  237. pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
  238. movdqa XMMWORD [wk(0)], xmm1
  239. movdqa XMMWORD [wk(1)], xmm2
  240. add rax, byte SIZEOF_XMMWORD-1
  241. and rax, byte -SIZEOF_XMMWORD
  242. cmp rax, byte SIZEOF_XMMWORD
  243. ja short .columnloop
  244. .columnloop_last:
  245. ; -- process the last column block
  246. pcmpeqb xmm1,xmm1
  247. pslldq xmm1,(SIZEOF_XMMWORD-2)
  248. movdqa xmm2,xmm1
  249. pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
  250. pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
  251. movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
  252. movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
  253. jmp near .upsample
  254. .columnloop:
  255. ; -- process the next column block
  256. movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
  257. movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
  258. movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
  259. pxor xmm3,xmm3 ; xmm3=(all 0's)
  260. movdqa xmm4,xmm0
  261. punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
  262. punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
  263. movdqa xmm5,xmm1
  264. punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
  265. punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
  266. movdqa xmm6,xmm2
  267. punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
  268. punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
  269. pmullw xmm0,[rel PW_THREE]
  270. pmullw xmm4,[rel PW_THREE]
  271. paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
  272. paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
  273. paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
  274. paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
  275. movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
  276. movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
  277. movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
  278. movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
  279. pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
  280. pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
  281. movdqa XMMWORD [wk(2)], xmm1
  282. movdqa XMMWORD [wk(3)], xmm2
  283. .upsample:
  284. ; -- process the upper row
  285. movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  286. movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
  287. movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
  288. movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
  289. psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
  290. pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
  291. movdqa xmm5,xmm7
  292. movdqa xmm6,xmm3
  293. psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
  294. pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
  295. por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
  296. por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
  297. movdqa xmm1,xmm7
  298. movdqa xmm2,xmm3
  299. pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
  300. psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
  301. movdqa xmm4,xmm3
  302. psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
  303. por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
  304. por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
  305. movdqa XMMWORD [wk(0)], xmm4
  306. pmullw xmm7,[rel PW_THREE]
  307. pmullw xmm3,[rel PW_THREE]
  308. paddw xmm1,[rel PW_EIGHT]
  309. paddw xmm5,[rel PW_EIGHT]
  310. paddw xmm0,[rel PW_SEVEN]
  311. paddw xmm2,[rel PW_SEVEN]
  312. paddw xmm1,xmm7
  313. paddw xmm5,xmm3
  314. psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
  315. psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
  316. paddw xmm0,xmm7
  317. paddw xmm2,xmm3
  318. psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
  319. psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
  320. psllw xmm0,BYTE_BIT
  321. psllw xmm2,BYTE_BIT
  322. por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
  323. por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
  324. movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
  325. movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
  326. ; -- process the lower row
  327. movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
  328. movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
  329. movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
  330. movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
  331. psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
  332. pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
  333. movdqa xmm0,xmm6
  334. movdqa xmm2,xmm4
  335. psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
  336. pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
  337. por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
  338. por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
  339. movdqa xmm1,xmm6
  340. movdqa xmm5,xmm4
  341. pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
  342. psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
  343. movdqa xmm3,xmm4
  344. psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
  345. por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
  346. por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
  347. movdqa XMMWORD [wk(1)], xmm3
  348. pmullw xmm6,[rel PW_THREE]
  349. pmullw xmm4,[rel PW_THREE]
  350. paddw xmm1,[rel PW_EIGHT]
  351. paddw xmm0,[rel PW_EIGHT]
  352. paddw xmm7,[rel PW_SEVEN]
  353. paddw xmm5,[rel PW_SEVEN]
  354. paddw xmm1,xmm6
  355. paddw xmm0,xmm4
  356. psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
  357. psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
  358. paddw xmm7,xmm6
  359. paddw xmm5,xmm4
  360. psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
  361. psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
  362. psllw xmm7,BYTE_BIT
  363. psllw xmm5,BYTE_BIT
  364. por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
  365. por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
  366. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
  367. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
  368. sub rax, byte SIZEOF_XMMWORD
  369. add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
  370. add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
  371. add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
  372. add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
  373. add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
  374. cmp rax, byte SIZEOF_XMMWORD
  375. ja near .columnloop
  376. test rax,rax
  377. jnz near .columnloop_last
  378. pop rsi
  379. pop rdi
  380. pop rcx
  381. pop rax
  382. add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
  383. add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
  384. sub rcx, byte 2 ; rowctr
  385. jg near .rowloop
  386. .return:
  387. pop rbx
  388. uncollect_args
  389. mov rsp,rbp ; rsp <- aligned rbp
  390. pop rsp ; rsp <- original rbp
  391. pop rbp
  392. ret
  393. ; --------------------------------------------------------------------------
  394. ;
  395. ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
  396. ; It's still a box filter.
  397. ;
  398. ; GLOBAL(void)
  399. ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
  400. ; JDIMENSION output_width,
  401. ; JSAMPARRAY input_data,
  402. ; JSAMPARRAY *output_data_ptr);
  403. ;
  404. ; r10 = int max_v_samp_factor
  405. ; r11 = JDIMENSION output_width
  406. ; r12 = JSAMPARRAY input_data
  407. ; r13 = JSAMPARRAY *output_data_ptr
  408. align 16
  409. global EXTN(jsimd_h2v1_upsample_sse2)
  410. EXTN(jsimd_h2v1_upsample_sse2):
  411. push rbp
  412. mov rax,rsp
  413. mov rbp,rsp
  414. collect_args
  415. mov edx, r11d
  416. add rdx, byte (2*SIZEOF_XMMWORD)-1
  417. and rdx, byte -(2*SIZEOF_XMMWORD)
  418. jz near .return
  419. mov rcx, r10 ; rowctr
  420. test rcx,rcx
  421. jz short .return
  422. mov rsi, r12 ; input_data
  423. mov rdi, r13
  424. mov rdi, JSAMPARRAY [rdi] ; output_data
  425. .rowloop:
  426. push rdi
  427. push rsi
  428. mov rsi, JSAMPROW [rsi] ; inptr
  429. mov rdi, JSAMPROW [rdi] ; outptr
  430. mov rax,rdx ; colctr
  431. .columnloop:
  432. movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  433. movdqa xmm1,xmm0
  434. punpcklbw xmm0,xmm0
  435. punpckhbw xmm1,xmm1
  436. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
  437. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
  438. sub rax, byte 2*SIZEOF_XMMWORD
  439. jz short .nextrow
  440. movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  441. movdqa xmm3,xmm2
  442. punpcklbw xmm2,xmm2
  443. punpckhbw xmm3,xmm3
  444. movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
  445. movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
  446. sub rax, byte 2*SIZEOF_XMMWORD
  447. jz short .nextrow
  448. add rsi, byte 2*SIZEOF_XMMWORD ; inptr
  449. add rdi, byte 4*SIZEOF_XMMWORD ; outptr
  450. jmp short .columnloop
  451. .nextrow:
  452. pop rsi
  453. pop rdi
  454. add rsi, byte SIZEOF_JSAMPROW ; input_data
  455. add rdi, byte SIZEOF_JSAMPROW ; output_data
  456. dec rcx ; rowctr
  457. jg short .rowloop
  458. .return:
  459. uncollect_args
  460. pop rbp
  461. ret
  462. ; --------------------------------------------------------------------------
  463. ;
  464. ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
  465. ; It's still a box filter.
  466. ;
  467. ; GLOBAL(void)
  468. ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
  469. ; JDIMENSION output_width,
  470. ; JSAMPARRAY input_data,
  471. ; JSAMPARRAY *output_data_ptr);
  472. ;
  473. ; r10 = int max_v_samp_factor
  474. ; r11 = JDIMENSION output_width
  475. ; r12 = JSAMPARRAY input_data
  476. ; r13 = JSAMPARRAY *output_data_ptr
  477. align 16
  478. global EXTN(jsimd_h2v2_upsample_sse2)
  479. EXTN(jsimd_h2v2_upsample_sse2):
  480. push rbp
  481. mov rax,rsp
  482. mov rbp,rsp
  483. collect_args
  484. push rbx
  485. mov edx, r11d
  486. add rdx, byte (2*SIZEOF_XMMWORD)-1
  487. and rdx, byte -(2*SIZEOF_XMMWORD)
  488. jz near .return
  489. mov rcx, r10 ; rowctr
  490. test rcx,rcx
  491. jz near .return
  492. mov rsi, r12 ; input_data
  493. mov rdi, r13
  494. mov rdi, JSAMPARRAY [rdi] ; output_data
  495. .rowloop:
  496. push rdi
  497. push rsi
  498. mov rsi, JSAMPROW [rsi] ; inptr
  499. mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
  500. mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
  501. mov rax,rdx ; colctr
  502. .columnloop:
  503. movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  504. movdqa xmm1,xmm0
  505. punpcklbw xmm0,xmm0
  506. punpckhbw xmm1,xmm1
  507. movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
  508. movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
  509. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
  510. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
  511. sub rax, byte 2*SIZEOF_XMMWORD
  512. jz short .nextrow
  513. movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  514. movdqa xmm3,xmm2
  515. punpcklbw xmm2,xmm2
  516. punpckhbw xmm3,xmm3
  517. movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
  518. movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
  519. movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
  520. movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
  521. sub rax, byte 2*SIZEOF_XMMWORD
  522. jz short .nextrow
  523. add rsi, byte 2*SIZEOF_XMMWORD ; inptr
  524. add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
  525. add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
  526. jmp short .columnloop
  527. .nextrow:
  528. pop rsi
  529. pop rdi
  530. add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
  531. add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
  532. sub rcx, byte 2 ; rowctr
  533. jg near .rowloop
  534. .return:
  535. pop rbx
  536. uncollect_args
  537. pop rbp
  538. ret
  539. ; For some reason, the OS X linker does not honor the request to align the
  540. ; segment unless we do this.
  541. align 16