jcsample-sse2-64.asm 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. ;
  2. ; jcsample.asm - downsampling (64-bit SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; [TAB8]
  18. %include "jsimdext.inc"
  19. ; --------------------------------------------------------------------------
  20. SECTION SEG_TEXT
  21. BITS 64
  22. ;
  23. ; Downsample pixel values of a single component.
  24. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  25. ; without smoothing.
  26. ;
  27. ; GLOBAL(void)
  28. ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
  29. ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
  30. ; JSAMPARRAY input_data, JSAMPARRAY output_data);
  31. ;
  32. ; r10 = JDIMENSION image_width
  33. ; r11 = int max_v_samp_factor
  34. ; r12 = JDIMENSION v_samp_factor
  35. ; r13 = JDIMENSION width_blocks
  36. ; r14 = JSAMPARRAY input_data
  37. ; r15 = JSAMPARRAY output_data
  38. align 16
  39. global EXTN(jsimd_h2v1_downsample_sse2)
  40. EXTN(jsimd_h2v1_downsample_sse2):
  41. push rbp
  42. mov rax,rsp
  43. mov rbp,rsp
  44. collect_args
  45. mov ecx, r13d
  46. shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
  47. jz near .return
  48. mov edx, r10d
  49. ; -- expand_right_edge
  50. push rcx
  51. shl rcx,1 ; output_cols * 2
  52. sub rcx,rdx
  53. jle short .expand_end
  54. mov rax, r11
  55. test rax,rax
  56. jle short .expand_end
  57. cld
  58. mov rsi, r14 ; input_data
  59. .expandloop:
  60. push rax
  61. push rcx
  62. mov rdi, JSAMPROW [rsi]
  63. add rdi,rdx
  64. mov al, JSAMPLE [rdi-1]
  65. rep stosb
  66. pop rcx
  67. pop rax
  68. add rsi, byte SIZEOF_JSAMPROW
  69. dec rax
  70. jg short .expandloop
  71. .expand_end:
  72. pop rcx ; output_cols
  73. ; -- h2v1_downsample
  74. mov eax, r12d ; rowctr
  75. test eax,eax
  76. jle near .return
  77. mov rdx, 0x00010000 ; bias pattern
  78. movd xmm7,edx
  79. pcmpeqw xmm6,xmm6
  80. pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  81. psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
  82. mov rsi, r14 ; input_data
  83. mov rdi, r15 ; output_data
  84. .rowloop:
  85. push rcx
  86. push rdi
  87. push rsi
  88. mov rsi, JSAMPROW [rsi] ; inptr
  89. mov rdi, JSAMPROW [rdi] ; outptr
  90. cmp rcx, byte SIZEOF_XMMWORD
  91. jae short .columnloop
  92. .columnloop_r8:
  93. movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  94. pxor xmm1,xmm1
  95. mov rcx, SIZEOF_XMMWORD
  96. jmp short .downsample
  97. .columnloop:
  98. movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  99. movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  100. .downsample:
  101. movdqa xmm2,xmm0
  102. movdqa xmm3,xmm1
  103. pand xmm0,xmm6
  104. psrlw xmm2,BYTE_BIT
  105. pand xmm1,xmm6
  106. psrlw xmm3,BYTE_BIT
  107. paddw xmm0,xmm2
  108. paddw xmm1,xmm3
  109. paddw xmm0,xmm7
  110. paddw xmm1,xmm7
  111. psrlw xmm0,1
  112. psrlw xmm1,1
  113. packuswb xmm0,xmm1
  114. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
  115. sub rcx, byte SIZEOF_XMMWORD ; outcol
  116. add rsi, byte 2*SIZEOF_XMMWORD ; inptr
  117. add rdi, byte 1*SIZEOF_XMMWORD ; outptr
  118. cmp rcx, byte SIZEOF_XMMWORD
  119. jae short .columnloop
  120. test rcx,rcx
  121. jnz short .columnloop_r8
  122. pop rsi
  123. pop rdi
  124. pop rcx
  125. add rsi, byte SIZEOF_JSAMPROW ; input_data
  126. add rdi, byte SIZEOF_JSAMPROW ; output_data
  127. dec rax ; rowctr
  128. jg near .rowloop
  129. .return:
  130. uncollect_args
  131. pop rbp
  132. ret
  133. ; --------------------------------------------------------------------------
  134. ;
  135. ; Downsample pixel values of a single component.
  136. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  137. ; without smoothing.
  138. ;
  139. ; GLOBAL(void)
  140. ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
  141. ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
  142. ; JSAMPARRAY input_data, JSAMPARRAY output_data);
  143. ;
  144. ; r10 = JDIMENSION image_width
  145. ; r11 = int max_v_samp_factor
  146. ; r12 = JDIMENSION v_samp_factor
  147. ; r13 = JDIMENSION width_blocks
  148. ; r14 = JSAMPARRAY input_data
  149. ; r15 = JSAMPARRAY output_data
  150. align 16
  151. global EXTN(jsimd_h2v2_downsample_sse2)
  152. EXTN(jsimd_h2v2_downsample_sse2):
  153. push rbp
  154. mov rax,rsp
  155. mov rbp,rsp
  156. collect_args
  157. mov ecx, r13d
  158. shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
  159. jz near .return
  160. mov edx, r10d
  161. ; -- expand_right_edge
  162. push rcx
  163. shl rcx,1 ; output_cols * 2
  164. sub rcx,rdx
  165. jle short .expand_end
  166. mov rax, r11
  167. test rax,rax
  168. jle short .expand_end
  169. cld
  170. mov rsi, r14 ; input_data
  171. .expandloop:
  172. push rax
  173. push rcx
  174. mov rdi, JSAMPROW [rsi]
  175. add rdi,rdx
  176. mov al, JSAMPLE [rdi-1]
  177. rep stosb
  178. pop rcx
  179. pop rax
  180. add rsi, byte SIZEOF_JSAMPROW
  181. dec rax
  182. jg short .expandloop
  183. .expand_end:
  184. pop rcx ; output_cols
  185. ; -- h2v2_downsample
  186. mov eax, r12d ; rowctr
  187. test rax,rax
  188. jle near .return
  189. mov rdx, 0x00020001 ; bias pattern
  190. movd xmm7,edx
  191. pcmpeqw xmm6,xmm6
  192. pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
  193. psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
  194. mov rsi, r14 ; input_data
  195. mov rdi, r15 ; output_data
  196. .rowloop:
  197. push rcx
  198. push rdi
  199. push rsi
  200. mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
  201. mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
  202. mov rdi, JSAMPROW [rdi] ; outptr
  203. cmp rcx, byte SIZEOF_XMMWORD
  204. jae short .columnloop
  205. .columnloop_r8:
  206. movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  207. movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  208. pxor xmm2,xmm2
  209. pxor xmm3,xmm3
  210. mov rcx, SIZEOF_XMMWORD
  211. jmp short .downsample
  212. .columnloop:
  213. movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  214. movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  215. movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
  216. movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  217. .downsample:
  218. movdqa xmm4,xmm0
  219. movdqa xmm5,xmm1
  220. pand xmm0,xmm6
  221. psrlw xmm4,BYTE_BIT
  222. pand xmm1,xmm6
  223. psrlw xmm5,BYTE_BIT
  224. paddw xmm0,xmm4
  225. paddw xmm1,xmm5
  226. movdqa xmm4,xmm2
  227. movdqa xmm5,xmm3
  228. pand xmm2,xmm6
  229. psrlw xmm4,BYTE_BIT
  230. pand xmm3,xmm6
  231. psrlw xmm5,BYTE_BIT
  232. paddw xmm2,xmm4
  233. paddw xmm3,xmm5
  234. paddw xmm0,xmm1
  235. paddw xmm2,xmm3
  236. paddw xmm0,xmm7
  237. paddw xmm2,xmm7
  238. psrlw xmm0,2
  239. psrlw xmm2,2
  240. packuswb xmm0,xmm2
  241. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
  242. sub rcx, byte SIZEOF_XMMWORD ; outcol
  243. add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
  244. add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
  245. add rdi, byte 1*SIZEOF_XMMWORD ; outptr
  246. cmp rcx, byte SIZEOF_XMMWORD
  247. jae near .columnloop
  248. test rcx,rcx
  249. jnz near .columnloop_r8
  250. pop rsi
  251. pop rdi
  252. pop rcx
  253. add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
  254. add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
  255. dec rax ; rowctr
  256. jg near .rowloop
  257. .return:
  258. uncollect_args
  259. pop rbp
  260. ret
  261. ; For some reason, the OS X linker does not honor the request to align the
  262. ; segment unless we do this.
  263. align 16