jcsample-sse2.asm 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. ;
  2. ; jcsample.asm - downsampling (SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler),
  11. ; can *not* be assembled with Microsoft's MASM or any compatible
  12. ; assembler (including Borland's Turbo Assembler).
  13. ; NASM is available from http://nasm.sourceforge.net/ or
  14. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  15. ;
  16. ; [TAB8]
  17. %include "jsimdext.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 32
  21. ;
  22. ; Downsample pixel values of a single component.
  23. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  24. ; without smoothing.
  25. ;
  26. ; GLOBAL(void)
  27. ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
  28. ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
  29. ; JSAMPARRAY input_data, JSAMPARRAY output_data);
  30. ;
  31. %define img_width(b) (b)+8 ; JDIMENSION image_width
  32. %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
  33. %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
  34. %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
  35. %define input_data(b) (b)+24 ; JSAMPARRAY input_data
  36. %define output_data(b) (b)+28 ; JSAMPARRAY output_data
  37. align 16
  38. global EXTN(jsimd_h2v1_downsample_sse2)
  39. EXTN(jsimd_h2v1_downsample_sse2):
  40. push ebp
  41. mov ebp,esp
  42. ; push ebx ; unused
  43. ; push ecx ; need not be preserved
  44. ; push edx ; need not be preserved
  45. push esi
  46. push edi
  47. mov ecx, JDIMENSION [width_blks(ebp)]
  48. shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
  49. jz near .return
  50. mov edx, JDIMENSION [img_width(ebp)]
  51. ; -- expand_right_edge
  52. push ecx
  53. shl ecx,1 ; output_cols * 2
  54. sub ecx,edx
  55. jle short .expand_end
  56. mov eax, INT [max_v_samp(ebp)]
  57. test eax,eax
  58. jle short .expand_end
  59. cld
  60. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  61. alignx 16,7
  62. .expandloop:
  63. push eax
  64. push ecx
  65. mov edi, JSAMPROW [esi]
  66. add edi,edx
  67. mov al, JSAMPLE [edi-1]
  68. rep stosb
  69. pop ecx
  70. pop eax
  71. add esi, byte SIZEOF_JSAMPROW
  72. dec eax
  73. jg short .expandloop
  74. .expand_end:
  75. pop ecx ; output_cols
  76. ; -- h2v1_downsample
  77. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  78. test eax,eax
  79. jle near .return
  80. mov edx, 0x00010000 ; bias pattern
  81. movd xmm7,edx
  82. pcmpeqw xmm6,xmm6
  83. pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  84. psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
  85. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  86. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  87. alignx 16,7
  88. .rowloop:
  89. push ecx
  90. push edi
  91. push esi
  92. mov esi, JSAMPROW [esi] ; inptr
  93. mov edi, JSAMPROW [edi] ; outptr
  94. cmp ecx, byte SIZEOF_XMMWORD
  95. jae short .columnloop
  96. alignx 16,7
  97. .columnloop_r8:
  98. movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
  99. pxor xmm1,xmm1
  100. mov ecx, SIZEOF_XMMWORD
  101. jmp short .downsample
  102. alignx 16,7
  103. .columnloop:
  104. movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
  105. movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
  106. .downsample:
  107. movdqa xmm2,xmm0
  108. movdqa xmm3,xmm1
  109. pand xmm0,xmm6
  110. psrlw xmm2,BYTE_BIT
  111. pand xmm1,xmm6
  112. psrlw xmm3,BYTE_BIT
  113. paddw xmm0,xmm2
  114. paddw xmm1,xmm3
  115. paddw xmm0,xmm7
  116. paddw xmm1,xmm7
  117. psrlw xmm0,1
  118. psrlw xmm1,1
  119. packuswb xmm0,xmm1
  120. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
  121. sub ecx, byte SIZEOF_XMMWORD ; outcol
  122. add esi, byte 2*SIZEOF_XMMWORD ; inptr
  123. add edi, byte 1*SIZEOF_XMMWORD ; outptr
  124. cmp ecx, byte SIZEOF_XMMWORD
  125. jae short .columnloop
  126. test ecx,ecx
  127. jnz short .columnloop_r8
  128. pop esi
  129. pop edi
  130. pop ecx
  131. add esi, byte SIZEOF_JSAMPROW ; input_data
  132. add edi, byte SIZEOF_JSAMPROW ; output_data
  133. dec eax ; rowctr
  134. jg near .rowloop
  135. .return:
  136. pop edi
  137. pop esi
  138. ; pop edx ; need not be preserved
  139. ; pop ecx ; need not be preserved
  140. ; pop ebx ; unused
  141. pop ebp
  142. ret
  143. ; --------------------------------------------------------------------------
  144. ;
  145. ; Downsample pixel values of a single component.
  146. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  147. ; without smoothing.
  148. ;
  149. ; GLOBAL(void)
  150. ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
  151. ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
  152. ; JSAMPARRAY input_data, JSAMPARRAY output_data);
  153. ;
  154. %define img_width(b) (b)+8 ; JDIMENSION image_width
  155. %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
  156. %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
  157. %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
  158. %define input_data(b) (b)+24 ; JSAMPARRAY input_data
  159. %define output_data(b) (b)+28 ; JSAMPARRAY output_data
  160. align 16
  161. global EXTN(jsimd_h2v2_downsample_sse2)
  162. EXTN(jsimd_h2v2_downsample_sse2):
  163. push ebp
  164. mov ebp,esp
  165. ; push ebx ; unused
  166. ; push ecx ; need not be preserved
  167. ; push edx ; need not be preserved
  168. push esi
  169. push edi
  170. mov ecx, JDIMENSION [width_blks(ebp)]
  171. shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
  172. jz near .return
  173. mov edx, JDIMENSION [img_width(ebp)]
  174. ; -- expand_right_edge
  175. push ecx
  176. shl ecx,1 ; output_cols * 2
  177. sub ecx,edx
  178. jle short .expand_end
  179. mov eax, INT [max_v_samp(ebp)]
  180. test eax,eax
  181. jle short .expand_end
  182. cld
  183. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  184. alignx 16,7
  185. .expandloop:
  186. push eax
  187. push ecx
  188. mov edi, JSAMPROW [esi]
  189. add edi,edx
  190. mov al, JSAMPLE [edi-1]
  191. rep stosb
  192. pop ecx
  193. pop eax
  194. add esi, byte SIZEOF_JSAMPROW
  195. dec eax
  196. jg short .expandloop
  197. .expand_end:
  198. pop ecx ; output_cols
  199. ; -- h2v2_downsample
  200. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  201. test eax,eax
  202. jle near .return
  203. mov edx, 0x00020001 ; bias pattern
  204. movd xmm7,edx
  205. pcmpeqw xmm6,xmm6
  206. pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
  207. psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
  208. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  209. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  210. alignx 16,7
  211. .rowloop:
  212. push ecx
  213. push edi
  214. push esi
  215. mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  216. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
  217. mov edi, JSAMPROW [edi] ; outptr
  218. cmp ecx, byte SIZEOF_XMMWORD
  219. jae short .columnloop
  220. alignx 16,7
  221. .columnloop_r8:
  222. movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
  223. movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
  224. pxor xmm2,xmm2
  225. pxor xmm3,xmm3
  226. mov ecx, SIZEOF_XMMWORD
  227. jmp short .downsample
  228. alignx 16,7
  229. .columnloop:
  230. movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
  231. movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
  232. movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
  233. movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
  234. .downsample:
  235. movdqa xmm4,xmm0
  236. movdqa xmm5,xmm1
  237. pand xmm0,xmm6
  238. psrlw xmm4,BYTE_BIT
  239. pand xmm1,xmm6
  240. psrlw xmm5,BYTE_BIT
  241. paddw xmm0,xmm4
  242. paddw xmm1,xmm5
  243. movdqa xmm4,xmm2
  244. movdqa xmm5,xmm3
  245. pand xmm2,xmm6
  246. psrlw xmm4,BYTE_BIT
  247. pand xmm3,xmm6
  248. psrlw xmm5,BYTE_BIT
  249. paddw xmm2,xmm4
  250. paddw xmm3,xmm5
  251. paddw xmm0,xmm1
  252. paddw xmm2,xmm3
  253. paddw xmm0,xmm7
  254. paddw xmm2,xmm7
  255. psrlw xmm0,2
  256. psrlw xmm2,2
  257. packuswb xmm0,xmm2
  258. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
  259. sub ecx, byte SIZEOF_XMMWORD ; outcol
  260. add edx, byte 2*SIZEOF_XMMWORD ; inptr0
  261. add esi, byte 2*SIZEOF_XMMWORD ; inptr1
  262. add edi, byte 1*SIZEOF_XMMWORD ; outptr
  263. cmp ecx, byte SIZEOF_XMMWORD
  264. jae near .columnloop
  265. test ecx,ecx
  266. jnz near .columnloop_r8
  267. pop esi
  268. pop edi
  269. pop ecx
  270. add esi, byte 2*SIZEOF_JSAMPROW ; input_data
  271. add edi, byte 1*SIZEOF_JSAMPROW ; output_data
  272. dec eax ; rowctr
  273. jg near .rowloop
  274. .return:
  275. pop edi
  276. pop esi
  277. ; pop edx ; need not be preserved
  278. ; pop ecx ; need not be preserved
  279. ; pop ebx ; unused
  280. pop ebp
  281. ret
  282. ; For some reason, the OS X linker does not honor the request to align the
  283. ; segment unless we do this.
  284. align 16