jcsample-mmx.asm 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. ;
  2. ; jcsample.asm - downsampling (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler),
  11. ; can *not* be assembled with Microsoft's MASM or any compatible
  12. ; assembler (including Borland's Turbo Assembler).
  13. ; NASM is available from http://nasm.sourceforge.net/ or
  14. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  15. ;
  16. ; [TAB8]
  17. %include "jsimdext.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 32
  21. ;
  22. ; Downsample pixel values of a single component.
  23. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  24. ; without smoothing.
  25. ;
  26. ; GLOBAL(void)
  27. ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
  28. ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
  29. ; JSAMPARRAY input_data, JSAMPARRAY output_data);
  30. ;
  31. %define img_width(b) (b)+8 ; JDIMENSION image_width
  32. %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
  33. %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
  34. %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
  35. %define input_data(b) (b)+24 ; JSAMPARRAY input_data
  36. %define output_data(b) (b)+28 ; JSAMPARRAY output_data
  37. align 16
  38. global EXTN(jsimd_h2v1_downsample_mmx)
  39. EXTN(jsimd_h2v1_downsample_mmx):
  40. push ebp
  41. mov ebp,esp
  42. ; push ebx ; unused
  43. ; push ecx ; need not be preserved
  44. ; push edx ; need not be preserved
  45. push esi
  46. push edi
  47. mov ecx, JDIMENSION [width_blks(ebp)]
  48. shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
  49. jz near .return
  50. mov edx, JDIMENSION [img_width(ebp)]
  51. ; -- expand_right_edge
  52. push ecx
  53. shl ecx,1 ; output_cols * 2
  54. sub ecx,edx
  55. jle short .expand_end
  56. mov eax, INT [max_v_samp(ebp)]
  57. test eax,eax
  58. jle short .expand_end
  59. cld
  60. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  61. alignx 16,7
  62. .expandloop:
  63. push eax
  64. push ecx
  65. mov edi, JSAMPROW [esi]
  66. add edi,edx
  67. mov al, JSAMPLE [edi-1]
  68. rep stosb
  69. pop ecx
  70. pop eax
  71. add esi, byte SIZEOF_JSAMPROW
  72. dec eax
  73. jg short .expandloop
  74. .expand_end:
  75. pop ecx ; output_cols
  76. ; -- h2v1_downsample
  77. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  78. test eax,eax
  79. jle near .return
  80. mov edx, 0x00010000 ; bias pattern
  81. movd mm7,edx
  82. pcmpeqw mm6,mm6
  83. punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
  84. psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
  85. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  86. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  87. alignx 16,7
  88. .rowloop:
  89. push ecx
  90. push edi
  91. push esi
  92. mov esi, JSAMPROW [esi] ; inptr
  93. mov edi, JSAMPROW [edi] ; outptr
  94. alignx 16,7
  95. .columnloop:
  96. movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
  97. movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
  98. movq mm2,mm0
  99. movq mm3,mm1
  100. pand mm0,mm6
  101. psrlw mm2,BYTE_BIT
  102. pand mm1,mm6
  103. psrlw mm3,BYTE_BIT
  104. paddw mm0,mm2
  105. paddw mm1,mm3
  106. paddw mm0,mm7
  107. paddw mm1,mm7
  108. psrlw mm0,1
  109. psrlw mm1,1
  110. packuswb mm0,mm1
  111. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  112. add esi, byte 2*SIZEOF_MMWORD ; inptr
  113. add edi, byte 1*SIZEOF_MMWORD ; outptr
  114. sub ecx, byte SIZEOF_MMWORD ; outcol
  115. jnz short .columnloop
  116. pop esi
  117. pop edi
  118. pop ecx
  119. add esi, byte SIZEOF_JSAMPROW ; input_data
  120. add edi, byte SIZEOF_JSAMPROW ; output_data
  121. dec eax ; rowctr
  122. jg short .rowloop
  123. emms ; empty MMX state
  124. .return:
  125. pop edi
  126. pop esi
  127. ; pop edx ; need not be preserved
  128. ; pop ecx ; need not be preserved
  129. ; pop ebx ; unused
  130. pop ebp
  131. ret
  132. ; --------------------------------------------------------------------------
  133. ;
  134. ; Downsample pixel values of a single component.
  135. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  136. ; without smoothing.
  137. ;
  138. ; GLOBAL(void)
  139. ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
  140. ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
  141. ; JSAMPARRAY input_data, JSAMPARRAY output_data);
  142. ;
  143. %define img_width(b) (b)+8 ; JDIMENSION image_width
  144. %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
  145. %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
  146. %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
  147. %define input_data(b) (b)+24 ; JSAMPARRAY input_data
  148. %define output_data(b) (b)+28 ; JSAMPARRAY output_data
  149. align 16
  150. global EXTN(jsimd_h2v2_downsample_mmx)
  151. EXTN(jsimd_h2v2_downsample_mmx):
  152. push ebp
  153. mov ebp,esp
  154. ; push ebx ; unused
  155. ; push ecx ; need not be preserved
  156. ; push edx ; need not be preserved
  157. push esi
  158. push edi
  159. mov ecx, JDIMENSION [width_blks(ebp)]
  160. shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
  161. jz near .return
  162. mov edx, JDIMENSION [img_width(ebp)]
  163. ; -- expand_right_edge
  164. push ecx
  165. shl ecx,1 ; output_cols * 2
  166. sub ecx,edx
  167. jle short .expand_end
  168. mov eax, INT [max_v_samp(ebp)]
  169. test eax,eax
  170. jle short .expand_end
  171. cld
  172. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  173. alignx 16,7
  174. .expandloop:
  175. push eax
  176. push ecx
  177. mov edi, JSAMPROW [esi]
  178. add edi,edx
  179. mov al, JSAMPLE [edi-1]
  180. rep stosb
  181. pop ecx
  182. pop eax
  183. add esi, byte SIZEOF_JSAMPROW
  184. dec eax
  185. jg short .expandloop
  186. .expand_end:
  187. pop ecx ; output_cols
  188. ; -- h2v2_downsample
  189. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  190. test eax,eax
  191. jle near .return
  192. mov edx, 0x00020001 ; bias pattern
  193. movd mm7,edx
  194. pcmpeqw mm6,mm6
  195. punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
  196. psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
  197. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  198. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  199. alignx 16,7
  200. .rowloop:
  201. push ecx
  202. push edi
  203. push esi
  204. mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  205. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
  206. mov edi, JSAMPROW [edi] ; outptr
  207. alignx 16,7
  208. .columnloop:
  209. movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
  210. movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
  211. movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
  212. movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
  213. movq mm4,mm0
  214. movq mm5,mm1
  215. pand mm0,mm6
  216. psrlw mm4,BYTE_BIT
  217. pand mm1,mm6
  218. psrlw mm5,BYTE_BIT
  219. paddw mm0,mm4
  220. paddw mm1,mm5
  221. movq mm4,mm2
  222. movq mm5,mm3
  223. pand mm2,mm6
  224. psrlw mm4,BYTE_BIT
  225. pand mm3,mm6
  226. psrlw mm5,BYTE_BIT
  227. paddw mm2,mm4
  228. paddw mm3,mm5
  229. paddw mm0,mm1
  230. paddw mm2,mm3
  231. paddw mm0,mm7
  232. paddw mm2,mm7
  233. psrlw mm0,2
  234. psrlw mm2,2
  235. packuswb mm0,mm2
  236. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  237. add edx, byte 2*SIZEOF_MMWORD ; inptr0
  238. add esi, byte 2*SIZEOF_MMWORD ; inptr1
  239. add edi, byte 1*SIZEOF_MMWORD ; outptr
  240. sub ecx, byte SIZEOF_MMWORD ; outcol
  241. jnz near .columnloop
  242. pop esi
  243. pop edi
  244. pop ecx
  245. add esi, byte 2*SIZEOF_JSAMPROW ; input_data
  246. add edi, byte 1*SIZEOF_JSAMPROW ; output_data
  247. dec eax ; rowctr
  248. jg near .rowloop
  249. emms ; empty MMX state
  250. .return:
  251. pop edi
  252. pop esi
  253. ; pop edx ; need not be preserved
  254. ; pop ecx ; need not be preserved
  255. ; pop ebx ; unused
  256. pop ebp
  257. ret
  258. ; For some reason, the OS X linker does not honor the request to align the
  259. ; segment unless we do this.
  260. align 16