jquant-sse.asm 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. ;
  2. ; jquant.asm - sample data conversion and quantization (SSE & MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler),
  11. ; can *not* be assembled with Microsoft's MASM or any compatible
  12. ; assembler (including Borland's Turbo Assembler).
  13. ; NASM is available from http://nasm.sourceforge.net/ or
  14. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  15. ;
  16. ; [TAB8]
  17. %include "jsimdext.inc"
  18. %include "jdct.inc"
  19. ; --------------------------------------------------------------------------
  20. SECTION SEG_TEXT
  21. BITS 32
  22. ;
  23. ; Load data into workspace, applying unsigned->signed conversion
  24. ;
  25. ; GLOBAL(void)
  26. ; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
  27. ; FAST_FLOAT *workspace);
  28. ;
  29. %define sample_data ebp+8 ; JSAMPARRAY sample_data
  30. %define start_col ebp+12 ; JDIMENSION start_col
  31. %define workspace ebp+16 ; FAST_FLOAT *workspace
  32. align 16
  33. global EXTN(jsimd_convsamp_float_sse)
  34. EXTN(jsimd_convsamp_float_sse):
  35. push ebp
  36. mov ebp,esp
  37. push ebx
  38. ; push ecx ; need not be preserved
  39. ; push edx ; need not be preserved
  40. push esi
  41. push edi
  42. pcmpeqw mm7,mm7
  43. psllw mm7,7
  44. packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
  45. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  46. mov eax, JDIMENSION [start_col]
  47. mov edi, POINTER [workspace] ; (DCTELEM *)
  48. mov ecx, DCTSIZE/2
  49. alignx 16,7
  50. .convloop:
  51. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  52. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  53. movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  54. movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
  55. psubb mm0,mm7 ; mm0=(01234567)
  56. psubb mm1,mm7 ; mm1=(89ABCDEF)
  57. punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
  58. punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
  59. punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
  60. punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
  61. punpcklwd mm4,mm2 ; mm4=(***0***1)
  62. punpckhwd mm2,mm2 ; mm2=(***2***3)
  63. punpcklwd mm5,mm0 ; mm5=(***4***5)
  64. punpckhwd mm0,mm0 ; mm0=(***6***7)
  65. psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
  66. psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
  67. cvtpi2ps xmm0,mm4 ; xmm0=(01**)
  68. cvtpi2ps xmm1,mm2 ; xmm1=(23**)
  69. psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
  70. psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
  71. cvtpi2ps xmm2,mm5 ; xmm2=(45**)
  72. cvtpi2ps xmm3,mm0 ; xmm3=(67**)
  73. punpcklwd mm6,mm3 ; mm6=(***8***9)
  74. punpckhwd mm3,mm3 ; mm3=(***A***B)
  75. punpcklwd mm4,mm1 ; mm4=(***C***D)
  76. punpckhwd mm1,mm1 ; mm1=(***E***F)
  77. psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
  78. psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
  79. cvtpi2ps xmm4,mm6 ; xmm4=(89**)
  80. cvtpi2ps xmm5,mm3 ; xmm5=(AB**)
  81. psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
  82. psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
  83. cvtpi2ps xmm6,mm4 ; xmm6=(CD**)
  84. cvtpi2ps xmm7,mm1 ; xmm7=(EF**)
  85. movlhps xmm0,xmm1 ; xmm0=(0123)
  86. movlhps xmm2,xmm3 ; xmm2=(4567)
  87. movlhps xmm4,xmm5 ; xmm4=(89AB)
  88. movlhps xmm6,xmm7 ; xmm6=(CDEF)
  89. movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
  90. movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
  91. movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
  92. movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
  93. add esi, byte 2*SIZEOF_JSAMPROW
  94. add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  95. dec ecx
  96. jnz near .convloop
  97. emms ; empty MMX state
  98. pop edi
  99. pop esi
  100. ; pop edx ; need not be preserved
  101. ; pop ecx ; need not be preserved
  102. pop ebx
  103. pop ebp
  104. ret
  105. ; --------------------------------------------------------------------------
  106. ;
  107. ; Quantize/descale the coefficients, and store into coef_block
  108. ;
  109. ; GLOBAL(void)
  110. ; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
  111. ; FAST_FLOAT *workspace);
  112. ;
  113. %define coef_block ebp+8 ; JCOEFPTR coef_block
  114. %define divisors ebp+12 ; FAST_FLOAT *divisors
  115. %define workspace ebp+16 ; FAST_FLOAT *workspace
  116. align 16
  117. global EXTN(jsimd_quantize_float_sse)
  118. EXTN(jsimd_quantize_float_sse):
  119. push ebp
  120. mov ebp,esp
  121. ; push ebx ; unused
  122. ; push ecx ; unused
  123. ; push edx ; need not be preserved
  124. push esi
  125. push edi
  126. mov esi, POINTER [workspace]
  127. mov edx, POINTER [divisors]
  128. mov edi, JCOEFPTR [coef_block]
  129. mov eax, DCTSIZE2/16
  130. alignx 16,7
  131. .quantloop:
  132. movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  133. movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
  134. mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  135. mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
  136. movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  137. movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
  138. mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  139. mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
  140. movhlps xmm4,xmm0
  141. movhlps xmm5,xmm1
  142. cvtps2pi mm0,xmm0
  143. cvtps2pi mm1,xmm1
  144. cvtps2pi mm4,xmm4
  145. cvtps2pi mm5,xmm5
  146. movhlps xmm6,xmm2
  147. movhlps xmm7,xmm3
  148. cvtps2pi mm2,xmm2
  149. cvtps2pi mm3,xmm3
  150. cvtps2pi mm6,xmm6
  151. cvtps2pi mm7,xmm7
  152. packssdw mm0,mm4
  153. packssdw mm1,mm5
  154. packssdw mm2,mm6
  155. packssdw mm3,mm7
  156. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  157. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
  158. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
  159. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
  160. add esi, byte 16*SIZEOF_FAST_FLOAT
  161. add edx, byte 16*SIZEOF_FAST_FLOAT
  162. add edi, byte 16*SIZEOF_JCOEF
  163. dec eax
  164. jnz short .quantloop
  165. emms ; empty MMX state
  166. pop edi
  167. pop esi
  168. ; pop edx ; need not be preserved
  169. ; pop ecx ; unused
  170. ; pop ebx ; unused
  171. pop ebp
  172. ret
  173. ; For some reason, the OS X linker does not honor the request to align the
  174. ; segment unless we do this.
  175. align 16