sad_sse2.asm 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %define program_name vpx
  11. %include "third_party/x86inc/x86inc.asm"
  12. SECTION .text
  13. %macro SAD_FN 4
  14. %if %4 == 0
  15. %if %3 == 5
  16. cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
  17. %else ; %3 == 7
  18. cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
  19. src_stride3, ref_stride3, n_rows
  20. %endif ; %3 == 5/7
  21. %else ; avg
  22. %if %3 == 5
  23. cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
  24. second_pred, n_rows
  25. %else ; %3 == 7
  26. cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
  27. ref, ref_stride, \
  28. second_pred, \
  29. src_stride3, ref_stride3
  30. %if ARCH_X86_64
  31. %define n_rowsd r7d
  32. %else ; x86-32
  33. %define n_rowsd dword r0m
  34. %endif ; x86-32/64
  35. %endif ; %3 == 5/7
  36. %endif ; avg/sad
  37. movsxdifnidn src_strideq, src_strided
  38. movsxdifnidn ref_strideq, ref_strided
  39. %if %3 == 7
  40. lea src_stride3q, [src_strideq*3]
  41. lea ref_stride3q, [ref_strideq*3]
  42. %endif ; %3 == 7
  43. %endmacro
  44. ; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
  45. ; uint8_t *ref, int ref_stride);
  46. %macro SAD64XN 1-2 0
  47. SAD_FN 64, %1, 5, %2
  48. mov n_rowsd, %1
  49. pxor m0, m0
  50. .loop:
  51. movu m1, [refq]
  52. movu m2, [refq+16]
  53. movu m3, [refq+32]
  54. movu m4, [refq+48]
  55. %if %2 == 1
  56. pavgb m1, [second_predq+mmsize*0]
  57. pavgb m2, [second_predq+mmsize*1]
  58. pavgb m3, [second_predq+mmsize*2]
  59. pavgb m4, [second_predq+mmsize*3]
  60. lea second_predq, [second_predq+mmsize*4]
  61. %endif
  62. psadbw m1, [srcq]
  63. psadbw m2, [srcq+16]
  64. psadbw m3, [srcq+32]
  65. psadbw m4, [srcq+48]
  66. paddd m1, m2
  67. paddd m3, m4
  68. add refq, ref_strideq
  69. paddd m0, m1
  70. add srcq, src_strideq
  71. paddd m0, m3
  72. dec n_rowsd
  73. jg .loop
  74. movhlps m1, m0
  75. paddd m0, m1
  76. movd eax, m0
  77. RET
  78. %endmacro
  79. INIT_XMM sse2
  80. SAD64XN 64 ; sad64x64_sse2
  81. SAD64XN 32 ; sad64x32_sse2
  82. SAD64XN 64, 1 ; sad64x64_avg_sse2
  83. SAD64XN 32, 1 ; sad64x32_avg_sse2
  84. ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
  85. ; uint8_t *ref, int ref_stride);
  86. %macro SAD32XN 1-2 0
  87. SAD_FN 32, %1, 5, %2
  88. mov n_rowsd, %1/2
  89. pxor m0, m0
  90. .loop:
  91. movu m1, [refq]
  92. movu m2, [refq+16]
  93. movu m3, [refq+ref_strideq]
  94. movu m4, [refq+ref_strideq+16]
  95. %if %2 == 1
  96. pavgb m1, [second_predq+mmsize*0]
  97. pavgb m2, [second_predq+mmsize*1]
  98. pavgb m3, [second_predq+mmsize*2]
  99. pavgb m4, [second_predq+mmsize*3]
  100. lea second_predq, [second_predq+mmsize*4]
  101. %endif
  102. psadbw m1, [srcq]
  103. psadbw m2, [srcq+16]
  104. psadbw m3, [srcq+src_strideq]
  105. psadbw m4, [srcq+src_strideq+16]
  106. paddd m1, m2
  107. paddd m3, m4
  108. lea refq, [refq+ref_strideq*2]
  109. paddd m0, m1
  110. lea srcq, [srcq+src_strideq*2]
  111. paddd m0, m3
  112. dec n_rowsd
  113. jg .loop
  114. movhlps m1, m0
  115. paddd m0, m1
  116. movd eax, m0
  117. RET
  118. %endmacro
  119. INIT_XMM sse2
  120. SAD32XN 64 ; sad32x64_sse2
  121. SAD32XN 32 ; sad32x32_sse2
  122. SAD32XN 16 ; sad32x16_sse2
  123. SAD32XN 64, 1 ; sad32x64_avg_sse2
  124. SAD32XN 32, 1 ; sad32x32_avg_sse2
  125. SAD32XN 16, 1 ; sad32x16_avg_sse2
  126. ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
  127. ; uint8_t *ref, int ref_stride);
  128. %macro SAD16XN 1-2 0
  129. SAD_FN 16, %1, 7, %2
  130. mov n_rowsd, %1/4
  131. pxor m0, m0
  132. .loop:
  133. movu m1, [refq]
  134. movu m2, [refq+ref_strideq]
  135. movu m3, [refq+ref_strideq*2]
  136. movu m4, [refq+ref_stride3q]
  137. %if %2 == 1
  138. pavgb m1, [second_predq+mmsize*0]
  139. pavgb m2, [second_predq+mmsize*1]
  140. pavgb m3, [second_predq+mmsize*2]
  141. pavgb m4, [second_predq+mmsize*3]
  142. lea second_predq, [second_predq+mmsize*4]
  143. %endif
  144. psadbw m1, [srcq]
  145. psadbw m2, [srcq+src_strideq]
  146. psadbw m3, [srcq+src_strideq*2]
  147. psadbw m4, [srcq+src_stride3q]
  148. paddd m1, m2
  149. paddd m3, m4
  150. lea refq, [refq+ref_strideq*4]
  151. paddd m0, m1
  152. lea srcq, [srcq+src_strideq*4]
  153. paddd m0, m3
  154. dec n_rowsd
  155. jg .loop
  156. movhlps m1, m0
  157. paddd m0, m1
  158. movd eax, m0
  159. RET
  160. %endmacro
  161. INIT_XMM sse2
  162. SAD16XN 32 ; sad16x32_sse2
  163. SAD16XN 16 ; sad16x16_sse2
  164. SAD16XN 8 ; sad16x8_sse2
  165. SAD16XN 32, 1 ; sad16x32_avg_sse2
  166. SAD16XN 16, 1 ; sad16x16_avg_sse2
  167. SAD16XN 8, 1 ; sad16x8_avg_sse2
  168. ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
  169. ; uint8_t *ref, int ref_stride);
  170. %macro SAD8XN 1-2 0
  171. SAD_FN 8, %1, 7, %2
  172. mov n_rowsd, %1/4
  173. pxor m0, m0
  174. .loop:
  175. movh m1, [refq]
  176. movhps m1, [refq+ref_strideq]
  177. movh m2, [refq+ref_strideq*2]
  178. movhps m2, [refq+ref_stride3q]
  179. %if %2 == 1
  180. pavgb m1, [second_predq+mmsize*0]
  181. pavgb m2, [second_predq+mmsize*1]
  182. lea second_predq, [second_predq+mmsize*2]
  183. %endif
  184. movh m3, [srcq]
  185. movhps m3, [srcq+src_strideq]
  186. movh m4, [srcq+src_strideq*2]
  187. movhps m4, [srcq+src_stride3q]
  188. psadbw m1, m3
  189. psadbw m2, m4
  190. lea refq, [refq+ref_strideq*4]
  191. paddd m0, m1
  192. lea srcq, [srcq+src_strideq*4]
  193. paddd m0, m2
  194. dec n_rowsd
  195. jg .loop
  196. movhlps m1, m0
  197. paddd m0, m1
  198. movd eax, m0
  199. RET
  200. %endmacro
  201. INIT_XMM sse2
  202. SAD8XN 16 ; sad8x16_sse2
  203. SAD8XN 8 ; sad8x8_sse2
  204. SAD8XN 4 ; sad8x4_sse2
  205. SAD8XN 16, 1 ; sad8x16_avg_sse2
  206. SAD8XN 8, 1 ; sad8x8_avg_sse2
  207. SAD8XN 4, 1 ; sad8x4_avg_sse2
  208. ; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
  209. ; uint8_t *ref, int ref_stride);
  210. %macro SAD4XN 1-2 0
  211. SAD_FN 4, %1, 7, %2
  212. mov n_rowsd, %1/4
  213. pxor m0, m0
  214. .loop:
  215. movd m1, [refq]
  216. movd m2, [refq+ref_strideq]
  217. movd m3, [refq+ref_strideq*2]
  218. movd m4, [refq+ref_stride3q]
  219. punpckldq m1, m2
  220. punpckldq m3, m4
  221. %if %2 == 1
  222. pavgb m1, [second_predq+mmsize*0]
  223. pavgb m3, [second_predq+mmsize*1]
  224. lea second_predq, [second_predq+mmsize*2]
  225. %endif
  226. movd m2, [srcq]
  227. movd m5, [srcq+src_strideq]
  228. movd m4, [srcq+src_strideq*2]
  229. movd m6, [srcq+src_stride3q]
  230. punpckldq m2, m5
  231. punpckldq m4, m6
  232. psadbw m1, m2
  233. psadbw m3, m4
  234. lea refq, [refq+ref_strideq*4]
  235. paddd m0, m1
  236. lea srcq, [srcq+src_strideq*4]
  237. paddd m0, m3
  238. dec n_rowsd
  239. jg .loop
  240. movd eax, m0
  241. RET
  242. %endmacro
  243. INIT_MMX sse
  244. SAD4XN 8 ; sad4x8_sse
  245. SAD4XN 4 ; sad4x4_sse
  246. SAD4XN 8, 1 ; sad4x8_avg_sse
  247. SAD4XN 4, 1 ; sad4x4_avg_sse