compare_win.cc 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. /*
  2. * Copyright 2012 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/compare_row.h"
  12. #include "libyuv/row.h"
  13. #ifdef __cplusplus
  14. namespace libyuv {
  15. extern "C" {
  16. #endif
  17. // This module is for 32 bit Visual C x86 and clangcl
  18. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  19. __declspec(naked)
  20. uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  21. __asm {
  22. mov eax, [esp + 4] // src_a
  23. mov edx, [esp + 8] // src_b
  24. mov ecx, [esp + 12] // count
  25. pxor xmm0, xmm0
  26. pxor xmm5, xmm5
  27. wloop:
  28. movdqu xmm1, [eax]
  29. lea eax, [eax + 16]
  30. movdqu xmm2, [edx]
  31. lea edx, [edx + 16]
  32. movdqa xmm3, xmm1 // abs trick
  33. psubusb xmm1, xmm2
  34. psubusb xmm2, xmm3
  35. por xmm1, xmm2
  36. movdqa xmm2, xmm1
  37. punpcklbw xmm1, xmm5
  38. punpckhbw xmm2, xmm5
  39. pmaddwd xmm1, xmm1
  40. pmaddwd xmm2, xmm2
  41. paddd xmm0, xmm1
  42. paddd xmm0, xmm2
  43. sub ecx, 16
  44. jg wloop
  45. pshufd xmm1, xmm0, 0xee
  46. paddd xmm0, xmm1
  47. pshufd xmm1, xmm0, 0x01
  48. paddd xmm0, xmm1
  49. movd eax, xmm0
  50. ret
  51. }
  52. }
  53. // Visual C 2012 required for AVX2.
  54. #if _MSC_VER >= 1700
  55. // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
  56. #pragma warning(disable: 4752)
  57. __declspec(naked)
  58. uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
  59. __asm {
  60. mov eax, [esp + 4] // src_a
  61. mov edx, [esp + 8] // src_b
  62. mov ecx, [esp + 12] // count
  63. vpxor ymm0, ymm0, ymm0 // sum
  64. vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
  65. sub edx, eax
  66. wloop:
  67. vmovdqu ymm1, [eax]
  68. vmovdqu ymm2, [eax + edx]
  69. lea eax, [eax + 32]
  70. vpsubusb ymm3, ymm1, ymm2 // abs difference trick
  71. vpsubusb ymm2, ymm2, ymm1
  72. vpor ymm1, ymm2, ymm3
  73. vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
  74. vpunpckhbw ymm1, ymm1, ymm5
  75. vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
  76. vpmaddwd ymm1, ymm1, ymm1
  77. vpaddd ymm0, ymm0, ymm1
  78. vpaddd ymm0, ymm0, ymm2
  79. sub ecx, 32
  80. jg wloop
  81. vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
  82. vpaddd ymm0, ymm0, ymm1
  83. vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
  84. vpaddd ymm0, ymm0, ymm1
  85. vpermq ymm1, ymm0, 0x02 // high + low lane.
  86. vpaddd ymm0, ymm0, ymm1
  87. vmovd eax, xmm0
  88. vzeroupper
  89. ret
  90. }
  91. }
  92. #endif // _MSC_VER >= 1700
  93. uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
  94. uvec32 kHashMul0 = {
  95. 0x0c3525e1, // 33 ^ 15
  96. 0xa3476dc1, // 33 ^ 14
  97. 0x3b4039a1, // 33 ^ 13
  98. 0x4f5f0981, // 33 ^ 12
  99. };
  100. uvec32 kHashMul1 = {
  101. 0x30f35d61, // 33 ^ 11
  102. 0x855cb541, // 33 ^ 10
  103. 0x040a9121, // 33 ^ 9
  104. 0x747c7101, // 33 ^ 8
  105. };
  106. uvec32 kHashMul2 = {
  107. 0xec41d4e1, // 33 ^ 7
  108. 0x4cfa3cc1, // 33 ^ 6
  109. 0x025528a1, // 33 ^ 5
  110. 0x00121881, // 33 ^ 4
  111. };
  112. uvec32 kHashMul3 = {
  113. 0x00008c61, // 33 ^ 3
  114. 0x00000441, // 33 ^ 2
  115. 0x00000021, // 33 ^ 1
  116. 0x00000001, // 33 ^ 0
  117. };
  118. __declspec(naked)
  119. uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  120. __asm {
  121. mov eax, [esp + 4] // src
  122. mov ecx, [esp + 8] // count
  123. movd xmm0, [esp + 12] // seed
  124. pxor xmm7, xmm7 // constant 0 for unpck
  125. movdqa xmm6, xmmword ptr kHash16x33
  126. wloop:
  127. movdqu xmm1, [eax] // src[0-15]
  128. lea eax, [eax + 16]
  129. pmulld xmm0, xmm6 // hash *= 33 ^ 16
  130. movdqa xmm5, xmmword ptr kHashMul0
  131. movdqa xmm2, xmm1
  132. punpcklbw xmm2, xmm7 // src[0-7]
  133. movdqa xmm3, xmm2
  134. punpcklwd xmm3, xmm7 // src[0-3]
  135. pmulld xmm3, xmm5
  136. movdqa xmm5, xmmword ptr kHashMul1
  137. movdqa xmm4, xmm2
  138. punpckhwd xmm4, xmm7 // src[4-7]
  139. pmulld xmm4, xmm5
  140. movdqa xmm5, xmmword ptr kHashMul2
  141. punpckhbw xmm1, xmm7 // src[8-15]
  142. movdqa xmm2, xmm1
  143. punpcklwd xmm2, xmm7 // src[8-11]
  144. pmulld xmm2, xmm5
  145. movdqa xmm5, xmmword ptr kHashMul3
  146. punpckhwd xmm1, xmm7 // src[12-15]
  147. pmulld xmm1, xmm5
  148. paddd xmm3, xmm4 // add 16 results
  149. paddd xmm1, xmm2
  150. paddd xmm1, xmm3
  151. pshufd xmm2, xmm1, 0x0e // upper 2 dwords
  152. paddd xmm1, xmm2
  153. pshufd xmm2, xmm1, 0x01
  154. paddd xmm1, xmm2
  155. paddd xmm0, xmm1
  156. sub ecx, 16
  157. jg wloop
  158. movd eax, xmm0 // return hash
  159. ret
  160. }
  161. }
  162. // Visual C 2012 required for AVX2.
  163. #if _MSC_VER >= 1700
  164. __declspec(naked)
  165. uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  166. __asm {
  167. mov eax, [esp + 4] // src
  168. mov ecx, [esp + 8] // count
  169. vmovd xmm0, [esp + 12] // seed
  170. wloop:
  171. vpmovzxbd xmm3, [eax] // src[0-3]
  172. vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
  173. vpmovzxbd xmm4, [eax + 4] // src[4-7]
  174. vpmulld xmm3, xmm3, xmmword ptr kHashMul0
  175. vpmovzxbd xmm2, [eax + 8] // src[8-11]
  176. vpmulld xmm4, xmm4, xmmword ptr kHashMul1
  177. vpmovzxbd xmm1, [eax + 12] // src[12-15]
  178. vpmulld xmm2, xmm2, xmmword ptr kHashMul2
  179. lea eax, [eax + 16]
  180. vpmulld xmm1, xmm1, xmmword ptr kHashMul3
  181. vpaddd xmm3, xmm3, xmm4 // add 16 results
  182. vpaddd xmm1, xmm1, xmm2
  183. vpaddd xmm1, xmm1, xmm3
  184. vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
  185. vpaddd xmm1, xmm1,xmm2
  186. vpshufd xmm2, xmm1, 0x01
  187. vpaddd xmm1, xmm1, xmm2
  188. vpaddd xmm0, xmm0, xmm1
  189. sub ecx, 16
  190. jg wloop
  191. vmovd eax, xmm0 // return hash
  192. vzeroupper
  193. ret
  194. }
  195. }
  196. #endif // _MSC_VER >= 1700
  197. #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  198. #ifdef __cplusplus
  199. } // extern "C"
  200. } // namespace libyuv
  201. #endif