rotate_win.cc 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/rotate_row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for 32 bit Visual C x86 and clangcl
  17. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  18. __declspec(naked)
  19. void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  20. uint8* dst, int dst_stride, int width) {
  21. __asm {
  22. push edi
  23. push esi
  24. push ebp
  25. mov eax, [esp + 12 + 4] // src
  26. mov edi, [esp + 12 + 8] // src_stride
  27. mov edx, [esp + 12 + 12] // dst
  28. mov esi, [esp + 12 + 16] // dst_stride
  29. mov ecx, [esp + 12 + 20] // width
  30. // Read in the data from the source pointer.
  31. // First round of bit swap.
  32. align 4
  33. convertloop:
  34. movq xmm0, qword ptr [eax]
  35. lea ebp, [eax + 8]
  36. movq xmm1, qword ptr [eax + edi]
  37. lea eax, [eax + 2 * edi]
  38. punpcklbw xmm0, xmm1
  39. movq xmm2, qword ptr [eax]
  40. movdqa xmm1, xmm0
  41. palignr xmm1, xmm1, 8
  42. movq xmm3, qword ptr [eax + edi]
  43. lea eax, [eax + 2 * edi]
  44. punpcklbw xmm2, xmm3
  45. movdqa xmm3, xmm2
  46. movq xmm4, qword ptr [eax]
  47. palignr xmm3, xmm3, 8
  48. movq xmm5, qword ptr [eax + edi]
  49. punpcklbw xmm4, xmm5
  50. lea eax, [eax + 2 * edi]
  51. movdqa xmm5, xmm4
  52. movq xmm6, qword ptr [eax]
  53. palignr xmm5, xmm5, 8
  54. movq xmm7, qword ptr [eax + edi]
  55. punpcklbw xmm6, xmm7
  56. mov eax, ebp
  57. movdqa xmm7, xmm6
  58. palignr xmm7, xmm7, 8
  59. // Second round of bit swap.
  60. punpcklwd xmm0, xmm2
  61. punpcklwd xmm1, xmm3
  62. movdqa xmm2, xmm0
  63. movdqa xmm3, xmm1
  64. palignr xmm2, xmm2, 8
  65. palignr xmm3, xmm3, 8
  66. punpcklwd xmm4, xmm6
  67. punpcklwd xmm5, xmm7
  68. movdqa xmm6, xmm4
  69. movdqa xmm7, xmm5
  70. palignr xmm6, xmm6, 8
  71. palignr xmm7, xmm7, 8
  72. // Third round of bit swap.
  73. // Write to the destination pointer.
  74. punpckldq xmm0, xmm4
  75. movq qword ptr [edx], xmm0
  76. movdqa xmm4, xmm0
  77. palignr xmm4, xmm4, 8
  78. movq qword ptr [edx + esi], xmm4
  79. lea edx, [edx + 2 * esi]
  80. punpckldq xmm2, xmm6
  81. movdqa xmm6, xmm2
  82. palignr xmm6, xmm6, 8
  83. movq qword ptr [edx], xmm2
  84. punpckldq xmm1, xmm5
  85. movq qword ptr [edx + esi], xmm6
  86. lea edx, [edx + 2 * esi]
  87. movdqa xmm5, xmm1
  88. movq qword ptr [edx], xmm1
  89. palignr xmm5, xmm5, 8
  90. punpckldq xmm3, xmm7
  91. movq qword ptr [edx + esi], xmm5
  92. lea edx, [edx + 2 * esi]
  93. movq qword ptr [edx], xmm3
  94. movdqa xmm7, xmm3
  95. palignr xmm7, xmm7, 8
  96. sub ecx, 8
  97. movq qword ptr [edx + esi], xmm7
  98. lea edx, [edx + 2 * esi]
  99. jg convertloop
  100. pop ebp
  101. pop esi
  102. pop edi
  103. ret
  104. }
  105. }
  106. __declspec(naked)
  107. void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
  108. uint8* dst_a, int dst_stride_a,
  109. uint8* dst_b, int dst_stride_b,
  110. int w) {
  111. __asm {
  112. push ebx
  113. push esi
  114. push edi
  115. push ebp
  116. mov eax, [esp + 16 + 4] // src
  117. mov edi, [esp + 16 + 8] // src_stride
  118. mov edx, [esp + 16 + 12] // dst_a
  119. mov esi, [esp + 16 + 16] // dst_stride_a
  120. mov ebx, [esp + 16 + 20] // dst_b
  121. mov ebp, [esp + 16 + 24] // dst_stride_b
  122. mov ecx, esp
  123. sub esp, 4 + 16
  124. and esp, ~15
  125. mov [esp + 16], ecx
  126. mov ecx, [ecx + 16 + 28] // w
  127. align 4
  128. convertloop:
  129. // Read in the data from the source pointer.
  130. // First round of bit swap.
  131. movdqu xmm0, [eax]
  132. movdqu xmm1, [eax + edi]
  133. lea eax, [eax + 2 * edi]
  134. movdqa xmm7, xmm0 // use xmm7 as temp register.
  135. punpcklbw xmm0, xmm1
  136. punpckhbw xmm7, xmm1
  137. movdqa xmm1, xmm7
  138. movdqu xmm2, [eax]
  139. movdqu xmm3, [eax + edi]
  140. lea eax, [eax + 2 * edi]
  141. movdqa xmm7, xmm2
  142. punpcklbw xmm2, xmm3
  143. punpckhbw xmm7, xmm3
  144. movdqa xmm3, xmm7
  145. movdqu xmm4, [eax]
  146. movdqu xmm5, [eax + edi]
  147. lea eax, [eax + 2 * edi]
  148. movdqa xmm7, xmm4
  149. punpcklbw xmm4, xmm5
  150. punpckhbw xmm7, xmm5
  151. movdqa xmm5, xmm7
  152. movdqu xmm6, [eax]
  153. movdqu xmm7, [eax + edi]
  154. lea eax, [eax + 2 * edi]
  155. movdqu [esp], xmm5 // backup xmm5
  156. neg edi
  157. movdqa xmm5, xmm6 // use xmm5 as temp register.
  158. punpcklbw xmm6, xmm7
  159. punpckhbw xmm5, xmm7
  160. movdqa xmm7, xmm5
  161. lea eax, [eax + 8 * edi + 16]
  162. neg edi
  163. // Second round of bit swap.
  164. movdqa xmm5, xmm0
  165. punpcklwd xmm0, xmm2
  166. punpckhwd xmm5, xmm2
  167. movdqa xmm2, xmm5
  168. movdqa xmm5, xmm1
  169. punpcklwd xmm1, xmm3
  170. punpckhwd xmm5, xmm3
  171. movdqa xmm3, xmm5
  172. movdqa xmm5, xmm4
  173. punpcklwd xmm4, xmm6
  174. punpckhwd xmm5, xmm6
  175. movdqa xmm6, xmm5
  176. movdqu xmm5, [esp] // restore xmm5
  177. movdqu [esp], xmm6 // backup xmm6
  178. movdqa xmm6, xmm5 // use xmm6 as temp register.
  179. punpcklwd xmm5, xmm7
  180. punpckhwd xmm6, xmm7
  181. movdqa xmm7, xmm6
  182. // Third round of bit swap.
  183. // Write to the destination pointer.
  184. movdqa xmm6, xmm0
  185. punpckldq xmm0, xmm4
  186. punpckhdq xmm6, xmm4
  187. movdqa xmm4, xmm6
  188. movdqu xmm6, [esp] // restore xmm6
  189. movlpd qword ptr [edx], xmm0
  190. movhpd qword ptr [ebx], xmm0
  191. movlpd qword ptr [edx + esi], xmm4
  192. lea edx, [edx + 2 * esi]
  193. movhpd qword ptr [ebx + ebp], xmm4
  194. lea ebx, [ebx + 2 * ebp]
  195. movdqa xmm0, xmm2 // use xmm0 as the temp register.
  196. punpckldq xmm2, xmm6
  197. movlpd qword ptr [edx], xmm2
  198. movhpd qword ptr [ebx], xmm2
  199. punpckhdq xmm0, xmm6
  200. movlpd qword ptr [edx + esi], xmm0
  201. lea edx, [edx + 2 * esi]
  202. movhpd qword ptr [ebx + ebp], xmm0
  203. lea ebx, [ebx + 2 * ebp]
  204. movdqa xmm0, xmm1 // use xmm0 as the temp register.
  205. punpckldq xmm1, xmm5
  206. movlpd qword ptr [edx], xmm1
  207. movhpd qword ptr [ebx], xmm1
  208. punpckhdq xmm0, xmm5
  209. movlpd qword ptr [edx + esi], xmm0
  210. lea edx, [edx + 2 * esi]
  211. movhpd qword ptr [ebx + ebp], xmm0
  212. lea ebx, [ebx + 2 * ebp]
  213. movdqa xmm0, xmm3 // use xmm0 as the temp register.
  214. punpckldq xmm3, xmm7
  215. movlpd qword ptr [edx], xmm3
  216. movhpd qword ptr [ebx], xmm3
  217. punpckhdq xmm0, xmm7
  218. sub ecx, 8
  219. movlpd qword ptr [edx + esi], xmm0
  220. lea edx, [edx + 2 * esi]
  221. movhpd qword ptr [ebx + ebp], xmm0
  222. lea ebx, [ebx + 2 * ebp]
  223. jg convertloop
  224. mov esp, [esp + 16]
  225. pop ebp
  226. pop edi
  227. pop esi
  228. pop ebx
  229. ret
  230. }
  231. }
  232. #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  233. #ifdef __cplusplus
  234. } // extern "C"
  235. } // namespace libyuv
  236. #endif