ImageScalingSSE2.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
  2. * This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. #include "ImageScaling.h"
  6. #include "mozilla/Attributes.h"
  7. #include "SSEHelpers.h"
  8. /* The functions below use the following system for averaging 4 pixels:
  9. *
  10. * The first observation is that a half-adder is implemented as follows:
  11. * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
  12. *
  13. * This can be trivially extended to three pixels by observaring that when
  14. * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
  15. * carries of the individual numbers, since the sum of 3 bits can only ever
  16. * have a carry of one.
  17. *
  18. * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
  19. * assuming eliminating overflows and underflows, carry + (sum >> 1).
  20. *
  21. * We now average our existing sum with the fourth number, so we get:
  22. * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
  23. *
  24. * We now observe that our sum has been moved into place relative to the
  25. * carry, so we can now average with the carry to get the final 4 input
  26. * average: avg = (sum2 + carry) >> 1;
  27. *
  28. * Or to reverse the proof:
  29. * avg = ((sum >> 1) + carry + d >> 1) >> 1
  30. * avg = ((a + b + c) >> 1 + d >> 1) >> 1
  31. * avg = ((a + b + c + d) >> 2)
  32. *
  33. * An additional fact used in the SSE versions is the concept that we can
  34. * trivially convert a rounded average to a truncated average:
  35. *
  36. * We have:
  37. * f(a, b) = (a + b + 1) >> 1
  38. *
  39. * And want:
  40. * g(a, b) = (a + b) >> 1
  41. *
  42. * Observe:
  43. * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
  44. * == ~((-a - 1 + -b - 1 + 1) >> 1)
  45. * == ~((-a - 1 + -b) >> 1)
  46. * == ~((-(a + b) - 1) >> 1)
  47. * == ~((~(a + b)) >> 1)
  48. * == (a + b) >> 1
  49. * == g(a, b)
  50. */
  51. MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg)
  52. {
  53. __m128i minusone = _mm_set1_epi32(0xffffffff);
  54. return _mm_xor_si128(arg, minusone);
  55. }
  56. /* We have to pass pointers here, MSVC does not allow passing more than 3
  57. * __m128i arguments on the stack. And it does not allow 16-byte aligned
  58. * stack variables. This inlines properly on MSVC 2010. It does -not- inline
  59. * with just the inline directive.
  60. */
  61. MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d)
  62. {
  63. #define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
  64. #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
  65. // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
  66. // needs to be a compile time constant.
  67. #define shuffle_si128(arga, argb, imm) \
  68. _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm)));
  69. __m128i t = shuffle_si128(*a, *b, shuf1);
  70. *b = shuffle_si128(*a, *b, shuf2);
  71. *a = t;
  72. t = shuffle_si128(*c, *d, shuf1);
  73. *d = shuffle_si128(*c, *d, shuf2);
  74. *c = t;
  75. #undef shuf1
  76. #undef shuf2
  77. #undef shuffle_si128
  78. __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
  79. __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
  80. sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
  81. return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
  82. }
  83. MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b)
  84. {
  85. return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
  86. }
  87. MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
  88. {
  89. __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
  90. b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
  91. a = t;
  92. return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
  93. }
  94. MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
  95. {
  96. uint32_t sum = a ^ b ^ c;
  97. uint32_t carry = (a & b) | (a & c) | (b & c);
  98. uint32_t mask = 0xfefefefe;
  99. // Not having a byte based average instruction means we should mask to avoid
  100. // underflow.
  101. sum = (((sum ^ d) & mask) >> 1) + (sum & d);
  102. return (((sum ^ carry) & mask) >> 1) + (sum & carry);
  103. }
  104. // Simple 2 pixel average version of the function above.
  105. MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b)
  106. {
  107. uint32_t sum = a ^ b;
  108. uint32_t carry = (a & b);
  109. uint32_t mask = 0xfefefefe;
  110. return ((sum & mask) >> 1) + carry;
  111. }
  112. namespace mozilla {
  113. namespace gfx {
  114. void
  115. ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride,
  116. const IntSize &aSourceSize, uint8_t *aDest,
  117. uint32_t aDestStride)
  118. {
  119. const int Bpp = 4;
  120. for (int y = 0; y < aSourceSize.height; y += 2) {
  121. __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
  122. int x = 0;
  123. // Run a loop depending on alignment.
  124. if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
  125. !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
  126. for (; x < (aSourceSize.width - 7); x += 8) {
  127. __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
  128. __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
  129. __m128i a = _mm_load_si128(upperRow);
  130. __m128i b = _mm_load_si128(upperRow + 1);
  131. __m128i c = _mm_load_si128(lowerRow);
  132. __m128i d = _mm_load_si128(lowerRow + 1);
  133. *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
  134. }
  135. } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
  136. for (; x < (aSourceSize.width - 7); x += 8) {
  137. __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
  138. __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
  139. __m128i a = _mm_load_si128(upperRow);
  140. __m128i b = _mm_load_si128(upperRow + 1);
  141. __m128i c = loadUnaligned128(lowerRow);
  142. __m128i d = loadUnaligned128(lowerRow + 1);
  143. *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
  144. }
  145. } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
  146. for (; x < (aSourceSize.width - 7); x += 8) {
  147. __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
  148. __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
  149. __m128i a = loadUnaligned128((__m128i*)upperRow);
  150. __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
  151. __m128i c = _mm_load_si128((__m128i*)lowerRow);
  152. __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
  153. *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
  154. }
  155. } else {
  156. for (; x < (aSourceSize.width - 7); x += 8) {
  157. __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
  158. __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
  159. __m128i a = loadUnaligned128(upperRow);
  160. __m128i b = loadUnaligned128(upperRow + 1);
  161. __m128i c = loadUnaligned128(lowerRow);
  162. __m128i d = loadUnaligned128(lowerRow + 1);
  163. *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
  164. }
  165. }
  166. uint32_t *unalignedStorage = (uint32_t*)storage;
  167. // Take care of the final pixels, we know there's an even number of pixels
  168. // in the source rectangle. We use a 2x2 'simd' implementation for this.
  169. //
  170. // Potentially we only have to do this in the last row since overflowing
  171. // 8 pixels in an earlier row would appear to be harmless as it doesn't
  172. // touch invalid memory. Even when reading and writing to the same surface.
  173. // in practice we only do this when doing an additional downscale pass, and
  174. // in this situation we have unused stride to write into harmlessly.
  175. // I do not believe the additional code complexity would be worth it though.
  176. for (; x < aSourceSize.width; x += 2) {
  177. uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp);
  178. uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
  179. *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
  180. *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
  181. }
  182. }
  183. }
  184. void
  185. ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride,
  186. const IntSize &aSourceSize, uint8_t *aDest,
  187. uint32_t aDestStride)
  188. {
  189. for (int y = 0; y < aSourceSize.height; y += 2) {
  190. __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
  191. int x = 0;
  192. // Run a loop depending on alignment.
  193. if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
  194. !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
  195. for (; x < (aSourceSize.width - 3); x += 4) {
  196. uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
  197. uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
  198. __m128i a = _mm_load_si128((__m128i*)upperRow);
  199. __m128i b = _mm_load_si128((__m128i*)lowerRow);
  200. *storage++ = avg_sse2_4x2_4x1(a, b);
  201. }
  202. } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
  203. // This line doesn't align well.
  204. for (; x < (aSourceSize.width - 3); x += 4) {
  205. uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
  206. uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
  207. __m128i a = _mm_load_si128((__m128i*)upperRow);
  208. __m128i b = loadUnaligned128((__m128i*)lowerRow);
  209. *storage++ = avg_sse2_4x2_4x1(a, b);
  210. }
  211. } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
  212. for (; x < (aSourceSize.width - 3); x += 4) {
  213. uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
  214. uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
  215. __m128i a = loadUnaligned128((__m128i*)upperRow);
  216. __m128i b = _mm_load_si128((__m128i*)lowerRow);
  217. *storage++ = avg_sse2_4x2_4x1(a, b);
  218. }
  219. } else {
  220. for (; x < (aSourceSize.width - 3); x += 4) {
  221. uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
  222. uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
  223. __m128i a = loadUnaligned128((__m128i*)upperRow);
  224. __m128i b = loadUnaligned128((__m128i*)lowerRow);
  225. *storage++ = avg_sse2_4x2_4x1(a, b);
  226. }
  227. }
  228. uint32_t *unalignedStorage = (uint32_t*)storage;
  229. // Take care of the final pixels, we know there's an even number of pixels
  230. // in the source rectangle.
  231. //
  232. // Similar overflow considerations are valid as in the previous function.
  233. for (; x < aSourceSize.width; x++) {
  234. uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
  235. uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
  236. *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
  237. }
  238. }
  239. }
  240. void
  241. ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride,
  242. const IntSize &aSourceSize, uint8_t *aDest,
  243. uint32_t aDestStride)
  244. {
  245. for (int y = 0; y < aSourceSize.height; y++) {
  246. __m128i *storage = (__m128i*)(aDest + (y * aDestStride));
  247. int x = 0;
  248. // Run a loop depending on alignment.
  249. if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
  250. for (; x < (aSourceSize.width - 7); x += 8) {
  251. __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
  252. __m128i a = _mm_load_si128(pixels);
  253. __m128i b = _mm_load_si128(pixels + 1);
  254. *storage++ = avg_sse2_8x1_4x1(a, b);
  255. }
  256. } else {
  257. for (; x < (aSourceSize.width - 7); x += 8) {
  258. __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
  259. __m128i a = loadUnaligned128(pixels);
  260. __m128i b = loadUnaligned128(pixels + 1);
  261. *storage++ = avg_sse2_8x1_4x1(a, b);
  262. }
  263. }
  264. uint32_t *unalignedStorage = (uint32_t*)storage;
  265. // Take care of the final pixels, we know there's an even number of pixels
  266. // in the source rectangle.
  267. //
  268. // Similar overflow considerations are valid as in the previous function.
  269. for (; x < aSourceSize.width; x += 2) {
  270. uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
  271. *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
  272. }
  273. }
  274. }
  275. } // namespace gfx
  276. } // namespace mozilla