BlurNEON.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. /* This Source Code Form is subject to the terms of the Mozilla Public
  2. * License, v. 2.0. If a copy of the MPL was not distributed with this
  3. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  4. #include "Blur.h"
  5. #include <arm_neon.h>
  6. namespace mozilla {
  7. namespace gfx {
  8. MOZ_ALWAYS_INLINE
  9. uint16x4_t Divide(uint32x4_t aValues, uint32x2_t aDivisor)
  10. {
  11. uint64x2_t roundingAddition = vdupq_n_u64(int64_t(1) << 31);
  12. uint64x2_t multiplied21 = vmull_u32(vget_low_u32(aValues), aDivisor);
  13. uint64x2_t multiplied43 = vmull_u32(vget_high_u32(aValues), aDivisor);
  14. return vqmovn_u32(vcombine_u32(vshrn_n_u64(vaddq_u64(multiplied21, roundingAddition), 32),
  15. vshrn_n_u64(vaddq_u64(multiplied43, roundingAddition), 32)));
  16. }
  17. MOZ_ALWAYS_INLINE
  18. uint16x4_t BlurFourPixels(const uint32x4_t& aTopLeft, const uint32x4_t& aTopRight,
  19. const uint32x4_t& aBottomRight, const uint32x4_t& aBottomLeft,
  20. const uint32x2_t& aDivisor)
  21. {
  22. uint32x4_t values = vaddq_u32(vsubq_u32(vsubq_u32(aBottomRight, aTopRight), aBottomLeft), aTopLeft);
  23. return Divide(values, aDivisor);
  24. }
  25. MOZ_ALWAYS_INLINE
  26. void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource,
  27. int32_t aSourceWidth, int32_t aLeftInflation,
  28. int32_t aRightInflation)
  29. {
  30. int32_t currentRowSum = 0;
  31. for (int x = 0; x < aLeftInflation; x++) {
  32. currentRowSum += aSource[0];
  33. aDest[x] = currentRowSum;
  34. }
  35. for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
  36. currentRowSum += aSource[(x - aLeftInflation)];
  37. aDest[x] = currentRowSum;
  38. }
  39. for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
  40. currentRowSum += aSource[aSourceWidth - 1];
  41. aDest[x] = currentRowSum;
  42. }
  43. }
  44. MOZ_ALWAYS_INLINE void
  45. GenerateIntegralImage_NEON(int32_t aLeftInflation, int32_t aRightInflation,
  46. int32_t aTopInflation, int32_t aBottomInflation,
  47. uint32_t *aIntegralImage, size_t aIntegralImageStride,
  48. uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
  49. {
  50. MOZ_ASSERT(!(aLeftInflation & 3));
  51. uint32_t stride32bit = aIntegralImageStride / 4;
  52. IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
  53. aSize.height + aTopInflation + aBottomInflation);
  54. LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation);
  55. for (int y = 1; y < aTopInflation + 1; y++) {
  56. uint32_t *intRow = aIntegralImage + (y * stride32bit);
  57. uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
  58. uint32_t *intFirstRow = aIntegralImage;
  59. for (int x = 0; x < integralImageSize.width; x += 4) {
  60. uint32x4_t firstRow = vld1q_u32(intFirstRow + x);
  61. uint32x4_t previousRow = vld1q_u32(intPrevRow + x);
  62. vst1q_u32(intRow + x, vaddq_u32(firstRow, previousRow));
  63. }
  64. }
  65. for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
  66. uint32x4_t currentRowSum = vdupq_n_u32(0);
  67. uint32_t *intRow = aIntegralImage + (y * stride32bit);
  68. uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
  69. uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation);
  70. uint32_t pixel = sourceRow[0];
  71. for (int x = 0; x < aLeftInflation; x += 4) {
  72. uint32_t temp[4];
  73. temp[0] = pixel;
  74. temp[1] = temp[0] + pixel;
  75. temp[2] = temp[1] + pixel;
  76. temp[3] = temp[2] + pixel;
  77. uint32x4_t sumPixels = vld1q_u32(temp);
  78. sumPixels = vaddq_u32(sumPixels, currentRowSum);
  79. currentRowSum = vdupq_n_u32(vgetq_lane_u32(sumPixels, 3));
  80. vst1q_u32(intRow + x, vaddq_u32(sumPixels, vld1q_u32(intPrevRow + x)));
  81. }
  82. for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
  83. // It's important to shuffle here. When we exit this loop currentRowSum
  84. // has to be set to sumPixels, so that the following loop can get the
  85. // correct pixel for the currentRowSum. The highest order pixel in
  86. // currentRowSum could've originated from accumulation in the stride.
  87. currentRowSum = vdupq_n_u32(vgetq_lane_u32(currentRowSum, 3));
  88. uint32_t temp[4];
  89. temp[0] = *(sourceRow + (x - aLeftInflation));
  90. temp[1] = temp[0] + *(sourceRow + (x - aLeftInflation) + 1);
  91. temp[2] = temp[1] + *(sourceRow + (x - aLeftInflation) + 2);
  92. temp[3] = temp[2] + *(sourceRow + (x - aLeftInflation) + 3);
  93. uint32x4_t sumPixels = vld1q_u32(temp);
  94. sumPixels = vaddq_u32(sumPixels, currentRowSum);
  95. currentRowSum = sumPixels;
  96. vst1q_u32(intRow + x, vaddq_u32(sumPixels, vld1q_u32(intPrevRow + x)));
  97. }
  98. pixel = sourceRow[aSize.width - 1];
  99. int x = (aSize.width + aLeftInflation);
  100. if ((aSize.width & 3)) {
  101. // Deal with unaligned portion. Get the correct pixel from currentRowSum,
  102. // see explanation above.
  103. uint32_t intCurrentRowSum = ((uint32_t*)&currentRowSum)[(aSize.width % 4) - 1];
  104. for (; x < integralImageSize.width; x++) {
  105. // We could be unaligned here!
  106. if (!(x & 3)) {
  107. // aligned!
  108. currentRowSum = vdupq_n_u32(intCurrentRowSum);
  109. break;
  110. }
  111. intCurrentRowSum += pixel;
  112. intRow[x] = intPrevRow[x] + intCurrentRowSum;
  113. }
  114. } else {
  115. currentRowSum = vdupq_n_u32(vgetq_lane_u32(currentRowSum, 3));
  116. }
  117. for (; x < integralImageSize.width; x += 4) {
  118. uint32_t temp[4];
  119. temp[0] = pixel;
  120. temp[1] = temp[0] + pixel;
  121. temp[2] = temp[1] + pixel;
  122. temp[3] = temp[2] + pixel;
  123. uint32x4_t sumPixels = vld1q_u32(temp);
  124. sumPixels = vaddq_u32(sumPixels, currentRowSum);
  125. currentRowSum = vdupq_n_u32(vgetq_lane_u32(sumPixels, 3));
  126. vst1q_u32(intRow + x, vaddq_u32(sumPixels, vld1q_u32(intPrevRow + x)));
  127. }
  128. }
  129. if (aBottomInflation) {
  130. // Store the last valid row of our source image in the last row of
  131. // our integral image. This will be overwritten with the correct values
  132. // in the upcoming loop.
  133. LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit,
  134. aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation);
  135. for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) {
  136. uint32_t *intRow = aIntegralImage + (y * stride32bit);
  137. uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
  138. uint32_t *intLastRow = aIntegralImage + (integralImageSize.height - 1) * stride32bit;
  139. for (int x = 0; x < integralImageSize.width; x += 4) {
  140. vst1q_u32(intRow + x,
  141. vaddq_u32(vld1q_u32(intLastRow + x),
  142. vld1q_u32(intPrevRow + x)));
  143. }
  144. }
  145. }
  146. }
  147. /**
  148. * Attempt to do an in-place box blur using an integral image.
  149. */
  150. void
  151. AlphaBoxBlur::BoxBlur_NEON(uint8_t* aData,
  152. int32_t aLeftLobe,
  153. int32_t aRightLobe,
  154. int32_t aTopLobe,
  155. int32_t aBottomLobe,
  156. uint32_t *aIntegralImage,
  157. size_t aIntegralImageStride)
  158. {
  159. IntSize size = GetSize();
  160. MOZ_ASSERT(size.height > 0);
  161. // Our 'left' or 'top' lobe will include the current pixel. i.e. when
  162. // looking at an integral image the value of a pixel at 'x,y' is calculated
  163. // using the value of the integral image values above/below that.
  164. aLeftLobe++;
  165. aTopLobe++;
  166. int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
  167. MOZ_ASSERT(boxSize > 0);
  168. if (boxSize == 1) {
  169. return;
  170. }
  171. uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
  172. uint32_t stride32bit = aIntegralImageStride / 4;
  173. int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
  174. GenerateIntegralImage_NEON(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
  175. aIntegralImage, aIntegralImageStride, aData,
  176. mStride, size);
  177. uint32x2_t divisor = vdup_n_u32(reciprocal);
  178. // This points to the start of the rectangle within the IntegralImage that overlaps
  179. // the surface being blurred.
  180. uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
  181. IntRect skipRect = mSkipRect;
  182. int32_t stride = mStride;
  183. uint8_t *data = aData;
  184. for (int32_t y = 0; y < size.height; y++) {
  185. bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
  186. uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
  187. uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
  188. uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
  189. uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
  190. int32_t x = 0;
  191. // Process 16 pixels at a time for as long as possible.
  192. for (; x <= size.width - 16; x += 16) {
  193. if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
  194. x = skipRect.XMost() - 16;
  195. // Trigger early jump on coming loop iterations, this will be reset
  196. // next line anyway.
  197. inSkipRectY = false;
  198. continue;
  199. }
  200. uint32x4_t topLeft;
  201. uint32x4_t topRight;
  202. uint32x4_t bottomRight;
  203. uint32x4_t bottomLeft;
  204. topLeft = vld1q_u32(topLeftBase + x);
  205. topRight = vld1q_u32(topRightBase + x);
  206. bottomRight = vld1q_u32(bottomRightBase + x);
  207. bottomLeft = vld1q_u32(bottomLeftBase + x);
  208. uint16x4_t result1 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
  209. topLeft = vld1q_u32(topLeftBase + x + 4);
  210. topRight = vld1q_u32(topRightBase + x + 4);
  211. bottomRight = vld1q_u32(bottomRightBase + x + 4);
  212. bottomLeft = vld1q_u32(bottomLeftBase + x + 4);
  213. uint16x4_t result2 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
  214. topLeft = vld1q_u32(topLeftBase + x + 8);
  215. topRight = vld1q_u32(topRightBase + x + 8);
  216. bottomRight = vld1q_u32(bottomRightBase + x + 8);
  217. bottomLeft = vld1q_u32(bottomLeftBase + x + 8);
  218. uint16x4_t result3 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
  219. topLeft = vld1q_u32(topLeftBase + x + 12);
  220. topRight = vld1q_u32(topRightBase + x + 12);
  221. bottomRight = vld1q_u32(bottomRightBase + x + 12);
  222. bottomLeft = vld1q_u32(bottomLeftBase + x + 12);
  223. uint16x4_t result4 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
  224. uint8x8_t combine1 = vqmovn_u16(vcombine_u16(result1, result2));
  225. uint8x8_t combine2 = vqmovn_u16(vcombine_u16(result3, result4));
  226. uint8x16_t final = vcombine_u8(combine1, combine2);
  227. vst1q_u8(data + stride * y + x, final);
  228. }
  229. // Process the remaining pixels 4 bytes at a time.
  230. for (; x < size.width; x += 4) {
  231. if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
  232. x = skipRect.XMost() - 4;
  233. // Trigger early jump on coming loop iterations, this will be reset
  234. // next line anyway.
  235. inSkipRectY = false;
  236. continue;
  237. }
  238. uint32x4_t topLeft = vld1q_u32(topLeftBase + x);
  239. uint32x4_t topRight = vld1q_u32(topRightBase + x);
  240. uint32x4_t bottomRight = vld1q_u32(bottomRightBase + x);
  241. uint32x4_t bottomLeft = vld1q_u32(bottomLeftBase + x);
  242. uint16x4_t result = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
  243. uint32x2_t final = vreinterpret_u32_u8(vmovn_u16(vcombine_u16(result, vdup_n_u16(0))));
  244. *(uint32_t*)(data + stride * y + x) = vget_lane_u32(final, 0);
  245. }
  246. }
  247. }
  248. }
  249. }