sad4d_neon.c 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx/vpx_integer.h"
  14. static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
  15. const uint16x8_t vec_hi) {
  16. const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
  17. vget_high_u16(vec_lo));
  18. const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
  19. vget_high_u16(vec_hi));
  20. const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
  21. const uint64x2_t b = vpaddlq_u32(a);
  22. const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
  23. vreinterpret_u32_u64(vget_high_u64(b)));
  24. return vget_lane_u32(c, 0);
  25. }
  26. // Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
  27. // vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
  28. // and vec_sum_ref_hi.
  29. static void sad_neon_64(const uint8x16_t vec_src_00,
  30. const uint8x16_t vec_src_16,
  31. const uint8x16_t vec_src_32,
  32. const uint8x16_t vec_src_48,
  33. const uint8_t *ref,
  34. uint16x8_t *vec_sum_ref_lo,
  35. uint16x8_t *vec_sum_ref_hi) {
  36. const uint8x16_t vec_ref_00 = vld1q_u8(ref);
  37. const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
  38. const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
  39. const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
  40. *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
  41. vget_low_u8(vec_ref_00));
  42. *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
  43. vget_high_u8(vec_ref_00));
  44. *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
  45. vget_low_u8(vec_ref_16));
  46. *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
  47. vget_high_u8(vec_ref_16));
  48. *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
  49. vget_low_u8(vec_ref_32));
  50. *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
  51. vget_high_u8(vec_ref_32));
  52. *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
  53. vget_low_u8(vec_ref_48));
  54. *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
  55. vget_high_u8(vec_ref_48));
  56. }
  57. // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
  58. // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
  59. static void sad_neon_32(const uint8x16_t vec_src_00,
  60. const uint8x16_t vec_src_16,
  61. const uint8_t *ref,
  62. uint16x8_t *vec_sum_ref_lo,
  63. uint16x8_t *vec_sum_ref_hi) {
  64. const uint8x16_t vec_ref_00 = vld1q_u8(ref);
  65. const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
  66. *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
  67. vget_low_u8(vec_ref_00));
  68. *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
  69. vget_high_u8(vec_ref_00));
  70. *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
  71. vget_low_u8(vec_ref_16));
  72. *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
  73. vget_high_u8(vec_ref_16));
  74. }
  75. void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
  76. const uint8_t* const ref[4], int ref_stride,
  77. uint32_t *res) {
  78. int i;
  79. uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
  80. uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
  81. uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
  82. uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
  83. uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
  84. uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
  85. uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
  86. uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
  87. const uint8_t *ref0, *ref1, *ref2, *ref3;
  88. ref0 = ref[0];
  89. ref1 = ref[1];
  90. ref2 = ref[2];
  91. ref3 = ref[3];
  92. for (i = 0; i < 64; ++i) {
  93. const uint8x16_t vec_src_00 = vld1q_u8(src);
  94. const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
  95. const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
  96. const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
  97. sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
  98. &vec_sum_ref0_lo, &vec_sum_ref0_hi);
  99. sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
  100. &vec_sum_ref1_lo, &vec_sum_ref1_hi);
  101. sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
  102. &vec_sum_ref2_lo, &vec_sum_ref2_hi);
  103. sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
  104. &vec_sum_ref3_lo, &vec_sum_ref3_hi);
  105. src += src_stride;
  106. ref0 += ref_stride;
  107. ref1 += ref_stride;
  108. ref2 += ref_stride;
  109. ref3 += ref_stride;
  110. }
  111. res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
  112. res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
  113. res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
  114. res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
  115. }
  116. void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
  117. const uint8_t* const ref[4], int ref_stride,
  118. uint32_t *res) {
  119. int i;
  120. uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
  121. uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
  122. uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
  123. uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
  124. uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
  125. uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
  126. uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
  127. uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
  128. const uint8_t *ref0, *ref1, *ref2, *ref3;
  129. ref0 = ref[0];
  130. ref1 = ref[1];
  131. ref2 = ref[2];
  132. ref3 = ref[3];
  133. for (i = 0; i < 32; ++i) {
  134. const uint8x16_t vec_src_00 = vld1q_u8(src);
  135. const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
  136. sad_neon_32(vec_src_00, vec_src_16, ref0,
  137. &vec_sum_ref0_lo, &vec_sum_ref0_hi);
  138. sad_neon_32(vec_src_00, vec_src_16, ref1,
  139. &vec_sum_ref1_lo, &vec_sum_ref1_hi);
  140. sad_neon_32(vec_src_00, vec_src_16, ref2,
  141. &vec_sum_ref2_lo, &vec_sum_ref2_hi);
  142. sad_neon_32(vec_src_00, vec_src_16, ref3,
  143. &vec_sum_ref3_lo, &vec_sum_ref3_hi);
  144. src += src_stride;
  145. ref0 += ref_stride;
  146. ref1 += ref_stride;
  147. ref2 += ref_stride;
  148. ref3 += ref_stride;
  149. }
  150. res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
  151. res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
  152. res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
  153. res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
  154. }
  155. void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
  156. const uint8_t* const ref[4], int ref_stride,
  157. uint32_t *res) {
  158. int i;
  159. uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
  160. uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
  161. uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
  162. uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
  163. uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
  164. uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
  165. uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
  166. uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
  167. const uint8_t *ref0, *ref1, *ref2, *ref3;
  168. ref0 = ref[0];
  169. ref1 = ref[1];
  170. ref2 = ref[2];
  171. ref3 = ref[3];
  172. for (i = 0; i < 16; ++i) {
  173. const uint8x16_t vec_src = vld1q_u8(src);
  174. const uint8x16_t vec_ref0 = vld1q_u8(ref0);
  175. const uint8x16_t vec_ref1 = vld1q_u8(ref1);
  176. const uint8x16_t vec_ref2 = vld1q_u8(ref2);
  177. const uint8x16_t vec_ref3 = vld1q_u8(ref3);
  178. vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
  179. vget_low_u8(vec_ref0));
  180. vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
  181. vget_high_u8(vec_ref0));
  182. vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
  183. vget_low_u8(vec_ref1));
  184. vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
  185. vget_high_u8(vec_ref1));
  186. vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
  187. vget_low_u8(vec_ref2));
  188. vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
  189. vget_high_u8(vec_ref2));
  190. vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
  191. vget_low_u8(vec_ref3));
  192. vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
  193. vget_high_u8(vec_ref3));
  194. src += src_stride;
  195. ref0 += ref_stride;
  196. ref1 += ref_stride;
  197. ref2 += ref_stride;
  198. ref3 += ref_stride;
  199. }
  200. res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
  201. res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
  202. res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
  203. res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
  204. }