Tables.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. #include "Tables.hpp"
  2. const int32_t g_table[8][4] = {
  3. { 2, 8, -2, -8 },
  4. { 5, 17, -5, -17 },
  5. { 9, 29, -9, -29 },
  6. { 13, 42, -13, -42 },
  7. { 18, 60, -18, -60 },
  8. { 24, 80, -24, -80 },
  9. { 33, 106, -33, -106 },
  10. { 47, 183, -47, -183 }
  11. };
  12. const int64_t g_table256[8][4] = {
  13. { 2*256, 8*256, -2*256, -8*256 },
  14. { 5*256, 17*256, -5*256, -17*256 },
  15. { 9*256, 29*256, -9*256, -29*256 },
  16. { 13*256, 42*256, -13*256, -42*256 },
  17. { 18*256, 60*256, -18*256, -60*256 },
  18. { 24*256, 80*256, -24*256, -80*256 },
  19. { 33*256, 106*256, -33*256, -106*256 },
  20. { 47*256, 183*256, -47*256, -183*256 }
  21. };
  22. const uint32_t g_id[4][16] = {
  23. { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
  24. { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
  25. { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
  26. { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
  27. };
  28. const uint32_t g_avg2[16] = {
  29. 0x00,
  30. 0x11,
  31. 0x22,
  32. 0x33,
  33. 0x44,
  34. 0x55,
  35. 0x66,
  36. 0x77,
  37. 0x88,
  38. 0x99,
  39. 0xAA,
  40. 0xBB,
  41. 0xCC,
  42. 0xDD,
  43. 0xEE,
  44. 0xFF
  45. };
  46. const uint32_t g_flags[64] = {
  47. 0x80800402, 0x80800402, 0x80800402, 0x80800402,
  48. 0x80800402, 0x80800402, 0x80800402, 0x8080E002,
  49. 0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
  50. 0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
  51. 0x80000402, 0x80000402, 0x80000402, 0x80000402,
  52. 0x80000402, 0x80000402, 0x80000402, 0x8000E002,
  53. 0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
  54. 0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
  55. 0x00800402, 0x00800402, 0x00800402, 0x00800402,
  56. 0x00800402, 0x00800402, 0x00800402, 0x0080E002,
  57. 0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
  58. 0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
  59. 0x00000402, 0x00000402, 0x00000402, 0x00000402,
  60. 0x00000402, 0x00000402, 0x00000402, 0x0000E002,
  61. 0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
  62. 0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
  63. };
  64. const int32_t g_alpha[16][8] = {
  65. { -3, -6, -9, -15, 2, 5, 8, 14 },
  66. { -3, -7, -10, -13, 2, 6, 9, 12 },
  67. { -2, -5, -8, -13, 1, 4, 7, 12 },
  68. { -2, -4, -6, -13, 1, 3, 5, 12 },
  69. { -3, -6, -8, -12, 2, 5, 7, 11 },
  70. { -3, -7, -9, -11, 2, 6, 8, 10 },
  71. { -4, -7, -8, -11, 3, 6, 7, 10 },
  72. { -3, -5, -8, -11, 2, 4, 7, 10 },
  73. { -2, -6, -8, -10, 1, 5, 7, 9 },
  74. { -2, -5, -8, -10, 1, 4, 7, 9 },
  75. { -2, -4, -8, -10, 1, 3, 7, 9 },
  76. { -2, -5, -7, -10, 1, 4, 6, 9 },
  77. { -3, -4, -7, -10, 2, 3, 6, 9 },
  78. { -1, -2, -3, -10, 0, 1, 2, 9 },
  79. { -4, -6, -8, -9, 3, 5, 7, 8 },
  80. { -3, -5, -7, -9, 2, 4, 6, 8 }
  81. };
  82. const int32_t g_alphaRange[16] = {
  83. 0x100FF / ( 1 + g_alpha[0][7] - g_alpha[0][3] ),
  84. 0x100FF / ( 1 + g_alpha[1][7] - g_alpha[1][3] ),
  85. 0x100FF / ( 1 + g_alpha[2][7] - g_alpha[2][3] ),
  86. 0x100FF / ( 1 + g_alpha[3][7] - g_alpha[3][3] ),
  87. 0x100FF / ( 1 + g_alpha[4][7] - g_alpha[4][3] ),
  88. 0x100FF / ( 1 + g_alpha[5][7] - g_alpha[5][3] ),
  89. 0x100FF / ( 1 + g_alpha[6][7] - g_alpha[6][3] ),
  90. 0x100FF / ( 1 + g_alpha[7][7] - g_alpha[7][3] ),
  91. 0x100FF / ( 1 + g_alpha[8][7] - g_alpha[8][3] ),
  92. 0x100FF / ( 1 + g_alpha[9][7] - g_alpha[9][3] ),
  93. 0x100FF / ( 1 + g_alpha[10][7] - g_alpha[10][3] ),
  94. 0x100FF / ( 1 + g_alpha[11][7] - g_alpha[11][3] ),
  95. 0x100FF / ( 1 + g_alpha[12][7] - g_alpha[12][3] ),
  96. 0x100FF / ( 1 + g_alpha[13][7] - g_alpha[13][3] ),
  97. 0x100FF / ( 1 + g_alpha[14][7] - g_alpha[14][3] ),
  98. 0x100FF / ( 1 + g_alpha[15][7] - g_alpha[15][3] ),
  99. };
  100. #ifdef __SSE4_1__
  101. const __m128i g_table_SIMD[2] =
  102. {
  103. _mm_setr_epi16( 2, 5, 9, 13, 18, 24, 33, 47),
  104. _mm_setr_epi16( 8, 17, 29, 42, 60, 80, 106, 183)
  105. };
  106. const __m128i g_table128_SIMD[2] =
  107. {
  108. _mm_setr_epi16( 2*128, 5*128, 9*128, 13*128, 18*128, 24*128, 33*128, 47*128),
  109. _mm_setr_epi16( 8*128, 17*128, 29*128, 42*128, 60*128, 80*128, 106*128, 183*128)
  110. };
  111. const __m128i g_table256_SIMD[4] =
  112. {
  113. _mm_setr_epi32( 2*256, 5*256, 9*256, 13*256),
  114. _mm_setr_epi32( 8*256, 17*256, 29*256, 42*256),
  115. _mm_setr_epi32( 18*256, 24*256, 33*256, 47*256),
  116. _mm_setr_epi32( 60*256, 80*256, 106*256, 183*256)
  117. };
  118. const __m128i g_alpha_SIMD[16] = {
  119. _mm_setr_epi16( g_alpha[ 0][0], g_alpha[ 0][1], g_alpha[ 0][2], g_alpha[ 0][3], g_alpha[ 0][4], g_alpha[ 0][5], g_alpha[ 0][6], g_alpha[ 0][7] ),
  120. _mm_setr_epi16( g_alpha[ 1][0], g_alpha[ 1][1], g_alpha[ 1][2], g_alpha[ 1][3], g_alpha[ 1][4], g_alpha[ 1][5], g_alpha[ 1][6], g_alpha[ 1][7] ),
  121. _mm_setr_epi16( g_alpha[ 2][0], g_alpha[ 2][1], g_alpha[ 2][2], g_alpha[ 2][3], g_alpha[ 2][4], g_alpha[ 2][5], g_alpha[ 2][6], g_alpha[ 2][7] ),
  122. _mm_setr_epi16( g_alpha[ 3][0], g_alpha[ 3][1], g_alpha[ 3][2], g_alpha[ 3][3], g_alpha[ 3][4], g_alpha[ 3][5], g_alpha[ 3][6], g_alpha[ 3][7] ),
  123. _mm_setr_epi16( g_alpha[ 4][0], g_alpha[ 4][1], g_alpha[ 4][2], g_alpha[ 4][3], g_alpha[ 4][4], g_alpha[ 4][5], g_alpha[ 4][6], g_alpha[ 4][7] ),
  124. _mm_setr_epi16( g_alpha[ 5][0], g_alpha[ 5][1], g_alpha[ 5][2], g_alpha[ 5][3], g_alpha[ 5][4], g_alpha[ 5][5], g_alpha[ 5][6], g_alpha[ 5][7] ),
  125. _mm_setr_epi16( g_alpha[ 6][0], g_alpha[ 6][1], g_alpha[ 6][2], g_alpha[ 6][3], g_alpha[ 6][4], g_alpha[ 6][5], g_alpha[ 6][6], g_alpha[ 6][7] ),
  126. _mm_setr_epi16( g_alpha[ 7][0], g_alpha[ 7][1], g_alpha[ 7][2], g_alpha[ 7][3], g_alpha[ 7][4], g_alpha[ 7][5], g_alpha[ 7][6], g_alpha[ 7][7] ),
  127. _mm_setr_epi16( g_alpha[ 8][0], g_alpha[ 8][1], g_alpha[ 8][2], g_alpha[ 8][3], g_alpha[ 8][4], g_alpha[ 8][5], g_alpha[ 8][6], g_alpha[ 8][7] ),
  128. _mm_setr_epi16( g_alpha[ 9][0], g_alpha[ 9][1], g_alpha[ 9][2], g_alpha[ 9][3], g_alpha[ 9][4], g_alpha[ 9][5], g_alpha[ 9][6], g_alpha[ 9][7] ),
  129. _mm_setr_epi16( g_alpha[10][0], g_alpha[10][1], g_alpha[10][2], g_alpha[10][3], g_alpha[10][4], g_alpha[10][5], g_alpha[10][6], g_alpha[10][7] ),
  130. _mm_setr_epi16( g_alpha[11][0], g_alpha[11][1], g_alpha[11][2], g_alpha[11][3], g_alpha[11][4], g_alpha[11][5], g_alpha[11][6], g_alpha[11][7] ),
  131. _mm_setr_epi16( g_alpha[12][0], g_alpha[12][1], g_alpha[12][2], g_alpha[12][3], g_alpha[12][4], g_alpha[12][5], g_alpha[12][6], g_alpha[12][7] ),
  132. _mm_setr_epi16( g_alpha[13][0], g_alpha[13][1], g_alpha[13][2], g_alpha[13][3], g_alpha[13][4], g_alpha[13][5], g_alpha[13][6], g_alpha[13][7] ),
  133. _mm_setr_epi16( g_alpha[14][0], g_alpha[14][1], g_alpha[14][2], g_alpha[14][3], g_alpha[14][4], g_alpha[14][5], g_alpha[14][6], g_alpha[14][7] ),
  134. _mm_setr_epi16( g_alpha[15][0], g_alpha[15][1], g_alpha[15][2], g_alpha[15][3], g_alpha[15][4], g_alpha[15][5], g_alpha[15][6], g_alpha[15][7] ),
  135. };
  136. const __m128i g_alphaRange_SIMD = _mm_setr_epi16(
  137. g_alphaRange[0],
  138. g_alphaRange[1],
  139. g_alphaRange[4],
  140. g_alphaRange[5],
  141. g_alphaRange[8],
  142. g_alphaRange[14],
  143. 0,
  144. 0 );
  145. #endif
  146. #ifdef __AVX2__
  147. const __m256i g_alpha_AVX[8] = {
  148. _mm256_setr_epi16( g_alpha[ 0][0], g_alpha[ 1][0], g_alpha[ 2][0], g_alpha[ 3][0], g_alpha[ 4][0], g_alpha[ 5][0], g_alpha[ 6][0], g_alpha[ 7][0], g_alpha[ 8][0], g_alpha[ 9][0], g_alpha[10][0], g_alpha[11][0], g_alpha[12][0], g_alpha[13][0], g_alpha[14][0], g_alpha[15][0] ),
  149. _mm256_setr_epi16( g_alpha[ 0][1], g_alpha[ 1][1], g_alpha[ 2][1], g_alpha[ 3][1], g_alpha[ 4][1], g_alpha[ 5][1], g_alpha[ 6][1], g_alpha[ 7][1], g_alpha[ 8][1], g_alpha[ 9][1], g_alpha[10][1], g_alpha[11][1], g_alpha[12][1], g_alpha[13][1], g_alpha[14][1], g_alpha[15][1] ),
  150. _mm256_setr_epi16( g_alpha[ 0][2], g_alpha[ 1][2], g_alpha[ 2][2], g_alpha[ 3][2], g_alpha[ 4][2], g_alpha[ 5][2], g_alpha[ 6][2], g_alpha[ 7][2], g_alpha[ 8][2], g_alpha[ 9][2], g_alpha[10][2], g_alpha[11][2], g_alpha[12][2], g_alpha[13][2], g_alpha[14][2], g_alpha[15][2] ),
  151. _mm256_setr_epi16( g_alpha[ 0][3], g_alpha[ 1][3], g_alpha[ 2][3], g_alpha[ 3][3], g_alpha[ 4][3], g_alpha[ 5][3], g_alpha[ 6][3], g_alpha[ 7][3], g_alpha[ 8][3], g_alpha[ 9][3], g_alpha[10][3], g_alpha[11][3], g_alpha[12][3], g_alpha[13][3], g_alpha[14][3], g_alpha[15][3] ),
  152. _mm256_setr_epi16( g_alpha[ 0][4], g_alpha[ 1][4], g_alpha[ 2][4], g_alpha[ 3][4], g_alpha[ 4][4], g_alpha[ 5][4], g_alpha[ 6][4], g_alpha[ 7][4], g_alpha[ 8][4], g_alpha[ 9][4], g_alpha[10][4], g_alpha[11][4], g_alpha[12][4], g_alpha[13][4], g_alpha[14][4], g_alpha[15][4] ),
  153. _mm256_setr_epi16( g_alpha[ 0][5], g_alpha[ 1][5], g_alpha[ 2][5], g_alpha[ 3][5], g_alpha[ 4][5], g_alpha[ 5][5], g_alpha[ 6][5], g_alpha[ 7][5], g_alpha[ 8][5], g_alpha[ 9][5], g_alpha[10][5], g_alpha[11][5], g_alpha[12][5], g_alpha[13][5], g_alpha[14][5], g_alpha[15][5] ),
  154. _mm256_setr_epi16( g_alpha[ 0][6], g_alpha[ 1][6], g_alpha[ 2][6], g_alpha[ 3][6], g_alpha[ 4][6], g_alpha[ 5][6], g_alpha[ 6][6], g_alpha[ 7][6], g_alpha[ 8][6], g_alpha[ 9][6], g_alpha[10][6], g_alpha[11][6], g_alpha[12][6], g_alpha[13][6], g_alpha[14][6], g_alpha[15][6] ),
  155. _mm256_setr_epi16( g_alpha[ 0][7], g_alpha[ 1][7], g_alpha[ 2][7], g_alpha[ 3][7], g_alpha[ 4][7], g_alpha[ 5][7], g_alpha[ 6][7], g_alpha[ 7][7], g_alpha[ 8][7], g_alpha[ 9][7], g_alpha[10][7], g_alpha[11][7], g_alpha[12][7], g_alpha[13][7], g_alpha[14][7], g_alpha[15][7] ),
  156. };
  157. const __m256i g_alphaRange_AVX = _mm256_setr_epi16(
  158. g_alphaRange[ 0], g_alphaRange[ 1], g_alphaRange[ 2], g_alphaRange[ 3], g_alphaRange[ 4], g_alphaRange[ 5], g_alphaRange[ 6], g_alphaRange[ 7],
  159. g_alphaRange[ 8], g_alphaRange[ 9], g_alphaRange[10], g_alphaRange[11], g_alphaRange[12], g_alphaRange[13], g_alphaRange[14], g_alphaRange[15]
  160. );
  161. #endif
  162. #ifdef __ARM_NEON
  163. const int16x8_t g_table128_NEON[2] =
  164. {
  165. { 2*128, 5*128, 9*128, 13*128, 18*128, 24*128, 33*128, 47*128 },
  166. { 8*128, 17*128, 29*128, 42*128, 60*128, 80*128, 106*128, 183*128 }
  167. };
  168. const int32x4_t g_table256_NEON[4] =
  169. {
  170. { 2*256, 5*256, 9*256, 13*256 },
  171. { 8*256, 17*256, 29*256, 42*256 },
  172. { 18*256, 24*256, 33*256, 47*256 },
  173. { 60*256, 80*256, 106*256, 183*256 }
  174. };
  175. const int16x8_t g_alpha_NEON[16] =
  176. {
  177. { -3, -6, -9, -15, 2, 5, 8, 14 },
  178. { -3, -7, -10, -13, 2, 6, 9, 12 },
  179. { -2, -5, -8, -13, 1, 4, 7, 12 },
  180. { -2, -4, -6, -13, 1, 3, 5, 12 },
  181. { -3, -6, -8, -12, 2, 5, 7, 11 },
  182. { -3, -7, -9, -11, 2, 6, 8, 10 },
  183. { -4, -7, -8, -11, 3, 6, 7, 10 },
  184. { -3, -5, -8, -11, 2, 4, 7, 10 },
  185. { -2, -6, -8, -10, 1, 5, 7, 9 },
  186. { -2, -5, -8, -10, 1, 4, 7, 9 },
  187. { -2, -4, -8, -10, 1, 3, 7, 9 },
  188. { -2, -5, -7, -10, 1, 4, 6, 9 },
  189. { -3, -4, -7, -10, 2, 3, 6, 9 },
  190. { -1, -2, -3, -10, 0, 1, 2, 9 },
  191. { -4, -6, -8, -9, 3, 5, 7, 8 },
  192. { -3, -5, -7, -9, 2, 4, 6, 8 }
  193. };
  194. const int16x8_t g_alphaRange_NEON =
  195. {
  196. (int16_t)g_alphaRange[0],
  197. (int16_t)g_alphaRange[1],
  198. (int16_t)g_alphaRange[4],
  199. (int16_t)g_alphaRange[5],
  200. (int16_t)g_alphaRange[8],
  201. (int16_t)g_alphaRange[14],
  202. 0,
  203. 0
  204. };
  205. #endif