jpgd_idct.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. // Copyright 2009 Intel Corporation
  2. // All Rights Reserved
  3. //
  4. // Permission is granted to use, copy, distribute and prepare derivative works of this
  5. // software for any purpose and without fee, provided, that the above copyright notice
  6. // and this statement appear in all copies. Intel makes no representations about the
  7. // suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS."
  8. // INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
  9. // INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
  10. // INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
  11. // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not
  12. // assume any responsibility for any errors which may appear in this software nor any
  13. // responsibility to update it.
  14. //
  15. // From:
  16. // https://software.intel.com/sites/default/files/m/d/4/1/d/8/UsingIntelAVXToImplementIDCT-r1_5.pdf
  17. // https://software.intel.com/file/29048
  18. //
  19. // Requires SSE
  20. //
  21. #ifdef _MSC_VER
  22. #include <intrin.h>
  23. #endif
  24. #include <immintrin.h>
  25. #ifdef _MSC_VER
  26. #define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name
  27. #else
  28. #define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
  29. #endif
  30. #define BITS_INV_ACC 4
  31. #define SHIFT_INV_ROW 16 - BITS_INV_ACC
  32. #define SHIFT_INV_COL 1 + BITS_INV_ACC
  33. const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1)
  34. const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1)
  35. const short IRND_INV_CORR = IRND_INV_COL - 1; // correction -1.0 and round
  36. JPGD_SIMD_ALIGN(short, shortM128_one_corr[8]) = {1, 1, 1, 1, 1, 1, 1, 1};
  37. JPGD_SIMD_ALIGN(short, shortM128_round_inv_row[8]) = {IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0};
  38. JPGD_SIMD_ALIGN(short, shortM128_round_inv_col[8]) = {IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL};
  39. JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8])= {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR};
  40. JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
  41. JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
  42. JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
  43. JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5
  44. //-----------------------------------------------------------------------------
  45. // Table for rows 0,4 - constants are multiplied on cos_4_16
  46. // w15 w14 w11 w10 w07 w06 w03 w02
  47. // w29 w28 w25 w24 w21 w20 w17 w16
  48. // w31 w30 w27 w26 w23 w22 w19 w18
  49. //movq -> w05 w04 w01 w00
  50. JPGD_SIMD_ALIGN(short, shortM128_tab_i_04[]) = {
  51. 16384, 21407, 16384, 8867,
  52. 16384, -8867, 16384, -21407, // w13 w12 w09 w08
  53. 16384, 8867, -16384, -21407, // w07 w06 w03 w02
  54. -16384, 21407, 16384, -8867, // w15 w14 w11 w10
  55. 22725, 19266, 19266, -4520, // w21 w20 w17 w16
  56. 12873, -22725, 4520, -12873, // w29 w28 w25 w24
  57. 12873, 4520, -22725, -12873, // w23 w22 w19 w18
  58. 4520, 19266, 19266, -22725}; // w31 w30 w27 w26
  59. // Table for rows 1,7 - constants are multiplied on cos_1_16
  60. //movq -> w05 w04 w01 w00
  61. JPGD_SIMD_ALIGN(short, shortM128_tab_i_17[]) = {
  62. 22725, 29692, 22725, 12299,
  63. 22725, -12299, 22725, -29692, // w13 w12 w09 w08
  64. 22725, 12299, -22725, -29692, // w07 w06 w03 w02
  65. -22725, 29692, 22725, -12299, // w15 w14 w11 w10
  66. 31521, 26722, 26722, -6270, // w21 w20 w17 w16
  67. 17855, -31521, 6270, -17855, // w29 w28 w25 w24
  68. 17855, 6270, -31521, -17855, // w23 w22 w19 w18
  69. 6270, 26722, 26722, -31521}; // w31 w30 w27 w26
  70. // Table for rows 2,6 - constants are multiplied on cos_2_16
  71. //movq -> w05 w04 w01 w00
  72. JPGD_SIMD_ALIGN(short, shortM128_tab_i_26[]) = {
  73. 21407, 27969, 21407, 11585,
  74. 21407, -11585, 21407, -27969, // w13 w12 w09 w08
  75. 21407, 11585, -21407, -27969, // w07 w06 w03 w02
  76. -21407, 27969, 21407, -11585, // w15 w14 w11 w10
  77. 29692, 25172, 25172, -5906, // w21 w20 w17 w16
  78. 16819, -29692, 5906, -16819, // w29 w28 w25 w24
  79. 16819, 5906, -29692, -16819, // w23 w22 w19 w18
  80. 5906, 25172, 25172, -29692}; // w31 w30 w27 w26
  81. // Table for rows 3,5 - constants are multiplied on cos_3_16
  82. //movq -> w05 w04 w01 w00
  83. JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = {
  84. 19266, 25172, 19266, 10426,
  85. 19266, -10426, 19266, -25172, // w13 w12 w09 w08
  86. 19266, 10426, -19266, -25172, // w07 w06 w03 w02
  87. -19266, 25172, 19266, -10426, // w15 w14 w11 w10
  88. 26722, 22654, 22654, -5315, // w21 w20 w17 w16
  89. 15137, -26722, 5315, -15137, // w29 w28 w25 w24
  90. 15137, 5315, -26722, -15137, // w23 w22 w19 w18
  91. 5315, 22654, 22654, -26722}; // w31 w30 w27 w26
  92. JPGD_SIMD_ALIGN(short, shortM128_128[8]) = { 128, 128, 128, 128, 128, 128, 128, 128 };
  93. void idctSSEShortU8(const short *pInput, uint8_t * pOutputUB)
  94. {
  95. __m128i r_xmm0, r_xmm4;
  96. __m128i r_xmm1, r_xmm2, r_xmm3, r_xmm5, r_xmm6, r_xmm7;
  97. __m128i row0, row1, row2, row3, row4, row5, row6, row7;
  98. short * pTab_i_04 = shortM128_tab_i_04;
  99. short * pTab_i_26 = shortM128_tab_i_26;
  100. //Get pointers for this input and output
  101. pTab_i_04 = shortM128_tab_i_04;
  102. pTab_i_26 = shortM128_tab_i_26;
  103. //Row 1 and Row 3
  104. r_xmm0 = _mm_load_si128((__m128i *) pInput);
  105. r_xmm4 = _mm_load_si128((__m128i *) (&pInput[2*8]));
  106. // *** Work on the data in xmm0
  107. //low shuffle mask = 0xd8 = 11 01 10 00
  108. //get short 2 and short 0 into ls 32-bits
  109. r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
  110. // copy short 2 and short 0 to all locations
  111. r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
  112. // add to those copies
  113. r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
  114. // shuffle mask = 0x55 = 01 01 01 01
  115. // copy short 3 and short 1 to all locations
  116. r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
  117. // high shuffle mask = 0xd8 = 11 01 10 00
  118. // get short 6 and short 4 into bit positions 64-95
  119. // get short 7 and short 5 into bit positions 96-127
  120. r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
  121. // add to short 3 and short 1
  122. r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
  123. // shuffle mask = 0xaa = 10 10 10 10
  124. // copy short 6 and short 4 to all locations
  125. r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
  126. // shuffle mask = 0xaa = 11 11 11 11
  127. // copy short 7 and short 5 to all locations
  128. r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
  129. // add to short 6 and short 4
  130. r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
  131. // *** Work on the data in xmm4
  132. // high shuffle mask = 0xd8 11 01 10 00
  133. // get short 6 and short 4 into bit positions 64-95
  134. // get short 7 and short 5 into bit positions 96-127
  135. r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
  136. // (xmm0 short 2 and short 0 plus pSi) + some constants
  137. r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
  138. r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
  139. r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
  140. r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
  141. r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
  142. r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
  143. r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
  144. r_xmm2 = r_xmm1;
  145. r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
  146. r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8]));
  147. r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
  148. r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
  149. r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
  150. r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16]));
  151. r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
  152. r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
  153. r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
  154. r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
  155. r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
  156. r_xmm6 = r_xmm5;
  157. r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
  158. r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
  159. row0 = _mm_packs_epi32(r_xmm0, r_xmm2);
  160. r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
  161. r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
  162. r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
  163. r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
  164. r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
  165. r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
  166. row2 = _mm_packs_epi32(r_xmm4, r_xmm6);
  167. //Row 5 and row 7
  168. r_xmm0 = _mm_load_si128((__m128i *) (&pInput[4*8]));
  169. r_xmm4 = _mm_load_si128((__m128i *) (&pInput[6*8]));
  170. r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
  171. r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
  172. r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
  173. r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
  174. r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
  175. r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
  176. r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
  177. r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
  178. r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
  179. r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
  180. r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
  181. r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
  182. r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
  183. r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
  184. r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
  185. r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
  186. r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
  187. r_xmm2 = r_xmm1;
  188. r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
  189. r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8]));
  190. r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
  191. r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
  192. r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
  193. r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16]));
  194. r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
  195. r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
  196. r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
  197. r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
  198. r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
  199. r_xmm6 = r_xmm5;
  200. r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
  201. r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
  202. row4 = _mm_packs_epi32(r_xmm0, r_xmm2);
  203. r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
  204. r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
  205. r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
  206. r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
  207. r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
  208. r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
  209. row6 = _mm_packs_epi32(r_xmm4, r_xmm6);
  210. //Row 4 and row 2
  211. pTab_i_04 = shortM128_tab_i_35;
  212. pTab_i_26 = shortM128_tab_i_17;
  213. r_xmm0 = _mm_load_si128((__m128i *) (&pInput[3*8]));
  214. r_xmm4 = _mm_load_si128((__m128i *) (&pInput[1*8]));
  215. r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
  216. r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
  217. r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
  218. r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
  219. r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
  220. r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
  221. r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
  222. r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
  223. r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
  224. r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
  225. r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
  226. r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
  227. r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
  228. r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
  229. r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
  230. r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
  231. r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
  232. r_xmm2 = r_xmm1;
  233. r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
  234. r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8]));
  235. r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
  236. r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
  237. r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
  238. r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16]));
  239. r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
  240. r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
  241. r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
  242. r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
  243. r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
  244. r_xmm6 = r_xmm5;
  245. r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
  246. r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
  247. row3 = _mm_packs_epi32(r_xmm0, r_xmm2);
  248. r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
  249. r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
  250. r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
  251. r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
  252. r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
  253. r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
  254. row1 = _mm_packs_epi32(r_xmm4, r_xmm6);
  255. //Row 6 and row 8
  256. r_xmm0 = _mm_load_si128((__m128i *) (&pInput[5*8]));
  257. r_xmm4 = _mm_load_si128((__m128i *) (&pInput[7*8]));
  258. r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
  259. r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
  260. r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
  261. r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
  262. r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
  263. r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
  264. r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
  265. r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
  266. r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
  267. r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
  268. r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
  269. r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
  270. r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
  271. r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
  272. r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
  273. r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
  274. r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
  275. r_xmm2 = r_xmm1;
  276. r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
  277. r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8]));
  278. r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
  279. r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
  280. r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
  281. r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16]));
  282. r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
  283. r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
  284. r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
  285. r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
  286. r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
  287. r_xmm6 = r_xmm5;
  288. r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
  289. r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
  290. row5 = _mm_packs_epi32(r_xmm0, r_xmm2);
  291. r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
  292. r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
  293. r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
  294. r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
  295. r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
  296. r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
  297. row7 = _mm_packs_epi32(r_xmm4, r_xmm6);
  298. r_xmm1 = _mm_load_si128((__m128i *) shortM128_tg_3_16);
  299. r_xmm2 = row5;
  300. r_xmm3 = row3;
  301. r_xmm0 = _mm_mulhi_epi16(row5, r_xmm1);
  302. r_xmm1 = _mm_mulhi_epi16(r_xmm1, r_xmm3);
  303. r_xmm5 = _mm_load_si128((__m128i *) shortM128_tg_1_16);
  304. r_xmm6 = row7;
  305. r_xmm4 = _mm_mulhi_epi16(row7, r_xmm5);
  306. r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm2);
  307. r_xmm5 = _mm_mulhi_epi16(r_xmm5, row1);
  308. r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm3);
  309. r_xmm7 = row6;
  310. r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm3);
  311. r_xmm3 = _mm_load_si128((__m128i *) shortM128_tg_2_16);
  312. r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm1);
  313. r_xmm7 = _mm_mulhi_epi16(r_xmm7, r_xmm3);
  314. r_xmm1 = r_xmm0;
  315. r_xmm3 = _mm_mulhi_epi16(r_xmm3, row2);
  316. r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm6);
  317. r_xmm4 = _mm_adds_epi16(r_xmm4, row1);
  318. r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm4);
  319. r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i *) shortM128_one_corr));
  320. r_xmm4 = _mm_subs_epi16(r_xmm4, r_xmm1);
  321. r_xmm6 = r_xmm5;
  322. r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm2);
  323. r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_one_corr));
  324. r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
  325. //Intermediate results, needed later
  326. __m128i temp3, temp7;
  327. temp7 = r_xmm0;
  328. r_xmm1 = r_xmm4;
  329. r_xmm0 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
  330. r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm5);
  331. r_xmm2 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
  332. r_xmm2 = _mm_mulhi_epi16(r_xmm2, r_xmm4);
  333. //Intermediate results, needed later
  334. temp3 = r_xmm6;
  335. r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm5);
  336. r_xmm7 = _mm_adds_epi16(r_xmm7, row2);
  337. r_xmm3 = _mm_subs_epi16(r_xmm3, row6);
  338. r_xmm6 = row0;
  339. r_xmm0 = _mm_mulhi_epi16(r_xmm0, r_xmm1);
  340. r_xmm5 = row4;
  341. r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm6);
  342. r_xmm6 = _mm_subs_epi16(r_xmm6, row4);
  343. r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm2);
  344. r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i *) shortM128_one_corr));
  345. r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm1);
  346. r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i *) shortM128_one_corr));
  347. r_xmm2 = r_xmm5;
  348. r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm7);
  349. r_xmm1 = r_xmm6;
  350. r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_round_inv_col));
  351. r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm7);
  352. r_xmm7 = temp7;
  353. r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm3);
  354. r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i *) shortM128_round_inv_col));
  355. r_xmm7 = _mm_adds_epi16(r_xmm7, r_xmm5);
  356. r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
  357. r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm3);
  358. r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i *) shortM128_round_inv_corr));
  359. r_xmm3 = r_xmm6;
  360. r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i *) shortM128_round_inv_corr));
  361. r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm4);
  362. //Store results for row 0
  363. //_mm_store_si128((__m128i *) pOutput, r_xmm7);
  364. __m128i r0 = r_xmm7;
  365. r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
  366. r_xmm7 = r_xmm1;
  367. r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm0);
  368. //Store results for row 1
  369. //_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6);
  370. __m128i r1 = r_xmm6;
  371. r_xmm1 = _mm_srai_epi16(r_xmm1, SHIFT_INV_COL);
  372. r_xmm6 = temp3;
  373. r_xmm7 = _mm_subs_epi16(r_xmm7, r_xmm0);
  374. r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
  375. //Store results for row 2
  376. //_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1);
  377. __m128i r2 = r_xmm1;
  378. r_xmm5 = _mm_subs_epi16(r_xmm5, temp7);
  379. r_xmm5 = _mm_srai_epi16(r_xmm5, SHIFT_INV_COL);
  380. //Store results for row 7
  381. //_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5);
  382. __m128i r7 = r_xmm5;
  383. r_xmm3 = _mm_subs_epi16(r_xmm3, r_xmm4);
  384. r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
  385. r_xmm2 = _mm_subs_epi16(r_xmm2, temp3);
  386. r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
  387. r_xmm2 = _mm_srai_epi16(r_xmm2, SHIFT_INV_COL);
  388. //Store results for row 3
  389. //_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6);
  390. __m128i r3 = r_xmm6;
  391. r_xmm3 = _mm_srai_epi16(r_xmm3, SHIFT_INV_COL);
  392. //Store results for rows 4, 5, and 6
  393. //_mm_store_si128((__m128i *) (&pOutput[4*8]), r_xmm2);
  394. //_mm_store_si128((__m128i *) (&pOutput[5*8]), r_xmm7);
  395. //_mm_store_si128((__m128i *) (&pOutput[6*8]), r_xmm3);
  396. __m128i r4 = r_xmm2;
  397. __m128i r5 = r_xmm7;
  398. __m128i r6 = r_xmm3;
  399. r0 = _mm_add_epi16(*(const __m128i *)shortM128_128, r0);
  400. r1 = _mm_add_epi16(*(const __m128i *)shortM128_128, r1);
  401. r2 = _mm_add_epi16(*(const __m128i *)shortM128_128, r2);
  402. r3 = _mm_add_epi16(*(const __m128i *)shortM128_128, r3);
  403. r4 = _mm_add_epi16(*(const __m128i *)shortM128_128, r4);
  404. r5 = _mm_add_epi16(*(const __m128i *)shortM128_128, r5);
  405. r6 = _mm_add_epi16(*(const __m128i *)shortM128_128, r6);
  406. r7 = _mm_add_epi16(*(const __m128i *)shortM128_128, r7);
  407. ((__m128i *)pOutputUB)[0] = _mm_packus_epi16(r0, r1);
  408. ((__m128i *)pOutputUB)[1] = _mm_packus_epi16(r2, r3);
  409. ((__m128i *)pOutputUB)[2] = _mm_packus_epi16(r4, r5);
  410. ((__m128i *)pOutputUB)[3] = _mm_packus_epi16(r6, r7);
  411. }