vp9_dct.c 53 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <math.h>
  12. #include "./vpx_config.h"
  13. #include "./vp9_rtcd.h"
  14. #include "vpx_ports/mem.h"
  15. #include "vp9/common/vp9_blockd.h"
  16. #include "vp9/common/vp9_idct.h"
  17. #include "vp9/common/vp9_systemdependent.h"
  18. #include "vp9/encoder/vp9_dct.h"
  19. static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
  20. tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
  21. // TODO(debargha, peter.derivaz): Find new bounds for this assert
  22. // and make the bounds consts.
  23. // assert(INT16_MIN <= rv && rv <= INT16_MAX);
  24. return rv;
  25. }
  26. void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {
  27. tran_high_t step[4];
  28. tran_high_t temp1, temp2;
  29. step[0] = input[0] + input[3];
  30. step[1] = input[1] + input[2];
  31. step[2] = input[1] - input[2];
  32. step[3] = input[0] - input[3];
  33. temp1 = (step[0] + step[1]) * cospi_16_64;
  34. temp2 = (step[0] - step[1]) * cospi_16_64;
  35. output[0] = (tran_low_t)fdct_round_shift(temp1);
  36. output[2] = (tran_low_t)fdct_round_shift(temp2);
  37. temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
  38. temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
  39. output[1] = (tran_low_t)fdct_round_shift(temp1);
  40. output[3] = (tran_low_t)fdct_round_shift(temp2);
  41. }
  42. void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
  43. int r, c;
  44. tran_low_t sum = 0;
  45. for (r = 0; r < 4; ++r)
  46. for (c = 0; c < 4; ++c)
  47. sum += input[r * stride + c];
  48. output[0] = sum << 1;
  49. output[1] = 0;
  50. }
  51. void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
  52. // The 2D transform is done with two passes which are actually pretty
  53. // similar. In the first one, we transform the columns and transpose
  54. // the results. In the second one, we transform the rows. To achieve that,
  55. // as the first pass results are transposed, we transpose the columns (that
  56. // is the transposed rows) and transpose the results (so that it goes back
  57. // in normal/row positions).
  58. int pass;
  59. // We need an intermediate buffer between passes.
  60. tran_low_t intermediate[4 * 4];
  61. const int16_t *in_pass0 = input;
  62. const tran_low_t *in = NULL;
  63. tran_low_t *out = intermediate;
  64. // Do the two transform/transpose passes
  65. for (pass = 0; pass < 2; ++pass) {
  66. tran_high_t input[4]; // canbe16
  67. tran_high_t step[4]; // canbe16
  68. tran_high_t temp1, temp2; // needs32
  69. int i;
  70. for (i = 0; i < 4; ++i) {
  71. // Load inputs.
  72. if (0 == pass) {
  73. input[0] = in_pass0[0 * stride] * 16;
  74. input[1] = in_pass0[1 * stride] * 16;
  75. input[2] = in_pass0[2 * stride] * 16;
  76. input[3] = in_pass0[3 * stride] * 16;
  77. if (i == 0 && input[0]) {
  78. input[0] += 1;
  79. }
  80. } else {
  81. input[0] = in[0 * 4];
  82. input[1] = in[1 * 4];
  83. input[2] = in[2 * 4];
  84. input[3] = in[3 * 4];
  85. }
  86. // Transform.
  87. step[0] = input[0] + input[3];
  88. step[1] = input[1] + input[2];
  89. step[2] = input[1] - input[2];
  90. step[3] = input[0] - input[3];
  91. temp1 = (step[0] + step[1]) * cospi_16_64;
  92. temp2 = (step[0] - step[1]) * cospi_16_64;
  93. out[0] = (tran_low_t)fdct_round_shift(temp1);
  94. out[2] = (tran_low_t)fdct_round_shift(temp2);
  95. temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
  96. temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
  97. out[1] = (tran_low_t)fdct_round_shift(temp1);
  98. out[3] = (tran_low_t)fdct_round_shift(temp2);
  99. // Do next column (which is a transposed row in second/horizontal pass)
  100. in_pass0++;
  101. in++;
  102. out += 4;
  103. }
  104. // Setup in/out for next pass.
  105. in = intermediate;
  106. out = output;
  107. }
  108. {
  109. int i, j;
  110. for (i = 0; i < 4; ++i) {
  111. for (j = 0; j < 4; ++j)
  112. output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
  113. }
  114. }
  115. }
  116. void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {
  117. tran_high_t x0, x1, x2, x3;
  118. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  119. x0 = input[0];
  120. x1 = input[1];
  121. x2 = input[2];
  122. x3 = input[3];
  123. if (!(x0 | x1 | x2 | x3)) {
  124. output[0] = output[1] = output[2] = output[3] = 0;
  125. return;
  126. }
  127. s0 = sinpi_1_9 * x0;
  128. s1 = sinpi_4_9 * x0;
  129. s2 = sinpi_2_9 * x1;
  130. s3 = sinpi_1_9 * x1;
  131. s4 = sinpi_3_9 * x2;
  132. s5 = sinpi_4_9 * x3;
  133. s6 = sinpi_2_9 * x3;
  134. s7 = x0 + x1 - x3;
  135. x0 = s0 + s2 + s5;
  136. x1 = sinpi_3_9 * s7;
  137. x2 = s1 - s3 + s6;
  138. x3 = s4;
  139. s0 = x0 + x3;
  140. s1 = x1;
  141. s2 = x2 - x3;
  142. s3 = x2 - x0 + x3;
  143. // 1-D transform scaling factor is sqrt(2).
  144. output[0] = (tran_low_t)fdct_round_shift(s0);
  145. output[1] = (tran_low_t)fdct_round_shift(s1);
  146. output[2] = (tran_low_t)fdct_round_shift(s2);
  147. output[3] = (tran_low_t)fdct_round_shift(s3);
  148. }
  149. void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
  150. int stride, int tx_type) {
  151. if (tx_type == DCT_DCT) {
  152. vp9_fdct4x4_c(input, output, stride);
  153. } else {
  154. tran_low_t out[4 * 4];
  155. int i, j;
  156. tran_low_t temp_in[4], temp_out[4];
  157. const transform_2d ht = FHT_4[tx_type];
  158. // Columns
  159. for (i = 0; i < 4; ++i) {
  160. for (j = 0; j < 4; ++j)
  161. temp_in[j] = input[j * stride + i] * 16;
  162. if (i == 0 && temp_in[0])
  163. temp_in[0] += 1;
  164. ht.cols(temp_in, temp_out);
  165. for (j = 0; j < 4; ++j)
  166. out[j * 4 + i] = temp_out[j];
  167. }
  168. // Rows
  169. for (i = 0; i < 4; ++i) {
  170. for (j = 0; j < 4; ++j)
  171. temp_in[j] = out[j + i * 4];
  172. ht.rows(temp_in, temp_out);
  173. for (j = 0; j < 4; ++j)
  174. output[j + i * 4] = (temp_out[j] + 1) >> 2;
  175. }
  176. }
  177. }
  178. void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {
  179. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
  180. tran_high_t t0, t1, t2, t3; // needs32
  181. tran_high_t x0, x1, x2, x3; // canbe16
  182. // stage 1
  183. s0 = input[0] + input[7];
  184. s1 = input[1] + input[6];
  185. s2 = input[2] + input[5];
  186. s3 = input[3] + input[4];
  187. s4 = input[3] - input[4];
  188. s5 = input[2] - input[5];
  189. s6 = input[1] - input[6];
  190. s7 = input[0] - input[7];
  191. // fdct4(step, step);
  192. x0 = s0 + s3;
  193. x1 = s1 + s2;
  194. x2 = s1 - s2;
  195. x3 = s0 - s3;
  196. t0 = (x0 + x1) * cospi_16_64;
  197. t1 = (x0 - x1) * cospi_16_64;
  198. t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
  199. t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
  200. output[0] = (tran_low_t)fdct_round_shift(t0);
  201. output[2] = (tran_low_t)fdct_round_shift(t2);
  202. output[4] = (tran_low_t)fdct_round_shift(t1);
  203. output[6] = (tran_low_t)fdct_round_shift(t3);
  204. // Stage 2
  205. t0 = (s6 - s5) * cospi_16_64;
  206. t1 = (s6 + s5) * cospi_16_64;
  207. t2 = (tran_low_t)fdct_round_shift(t0);
  208. t3 = (tran_low_t)fdct_round_shift(t1);
  209. // Stage 3
  210. x0 = s4 + t2;
  211. x1 = s4 - t2;
  212. x2 = s7 - t3;
  213. x3 = s7 + t3;
  214. // Stage 4
  215. t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
  216. t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
  217. t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  218. t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
  219. output[1] = (tran_low_t)fdct_round_shift(t0);
  220. output[3] = (tran_low_t)fdct_round_shift(t2);
  221. output[5] = (tran_low_t)fdct_round_shift(t1);
  222. output[7] = (tran_low_t)fdct_round_shift(t3);
  223. }
  224. void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
  225. int r, c;
  226. tran_low_t sum = 0;
  227. for (r = 0; r < 8; ++r)
  228. for (c = 0; c < 8; ++c)
  229. sum += input[r * stride + c];
  230. output[0] = sum;
  231. output[1] = 0;
  232. }
  233. void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
  234. int i, j;
  235. tran_low_t intermediate[64];
  236. // Transform columns
  237. {
  238. tran_low_t *output = intermediate;
  239. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
  240. tran_high_t t0, t1, t2, t3; // needs32
  241. tran_high_t x0, x1, x2, x3; // canbe16
  242. int i;
  243. for (i = 0; i < 8; i++) {
  244. // stage 1
  245. s0 = (input[0 * stride] + input[7 * stride]) * 4;
  246. s1 = (input[1 * stride] + input[6 * stride]) * 4;
  247. s2 = (input[2 * stride] + input[5 * stride]) * 4;
  248. s3 = (input[3 * stride] + input[4 * stride]) * 4;
  249. s4 = (input[3 * stride] - input[4 * stride]) * 4;
  250. s5 = (input[2 * stride] - input[5 * stride]) * 4;
  251. s6 = (input[1 * stride] - input[6 * stride]) * 4;
  252. s7 = (input[0 * stride] - input[7 * stride]) * 4;
  253. // fdct4(step, step);
  254. x0 = s0 + s3;
  255. x1 = s1 + s2;
  256. x2 = s1 - s2;
  257. x3 = s0 - s3;
  258. t0 = (x0 + x1) * cospi_16_64;
  259. t1 = (x0 - x1) * cospi_16_64;
  260. t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
  261. t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
  262. output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
  263. output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
  264. output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
  265. output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
  266. // Stage 2
  267. t0 = (s6 - s5) * cospi_16_64;
  268. t1 = (s6 + s5) * cospi_16_64;
  269. t2 = fdct_round_shift(t0);
  270. t3 = fdct_round_shift(t1);
  271. // Stage 3
  272. x0 = s4 + t2;
  273. x1 = s4 - t2;
  274. x2 = s7 - t3;
  275. x3 = s7 + t3;
  276. // Stage 4
  277. t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
  278. t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
  279. t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  280. t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
  281. output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
  282. output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
  283. output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
  284. output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
  285. input++;
  286. output++;
  287. }
  288. }
  289. // Rows
  290. for (i = 0; i < 8; ++i) {
  291. vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);
  292. for (j = 0; j < 8; ++j)
  293. final_output[j + i * 8] /= 2;
  294. }
  295. }
  296. void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
  297. tran_low_t *coeff_ptr, intptr_t n_coeffs,
  298. int skip_block,
  299. const int16_t *zbin_ptr, const int16_t *round_ptr,
  300. const int16_t *quant_ptr,
  301. const int16_t *quant_shift_ptr,
  302. tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
  303. const int16_t *dequant_ptr,
  304. uint16_t *eob_ptr,
  305. const int16_t *scan, const int16_t *iscan) {
  306. int eob = -1;
  307. int i, j;
  308. tran_low_t intermediate[64];
  309. // Transform columns
  310. {
  311. tran_low_t *output = intermediate;
  312. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
  313. tran_high_t t0, t1, t2, t3; // needs32
  314. tran_high_t x0, x1, x2, x3; // canbe16
  315. int i;
  316. for (i = 0; i < 8; i++) {
  317. // stage 1
  318. s0 = (input[0 * stride] + input[7 * stride]) * 4;
  319. s1 = (input[1 * stride] + input[6 * stride]) * 4;
  320. s2 = (input[2 * stride] + input[5 * stride]) * 4;
  321. s3 = (input[3 * stride] + input[4 * stride]) * 4;
  322. s4 = (input[3 * stride] - input[4 * stride]) * 4;
  323. s5 = (input[2 * stride] - input[5 * stride]) * 4;
  324. s6 = (input[1 * stride] - input[6 * stride]) * 4;
  325. s7 = (input[0 * stride] - input[7 * stride]) * 4;
  326. // fdct4(step, step);
  327. x0 = s0 + s3;
  328. x1 = s1 + s2;
  329. x2 = s1 - s2;
  330. x3 = s0 - s3;
  331. t0 = (x0 + x1) * cospi_16_64;
  332. t1 = (x0 - x1) * cospi_16_64;
  333. t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
  334. t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
  335. output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
  336. output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
  337. output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
  338. output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
  339. // Stage 2
  340. t0 = (s6 - s5) * cospi_16_64;
  341. t1 = (s6 + s5) * cospi_16_64;
  342. t2 = fdct_round_shift(t0);
  343. t3 = fdct_round_shift(t1);
  344. // Stage 3
  345. x0 = s4 + t2;
  346. x1 = s4 - t2;
  347. x2 = s7 - t3;
  348. x3 = s7 + t3;
  349. // Stage 4
  350. t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
  351. t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
  352. t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  353. t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
  354. output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
  355. output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
  356. output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
  357. output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
  358. input++;
  359. output++;
  360. }
  361. }
  362. // Rows
  363. for (i = 0; i < 8; ++i) {
  364. vp9_fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
  365. for (j = 0; j < 8; ++j)
  366. coeff_ptr[j + i * 8] /= 2;
  367. }
  368. // TODO(jingning) Decide the need of these arguments after the
  369. // quantization process is completed.
  370. (void)zbin_ptr;
  371. (void)quant_shift_ptr;
  372. (void)iscan;
  373. memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
  374. memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
  375. if (!skip_block) {
  376. // Quantization pass: All coefficients with index >= zero_flag are
  377. // skippable. Note: zero_flag can be zero.
  378. for (i = 0; i < n_coeffs; i++) {
  379. const int rc = scan[i];
  380. const int coeff = coeff_ptr[rc];
  381. const int coeff_sign = (coeff >> 31);
  382. const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
  383. int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
  384. tmp = (tmp * quant_ptr[rc != 0]) >> 16;
  385. qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
  386. dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
  387. if (tmp)
  388. eob = i;
  389. }
  390. }
  391. *eob_ptr = eob + 1;
  392. }
  393. void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
  394. int r, c;
  395. tran_low_t sum = 0;
  396. for (r = 0; r < 16; ++r)
  397. for (c = 0; c < 16; ++c)
  398. sum += input[r * stride + c];
  399. output[0] = sum >> 1;
  400. output[1] = 0;
  401. }
  402. void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
  403. // The 2D transform is done with two passes which are actually pretty
  404. // similar. In the first one, we transform the columns and transpose
  405. // the results. In the second one, we transform the rows. To achieve that,
  406. // as the first pass results are transposed, we transpose the columns (that
  407. // is the transposed rows) and transpose the results (so that it goes back
  408. // in normal/row positions).
  409. int pass;
  410. // We need an intermediate buffer between passes.
  411. tran_low_t intermediate[256];
  412. const int16_t *in_pass0 = input;
  413. const tran_low_t *in = NULL;
  414. tran_low_t *out = intermediate;
  415. // Do the two transform/transpose passes
  416. for (pass = 0; pass < 2; ++pass) {
  417. tran_high_t step1[8]; // canbe16
  418. tran_high_t step2[8]; // canbe16
  419. tran_high_t step3[8]; // canbe16
  420. tran_high_t input[8]; // canbe16
  421. tran_high_t temp1, temp2; // needs32
  422. int i;
  423. for (i = 0; i < 16; i++) {
  424. if (0 == pass) {
  425. // Calculate input for the first 8 results.
  426. input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
  427. input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
  428. input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
  429. input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
  430. input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
  431. input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
  432. input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
  433. input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
  434. // Calculate input for the next 8 results.
  435. step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
  436. step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
  437. step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
  438. step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
  439. step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
  440. step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
  441. step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
  442. step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
  443. } else {
  444. // Calculate input for the first 8 results.
  445. input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
  446. input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
  447. input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
  448. input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
  449. input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
  450. input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
  451. input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
  452. input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
  453. // Calculate input for the next 8 results.
  454. step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
  455. step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
  456. step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
  457. step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
  458. step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
  459. step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
  460. step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
  461. step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
  462. }
  463. // Work on the first eight values; fdct8(input, even_results);
  464. {
  465. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
  466. tran_high_t t0, t1, t2, t3; // needs32
  467. tran_high_t x0, x1, x2, x3; // canbe16
  468. // stage 1
  469. s0 = input[0] + input[7];
  470. s1 = input[1] + input[6];
  471. s2 = input[2] + input[5];
  472. s3 = input[3] + input[4];
  473. s4 = input[3] - input[4];
  474. s5 = input[2] - input[5];
  475. s6 = input[1] - input[6];
  476. s7 = input[0] - input[7];
  477. // fdct4(step, step);
  478. x0 = s0 + s3;
  479. x1 = s1 + s2;
  480. x2 = s1 - s2;
  481. x3 = s0 - s3;
  482. t0 = (x0 + x1) * cospi_16_64;
  483. t1 = (x0 - x1) * cospi_16_64;
  484. t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
  485. t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
  486. out[0] = (tran_low_t)fdct_round_shift(t0);
  487. out[4] = (tran_low_t)fdct_round_shift(t2);
  488. out[8] = (tran_low_t)fdct_round_shift(t1);
  489. out[12] = (tran_low_t)fdct_round_shift(t3);
  490. // Stage 2
  491. t0 = (s6 - s5) * cospi_16_64;
  492. t1 = (s6 + s5) * cospi_16_64;
  493. t2 = fdct_round_shift(t0);
  494. t3 = fdct_round_shift(t1);
  495. // Stage 3
  496. x0 = s4 + t2;
  497. x1 = s4 - t2;
  498. x2 = s7 - t3;
  499. x3 = s7 + t3;
  500. // Stage 4
  501. t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
  502. t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
  503. t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  504. t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
  505. out[2] = (tran_low_t)fdct_round_shift(t0);
  506. out[6] = (tran_low_t)fdct_round_shift(t2);
  507. out[10] = (tran_low_t)fdct_round_shift(t1);
  508. out[14] = (tran_low_t)fdct_round_shift(t3);
  509. }
  510. // Work on the next eight values; step1 -> odd_results
  511. {
  512. // step 2
  513. temp1 = (step1[5] - step1[2]) * cospi_16_64;
  514. temp2 = (step1[4] - step1[3]) * cospi_16_64;
  515. step2[2] = fdct_round_shift(temp1);
  516. step2[3] = fdct_round_shift(temp2);
  517. temp1 = (step1[4] + step1[3]) * cospi_16_64;
  518. temp2 = (step1[5] + step1[2]) * cospi_16_64;
  519. step2[4] = fdct_round_shift(temp1);
  520. step2[5] = fdct_round_shift(temp2);
  521. // step 3
  522. step3[0] = step1[0] + step2[3];
  523. step3[1] = step1[1] + step2[2];
  524. step3[2] = step1[1] - step2[2];
  525. step3[3] = step1[0] - step2[3];
  526. step3[4] = step1[7] - step2[4];
  527. step3[5] = step1[6] - step2[5];
  528. step3[6] = step1[6] + step2[5];
  529. step3[7] = step1[7] + step2[4];
  530. // step 4
  531. temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
  532. temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
  533. step2[1] = fdct_round_shift(temp1);
  534. step2[2] = fdct_round_shift(temp2);
  535. temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
  536. temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
  537. step2[5] = fdct_round_shift(temp1);
  538. step2[6] = fdct_round_shift(temp2);
  539. // step 5
  540. step1[0] = step3[0] + step2[1];
  541. step1[1] = step3[0] - step2[1];
  542. step1[2] = step3[3] + step2[2];
  543. step1[3] = step3[3] - step2[2];
  544. step1[4] = step3[4] - step2[5];
  545. step1[5] = step3[4] + step2[5];
  546. step1[6] = step3[7] - step2[6];
  547. step1[7] = step3[7] + step2[6];
  548. // step 6
  549. temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
  550. temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
  551. out[1] = (tran_low_t)fdct_round_shift(temp1);
  552. out[9] = (tran_low_t)fdct_round_shift(temp2);
  553. temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
  554. temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
  555. out[5] = (tran_low_t)fdct_round_shift(temp1);
  556. out[13] = (tran_low_t)fdct_round_shift(temp2);
  557. temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
  558. temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
  559. out[3] = (tran_low_t)fdct_round_shift(temp1);
  560. out[11] = (tran_low_t)fdct_round_shift(temp2);
  561. temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
  562. temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
  563. out[7] = (tran_low_t)fdct_round_shift(temp1);
  564. out[15] = (tran_low_t)fdct_round_shift(temp2);
  565. }
  566. // Do next column (which is a transposed row in second/horizontal pass)
  567. in++;
  568. in_pass0++;
  569. out += 16;
  570. }
  571. // Setup in/out for next pass.
  572. in = intermediate;
  573. out = output;
  574. }
  575. }
  576. void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
  577. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  578. tran_high_t x0 = input[7];
  579. tran_high_t x1 = input[0];
  580. tran_high_t x2 = input[5];
  581. tran_high_t x3 = input[2];
  582. tran_high_t x4 = input[3];
  583. tran_high_t x5 = input[4];
  584. tran_high_t x6 = input[1];
  585. tran_high_t x7 = input[6];
  586. // stage 1
  587. s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
  588. s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
  589. s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
  590. s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
  591. s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
  592. s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
  593. s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
  594. s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
  595. x0 = fdct_round_shift(s0 + s4);
  596. x1 = fdct_round_shift(s1 + s5);
  597. x2 = fdct_round_shift(s2 + s6);
  598. x3 = fdct_round_shift(s3 + s7);
  599. x4 = fdct_round_shift(s0 - s4);
  600. x5 = fdct_round_shift(s1 - s5);
  601. x6 = fdct_round_shift(s2 - s6);
  602. x7 = fdct_round_shift(s3 - s7);
  603. // stage 2
  604. s0 = x0;
  605. s1 = x1;
  606. s2 = x2;
  607. s3 = x3;
  608. s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
  609. s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
  610. s6 = - cospi_24_64 * x6 + cospi_8_64 * x7;
  611. s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
  612. x0 = s0 + s2;
  613. x1 = s1 + s3;
  614. x2 = s0 - s2;
  615. x3 = s1 - s3;
  616. x4 = fdct_round_shift(s4 + s6);
  617. x5 = fdct_round_shift(s5 + s7);
  618. x6 = fdct_round_shift(s4 - s6);
  619. x7 = fdct_round_shift(s5 - s7);
  620. // stage 3
  621. s2 = cospi_16_64 * (x2 + x3);
  622. s3 = cospi_16_64 * (x2 - x3);
  623. s6 = cospi_16_64 * (x6 + x7);
  624. s7 = cospi_16_64 * (x6 - x7);
  625. x2 = fdct_round_shift(s2);
  626. x3 = fdct_round_shift(s3);
  627. x6 = fdct_round_shift(s6);
  628. x7 = fdct_round_shift(s7);
  629. output[0] = (tran_low_t)x0;
  630. output[1] = (tran_low_t)-x4;
  631. output[2] = (tran_low_t)x6;
  632. output[3] = (tran_low_t)-x2;
  633. output[4] = (tran_low_t)x3;
  634. output[5] = (tran_low_t)-x7;
  635. output[6] = (tran_low_t)x5;
  636. output[7] = (tran_low_t)-x1;
  637. }
  638. void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
  639. int stride, int tx_type) {
  640. if (tx_type == DCT_DCT) {
  641. vp9_fdct8x8_c(input, output, stride);
  642. } else {
  643. tran_low_t out[64];
  644. int i, j;
  645. tran_low_t temp_in[8], temp_out[8];
  646. const transform_2d ht = FHT_8[tx_type];
  647. // Columns
  648. for (i = 0; i < 8; ++i) {
  649. for (j = 0; j < 8; ++j)
  650. temp_in[j] = input[j * stride + i] * 4;
  651. ht.cols(temp_in, temp_out);
  652. for (j = 0; j < 8; ++j)
  653. out[j * 8 + i] = temp_out[j];
  654. }
  655. // Rows
  656. for (i = 0; i < 8; ++i) {
  657. for (j = 0; j < 8; ++j)
  658. temp_in[j] = out[j + i * 8];
  659. ht.rows(temp_in, temp_out);
  660. for (j = 0; j < 8; ++j)
  661. output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
  662. }
  663. }
  664. }
  665. /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
  666. pixel. */
  667. void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
  668. int i;
  669. tran_high_t a1, b1, c1, d1, e1;
  670. const int16_t *ip_pass0 = input;
  671. const tran_low_t *ip = NULL;
  672. tran_low_t *op = output;
  673. for (i = 0; i < 4; i++) {
  674. a1 = ip_pass0[0 * stride];
  675. b1 = ip_pass0[1 * stride];
  676. c1 = ip_pass0[2 * stride];
  677. d1 = ip_pass0[3 * stride];
  678. a1 += b1;
  679. d1 = d1 - c1;
  680. e1 = (a1 - d1) >> 1;
  681. b1 = e1 - b1;
  682. c1 = e1 - c1;
  683. a1 -= c1;
  684. d1 += b1;
  685. op[0] = (tran_low_t)a1;
  686. op[4] = (tran_low_t)c1;
  687. op[8] = (tran_low_t)d1;
  688. op[12] = (tran_low_t)b1;
  689. ip_pass0++;
  690. op++;
  691. }
  692. ip = output;
  693. op = output;
  694. for (i = 0; i < 4; i++) {
  695. a1 = ip[0];
  696. b1 = ip[1];
  697. c1 = ip[2];
  698. d1 = ip[3];
  699. a1 += b1;
  700. d1 -= c1;
  701. e1 = (a1 - d1) >> 1;
  702. b1 = e1 - b1;
  703. c1 = e1 - c1;
  704. a1 -= c1;
  705. d1 += b1;
  706. op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
  707. op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
  708. op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
  709. op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
  710. ip += 4;
  711. op += 4;
  712. }
  713. }
  714. // Rewrote to use same algorithm as others.
  715. void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {
  716. tran_high_t step1[8]; // canbe16
  717. tran_high_t step2[8]; // canbe16
  718. tran_high_t step3[8]; // canbe16
  719. tran_high_t input[8]; // canbe16
  720. tran_high_t temp1, temp2; // needs32
  721. // step 1
  722. input[0] = in[0] + in[15];
  723. input[1] = in[1] + in[14];
  724. input[2] = in[2] + in[13];
  725. input[3] = in[3] + in[12];
  726. input[4] = in[4] + in[11];
  727. input[5] = in[5] + in[10];
  728. input[6] = in[6] + in[ 9];
  729. input[7] = in[7] + in[ 8];
  730. step1[0] = in[7] - in[ 8];
  731. step1[1] = in[6] - in[ 9];
  732. step1[2] = in[5] - in[10];
  733. step1[3] = in[4] - in[11];
  734. step1[4] = in[3] - in[12];
  735. step1[5] = in[2] - in[13];
  736. step1[6] = in[1] - in[14];
  737. step1[7] = in[0] - in[15];
  738. // fdct8(step, step);
  739. {
  740. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
  741. tran_high_t t0, t1, t2, t3; // needs32
  742. tran_high_t x0, x1, x2, x3; // canbe16
  743. // stage 1
  744. s0 = input[0] + input[7];
  745. s1 = input[1] + input[6];
  746. s2 = input[2] + input[5];
  747. s3 = input[3] + input[4];
  748. s4 = input[3] - input[4];
  749. s5 = input[2] - input[5];
  750. s6 = input[1] - input[6];
  751. s7 = input[0] - input[7];
  752. // fdct4(step, step);
  753. x0 = s0 + s3;
  754. x1 = s1 + s2;
  755. x2 = s1 - s2;
  756. x3 = s0 - s3;
  757. t0 = (x0 + x1) * cospi_16_64;
  758. t1 = (x0 - x1) * cospi_16_64;
  759. t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
  760. t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
  761. out[0] = (tran_low_t)fdct_round_shift(t0);
  762. out[4] = (tran_low_t)fdct_round_shift(t2);
  763. out[8] = (tran_low_t)fdct_round_shift(t1);
  764. out[12] = (tran_low_t)fdct_round_shift(t3);
  765. // Stage 2
  766. t0 = (s6 - s5) * cospi_16_64;
  767. t1 = (s6 + s5) * cospi_16_64;
  768. t2 = fdct_round_shift(t0);
  769. t3 = fdct_round_shift(t1);
  770. // Stage 3
  771. x0 = s4 + t2;
  772. x1 = s4 - t2;
  773. x2 = s7 - t3;
  774. x3 = s7 + t3;
  775. // Stage 4
  776. t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
  777. t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
  778. t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  779. t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
  780. out[2] = (tran_low_t)fdct_round_shift(t0);
  781. out[6] = (tran_low_t)fdct_round_shift(t2);
  782. out[10] = (tran_low_t)fdct_round_shift(t1);
  783. out[14] = (tran_low_t)fdct_round_shift(t3);
  784. }
  785. // step 2
  786. temp1 = (step1[5] - step1[2]) * cospi_16_64;
  787. temp2 = (step1[4] - step1[3]) * cospi_16_64;
  788. step2[2] = fdct_round_shift(temp1);
  789. step2[3] = fdct_round_shift(temp2);
  790. temp1 = (step1[4] + step1[3]) * cospi_16_64;
  791. temp2 = (step1[5] + step1[2]) * cospi_16_64;
  792. step2[4] = fdct_round_shift(temp1);
  793. step2[5] = fdct_round_shift(temp2);
  794. // step 3
  795. step3[0] = step1[0] + step2[3];
  796. step3[1] = step1[1] + step2[2];
  797. step3[2] = step1[1] - step2[2];
  798. step3[3] = step1[0] - step2[3];
  799. step3[4] = step1[7] - step2[4];
  800. step3[5] = step1[6] - step2[5];
  801. step3[6] = step1[6] + step2[5];
  802. step3[7] = step1[7] + step2[4];
  803. // step 4
  804. temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
  805. temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
  806. step2[1] = fdct_round_shift(temp1);
  807. step2[2] = fdct_round_shift(temp2);
  808. temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
  809. temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
  810. step2[5] = fdct_round_shift(temp1);
  811. step2[6] = fdct_round_shift(temp2);
  812. // step 5
  813. step1[0] = step3[0] + step2[1];
  814. step1[1] = step3[0] - step2[1];
  815. step1[2] = step3[3] + step2[2];
  816. step1[3] = step3[3] - step2[2];
  817. step1[4] = step3[4] - step2[5];
  818. step1[5] = step3[4] + step2[5];
  819. step1[6] = step3[7] - step2[6];
  820. step1[7] = step3[7] + step2[6];
  821. // step 6
  822. temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
  823. temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
  824. out[1] = (tran_low_t)fdct_round_shift(temp1);
  825. out[9] = (tran_low_t)fdct_round_shift(temp2);
  826. temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
  827. temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
  828. out[5] = (tran_low_t)fdct_round_shift(temp1);
  829. out[13] = (tran_low_t)fdct_round_shift(temp2);
  830. temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
  831. temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
  832. out[3] = (tran_low_t)fdct_round_shift(temp1);
  833. out[11] = (tran_low_t)fdct_round_shift(temp2);
  834. temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
  835. temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
  836. out[7] = (tran_low_t)fdct_round_shift(temp1);
  837. out[15] = (tran_low_t)fdct_round_shift(temp2);
  838. }
  839. void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {
  840. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  841. tran_high_t s9, s10, s11, s12, s13, s14, s15;
  842. tran_high_t x0 = input[15];
  843. tran_high_t x1 = input[0];
  844. tran_high_t x2 = input[13];
  845. tran_high_t x3 = input[2];
  846. tran_high_t x4 = input[11];
  847. tran_high_t x5 = input[4];
  848. tran_high_t x6 = input[9];
  849. tran_high_t x7 = input[6];
  850. tran_high_t x8 = input[7];
  851. tran_high_t x9 = input[8];
  852. tran_high_t x10 = input[5];
  853. tran_high_t x11 = input[10];
  854. tran_high_t x12 = input[3];
  855. tran_high_t x13 = input[12];
  856. tran_high_t x14 = input[1];
  857. tran_high_t x15 = input[14];
  858. // stage 1
  859. s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
  860. s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
  861. s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
  862. s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
  863. s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
  864. s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  865. s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  866. s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  867. s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  868. s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  869. s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  870. s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  871. s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
  872. s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
  873. s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
  874. s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
  875. x0 = fdct_round_shift(s0 + s8);
  876. x1 = fdct_round_shift(s1 + s9);
  877. x2 = fdct_round_shift(s2 + s10);
  878. x3 = fdct_round_shift(s3 + s11);
  879. x4 = fdct_round_shift(s4 + s12);
  880. x5 = fdct_round_shift(s5 + s13);
  881. x6 = fdct_round_shift(s6 + s14);
  882. x7 = fdct_round_shift(s7 + s15);
  883. x8 = fdct_round_shift(s0 - s8);
  884. x9 = fdct_round_shift(s1 - s9);
  885. x10 = fdct_round_shift(s2 - s10);
  886. x11 = fdct_round_shift(s3 - s11);
  887. x12 = fdct_round_shift(s4 - s12);
  888. x13 = fdct_round_shift(s5 - s13);
  889. x14 = fdct_round_shift(s6 - s14);
  890. x15 = fdct_round_shift(s7 - s15);
  891. // stage 2
  892. s0 = x0;
  893. s1 = x1;
  894. s2 = x2;
  895. s3 = x3;
  896. s4 = x4;
  897. s5 = x5;
  898. s6 = x6;
  899. s7 = x7;
  900. s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
  901. s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
  902. s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
  903. s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
  904. s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
  905. s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
  906. s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
  907. s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
  908. x0 = s0 + s4;
  909. x1 = s1 + s5;
  910. x2 = s2 + s6;
  911. x3 = s3 + s7;
  912. x4 = s0 - s4;
  913. x5 = s1 - s5;
  914. x6 = s2 - s6;
  915. x7 = s3 - s7;
  916. x8 = fdct_round_shift(s8 + s12);
  917. x9 = fdct_round_shift(s9 + s13);
  918. x10 = fdct_round_shift(s10 + s14);
  919. x11 = fdct_round_shift(s11 + s15);
  920. x12 = fdct_round_shift(s8 - s12);
  921. x13 = fdct_round_shift(s9 - s13);
  922. x14 = fdct_round_shift(s10 - s14);
  923. x15 = fdct_round_shift(s11 - s15);
  924. // stage 3
  925. s0 = x0;
  926. s1 = x1;
  927. s2 = x2;
  928. s3 = x3;
  929. s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
  930. s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
  931. s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
  932. s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
  933. s8 = x8;
  934. s9 = x9;
  935. s10 = x10;
  936. s11 = x11;
  937. s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
  938. s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
  939. s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
  940. s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
  941. x0 = s0 + s2;
  942. x1 = s1 + s3;
  943. x2 = s0 - s2;
  944. x3 = s1 - s3;
  945. x4 = fdct_round_shift(s4 + s6);
  946. x5 = fdct_round_shift(s5 + s7);
  947. x6 = fdct_round_shift(s4 - s6);
  948. x7 = fdct_round_shift(s5 - s7);
  949. x8 = s8 + s10;
  950. x9 = s9 + s11;
  951. x10 = s8 - s10;
  952. x11 = s9 - s11;
  953. x12 = fdct_round_shift(s12 + s14);
  954. x13 = fdct_round_shift(s13 + s15);
  955. x14 = fdct_round_shift(s12 - s14);
  956. x15 = fdct_round_shift(s13 - s15);
  957. // stage 4
  958. s2 = (- cospi_16_64) * (x2 + x3);
  959. s3 = cospi_16_64 * (x2 - x3);
  960. s6 = cospi_16_64 * (x6 + x7);
  961. s7 = cospi_16_64 * (- x6 + x7);
  962. s10 = cospi_16_64 * (x10 + x11);
  963. s11 = cospi_16_64 * (- x10 + x11);
  964. s14 = (- cospi_16_64) * (x14 + x15);
  965. s15 = cospi_16_64 * (x14 - x15);
  966. x2 = fdct_round_shift(s2);
  967. x3 = fdct_round_shift(s3);
  968. x6 = fdct_round_shift(s6);
  969. x7 = fdct_round_shift(s7);
  970. x10 = fdct_round_shift(s10);
  971. x11 = fdct_round_shift(s11);
  972. x14 = fdct_round_shift(s14);
  973. x15 = fdct_round_shift(s15);
  974. output[0] = (tran_low_t)x0;
  975. output[1] = (tran_low_t)-x8;
  976. output[2] = (tran_low_t)x12;
  977. output[3] = (tran_low_t)-x4;
  978. output[4] = (tran_low_t)x6;
  979. output[5] = (tran_low_t)x14;
  980. output[6] = (tran_low_t)x10;
  981. output[7] = (tran_low_t)x2;
  982. output[8] = (tran_low_t)x3;
  983. output[9] = (tran_low_t)x11;
  984. output[10] = (tran_low_t)x15;
  985. output[11] = (tran_low_t)x7;
  986. output[12] = (tran_low_t)x5;
  987. output[13] = (tran_low_t)-x13;
  988. output[14] = (tran_low_t)x9;
  989. output[15] = (tran_low_t)-x1;
  990. }
  991. void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
  992. int stride, int tx_type) {
  993. if (tx_type == DCT_DCT) {
  994. vp9_fdct16x16_c(input, output, stride);
  995. } else {
  996. tran_low_t out[256];
  997. int i, j;
  998. tran_low_t temp_in[16], temp_out[16];
  999. const transform_2d ht = FHT_16[tx_type];
  1000. // Columns
  1001. for (i = 0; i < 16; ++i) {
  1002. for (j = 0; j < 16; ++j)
  1003. temp_in[j] = input[j * stride + i] * 4;
  1004. ht.cols(temp_in, temp_out);
  1005. for (j = 0; j < 16; ++j)
  1006. out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
  1007. }
  1008. // Rows
  1009. for (i = 0; i < 16; ++i) {
  1010. for (j = 0; j < 16; ++j)
  1011. temp_in[j] = out[j + i * 16];
  1012. ht.rows(temp_in, temp_out);
  1013. for (j = 0; j < 16; ++j)
  1014. output[j + i * 16] = temp_out[j];
  1015. }
  1016. }
  1017. }
  1018. static INLINE tran_high_t dct_32_round(tran_high_t input) {
  1019. tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
  1020. // TODO(debargha, peter.derivaz): Find new bounds for this assert,
  1021. // and make the bounds consts.
  1022. // assert(-131072 <= rv && rv <= 131071);
  1023. return rv;
  1024. }
  1025. static INLINE tran_high_t half_round_shift(tran_high_t input) {
  1026. tran_high_t rv = (input + 1 + (input < 0)) >> 2;
  1027. return rv;
  1028. }
  1029. void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
  1030. tran_high_t step[32];
  1031. // Stage 1
  1032. step[0] = input[0] + input[(32 - 1)];
  1033. step[1] = input[1] + input[(32 - 2)];
  1034. step[2] = input[2] + input[(32 - 3)];
  1035. step[3] = input[3] + input[(32 - 4)];
  1036. step[4] = input[4] + input[(32 - 5)];
  1037. step[5] = input[5] + input[(32 - 6)];
  1038. step[6] = input[6] + input[(32 - 7)];
  1039. step[7] = input[7] + input[(32 - 8)];
  1040. step[8] = input[8] + input[(32 - 9)];
  1041. step[9] = input[9] + input[(32 - 10)];
  1042. step[10] = input[10] + input[(32 - 11)];
  1043. step[11] = input[11] + input[(32 - 12)];
  1044. step[12] = input[12] + input[(32 - 13)];
  1045. step[13] = input[13] + input[(32 - 14)];
  1046. step[14] = input[14] + input[(32 - 15)];
  1047. step[15] = input[15] + input[(32 - 16)];
  1048. step[16] = -input[16] + input[(32 - 17)];
  1049. step[17] = -input[17] + input[(32 - 18)];
  1050. step[18] = -input[18] + input[(32 - 19)];
  1051. step[19] = -input[19] + input[(32 - 20)];
  1052. step[20] = -input[20] + input[(32 - 21)];
  1053. step[21] = -input[21] + input[(32 - 22)];
  1054. step[22] = -input[22] + input[(32 - 23)];
  1055. step[23] = -input[23] + input[(32 - 24)];
  1056. step[24] = -input[24] + input[(32 - 25)];
  1057. step[25] = -input[25] + input[(32 - 26)];
  1058. step[26] = -input[26] + input[(32 - 27)];
  1059. step[27] = -input[27] + input[(32 - 28)];
  1060. step[28] = -input[28] + input[(32 - 29)];
  1061. step[29] = -input[29] + input[(32 - 30)];
  1062. step[30] = -input[30] + input[(32 - 31)];
  1063. step[31] = -input[31] + input[(32 - 32)];
  1064. // Stage 2
  1065. output[0] = step[0] + step[16 - 1];
  1066. output[1] = step[1] + step[16 - 2];
  1067. output[2] = step[2] + step[16 - 3];
  1068. output[3] = step[3] + step[16 - 4];
  1069. output[4] = step[4] + step[16 - 5];
  1070. output[5] = step[5] + step[16 - 6];
  1071. output[6] = step[6] + step[16 - 7];
  1072. output[7] = step[7] + step[16 - 8];
  1073. output[8] = -step[8] + step[16 - 9];
  1074. output[9] = -step[9] + step[16 - 10];
  1075. output[10] = -step[10] + step[16 - 11];
  1076. output[11] = -step[11] + step[16 - 12];
  1077. output[12] = -step[12] + step[16 - 13];
  1078. output[13] = -step[13] + step[16 - 14];
  1079. output[14] = -step[14] + step[16 - 15];
  1080. output[15] = -step[15] + step[16 - 16];
  1081. output[16] = step[16];
  1082. output[17] = step[17];
  1083. output[18] = step[18];
  1084. output[19] = step[19];
  1085. output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
  1086. output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
  1087. output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
  1088. output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
  1089. output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
  1090. output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
  1091. output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
  1092. output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
  1093. output[28] = step[28];
  1094. output[29] = step[29];
  1095. output[30] = step[30];
  1096. output[31] = step[31];
  1097. // dump the magnitude by 4, hence the intermediate values are within
  1098. // the range of 16 bits.
  1099. if (round) {
  1100. output[0] = half_round_shift(output[0]);
  1101. output[1] = half_round_shift(output[1]);
  1102. output[2] = half_round_shift(output[2]);
  1103. output[3] = half_round_shift(output[3]);
  1104. output[4] = half_round_shift(output[4]);
  1105. output[5] = half_round_shift(output[5]);
  1106. output[6] = half_round_shift(output[6]);
  1107. output[7] = half_round_shift(output[7]);
  1108. output[8] = half_round_shift(output[8]);
  1109. output[9] = half_round_shift(output[9]);
  1110. output[10] = half_round_shift(output[10]);
  1111. output[11] = half_round_shift(output[11]);
  1112. output[12] = half_round_shift(output[12]);
  1113. output[13] = half_round_shift(output[13]);
  1114. output[14] = half_round_shift(output[14]);
  1115. output[15] = half_round_shift(output[15]);
  1116. output[16] = half_round_shift(output[16]);
  1117. output[17] = half_round_shift(output[17]);
  1118. output[18] = half_round_shift(output[18]);
  1119. output[19] = half_round_shift(output[19]);
  1120. output[20] = half_round_shift(output[20]);
  1121. output[21] = half_round_shift(output[21]);
  1122. output[22] = half_round_shift(output[22]);
  1123. output[23] = half_round_shift(output[23]);
  1124. output[24] = half_round_shift(output[24]);
  1125. output[25] = half_round_shift(output[25]);
  1126. output[26] = half_round_shift(output[26]);
  1127. output[27] = half_round_shift(output[27]);
  1128. output[28] = half_round_shift(output[28]);
  1129. output[29] = half_round_shift(output[29]);
  1130. output[30] = half_round_shift(output[30]);
  1131. output[31] = half_round_shift(output[31]);
  1132. }
  1133. // Stage 3
  1134. step[0] = output[0] + output[(8 - 1)];
  1135. step[1] = output[1] + output[(8 - 2)];
  1136. step[2] = output[2] + output[(8 - 3)];
  1137. step[3] = output[3] + output[(8 - 4)];
  1138. step[4] = -output[4] + output[(8 - 5)];
  1139. step[5] = -output[5] + output[(8 - 6)];
  1140. step[6] = -output[6] + output[(8 - 7)];
  1141. step[7] = -output[7] + output[(8 - 8)];
  1142. step[8] = output[8];
  1143. step[9] = output[9];
  1144. step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
  1145. step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
  1146. step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
  1147. step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
  1148. step[14] = output[14];
  1149. step[15] = output[15];
  1150. step[16] = output[16] + output[23];
  1151. step[17] = output[17] + output[22];
  1152. step[18] = output[18] + output[21];
  1153. step[19] = output[19] + output[20];
  1154. step[20] = -output[20] + output[19];
  1155. step[21] = -output[21] + output[18];
  1156. step[22] = -output[22] + output[17];
  1157. step[23] = -output[23] + output[16];
  1158. step[24] = -output[24] + output[31];
  1159. step[25] = -output[25] + output[30];
  1160. step[26] = -output[26] + output[29];
  1161. step[27] = -output[27] + output[28];
  1162. step[28] = output[28] + output[27];
  1163. step[29] = output[29] + output[26];
  1164. step[30] = output[30] + output[25];
  1165. step[31] = output[31] + output[24];
  1166. // Stage 4
  1167. output[0] = step[0] + step[3];
  1168. output[1] = step[1] + step[2];
  1169. output[2] = -step[2] + step[1];
  1170. output[3] = -step[3] + step[0];
  1171. output[4] = step[4];
  1172. output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
  1173. output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
  1174. output[7] = step[7];
  1175. output[8] = step[8] + step[11];
  1176. output[9] = step[9] + step[10];
  1177. output[10] = -step[10] + step[9];
  1178. output[11] = -step[11] + step[8];
  1179. output[12] = -step[12] + step[15];
  1180. output[13] = -step[13] + step[14];
  1181. output[14] = step[14] + step[13];
  1182. output[15] = step[15] + step[12];
  1183. output[16] = step[16];
  1184. output[17] = step[17];
  1185. output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
  1186. output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
  1187. output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
  1188. output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
  1189. output[22] = step[22];
  1190. output[23] = step[23];
  1191. output[24] = step[24];
  1192. output[25] = step[25];
  1193. output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
  1194. output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
  1195. output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
  1196. output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
  1197. output[30] = step[30];
  1198. output[31] = step[31];
  1199. // Stage 5
  1200. step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
  1201. step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
  1202. step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
  1203. step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
  1204. step[4] = output[4] + output[5];
  1205. step[5] = -output[5] + output[4];
  1206. step[6] = -output[6] + output[7];
  1207. step[7] = output[7] + output[6];
  1208. step[8] = output[8];
  1209. step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
  1210. step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
  1211. step[11] = output[11];
  1212. step[12] = output[12];
  1213. step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
  1214. step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
  1215. step[15] = output[15];
  1216. step[16] = output[16] + output[19];
  1217. step[17] = output[17] + output[18];
  1218. step[18] = -output[18] + output[17];
  1219. step[19] = -output[19] + output[16];
  1220. step[20] = -output[20] + output[23];
  1221. step[21] = -output[21] + output[22];
  1222. step[22] = output[22] + output[21];
  1223. step[23] = output[23] + output[20];
  1224. step[24] = output[24] + output[27];
  1225. step[25] = output[25] + output[26];
  1226. step[26] = -output[26] + output[25];
  1227. step[27] = -output[27] + output[24];
  1228. step[28] = -output[28] + output[31];
  1229. step[29] = -output[29] + output[30];
  1230. step[30] = output[30] + output[29];
  1231. step[31] = output[31] + output[28];
  1232. // Stage 6
  1233. output[0] = step[0];
  1234. output[1] = step[1];
  1235. output[2] = step[2];
  1236. output[3] = step[3];
  1237. output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
  1238. output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
  1239. output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
  1240. output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
  1241. output[8] = step[8] + step[9];
  1242. output[9] = -step[9] + step[8];
  1243. output[10] = -step[10] + step[11];
  1244. output[11] = step[11] + step[10];
  1245. output[12] = step[12] + step[13];
  1246. output[13] = -step[13] + step[12];
  1247. output[14] = -step[14] + step[15];
  1248. output[15] = step[15] + step[14];
  1249. output[16] = step[16];
  1250. output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
  1251. output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
  1252. output[19] = step[19];
  1253. output[20] = step[20];
  1254. output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
  1255. output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
  1256. output[23] = step[23];
  1257. output[24] = step[24];
  1258. output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
  1259. output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
  1260. output[27] = step[27];
  1261. output[28] = step[28];
  1262. output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
  1263. output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
  1264. output[31] = step[31];
  1265. // Stage 7
  1266. step[0] = output[0];
  1267. step[1] = output[1];
  1268. step[2] = output[2];
  1269. step[3] = output[3];
  1270. step[4] = output[4];
  1271. step[5] = output[5];
  1272. step[6] = output[6];
  1273. step[7] = output[7];
  1274. step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
  1275. step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
  1276. step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
  1277. step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
  1278. step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
  1279. step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
  1280. step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
  1281. step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
  1282. step[16] = output[16] + output[17];
  1283. step[17] = -output[17] + output[16];
  1284. step[18] = -output[18] + output[19];
  1285. step[19] = output[19] + output[18];
  1286. step[20] = output[20] + output[21];
  1287. step[21] = -output[21] + output[20];
  1288. step[22] = -output[22] + output[23];
  1289. step[23] = output[23] + output[22];
  1290. step[24] = output[24] + output[25];
  1291. step[25] = -output[25] + output[24];
  1292. step[26] = -output[26] + output[27];
  1293. step[27] = output[27] + output[26];
  1294. step[28] = output[28] + output[29];
  1295. step[29] = -output[29] + output[28];
  1296. step[30] = -output[30] + output[31];
  1297. step[31] = output[31] + output[30];
  1298. // Final stage --- outputs indices are bit-reversed.
  1299. output[0] = step[0];
  1300. output[16] = step[1];
  1301. output[8] = step[2];
  1302. output[24] = step[3];
  1303. output[4] = step[4];
  1304. output[20] = step[5];
  1305. output[12] = step[6];
  1306. output[28] = step[7];
  1307. output[2] = step[8];
  1308. output[18] = step[9];
  1309. output[10] = step[10];
  1310. output[26] = step[11];
  1311. output[6] = step[12];
  1312. output[22] = step[13];
  1313. output[14] = step[14];
  1314. output[30] = step[15];
  1315. output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
  1316. output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
  1317. output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
  1318. output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
  1319. output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
  1320. output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
  1321. output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
  1322. output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
  1323. output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
  1324. output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
  1325. output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
  1326. output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
  1327. output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
  1328. output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
  1329. output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
  1330. output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
  1331. }
  1332. void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
  1333. int r, c;
  1334. tran_low_t sum = 0;
  1335. for (r = 0; r < 32; ++r)
  1336. for (c = 0; c < 32; ++c)
  1337. sum += input[r * stride + c];
  1338. output[0] = sum >> 3;
  1339. output[1] = 0;
  1340. }
  1341. void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
  1342. int i, j;
  1343. tran_high_t output[32 * 32];
  1344. // Columns
  1345. for (i = 0; i < 32; ++i) {
  1346. tran_high_t temp_in[32], temp_out[32];
  1347. for (j = 0; j < 32; ++j)
  1348. temp_in[j] = input[j * stride + i] * 4;
  1349. vp9_fdct32(temp_in, temp_out, 0);
  1350. for (j = 0; j < 32; ++j)
  1351. output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  1352. }
  1353. // Rows
  1354. for (i = 0; i < 32; ++i) {
  1355. tran_high_t temp_in[32], temp_out[32];
  1356. for (j = 0; j < 32; ++j)
  1357. temp_in[j] = output[j + i * 32];
  1358. vp9_fdct32(temp_in, temp_out, 0);
  1359. for (j = 0; j < 32; ++j)
  1360. out[j + i * 32] =
  1361. (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
  1362. }
  1363. }
  1364. // Note that although we use dct_32_round in dct32 computation flow,
  1365. // this 2d fdct32x32 for rate-distortion optimization loop is operating
  1366. // within 16 bits precision.
  1367. void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
  1368. int i, j;
  1369. tran_high_t output[32 * 32];
  1370. // Columns
  1371. for (i = 0; i < 32; ++i) {
  1372. tran_high_t temp_in[32], temp_out[32];
  1373. for (j = 0; j < 32; ++j)
  1374. temp_in[j] = input[j * stride + i] * 4;
  1375. vp9_fdct32(temp_in, temp_out, 0);
  1376. for (j = 0; j < 32; ++j)
  1377. // TODO(cd): see quality impact of only doing
  1378. // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
  1379. // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
  1380. output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  1381. }
  1382. // Rows
  1383. for (i = 0; i < 32; ++i) {
  1384. tran_high_t temp_in[32], temp_out[32];
  1385. for (j = 0; j < 32; ++j)
  1386. temp_in[j] = output[j + i * 32];
  1387. vp9_fdct32(temp_in, temp_out, 1);
  1388. for (j = 0; j < 32; ++j)
  1389. out[j + i * 32] = (tran_low_t)temp_out[j];
  1390. }
  1391. }
  1392. #if CONFIG_VP9_HIGHBITDEPTH
  1393. void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
  1394. int stride) {
  1395. vp9_fdct4x4_c(input, output, stride);
  1396. }
  1397. void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
  1398. int stride, int tx_type) {
  1399. vp9_fht4x4_c(input, output, stride, tx_type);
  1400. }
  1401. void vp9_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
  1402. int stride) {
  1403. vp9_fdct8x8_1_c(input, final_output, stride);
  1404. }
  1405. void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
  1406. int stride) {
  1407. vp9_fdct8x8_c(input, final_output, stride);
  1408. }
  1409. void vp9_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
  1410. int stride) {
  1411. vp9_fdct16x16_1_c(input, output, stride);
  1412. }
  1413. void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
  1414. int stride) {
  1415. vp9_fdct16x16_c(input, output, stride);
  1416. }
  1417. void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
  1418. int stride, int tx_type) {
  1419. vp9_fht8x8_c(input, output, stride, tx_type);
  1420. }
  1421. void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
  1422. int stride) {
  1423. vp9_fwht4x4_c(input, output, stride);
  1424. }
  1425. void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
  1426. int stride, int tx_type) {
  1427. vp9_fht16x16_c(input, output, stride, tx_type);
  1428. }
  1429. void vp9_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
  1430. int stride) {
  1431. vp9_fdct32x32_1_c(input, out, stride);
  1432. }
  1433. void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
  1434. vp9_fdct32x32_c(input, out, stride);
  1435. }
  1436. void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
  1437. int stride) {
  1438. vp9_fdct32x32_rd_c(input, out, stride);
  1439. }
  1440. #endif // CONFIG_VP9_HIGHBITDEPTH