vp9_mfqe.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_config.h"
  11. #include "./vp9_rtcd.h"
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "./vpx_scale_rtcd.h"
  14. #include "vp9/common/vp9_onyxc_int.h"
  15. #include "vp9/common/vp9_postproc.h"
  16. // TODO(jackychen): Replace this function with SSE2 code. There is
  17. // one SSE2 implementation in vp8, so will consider how to share it
  18. // between vp8 and vp9.
  19. static void filter_by_weight(const uint8_t *src, int src_stride,
  20. uint8_t *dst, int dst_stride,
  21. int block_size, int src_weight) {
  22. const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
  23. const int rounding_bit = 1 << (MFQE_PRECISION - 1);
  24. int r, c;
  25. for (r = 0; r < block_size; r++) {
  26. for (c = 0; c < block_size; c++) {
  27. dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
  28. >> MFQE_PRECISION;
  29. }
  30. src += src_stride;
  31. dst += dst_stride;
  32. }
  33. }
  34. void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
  35. uint8_t *dst, int dst_stride, int src_weight) {
  36. filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
  37. }
  38. void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
  39. uint8_t *dst, int dst_stride,
  40. int src_weight) {
  41. filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
  42. }
  43. static void filter_by_weight32x32(const uint8_t *src, int src_stride,
  44. uint8_t *dst, int dst_stride, int weight) {
  45. vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
  46. vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
  47. weight);
  48. vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
  49. dst + dst_stride * 16, dst_stride, weight);
  50. vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
  51. dst + dst_stride * 16 + 16, dst_stride, weight);
  52. }
  53. static void filter_by_weight64x64(const uint8_t *src, int src_stride,
  54. uint8_t *dst, int dst_stride, int weight) {
  55. filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
  56. filter_by_weight32x32(src + 32, src_stride, dst + 32,
  57. dst_stride, weight);
  58. filter_by_weight32x32(src + src_stride * 32, src_stride,
  59. dst + dst_stride * 32, dst_stride, weight);
  60. filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
  61. dst + dst_stride * 32 + 32, dst_stride, weight);
  62. }
  63. static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
  64. int yd_stride, const uint8_t *u, const uint8_t *v,
  65. int uv_stride, uint8_t *ud, uint8_t *vd,
  66. int uvd_stride, BLOCK_SIZE block_size,
  67. int weight) {
  68. if (block_size == BLOCK_16X16) {
  69. vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
  70. vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
  71. vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
  72. } else if (block_size == BLOCK_32X32) {
  73. filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
  74. vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
  75. vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
  76. } else if (block_size == BLOCK_64X64) {
  77. filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
  78. filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
  79. filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
  80. }
  81. }
  82. // TODO(jackychen): Determine whether replace it with assembly code.
  83. static void copy_mem8x8(const uint8_t *src, int src_stride,
  84. uint8_t *dst, int dst_stride) {
  85. int r;
  86. for (r = 0; r < 8; r++) {
  87. memcpy(dst, src, 8);
  88. src += src_stride;
  89. dst += dst_stride;
  90. }
  91. }
  92. static void copy_mem16x16(const uint8_t *src, int src_stride,
  93. uint8_t *dst, int dst_stride) {
  94. int r;
  95. for (r = 0; r < 16; r++) {
  96. memcpy(dst, src, 16);
  97. src += src_stride;
  98. dst += dst_stride;
  99. }
  100. }
  101. static void copy_mem32x32(const uint8_t *src, int src_stride,
  102. uint8_t *dst, int dst_stride) {
  103. copy_mem16x16(src, src_stride, dst, dst_stride);
  104. copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
  105. copy_mem16x16(src + src_stride * 16, src_stride,
  106. dst + dst_stride * 16, dst_stride);
  107. copy_mem16x16(src + src_stride * 16 + 16, src_stride,
  108. dst + dst_stride * 16 + 16, dst_stride);
  109. }
  110. void copy_mem64x64(const uint8_t *src, int src_stride,
  111. uint8_t *dst, int dst_stride) {
  112. copy_mem32x32(src, src_stride, dst, dst_stride);
  113. copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
  114. copy_mem32x32(src + src_stride * 32, src_stride,
  115. dst + src_stride * 32, dst_stride);
  116. copy_mem32x32(src + src_stride * 32 + 32, src_stride,
  117. dst + src_stride * 32 + 32, dst_stride);
  118. }
  119. static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
  120. int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
  121. uint8_t *vd, int yd_stride, int uvd_stride,
  122. BLOCK_SIZE bs) {
  123. if (bs == BLOCK_16X16) {
  124. copy_mem16x16(y, y_stride, yd, yd_stride);
  125. copy_mem8x8(u, uv_stride, ud, uvd_stride);
  126. copy_mem8x8(v, uv_stride, vd, uvd_stride);
  127. } else if (bs == BLOCK_32X32) {
  128. copy_mem32x32(y, y_stride, yd, yd_stride);
  129. copy_mem16x16(u, uv_stride, ud, uvd_stride);
  130. copy_mem16x16(v, uv_stride, vd, uvd_stride);
  131. } else {
  132. copy_mem64x64(y, y_stride, yd, yd_stride);
  133. copy_mem32x32(u, uv_stride, ud, uvd_stride);
  134. copy_mem32x32(v, uv_stride, vd, uvd_stride);
  135. }
  136. }
  137. static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
  138. const int adj = qdiff >> MFQE_PRECISION;
  139. if (bs == BLOCK_16X16) {
  140. *sad_thr = 7 + adj;
  141. } else if (bs == BLOCK_32X32) {
  142. *sad_thr = 6 + adj;
  143. } else { // BLOCK_64X64
  144. *sad_thr = 5 + adj;
  145. }
  146. *vdiff_thr = 125 + qdiff;
  147. }
  148. static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
  149. const uint8_t *v, int y_stride, int uv_stride,
  150. uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
  151. int uvd_stride, int qdiff) {
  152. int sad, sad_thr, vdiff, vdiff_thr;
  153. uint32_t sse;
  154. get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
  155. if (bs == BLOCK_16X16) {
  156. vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
  157. sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
  158. } else if (bs == BLOCK_32X32) {
  159. vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
  160. sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
  161. } else /* if (bs == BLOCK_64X64) */ {
  162. vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
  163. sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
  164. }
  165. // vdiff > sad * 3 means vdiff should not be too small, otherwise,
  166. // it might be a lighting change in smooth area. When there is a
  167. // lighting change in smooth area, it is dangerous to do MFQE.
  168. if (sad > 1 && vdiff > sad * 3) {
  169. const int weight = 1 << MFQE_PRECISION;
  170. int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
  171. // When ifactor equals weight, no MFQE is done.
  172. if (ifactor > weight) {
  173. ifactor = weight;
  174. }
  175. apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
  176. uvd_stride, bs, ifactor);
  177. } else {
  178. // Copy the block from current frame (i.e., no mfqe is done).
  179. copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
  180. yd_stride, uvd_stride, bs);
  181. }
  182. }
  183. static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
  184. // Check the motion in current block(for inter frame),
  185. // or check the motion in the correlated block in last frame (for keyframe).
  186. const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
  187. mi->mbmi.mv[0].as_mv.row +
  188. mi->mbmi.mv[0].as_mv.col *
  189. mi->mbmi.mv[0].as_mv.col;
  190. const int mv_threshold = 100;
  191. return mi->mbmi.mode >= NEARESTMV && // Not an intra block
  192. cur_bs >= BLOCK_16X16 &&
  193. mv_len_square <= mv_threshold;
  194. }
  195. // Process each partiton in a super block, recursively.
  196. static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
  197. const uint8_t *y, const uint8_t *u,
  198. const uint8_t *v, int y_stride, int uv_stride,
  199. uint8_t *yd, uint8_t *ud, uint8_t *vd,
  200. int yd_stride, int uvd_stride) {
  201. int mi_offset, y_offset, uv_offset;
  202. const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
  203. const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
  204. const int bsl = b_width_log2_lookup[bs];
  205. PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
  206. const BLOCK_SIZE subsize = get_subsize(bs, partition);
  207. if (cur_bs < BLOCK_8X8) {
  208. // If there are blocks smaller than 8x8, it must be on the boundary.
  209. return;
  210. }
  211. // No MFQE on blocks smaller than 16x16
  212. if (bs == BLOCK_16X16) {
  213. partition = PARTITION_NONE;
  214. }
  215. if (bs == BLOCK_64X64) {
  216. mi_offset = 4;
  217. y_offset = 32;
  218. uv_offset = 16;
  219. } else {
  220. mi_offset = 2;
  221. y_offset = 16;
  222. uv_offset = 8;
  223. }
  224. switch (partition) {
  225. BLOCK_SIZE mfqe_bs, bs_tmp;
  226. case PARTITION_HORZ:
  227. if (bs == BLOCK_64X64) {
  228. mfqe_bs = BLOCK_64X32;
  229. bs_tmp = BLOCK_32X32;
  230. } else {
  231. mfqe_bs = BLOCK_32X16;
  232. bs_tmp = BLOCK_16X16;
  233. }
  234. if (mfqe_decision(mi, mfqe_bs)) {
  235. // Do mfqe on the first square partition.
  236. mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
  237. yd, ud, vd, yd_stride, uvd_stride, qdiff);
  238. // Do mfqe on the second square partition.
  239. mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
  240. y_stride, uv_stride, yd + y_offset, ud + uv_offset,
  241. vd + uv_offset, yd_stride, uvd_stride, qdiff);
  242. }
  243. if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
  244. // Do mfqe on the first square partition.
  245. mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
  246. v + uv_offset * uv_stride, y_stride, uv_stride,
  247. yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
  248. vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
  249. // Do mfqe on the second square partition.
  250. mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
  251. u + uv_offset * uv_stride + uv_offset,
  252. v + uv_offset * uv_stride + uv_offset, y_stride,
  253. uv_stride, yd + y_offset * yd_stride + y_offset,
  254. ud + uv_offset * uvd_stride + uv_offset,
  255. vd + uv_offset * uvd_stride + uv_offset,
  256. yd_stride, uvd_stride, qdiff);
  257. }
  258. break;
  259. case PARTITION_VERT:
  260. if (bs == BLOCK_64X64) {
  261. mfqe_bs = BLOCK_32X64;
  262. bs_tmp = BLOCK_32X32;
  263. } else {
  264. mfqe_bs = BLOCK_16X32;
  265. bs_tmp = BLOCK_16X16;
  266. }
  267. if (mfqe_decision(mi, mfqe_bs)) {
  268. // Do mfqe on the first square partition.
  269. mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
  270. yd, ud, vd, yd_stride, uvd_stride, qdiff);
  271. // Do mfqe on the second square partition.
  272. mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
  273. v + uv_offset * uv_stride, y_stride, uv_stride,
  274. yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
  275. vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
  276. }
  277. if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
  278. // Do mfqe on the first square partition.
  279. mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
  280. y_stride, uv_stride, yd + y_offset, ud + uv_offset,
  281. vd + uv_offset, yd_stride, uvd_stride, qdiff);
  282. // Do mfqe on the second square partition.
  283. mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
  284. u + uv_offset * uv_stride + uv_offset,
  285. v + uv_offset * uv_stride + uv_offset, y_stride,
  286. uv_stride, yd + y_offset * yd_stride + y_offset,
  287. ud + uv_offset * uvd_stride + uv_offset,
  288. vd + uv_offset * uvd_stride + uv_offset,
  289. yd_stride, uvd_stride, qdiff);
  290. }
  291. break;
  292. case PARTITION_NONE:
  293. if (mfqe_decision(mi, cur_bs)) {
  294. // Do mfqe on this partition.
  295. mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
  296. yd, ud, vd, yd_stride, uvd_stride, qdiff);
  297. } else {
  298. // Copy the block from current frame(i.e., no mfqe is done).
  299. copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
  300. yd_stride, uvd_stride, bs);
  301. }
  302. break;
  303. case PARTITION_SPLIT:
  304. // Recursion on four square partitions, e.g. if bs is 64X64,
  305. // then look into four 32X32 blocks in it.
  306. mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
  307. yd_stride, uvd_stride);
  308. mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
  309. v + uv_offset, y_stride, uv_stride, yd + y_offset,
  310. ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
  311. mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
  312. y + y_offset * y_stride, u + uv_offset * uv_stride,
  313. v + uv_offset * uv_stride, y_stride, uv_stride,
  314. yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
  315. vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
  316. mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
  317. subsize, y + y_offset * y_stride + y_offset,
  318. u + uv_offset * uv_stride + uv_offset,
  319. v + uv_offset * uv_stride + uv_offset, y_stride,
  320. uv_stride, yd + y_offset * yd_stride + y_offset,
  321. ud + uv_offset * uvd_stride + uv_offset,
  322. vd + uv_offset * uvd_stride + uv_offset,
  323. yd_stride, uvd_stride);
  324. break;
  325. default:
  326. assert(0);
  327. }
  328. }
  329. void vp9_mfqe(VP9_COMMON *cm) {
  330. int mi_row, mi_col;
  331. // Current decoded frame.
  332. const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
  333. // Last decoded frame and will store the MFQE result.
  334. YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
  335. // Loop through each super block.
  336. for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
  337. for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
  338. MODE_INFO *mi;
  339. MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
  340. // Motion Info in last frame.
  341. MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
  342. (mi_row * cm->mi_stride + mi_col);
  343. const uint32_t y_stride = show->y_stride;
  344. const uint32_t uv_stride = show->uv_stride;
  345. const uint32_t yd_stride = dest->y_stride;
  346. const uint32_t uvd_stride = dest->uv_stride;
  347. const uint32_t row_offset_y = mi_row << 3;
  348. const uint32_t row_offset_uv = mi_row << 2;
  349. const uint32_t col_offset_y = mi_col << 3;
  350. const uint32_t col_offset_uv = mi_col << 2;
  351. const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
  352. col_offset_y;
  353. const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
  354. col_offset_uv;
  355. const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
  356. col_offset_uv;
  357. uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
  358. uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
  359. col_offset_uv;
  360. uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
  361. col_offset_uv;
  362. if (frame_is_intra_only(cm)) {
  363. mi = mi_prev;
  364. } else {
  365. mi = mi_local;
  366. }
  367. mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
  368. vd, yd_stride, uvd_stride);
  369. }
  370. }
  371. }