ssim.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "../util/ssim.h" // NOLINT
  11. #include <string.h>
  12. #ifdef __cplusplus
  13. extern "C" {
  14. #endif
  15. typedef unsigned int uint32; // NOLINT
  16. typedef unsigned short uint16; // NOLINT
  17. #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
  18. (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
  19. #define __SSE2__
  20. #endif
  21. #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
  22. #include <emmintrin.h>
  23. #endif
  24. #ifdef _OPENMP
  25. #include <omp.h>
  26. #endif
  27. // SSIM
  28. enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
  29. // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i)
  30. // The maximum value (11 x 11) must be less than 128 to avoid sign
  31. // problems during the calls to _mm_mullo_epi16().
  32. static const int K[KERNEL_SIZE] = {
  33. 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
  34. };
  35. static const double kiW[KERNEL + 1 + 1] = {
  36. 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
  37. 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
  38. 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
  39. 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
  40. 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
  41. };
  42. #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
  43. #define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
  44. #define MAKE_WEIGHT(L) \
  45. { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \
  46. PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
  47. // We need this union trick to be able to initialize constant static __m128i
  48. // values. We can't call _mm_set_epi16() for static compile-time initialization.
  49. static const struct {
  50. union {
  51. uint16 i16_[8];
  52. __m128i m_;
  53. } values_;
  54. } W0 = MAKE_WEIGHT(0),
  55. W1 = MAKE_WEIGHT(1),
  56. W2 = MAKE_WEIGHT(2),
  57. W3 = MAKE_WEIGHT(3);
  58. // ... the rest is symmetric.
  59. #undef MAKE_WEIGHT
  60. #undef PWEIGHT
  61. #endif
  62. // Common final expression for SSIM, once the weighted sums are known.
  63. static double FinalizeSSIM(double iw, double xm, double ym,
  64. double xxm, double xym, double yym) {
  65. const double iwx = xm * iw;
  66. const double iwy = ym * iw;
  67. double sxx = xxm * iw - iwx * iwx;
  68. double syy = yym * iw - iwy * iwy;
  69. // small errors are possible, due to rounding. Clamp to zero.
  70. if (sxx < 0.) sxx = 0.;
  71. if (syy < 0.) syy = 0.;
  72. const double sxsy = sqrt(sxx * syy);
  73. const double sxy = xym * iw - iwx * iwy;
  74. static const double C11 = (0.01 * 0.01) * (255 * 255);
  75. static const double C22 = (0.03 * 0.03) * (255 * 255);
  76. static const double C33 = (0.015 * 0.015) * (255 * 255);
  77. const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
  78. const double c = (2. * sxsy + C22) / (sxx + syy + C22);
  79. const double s = (sxy + C33) / (sxsy + C33);
  80. return l * c * s;
  81. }
  82. // GetSSIM() does clipping. GetSSIMFullKernel() does not
  83. // TODO(skal): use summed tables?
  84. // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
  85. // with a diff of 255, squared. The maximum error is thus 0x4388241,
  86. // which fits into 32 bits integers.
  87. double GetSSIM(const uint8 *org, const uint8 *rec,
  88. int xo, int yo, int W, int H, int stride) {
  89. uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
  90. org += (yo - KERNEL) * stride;
  91. org += (xo - KERNEL);
  92. rec += (yo - KERNEL) * stride;
  93. rec += (xo - KERNEL);
  94. for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
  95. if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
  96. const int Wy = K[y_];
  97. for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
  98. const int Wxy = Wy * K[x_];
  99. if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
  100. const int org_x = org[x_];
  101. const int rec_x = rec[x_];
  102. ws += Wxy;
  103. xm += Wxy * org_x;
  104. ym += Wxy * rec_x;
  105. xxm += Wxy * org_x * org_x;
  106. xym += Wxy * org_x * rec_x;
  107. yym += Wxy * rec_x * rec_x;
  108. }
  109. }
  110. }
  111. return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
  112. }
  113. double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
  114. int xo, int yo, int stride,
  115. double area_weight) {
  116. uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
  117. #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
  118. org += yo * stride + xo;
  119. rec += yo * stride + xo;
  120. for (int y = 1; y <= KERNEL; y++) {
  121. const int dy1 = y * stride;
  122. const int dy2 = y * stride;
  123. const int Wy = K[KERNEL + y];
  124. for (int x = 1; x <= KERNEL; x++) {
  125. // Compute the contributions of upper-left (ul), upper-right (ur)
  126. // lower-left (ll) and lower-right (lr) points (see the diagram below).
  127. // Symmetric Kernel will have same weight on those points.
  128. // - - - - - - -
  129. // - ul - - - ur -
  130. // - - - - - - -
  131. // - - - 0 - - -
  132. // - - - - - - -
  133. // - ll - - - lr -
  134. // - - - - - - -
  135. const int Wxy = Wy * K[KERNEL + x];
  136. const int ul1 = org[-dy1 - x];
  137. const int ur1 = org[-dy1 + x];
  138. const int ll1 = org[dy1 - x];
  139. const int lr1 = org[dy1 + x];
  140. const int ul2 = rec[-dy2 - x];
  141. const int ur2 = rec[-dy2 + x];
  142. const int ll2 = rec[dy2 - x];
  143. const int lr2 = rec[dy2 + x];
  144. xm += Wxy * (ul1 + ur1 + ll1 + lr1);
  145. ym += Wxy * (ul2 + ur2 + ll2 + lr2);
  146. xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
  147. xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
  148. yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
  149. }
  150. // Compute the contributions of up (u), down (d), left (l) and right (r)
  151. // points across the main axes (see the diagram below).
  152. // Symmetric Kernel will have same weight on those points.
  153. // - - - - - - -
  154. // - - - u - - -
  155. // - - - - - - -
  156. // - l - 0 - r -
  157. // - - - - - - -
  158. // - - - d - - -
  159. // - - - - - - -
  160. const int Wxy = Wy * K[KERNEL];
  161. const int u1 = org[-dy1];
  162. const int d1 = org[dy1];
  163. const int l1 = org[-y];
  164. const int r1 = org[y];
  165. const int u2 = rec[-dy2];
  166. const int d2 = rec[dy2];
  167. const int l2 = rec[-y];
  168. const int r2 = rec[y];
  169. xm += Wxy * (u1 + d1 + l1 + r1);
  170. ym += Wxy * (u2 + d2 + l2 + r2);
  171. xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
  172. xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
  173. yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
  174. }
  175. // Lastly the contribution of (x0, y0) point.
  176. const int Wxy = K[KERNEL] * K[KERNEL];
  177. const int s1 = org[0];
  178. const int s2 = rec[0];
  179. xm += Wxy * s1;
  180. ym += Wxy * s2;
  181. xxm += Wxy * s1 * s1;
  182. xym += Wxy * s1 * s2;
  183. yym += Wxy * s2 * s2;
  184. #else // __SSE2__
  185. org += (yo - KERNEL) * stride + (xo - KERNEL);
  186. rec += (yo - KERNEL) * stride + (xo - KERNEL);
  187. const __m128i zero = _mm_setzero_si128();
  188. __m128i x = zero;
  189. __m128i y = zero;
  190. __m128i xx = zero;
  191. __m128i xy = zero;
  192. __m128i yy = zero;
  193. // Read 8 pixels at line #L, and convert to 16bit, perform weighting
  194. // and acccumulate.
  195. #define LOAD_LINE_PAIR(L, WEIGHT) do { \
  196. const __m128i v0 = \
  197. _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
  198. const __m128i v1 = \
  199. _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
  200. const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
  201. const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
  202. const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
  203. const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
  204. x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
  205. y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
  206. x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
  207. y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
  208. xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
  209. xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
  210. yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
  211. } while (0)
  212. #define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \
  213. uint32 tmp[4]; \
  214. _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
  215. (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
  216. } while (0)
  217. LOAD_LINE_PAIR(0, W0);
  218. LOAD_LINE_PAIR(1, W1);
  219. LOAD_LINE_PAIR(2, W2);
  220. LOAD_LINE_PAIR(3, W3);
  221. LOAD_LINE_PAIR(4, W2);
  222. LOAD_LINE_PAIR(5, W1);
  223. LOAD_LINE_PAIR(6, W0);
  224. ADD_AND_STORE_FOUR_EPI32(x, xm);
  225. ADD_AND_STORE_FOUR_EPI32(y, ym);
  226. ADD_AND_STORE_FOUR_EPI32(xx, xxm);
  227. ADD_AND_STORE_FOUR_EPI32(xy, xym);
  228. ADD_AND_STORE_FOUR_EPI32(yy, yym);
  229. #undef LOAD_LINE_PAIR
  230. #undef ADD_AND_STORE_FOUR_EPI32
  231. #endif
  232. return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
  233. }
  234. static int start_max(int x, int y) { return (x > y) ? x : y; }
  235. double CalcSSIM(const uint8 *org, const uint8 *rec,
  236. const int image_width, const int image_height) {
  237. double SSIM = 0.;
  238. const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
  239. const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
  240. const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
  241. const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
  242. const int stride = image_width;
  243. for (int j = 0; j < KERNEL_Y; ++j) {
  244. for (int i = 0; i < image_width; ++i) {
  245. SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
  246. }
  247. }
  248. #ifdef _OPENMP
  249. #pragma omp parallel for reduction(+: SSIM)
  250. #endif
  251. for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
  252. for (int i = 0; i < KERNEL_X; ++i) {
  253. SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
  254. }
  255. for (int i = KERNEL_X; i < start_x; ++i) {
  256. SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
  257. }
  258. if (start_x < image_width) {
  259. // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
  260. // copy the 8 rightmost pixels on a cache area, and pad this area with
  261. // zeros which won't contribute to the overall SSIM value (but we need
  262. // to pass the correct normalizing constant!). By using this cache, we can
  263. // still call GetSSIMFullKernel() instead of the slower GetSSIM().
  264. // NOTE: we could use similar method for the left-most pixels too.
  265. const int kScratchWidth = 8;
  266. const int kScratchStride = kScratchWidth + KERNEL + 1;
  267. uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
  268. uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
  269. for (int k = 0; k < KERNEL_SIZE; ++k) {
  270. const int offset =
  271. (j - KERNEL + k) * stride + image_width - kScratchWidth;
  272. memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
  273. memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
  274. }
  275. for (int k = 0; k <= KERNEL_X + 1; ++k) {
  276. SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
  277. KERNEL + k, KERNEL, kScratchStride, kiW[k]);
  278. }
  279. }
  280. }
  281. for (int j = start_y; j < image_height; ++j) {
  282. for (int i = 0; i < image_width; ++i) {
  283. SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
  284. }
  285. }
  286. return SSIM;
  287. }
  288. double CalcLSSIM(double ssim) {
  289. return -10.0 * log10(1.0 - ssim);
  290. }
  291. #ifdef __cplusplus
  292. } // extern "C"
  293. #endif