astcenc_compute_variance.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2022 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  18. /**
  19. * @brief Functions to calculate variance per component in a NxN footprint.
  20. *
  21. * We need N to be parametric, so the routine below uses summed area tables in order to execute in
  22. * O(1) time independent of how big N is.
  23. *
  24. * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
  25. * perform a binary reduction, and then distributes the results. This method means that there is no
  26. * serial dependency between a given element and the next one, and also significantly improves
  27. * numerical stability allowing us to use floats rather than doubles.
  28. */
  29. #include "astcenc_internal.h"
  30. #include <cassert>
  31. /**
  32. * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
  33. *
  34. * This will take an input array of the form:
  35. * v0, v1, v2, ...
  36. * ... and modify in-place to turn it into a prefix-sum array of the form:
  37. * v0, v0+v1, v0+v1+v2, ...
  38. *
  39. * @param d The array to prefix-sum.
  40. * @param items The number of items in the array.
  41. * @param stride The item spacing in the array; i.e. dense arrays should use 1.
  42. */
  43. static void brent_kung_prefix_sum(
  44. vfloat4* d,
  45. size_t items,
  46. int stride
  47. ) {
  48. if (items < 2)
  49. return;
  50. size_t lc_stride = 2;
  51. size_t log2_stride = 1;
  52. // The reduction-tree loop
  53. do {
  54. size_t step = lc_stride >> 1;
  55. size_t start = lc_stride - 1;
  56. size_t iters = items >> log2_stride;
  57. vfloat4 *da = d + (start * stride);
  58. ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
  59. size_t ofs_stride = stride << log2_stride;
  60. while (iters)
  61. {
  62. *da = *da + da[ofs];
  63. da += ofs_stride;
  64. iters--;
  65. }
  66. log2_stride += 1;
  67. lc_stride <<= 1;
  68. } while (lc_stride <= items);
  69. // The expansion-tree loop
  70. do {
  71. log2_stride -= 1;
  72. lc_stride >>= 1;
  73. size_t step = lc_stride >> 1;
  74. size_t start = step + lc_stride - 1;
  75. size_t iters = (items - step) >> log2_stride;
  76. vfloat4 *da = d + (start * stride);
  77. ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
  78. size_t ofs_stride = stride << log2_stride;
  79. while (iters)
  80. {
  81. *da = *da + da[ofs];
  82. da += ofs_stride;
  83. iters--;
  84. }
  85. } while (lc_stride > 2);
  86. }
  87. /* See header for documentation. */
  88. void compute_pixel_region_variance(
  89. astcenc_contexti& ctx,
  90. const pixel_region_args& arg
  91. ) {
  92. // Unpack the memory structure into local variables
  93. const astcenc_image* img = arg.img;
  94. astcenc_swizzle swz = arg.swz;
  95. bool have_z = arg.have_z;
  96. int size_x = arg.size_x;
  97. int size_y = arg.size_y;
  98. int size_z = arg.size_z;
  99. int offset_x = arg.offset_x;
  100. int offset_y = arg.offset_y;
  101. int offset_z = arg.offset_z;
  102. int alpha_kernel_radius = arg.alpha_kernel_radius;
  103. float* input_alpha_averages = ctx.input_alpha_averages;
  104. vfloat4* work_memory = arg.work_memory;
  105. // Compute memory sizes and dimensions that we need
  106. int kernel_radius = alpha_kernel_radius;
  107. int kerneldim = 2 * kernel_radius + 1;
  108. int kernel_radius_xy = kernel_radius;
  109. int kernel_radius_z = have_z ? kernel_radius : 0;
  110. int padsize_x = size_x + kerneldim;
  111. int padsize_y = size_y + kerneldim;
  112. int padsize_z = size_z + (have_z ? kerneldim : 0);
  113. int sizeprod = padsize_x * padsize_y * padsize_z;
  114. int zd_start = have_z ? 1 : 0;
  115. vfloat4 *varbuf1 = work_memory;
  116. vfloat4 *varbuf2 = work_memory + sizeprod;
  117. // Scaling factors to apply to Y and Z for accesses into the work buffers
  118. int yst = padsize_x;
  119. int zst = padsize_x * padsize_y;
  120. // Scaling factors to apply to Y and Z for accesses into result buffers
  121. int ydt = img->dim_x;
  122. int zdt = img->dim_x * img->dim_y;
  123. // Macros to act as accessor functions for the work-memory
  124. #define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
  125. #define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
  126. // Load N and N^2 values into the work buffers
  127. if (img->data_type == ASTCENC_TYPE_U8)
  128. {
  129. // Swizzle data structure 4 = ZERO, 5 = ONE
  130. uint8_t data[6];
  131. data[ASTCENC_SWZ_0] = 0;
  132. data[ASTCENC_SWZ_1] = 255;
  133. for (int z = zd_start; z < padsize_z; z++)
  134. {
  135. int z_src = (z - zd_start) + offset_z - kernel_radius_z;
  136. z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
  137. uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
  138. for (int y = 1; y < padsize_y; y++)
  139. {
  140. int y_src = (y - 1) + offset_y - kernel_radius_xy;
  141. y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
  142. for (int x = 1; x < padsize_x; x++)
  143. {
  144. int x_src = (x - 1) + offset_x - kernel_radius_xy;
  145. x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
  146. data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];
  147. data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
  148. data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
  149. data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
  150. uint8_t r = data[swz.r];
  151. uint8_t g = data[swz.g];
  152. uint8_t b = data[swz.b];
  153. uint8_t a = data[swz.a];
  154. vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
  155. g * (1.0f / 255.0f),
  156. b * (1.0f / 255.0f),
  157. a * (1.0f / 255.0f));
  158. VARBUF1(z, y, x) = d;
  159. VARBUF2(z, y, x) = d * d;
  160. }
  161. }
  162. }
  163. }
  164. else if (img->data_type == ASTCENC_TYPE_F16)
  165. {
  166. // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
  167. uint16_t data[6];
  168. data[ASTCENC_SWZ_0] = 0;
  169. data[ASTCENC_SWZ_1] = 0x3C00;
  170. for (int z = zd_start; z < padsize_z; z++)
  171. {
  172. int z_src = (z - zd_start) + offset_z - kernel_radius_z;
  173. z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
  174. uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
  175. for (int y = 1; y < padsize_y; y++)
  176. {
  177. int y_src = (y - 1) + offset_y - kernel_radius_xy;
  178. y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
  179. for (int x = 1; x < padsize_x; x++)
  180. {
  181. int x_src = (x - 1) + offset_x - kernel_radius_xy;
  182. x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
  183. data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];
  184. data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
  185. data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
  186. data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
  187. vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
  188. vfloat4 d = float16_to_float(di);
  189. VARBUF1(z, y, x) = d;
  190. VARBUF2(z, y, x) = d * d;
  191. }
  192. }
  193. }
  194. }
  195. else // if (img->data_type == ASTCENC_TYPE_F32)
  196. {
  197. assert(img->data_type == ASTCENC_TYPE_F32);
  198. // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
  199. float data[6];
  200. data[ASTCENC_SWZ_0] = 0.0f;
  201. data[ASTCENC_SWZ_1] = 1.0f;
  202. for (int z = zd_start; z < padsize_z; z++)
  203. {
  204. int z_src = (z - zd_start) + offset_z - kernel_radius_z;
  205. z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
  206. float* data32 = static_cast<float*>(img->data[z_src]);
  207. for (int y = 1; y < padsize_y; y++)
  208. {
  209. int y_src = (y - 1) + offset_y - kernel_radius_xy;
  210. y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
  211. for (int x = 1; x < padsize_x; x++)
  212. {
  213. int x_src = (x - 1) + offset_x - kernel_radius_xy;
  214. x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
  215. data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];
  216. data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
  217. data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
  218. data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
  219. float r = data[swz.r];
  220. float g = data[swz.g];
  221. float b = data[swz.b];
  222. float a = data[swz.a];
  223. vfloat4 d(r, g, b, a);
  224. VARBUF1(z, y, x) = d;
  225. VARBUF2(z, y, x) = d * d;
  226. }
  227. }
  228. }
  229. }
  230. // Pad with an extra layer of 0s; this forms the edge of the SAT tables
  231. vfloat4 vbz = vfloat4::zero();
  232. for (int z = 0; z < padsize_z; z++)
  233. {
  234. for (int y = 0; y < padsize_y; y++)
  235. {
  236. VARBUF1(z, y, 0) = vbz;
  237. VARBUF2(z, y, 0) = vbz;
  238. }
  239. for (int x = 0; x < padsize_x; x++)
  240. {
  241. VARBUF1(z, 0, x) = vbz;
  242. VARBUF2(z, 0, x) = vbz;
  243. }
  244. }
  245. if (have_z)
  246. {
  247. for (int y = 0; y < padsize_y; y++)
  248. {
  249. for (int x = 0; x < padsize_x; x++)
  250. {
  251. VARBUF1(0, y, x) = vbz;
  252. VARBUF2(0, y, x) = vbz;
  253. }
  254. }
  255. }
  256. // Generate summed-area tables for N and N^2; this is done in-place, using
  257. // a Brent-Kung parallel-prefix based algorithm to minimize precision loss
  258. for (int z = zd_start; z < padsize_z; z++)
  259. {
  260. for (int y = 1; y < padsize_y; y++)
  261. {
  262. brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
  263. brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
  264. }
  265. }
  266. for (int z = zd_start; z < padsize_z; z++)
  267. {
  268. for (int x = 1; x < padsize_x; x++)
  269. {
  270. brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
  271. brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
  272. }
  273. }
  274. if (have_z)
  275. {
  276. for (int y = 1; y < padsize_y; y++)
  277. {
  278. for (int x = 1; x < padsize_x; x++)
  279. {
  280. brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
  281. brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
  282. }
  283. }
  284. }
  285. // Compute a few constants used in the variance-calculation.
  286. float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
  287. float alpha_rsamples;
  288. if (have_z)
  289. {
  290. alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
  291. }
  292. else
  293. {
  294. alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
  295. }
  296. // Use the summed-area tables to compute variance for each neighborhood
  297. if (have_z)
  298. {
  299. for (int z = 0; z < size_z; z++)
  300. {
  301. int z_src = z + kernel_radius_z;
  302. int z_dst = z + offset_z;
  303. int z_low = z_src - alpha_kernel_radius;
  304. int z_high = z_src + alpha_kernel_radius + 1;
  305. for (int y = 0; y < size_y; y++)
  306. {
  307. int y_src = y + kernel_radius_xy;
  308. int y_dst = y + offset_y;
  309. int y_low = y_src - alpha_kernel_radius;
  310. int y_high = y_src + alpha_kernel_radius + 1;
  311. for (int x = 0; x < size_x; x++)
  312. {
  313. int x_src = x + kernel_radius_xy;
  314. int x_dst = x + offset_x;
  315. int x_low = x_src - alpha_kernel_radius;
  316. int x_high = x_src + alpha_kernel_radius + 1;
  317. // Summed-area table lookups for alpha average
  318. float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()
  319. - VARBUF1(z_high, y_low, x_high).lane<3>()
  320. - VARBUF1(z_high, y_high, x_low).lane<3>()
  321. + VARBUF1(z_high, y_high, x_high).lane<3>()) -
  322. ( VARBUF1(z_low, y_low, x_low).lane<3>()
  323. - VARBUF1(z_low, y_low, x_high).lane<3>()
  324. - VARBUF1(z_low, y_high, x_low).lane<3>()
  325. + VARBUF1(z_low, y_high, x_high).lane<3>());
  326. int out_index = z_dst * zdt + y_dst * ydt + x_dst;
  327. input_alpha_averages[out_index] = (vasum * alpha_rsamples);
  328. }
  329. }
  330. }
  331. }
  332. else
  333. {
  334. for (int y = 0; y < size_y; y++)
  335. {
  336. int y_src = y + kernel_radius_xy;
  337. int y_dst = y + offset_y;
  338. int y_low = y_src - alpha_kernel_radius;
  339. int y_high = y_src + alpha_kernel_radius + 1;
  340. for (int x = 0; x < size_x; x++)
  341. {
  342. int x_src = x + kernel_radius_xy;
  343. int x_dst = x + offset_x;
  344. int x_low = x_src - alpha_kernel_radius;
  345. int x_high = x_src + alpha_kernel_radius + 1;
  346. // Summed-area table lookups for alpha average
  347. float vasum = VARBUF1(0, y_low, x_low).lane<3>()
  348. - VARBUF1(0, y_low, x_high).lane<3>()
  349. - VARBUF1(0, y_high, x_low).lane<3>()
  350. + VARBUF1(0, y_high, x_high).lane<3>();
  351. int out_index = y_dst * ydt + x_dst;
  352. input_alpha_averages[out_index] = (vasum * alpha_rsamples);
  353. }
  354. }
  355. }
  356. }
  357. /* See header for documentation. */
  358. unsigned int init_compute_averages(
  359. const astcenc_image& img,
  360. unsigned int alpha_kernel_radius,
  361. const astcenc_swizzle& swz,
  362. avg_args& ag
  363. ) {
  364. unsigned int size_x = img.dim_x;
  365. unsigned int size_y = img.dim_y;
  366. unsigned int size_z = img.dim_z;
  367. // Compute maximum block size and from that the working memory buffer size
  368. unsigned int kernel_radius = alpha_kernel_radius;
  369. unsigned int kerneldim = 2 * kernel_radius + 1;
  370. bool have_z = (size_z > 1);
  371. unsigned int max_blk_size_xy = have_z ? 16 : 32;
  372. unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
  373. unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
  374. unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
  375. // Perform block-wise averages calculations across the image
  376. // Initialize fields which are not populated until later
  377. ag.arg.size_x = 0;
  378. ag.arg.size_y = 0;
  379. ag.arg.size_z = 0;
  380. ag.arg.offset_x = 0;
  381. ag.arg.offset_y = 0;
  382. ag.arg.offset_z = 0;
  383. ag.arg.work_memory = nullptr;
  384. ag.arg.img = &img;
  385. ag.arg.swz = swz;
  386. ag.arg.have_z = have_z;
  387. ag.arg.alpha_kernel_radius = alpha_kernel_radius;
  388. ag.img_size_x = size_x;
  389. ag.img_size_y = size_y;
  390. ag.img_size_z = size_z;
  391. ag.blk_size_xy = max_blk_size_xy;
  392. ag.blk_size_z = max_blk_size_z;
  393. ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
  394. // The parallel task count
  395. unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
  396. unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
  397. return z_tasks * y_tasks;
  398. }
  399. #endif