astcenc_compress_symbolic.cpp 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2023 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  18. /**
  19. * @brief Functions to compress a symbolic block.
  20. */
  21. #include "astcenc_internal.h"
  22. #include "astcenc_diagnostic_trace.h"
  23. #include <cassert>
  24. /**
  25. * @brief Merge two planes of endpoints into a single vector.
  26. *
  27. * @param ep_plane1 The endpoints for plane 1.
  28. * @param ep_plane2 The endpoints for plane 2.
  29. * @param component_plane2 The color component for plane 2.
  30. * @param[out] result The merged output.
  31. */
  32. static void merge_endpoints(
  33. const endpoints& ep_plane1,
  34. const endpoints& ep_plane2,
  35. unsigned int component_plane2,
  36. endpoints& result
  37. ) {
  38. unsigned int partition_count = ep_plane1.partition_count;
  39. assert(partition_count == 1);
  40. vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
  41. result.partition_count = partition_count;
  42. result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
  43. result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
  44. }
  45. /**
  46. * @brief Attempt to improve weights given a chosen configuration.
  47. *
  48. * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
  49. * partition and per plane) and attempt to improve image quality by moving each weight up by one or
  50. * down by one quantization step.
  51. *
  52. * This is a specialized function which only supports operating on undecimated weight grids,
  53. * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
  54. * is needed less often.
  55. *
  56. * @param decode_mode The decode mode (LDR, HDR).
  57. * @param bsd The block size information.
  58. * @param blk The image block color data to compress.
  59. * @param[out] scb The symbolic compressed block output.
  60. */
  61. static bool realign_weights_undecimated(
  62. astcenc_profile decode_mode,
  63. const block_size_descriptor& bsd,
  64. const image_block& blk,
  65. symbolic_compressed_block& scb
  66. ) {
  67. // Get the partition descriptor
  68. unsigned int partition_count = scb.partition_count;
  69. const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
  70. // Get the quantization table
  71. const block_mode& bm = bsd.get_block_mode(scb.block_mode);
  72. unsigned int weight_quant_level = bm.quant_mode;
  73. const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
  74. unsigned int max_plane = bm.is_dual_plane;
  75. int plane2_component = scb.plane2_component;
  76. vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
  77. // Decode the color endpoints
  78. bool rgb_hdr;
  79. bool alpha_hdr;
  80. vint4 endpnt0[BLOCK_MAX_PARTITIONS];
  81. vint4 endpnt1[BLOCK_MAX_PARTITIONS];
  82. vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
  83. vfloat4 offset[BLOCK_MAX_PARTITIONS];
  84. promise(partition_count > 0);
  85. for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
  86. {
  87. unpack_color_endpoints(decode_mode,
  88. scb.color_formats[pa_idx],
  89. scb.color_values[pa_idx],
  90. rgb_hdr, alpha_hdr,
  91. endpnt0[pa_idx],
  92. endpnt1[pa_idx]);
  93. }
  94. uint8_t* dec_weights_uquant = scb.weights;
  95. bool adjustments = false;
  96. // For each plane and partition ...
  97. for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
  98. {
  99. for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
  100. {
  101. // Compute the endpoint delta for all components in current plane
  102. vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
  103. epd = select(epd, vint4::zero(), plane_mask);
  104. endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
  105. offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
  106. }
  107. // For each weight compute previous, current, and next errors
  108. promise(bsd.texel_count > 0);
  109. for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
  110. {
  111. int uqw = dec_weights_uquant[texel];
  112. uint32_t prev_and_next = qat.prev_next_values[uqw];
  113. int uqw_down = prev_and_next & 0xFF;
  114. int uqw_up = (prev_and_next >> 8) & 0xFF;
  115. // Interpolate the colors to create the diffs
  116. float weight_base = static_cast<float>(uqw);
  117. float weight_down = static_cast<float>(uqw_down - uqw);
  118. float weight_up = static_cast<float>(uqw_up - uqw);
  119. unsigned int partition = pi.partition_of_texel[texel];
  120. vfloat4 color_offset = offset[partition];
  121. vfloat4 color_base = endpnt0f[partition];
  122. vfloat4 color = color_base + color_offset * weight_base;
  123. vfloat4 orig_color = blk.texel(texel);
  124. vfloat4 error_weight = blk.channel_weight;
  125. vfloat4 color_diff = color - orig_color;
  126. vfloat4 color_diff_down = color_diff + color_offset * weight_down;
  127. vfloat4 color_diff_up = color_diff + color_offset * weight_up;
  128. float error_base = dot_s(color_diff * color_diff, error_weight);
  129. float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
  130. float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
  131. // Check if the prev or next error is better, and if so use it
  132. if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
  133. {
  134. dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
  135. adjustments = true;
  136. }
  137. else if ((error_down < error_base) && (uqw > 0))
  138. {
  139. dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
  140. adjustments = true;
  141. }
  142. }
  143. // Prepare iteration for plane 2
  144. dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
  145. plane_mask = ~plane_mask;
  146. }
  147. return adjustments;
  148. }
  149. /**
  150. * @brief Attempt to improve weights given a chosen configuration.
  151. *
  152. * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
  153. * partition and per plane) and attempt to improve image quality by moving each weight up by one or
  154. * down by one quantization step.
  155. *
  156. * @param decode_mode The decode mode (LDR, HDR).
  157. * @param bsd The block size information.
  158. * @param blk The image block color data to compress.
  159. * @param[out] scb The symbolic compressed block output.
  160. */
  161. static bool realign_weights_decimated(
  162. astcenc_profile decode_mode,
  163. const block_size_descriptor& bsd,
  164. const image_block& blk,
  165. symbolic_compressed_block& scb
  166. ) {
  167. // Get the partition descriptor
  168. unsigned int partition_count = scb.partition_count;
  169. const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
  170. // Get the quantization table
  171. const block_mode& bm = bsd.get_block_mode(scb.block_mode);
  172. unsigned int weight_quant_level = bm.quant_mode;
  173. const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
  174. // Get the decimation table
  175. const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
  176. unsigned int weight_count = di.weight_count;
  177. assert(weight_count != bsd.texel_count);
  178. unsigned int max_plane = bm.is_dual_plane;
  179. int plane2_component = scb.plane2_component;
  180. vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
  181. // Decode the color endpoints
  182. bool rgb_hdr;
  183. bool alpha_hdr;
  184. vint4 endpnt0[BLOCK_MAX_PARTITIONS];
  185. vint4 endpnt1[BLOCK_MAX_PARTITIONS];
  186. vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
  187. vfloat4 offset[BLOCK_MAX_PARTITIONS];
  188. promise(partition_count > 0);
  189. promise(weight_count > 0);
  190. for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
  191. {
  192. unpack_color_endpoints(decode_mode,
  193. scb.color_formats[pa_idx],
  194. scb.color_values[pa_idx],
  195. rgb_hdr, alpha_hdr,
  196. endpnt0[pa_idx],
  197. endpnt1[pa_idx]);
  198. }
  199. uint8_t* dec_weights_uquant = scb.weights;
  200. bool adjustments = false;
  201. // For each plane and partition ...
  202. for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
  203. {
  204. for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
  205. {
  206. // Compute the endpoint delta for all components in current plane
  207. vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
  208. epd = select(epd, vint4::zero(), plane_mask);
  209. endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
  210. offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
  211. }
  212. // Create an unquantized weight grid for this decimation level
  213. alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
  214. for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
  215. {
  216. vint unquant_value(dec_weights_uquant + we_idx);
  217. vfloat unquant_valuef = int_to_float(unquant_value);
  218. storea(unquant_valuef, uq_weightsf + we_idx);
  219. }
  220. // For each weight compute previous, current, and next errors
  221. for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
  222. {
  223. int uqw = dec_weights_uquant[we_idx];
  224. uint32_t prev_and_next = qat.prev_next_values[uqw];
  225. float uqw_base = uq_weightsf[we_idx];
  226. float uqw_down = static_cast<float>(prev_and_next & 0xFF);
  227. float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
  228. float uqw_diff_down = uqw_down - uqw_base;
  229. float uqw_diff_up = uqw_up - uqw_base;
  230. vfloat4 error_basev = vfloat4::zero();
  231. vfloat4 error_downv = vfloat4::zero();
  232. vfloat4 error_upv = vfloat4::zero();
  233. // Interpolate the colors to create the diffs
  234. unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
  235. promise(texels_to_evaluate > 0);
  236. for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
  237. {
  238. unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
  239. float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
  240. float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
  241. + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
  242. + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
  243. + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
  244. // Ideally this is integer rounded, but IQ gain it isn't worth the overhead
  245. // float weight = astc::flt_rd(weight_base + 0.5f);
  246. // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
  247. // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
  248. float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
  249. float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
  250. unsigned int partition = pi.partition_of_texel[texel];
  251. vfloat4 color_offset = offset[partition];
  252. vfloat4 color_base = endpnt0f[partition];
  253. vfloat4 color = color_base + color_offset * weight_base;
  254. vfloat4 orig_color = blk.texel(texel);
  255. vfloat4 color_diff = color - orig_color;
  256. vfloat4 color_down_diff = color_diff + color_offset * weight_down;
  257. vfloat4 color_up_diff = color_diff + color_offset * weight_up;
  258. error_basev += color_diff * color_diff;
  259. error_downv += color_down_diff * color_down_diff;
  260. error_upv += color_up_diff * color_up_diff;
  261. }
  262. vfloat4 error_weight = blk.channel_weight;
  263. float error_base = hadd_s(error_basev * error_weight);
  264. float error_down = hadd_s(error_downv * error_weight);
  265. float error_up = hadd_s(error_upv * error_weight);
  266. // Check if the prev or next error is better, and if so use it
  267. if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
  268. {
  269. uq_weightsf[we_idx] = uqw_up;
  270. dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
  271. adjustments = true;
  272. }
  273. else if ((error_down < error_base) && (uqw > 0))
  274. {
  275. uq_weightsf[we_idx] = uqw_down;
  276. dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
  277. adjustments = true;
  278. }
  279. }
  280. // Prepare iteration for plane 2
  281. dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
  282. plane_mask = ~plane_mask;
  283. }
  284. return adjustments;
  285. }
  286. /**
  287. * @brief Compress a block using a chosen partitioning and 1 plane of weights.
  288. *
  289. * @param config The compressor configuration.
  290. * @param bsd The block size information.
  291. * @param blk The image block color data to compress.
  292. * @param only_always True if we only use "always" percentile block modes.
  293. * @param tune_errorval_threshold The error value threshold.
  294. * @param partition_count The partition count.
  295. * @param partition_index The partition index if @c partition_count is 2-4.
  296. * @param[out] scb The symbolic compressed block output.
  297. * @param[out] tmpbuf The quantized weights for plane 1.
  298. */
  299. static float compress_symbolic_block_for_partition_1plane(
  300. const astcenc_config& config,
  301. const block_size_descriptor& bsd,
  302. const image_block& blk,
  303. bool only_always,
  304. float tune_errorval_threshold,
  305. unsigned int partition_count,
  306. unsigned int partition_index,
  307. symbolic_compressed_block& scb,
  308. compression_working_buffers& tmpbuf,
  309. int quant_limit
  310. ) {
  311. promise(partition_count > 0);
  312. promise(config.tune_candidate_limit > 0);
  313. promise(config.tune_refinement_limit > 0);
  314. int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
  315. auto compute_difference = &compute_symbolic_block_difference_1plane;
  316. if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
  317. {
  318. compute_difference = &compute_symbolic_block_difference_1plane_1partition;
  319. }
  320. const auto& pi = bsd.get_partition_info(partition_count, partition_index);
  321. // Compute ideal weights and endpoint colors, with no quantization or decimation
  322. endpoints_and_weights& ei = tmpbuf.ei1;
  323. compute_ideal_colors_and_weights_1plane(blk, pi, ei);
  324. // Compute ideal weights and endpoint colors for every decimation
  325. float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
  326. uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
  327. // For each decimation mode, compute an ideal set of weights with no quantization
  328. unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
  329. : bsd.decimation_mode_count_selected;
  330. promise(max_decimation_modes > 0);
  331. for (unsigned int i = 0; i < max_decimation_modes; i++)
  332. {
  333. const auto& dm = bsd.get_decimation_mode(i);
  334. if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
  335. {
  336. continue;
  337. }
  338. const auto& di = bsd.get_decimation_info(i);
  339. compute_ideal_weights_for_decimation(
  340. ei,
  341. di,
  342. dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
  343. }
  344. // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
  345. // weight pair, compute the smallest weight that will result in a color value greater than 1
  346. vfloat4 min_ep(10.0f);
  347. for (unsigned int i = 0; i < partition_count; i++)
  348. {
  349. vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
  350. vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
  351. min_ep = select(min_ep, ep, use_ep);
  352. }
  353. float min_wt_cutoff = hmin_s(min_ep);
  354. // For each mode, use the angular method to compute a shift
  355. compute_angular_endpoints_1plane(
  356. only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
  357. float* weight_low_value = tmpbuf.weight_low_value1;
  358. float* weight_high_value = tmpbuf.weight_high_value1;
  359. int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
  360. float* qwt_errors = tmpbuf.qwt_errors;
  361. // For each mode (which specifies a decimation and a quantization):
  362. // * Compute number of bits needed for the quantized weights
  363. // * Generate an optimized set of quantized weights
  364. // * Compute quantization errors for the mode
  365. static const int8_t free_bits_for_partition_count[4] {
  366. 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
  367. };
  368. unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
  369. : bsd.block_mode_count_1plane_selected;
  370. promise(max_block_modes > 0);
  371. for (unsigned int i = 0; i < max_block_modes; i++)
  372. {
  373. const block_mode& bm = bsd.block_modes[i];
  374. if (bm.quant_mode > max_weight_quant)
  375. {
  376. qwt_errors[i] = 1e38f;
  377. continue;
  378. }
  379. assert(!bm.is_dual_plane);
  380. int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
  381. if (bitcount <= 0)
  382. {
  383. qwt_errors[i] = 1e38f;
  384. continue;
  385. }
  386. if (weight_high_value[i] > 1.02f * min_wt_cutoff)
  387. {
  388. weight_high_value[i] = 1.0f;
  389. }
  390. int decimation_mode = bm.decimation_mode;
  391. const auto& di = bsd.get_decimation_info(decimation_mode);
  392. qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
  393. alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
  394. // Generate the optimized set of weights for the weight mode
  395. compute_quantized_weights_for_decimation(
  396. di,
  397. weight_low_value[i], weight_high_value[i],
  398. dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
  399. dec_weights_uquantf,
  400. dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
  401. bm.get_weight_quant_mode());
  402. // Compute weight quantization errors for the block mode
  403. qwt_errors[i] = compute_error_of_weight_set_1plane(
  404. ei,
  405. di,
  406. dec_weights_uquantf);
  407. }
  408. // Decide the optimal combination of color endpoint encodings and weight encodings
  409. uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
  410. int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
  411. quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
  412. quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
  413. unsigned int candidate_count = compute_ideal_endpoint_formats(
  414. pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
  415. config.tune_candidate_limit, 0, max_block_modes,
  416. partition_format_specifiers, block_mode_index,
  417. color_quant_level, color_quant_level_mod, tmpbuf);
  418. // Iterate over the N believed-to-be-best modes to find out which one is actually best
  419. float best_errorval_in_mode = ERROR_CALC_DEFAULT;
  420. float best_errorval_in_scb = scb.errorval;
  421. for (unsigned int i = 0; i < candidate_count; i++)
  422. {
  423. TRACE_NODE(node0, "candidate");
  424. const int bm_packed_index = block_mode_index[i];
  425. assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
  426. const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
  427. int decimation_mode = qw_bm.decimation_mode;
  428. const auto& di = bsd.get_decimation_info(decimation_mode);
  429. promise(di.weight_count > 0);
  430. trace_add_data("weight_x", di.weight_x);
  431. trace_add_data("weight_y", di.weight_y);
  432. trace_add_data("weight_z", di.weight_z);
  433. trace_add_data("weight_quant", qw_bm.quant_mode);
  434. // Recompute the ideal color endpoints before storing them
  435. vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
  436. vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
  437. symbolic_compressed_block workscb;
  438. endpoints workep = ei.ep;
  439. uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
  440. for (unsigned int j = 0; j < di.weight_count; j++)
  441. {
  442. workscb.weights[j] = u8_weight_src[j];
  443. }
  444. for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
  445. {
  446. recompute_ideal_colors_1plane(
  447. blk, pi, di, workscb.weights,
  448. workep, rgbs_colors, rgbo_colors);
  449. // Quantize the chosen color, tracking if worth trying the mod value
  450. bool all_same = color_quant_level[i] != color_quant_level_mod[i];
  451. for (unsigned int j = 0; j < partition_count; j++)
  452. {
  453. workscb.color_formats[j] = pack_color_endpoints(
  454. workep.endpt0[j],
  455. workep.endpt1[j],
  456. rgbs_colors[j],
  457. rgbo_colors[j],
  458. partition_format_specifiers[i][j],
  459. workscb.color_values[j],
  460. color_quant_level[i]);
  461. all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
  462. }
  463. // If all the color endpoint modes are the same, we get a few more bits to store colors;
  464. // let's see if we can take advantage of this: requantize all the colors and see if the
  465. // endpoint modes remain the same.
  466. workscb.color_formats_matched = 0;
  467. if (partition_count >= 2 && all_same)
  468. {
  469. uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
  470. uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
  471. bool all_same_mod = true;
  472. for (unsigned int j = 0; j < partition_count; j++)
  473. {
  474. color_formats_mod[j] = pack_color_endpoints(
  475. workep.endpt0[j],
  476. workep.endpt1[j],
  477. rgbs_colors[j],
  478. rgbo_colors[j],
  479. partition_format_specifiers[i][j],
  480. colorvals[j],
  481. color_quant_level_mod[i]);
  482. // Early out as soon as it's no longer possible to use mod
  483. if (color_formats_mod[j] != color_formats_mod[0])
  484. {
  485. all_same_mod = false;
  486. break;
  487. }
  488. }
  489. if (all_same_mod)
  490. {
  491. workscb.color_formats_matched = 1;
  492. for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
  493. {
  494. for (unsigned int k = 0; k < 8; k++)
  495. {
  496. workscb.color_values[j][k] = colorvals[j][k];
  497. }
  498. workscb.color_formats[j] = color_formats_mod[j];
  499. }
  500. }
  501. }
  502. // Store header fields
  503. workscb.partition_count = static_cast<uint8_t>(partition_count);
  504. workscb.partition_index = static_cast<uint16_t>(partition_index);
  505. workscb.plane2_component = -1;
  506. workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
  507. workscb.block_mode = qw_bm.mode_index;
  508. workscb.block_type = SYM_BTYPE_NONCONST;
  509. // Pre-realign test
  510. if (l == 0)
  511. {
  512. float errorval = compute_difference(config, bsd, workscb, blk);
  513. if (errorval == -ERROR_CALC_DEFAULT)
  514. {
  515. errorval = -errorval;
  516. workscb.block_type = SYM_BTYPE_ERROR;
  517. }
  518. trace_add_data("error_prerealign", errorval);
  519. best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
  520. // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
  521. // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
  522. // drive a heuristic to skip blocks that are unlikely to catch up with the best
  523. // block we have already.
  524. unsigned int iters_remaining = config.tune_refinement_limit - l;
  525. float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
  526. if (errorval > (threshold * best_errorval_in_scb))
  527. {
  528. break;
  529. }
  530. if (errorval < best_errorval_in_scb)
  531. {
  532. best_errorval_in_scb = errorval;
  533. workscb.errorval = errorval;
  534. scb = workscb;
  535. if (errorval < tune_errorval_threshold)
  536. {
  537. // Skip remaining candidates - this is "good enough"
  538. i = candidate_count;
  539. break;
  540. }
  541. }
  542. }
  543. bool adjustments;
  544. if (di.weight_count != bsd.texel_count)
  545. {
  546. adjustments = realign_weights_decimated(
  547. config.profile, bsd, blk, workscb);
  548. }
  549. else
  550. {
  551. adjustments = realign_weights_undecimated(
  552. config.profile, bsd, blk, workscb);
  553. }
  554. // Post-realign test
  555. float errorval = compute_difference(config, bsd, workscb, blk);
  556. if (errorval == -ERROR_CALC_DEFAULT)
  557. {
  558. errorval = -errorval;
  559. workscb.block_type = SYM_BTYPE_ERROR;
  560. }
  561. trace_add_data("error_postrealign", errorval);
  562. best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
  563. // Average refinement improvement is 3.5% per iteration, so skip blocks that are
  564. // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
  565. // give benefit of the doubt ...
  566. unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
  567. float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
  568. if (errorval > (threshold * best_errorval_in_scb))
  569. {
  570. break;
  571. }
  572. if (errorval < best_errorval_in_scb)
  573. {
  574. best_errorval_in_scb = errorval;
  575. workscb.errorval = errorval;
  576. scb = workscb;
  577. if (errorval < tune_errorval_threshold)
  578. {
  579. // Skip remaining candidates - this is "good enough"
  580. i = candidate_count;
  581. break;
  582. }
  583. }
  584. if (!adjustments)
  585. {
  586. break;
  587. }
  588. }
  589. }
  590. return best_errorval_in_mode;
  591. }
  592. /**
  593. * @brief Compress a block using a chosen partitioning and 2 planes of weights.
  594. *
  595. * @param config The compressor configuration.
  596. * @param bsd The block size information.
  597. * @param blk The image block color data to compress.
  598. * @param tune_errorval_threshold The error value threshold.
  599. * @param plane2_component The component index for the second plane of weights.
  600. * @param[out] scb The symbolic compressed block output.
  601. * @param[out] tmpbuf The quantized weights for plane 1.
  602. */
  603. static float compress_symbolic_block_for_partition_2planes(
  604. const astcenc_config& config,
  605. const block_size_descriptor& bsd,
  606. const image_block& blk,
  607. float tune_errorval_threshold,
  608. unsigned int plane2_component,
  609. symbolic_compressed_block& scb,
  610. compression_working_buffers& tmpbuf,
  611. int quant_limit
  612. ) {
  613. promise(config.tune_candidate_limit > 0);
  614. promise(config.tune_refinement_limit > 0);
  615. promise(bsd.decimation_mode_count_selected > 0);
  616. int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
  617. // Compute ideal weights and endpoint colors, with no quantization or decimation
  618. endpoints_and_weights& ei1 = tmpbuf.ei1;
  619. endpoints_and_weights& ei2 = tmpbuf.ei2;
  620. compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
  621. // Compute ideal weights and endpoint colors for every decimation
  622. float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
  623. uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
  624. // For each decimation mode, compute an ideal set of weights with no quantization
  625. for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
  626. {
  627. const auto& dm = bsd.get_decimation_mode(i);
  628. if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
  629. {
  630. continue;
  631. }
  632. const auto& di = bsd.get_decimation_info(i);
  633. compute_ideal_weights_for_decimation(
  634. ei1,
  635. di,
  636. dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
  637. compute_ideal_weights_for_decimation(
  638. ei2,
  639. di,
  640. dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
  641. }
  642. // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
  643. // weight pair, compute the smallest weight that will result in a color value greater than 1
  644. vfloat4 min_ep1(10.0f);
  645. vfloat4 min_ep2(10.0f);
  646. vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
  647. vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
  648. min_ep1 = select(min_ep1, ep1, use_ep1);
  649. vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
  650. vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
  651. min_ep2 = select(min_ep2, ep2, use_ep2);
  652. vfloat4 err_max(ERROR_CALC_DEFAULT);
  653. vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
  654. // Set the plane2 component to max error in ep1
  655. min_ep1 = select(min_ep1, err_max, err_mask);
  656. float min_wt_cutoff1 = hmin_s(min_ep1);
  657. // Set the minwt2 to the plane2 component min in ep2
  658. float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
  659. compute_angular_endpoints_2planes(
  660. bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
  661. // For each mode (which specifies a decimation and a quantization):
  662. // * Compute number of bits needed for the quantized weights
  663. // * Generate an optimized set of quantized weights
  664. // * Compute quantization errors for the mode
  665. float* weight_low_value1 = tmpbuf.weight_low_value1;
  666. float* weight_high_value1 = tmpbuf.weight_high_value1;
  667. float* weight_low_value2 = tmpbuf.weight_low_value2;
  668. float* weight_high_value2 = tmpbuf.weight_high_value2;
  669. int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
  670. float* qwt_errors = tmpbuf.qwt_errors;
  671. unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
  672. unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
  673. for (unsigned int i = start_2plane; i < end_2plane; i++)
  674. {
  675. const block_mode& bm = bsd.block_modes[i];
  676. assert(bm.is_dual_plane);
  677. if (bm.quant_mode > max_weight_quant)
  678. {
  679. qwt_errors[i] = 1e38f;
  680. continue;
  681. }
  682. qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
  683. if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
  684. {
  685. weight_high_value1[i] = 1.0f;
  686. }
  687. if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
  688. {
  689. weight_high_value2[i] = 1.0f;
  690. }
  691. unsigned int decimation_mode = bm.decimation_mode;
  692. const auto& di = bsd.get_decimation_info(decimation_mode);
  693. alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
  694. // Generate the optimized set of weights for the mode
  695. compute_quantized_weights_for_decimation(
  696. di,
  697. weight_low_value1[i],
  698. weight_high_value1[i],
  699. dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
  700. dec_weights_uquantf,
  701. dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
  702. bm.get_weight_quant_mode());
  703. compute_quantized_weights_for_decimation(
  704. di,
  705. weight_low_value2[i],
  706. weight_high_value2[i],
  707. dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
  708. dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
  709. dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
  710. bm.get_weight_quant_mode());
  711. // Compute weight quantization errors for the block mode
  712. qwt_errors[i] = compute_error_of_weight_set_2planes(
  713. ei1,
  714. ei2,
  715. di,
  716. dec_weights_uquantf,
  717. dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
  718. }
  719. // Decide the optimal combination of color endpoint encodings and weight encodings
  720. uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
  721. int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
  722. quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
  723. quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
  724. endpoints epm;
  725. merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
  726. const auto& pi = bsd.get_partition_info(1, 0);
  727. unsigned int candidate_count = compute_ideal_endpoint_formats(
  728. pi, blk, epm, qwt_bitcounts, qwt_errors,
  729. config.tune_candidate_limit,
  730. bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
  731. partition_format_specifiers, block_mode_index,
  732. color_quant_level, color_quant_level_mod, tmpbuf);
  733. // Iterate over the N believed-to-be-best modes to find out which one is actually best
  734. float best_errorval_in_mode = ERROR_CALC_DEFAULT;
  735. float best_errorval_in_scb = scb.errorval;
  736. for (unsigned int i = 0; i < candidate_count; i++)
  737. {
  738. TRACE_NODE(node0, "candidate");
  739. const int bm_packed_index = block_mode_index[i];
  740. assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
  741. bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
  742. const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
  743. int decimation_mode = qw_bm.decimation_mode;
  744. const auto& di = bsd.get_decimation_info(decimation_mode);
  745. promise(di.weight_count > 0);
  746. trace_add_data("weight_x", di.weight_x);
  747. trace_add_data("weight_y", di.weight_y);
  748. trace_add_data("weight_z", di.weight_z);
  749. trace_add_data("weight_quant", qw_bm.quant_mode);
  750. vfloat4 rgbs_color;
  751. vfloat4 rgbo_color;
  752. symbolic_compressed_block workscb;
  753. endpoints workep = epm;
  754. uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
  755. uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
  756. for (int j = 0; j < di.weight_count; j++)
  757. {
  758. workscb.weights[j] = u8_weight1_src[j];
  759. workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
  760. }
  761. for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
  762. {
  763. recompute_ideal_colors_2planes(
  764. blk, bsd, di,
  765. workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
  766. workep, rgbs_color, rgbo_color, plane2_component);
  767. // Quantize the chosen color
  768. workscb.color_formats[0] = pack_color_endpoints(
  769. workep.endpt0[0],
  770. workep.endpt1[0],
  771. rgbs_color, rgbo_color,
  772. partition_format_specifiers[i][0],
  773. workscb.color_values[0],
  774. color_quant_level[i]);
  775. // Store header fields
  776. workscb.partition_count = 1;
  777. workscb.partition_index = 0;
  778. workscb.quant_mode = color_quant_level[i];
  779. workscb.color_formats_matched = 0;
  780. workscb.block_mode = qw_bm.mode_index;
  781. workscb.plane2_component = static_cast<int8_t>(plane2_component);
  782. workscb.block_type = SYM_BTYPE_NONCONST;
  783. // Pre-realign test
  784. if (l == 0)
  785. {
  786. float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
  787. if (errorval == -ERROR_CALC_DEFAULT)
  788. {
  789. errorval = -errorval;
  790. workscb.block_type = SYM_BTYPE_ERROR;
  791. }
  792. trace_add_data("error_prerealign", errorval);
  793. best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
  794. // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
  795. // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
  796. // drive a heuristic to skip blocks that are unlikely to catch up with the best
  797. // block we have already.
  798. unsigned int iters_remaining = config.tune_refinement_limit - l;
  799. float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
  800. if (errorval > (threshold * best_errorval_in_scb))
  801. {
  802. break;
  803. }
  804. if (errorval < best_errorval_in_scb)
  805. {
  806. best_errorval_in_scb = errorval;
  807. workscb.errorval = errorval;
  808. scb = workscb;
  809. if (errorval < tune_errorval_threshold)
  810. {
  811. // Skip remaining candidates - this is "good enough"
  812. i = candidate_count;
  813. break;
  814. }
  815. }
  816. }
  817. // Perform a final pass over the weights to try to improve them.
  818. bool adjustments;
  819. if (di.weight_count != bsd.texel_count)
  820. {
  821. adjustments = realign_weights_decimated(
  822. config.profile, bsd, blk, workscb);
  823. }
  824. else
  825. {
  826. adjustments = realign_weights_undecimated(
  827. config.profile, bsd, blk, workscb);
  828. }
  829. // Post-realign test
  830. float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
  831. if (errorval == -ERROR_CALC_DEFAULT)
  832. {
  833. errorval = -errorval;
  834. workscb.block_type = SYM_BTYPE_ERROR;
  835. }
  836. trace_add_data("error_postrealign", errorval);
  837. best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
  838. // Average refinement improvement is 3.5% per iteration, so skip blocks that are
  839. // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
  840. // give benefit of the doubt ...
  841. unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
  842. float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
  843. if (errorval > (threshold * best_errorval_in_scb))
  844. {
  845. break;
  846. }
  847. if (errorval < best_errorval_in_scb)
  848. {
  849. best_errorval_in_scb = errorval;
  850. workscb.errorval = errorval;
  851. scb = workscb;
  852. if (errorval < tune_errorval_threshold)
  853. {
  854. // Skip remaining candidates - this is "good enough"
  855. i = candidate_count;
  856. break;
  857. }
  858. }
  859. if (!adjustments)
  860. {
  861. break;
  862. }
  863. }
  864. }
  865. return best_errorval_in_mode;
  866. }
  867. /**
  868. * @brief Determine the lowest cross-channel correlation factor.
  869. *
  870. * @param texels_per_block The number of texels in a block.
  871. * @param blk The image block color data to compress.
  872. *
  873. * @return Return the lowest correlation factor.
  874. */
  875. static float prepare_block_statistics(
  876. int texels_per_block,
  877. const image_block& blk
  878. ) {
  879. // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
  880. // of the matrix. The matrix is symmetric, so this is all we need for this use case.
  881. float rs = 0.0f;
  882. float gs = 0.0f;
  883. float bs = 0.0f;
  884. float as = 0.0f;
  885. float rr_var = 0.0f;
  886. float gg_var = 0.0f;
  887. float bb_var = 0.0f;
  888. float aa_var = 0.0f;
  889. float rg_cov = 0.0f;
  890. float rb_cov = 0.0f;
  891. float ra_cov = 0.0f;
  892. float gb_cov = 0.0f;
  893. float ga_cov = 0.0f;
  894. float ba_cov = 0.0f;
  895. float weight_sum = 0.0f;
  896. promise(texels_per_block > 0);
  897. for (int i = 0; i < texels_per_block; i++)
  898. {
  899. float weight = hadd_s(blk.channel_weight) / 4.0f;
  900. assert(weight >= 0.0f);
  901. weight_sum += weight;
  902. float r = blk.data_r[i];
  903. float g = blk.data_g[i];
  904. float b = blk.data_b[i];
  905. float a = blk.data_a[i];
  906. float rw = r * weight;
  907. rs += rw;
  908. rr_var += r * rw;
  909. rg_cov += g * rw;
  910. rb_cov += b * rw;
  911. ra_cov += a * rw;
  912. float gw = g * weight;
  913. gs += gw;
  914. gg_var += g * gw;
  915. gb_cov += b * gw;
  916. ga_cov += a * gw;
  917. float bw = b * weight;
  918. bs += bw;
  919. bb_var += b * bw;
  920. ba_cov += a * bw;
  921. float aw = a * weight;
  922. as += aw;
  923. aa_var += a * aw;
  924. }
  925. float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
  926. rr_var -= rs * (rs * rpt);
  927. rg_cov -= gs * (rs * rpt);
  928. rb_cov -= bs * (rs * rpt);
  929. ra_cov -= as * (rs * rpt);
  930. gg_var -= gs * (gs * rpt);
  931. gb_cov -= bs * (gs * rpt);
  932. ga_cov -= as * (gs * rpt);
  933. bb_var -= bs * (bs * rpt);
  934. ba_cov -= as * (bs * rpt);
  935. aa_var -= as * (as * rpt);
  936. // These will give a NaN if a channel is constant - these are fixed up in the next step
  937. rg_cov *= astc::rsqrt(rr_var * gg_var);
  938. rb_cov *= astc::rsqrt(rr_var * bb_var);
  939. ra_cov *= astc::rsqrt(rr_var * aa_var);
  940. gb_cov *= astc::rsqrt(gg_var * bb_var);
  941. ga_cov *= astc::rsqrt(gg_var * aa_var);
  942. ba_cov *= astc::rsqrt(bb_var * aa_var);
  943. if (astc::isnan(rg_cov)) rg_cov = 1.0f;
  944. if (astc::isnan(rb_cov)) rb_cov = 1.0f;
  945. if (astc::isnan(ra_cov)) ra_cov = 1.0f;
  946. if (astc::isnan(gb_cov)) gb_cov = 1.0f;
  947. if (astc::isnan(ga_cov)) ga_cov = 1.0f;
  948. if (astc::isnan(ba_cov)) ba_cov = 1.0f;
  949. float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
  950. lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
  951. lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
  952. lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
  953. lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));
  954. // Diagnostic trace points
  955. trace_add_data("min_r", blk.data_min.lane<0>());
  956. trace_add_data("max_r", blk.data_max.lane<0>());
  957. trace_add_data("min_g", blk.data_min.lane<1>());
  958. trace_add_data("max_g", blk.data_max.lane<1>());
  959. trace_add_data("min_b", blk.data_min.lane<2>());
  960. trace_add_data("max_b", blk.data_max.lane<2>());
  961. trace_add_data("min_a", blk.data_min.lane<3>());
  962. trace_add_data("max_a", blk.data_max.lane<3>());
  963. trace_add_data("cov_rg", fabsf(rg_cov));
  964. trace_add_data("cov_rb", fabsf(rb_cov));
  965. trace_add_data("cov_ra", fabsf(ra_cov));
  966. trace_add_data("cov_gb", fabsf(gb_cov));
  967. trace_add_data("cov_ga", fabsf(ga_cov));
  968. trace_add_data("cov_ba", fabsf(ba_cov));
  969. return lowest_correlation;
  970. }
  971. /* See header for documentation. */
  972. void compress_block(
  973. const astcenc_contexti& ctx,
  974. const image_block& blk,
  975. physical_compressed_block& pcb,
  976. compression_working_buffers& tmpbuf)
  977. {
  978. astcenc_profile decode_mode = ctx.config.profile;
  979. symbolic_compressed_block scb;
  980. const block_size_descriptor& bsd = *ctx.bsd;
  981. float lowest_correl;
  982. TRACE_NODE(node0, "block");
  983. trace_add_data("pos_x", blk.xpos);
  984. trace_add_data("pos_y", blk.ypos);
  985. trace_add_data("pos_z", blk.zpos);
  986. // Set stricter block targets for luminance data as we have more bits to play with
  987. bool block_is_l = blk.is_luminance();
  988. float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
  989. // Set slightly stricter block targets for lumalpha data as we have more bits to play with
  990. bool block_is_la = blk.is_luminancealpha();
  991. float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
  992. bool block_skip_two_plane = false;
  993. int max_partitions = ctx.config.tune_partition_count_limit;
  994. unsigned int requested_partition_indices[3] {
  995. ctx.config.tune_2partition_index_limit,
  996. ctx.config.tune_3partition_index_limit,
  997. ctx.config.tune_4partition_index_limit
  998. };
  999. unsigned int requested_partition_trials[3] {
  1000. ctx.config.tune_2partitioning_candidate_limit,
  1001. ctx.config.tune_3partitioning_candidate_limit,
  1002. ctx.config.tune_4partitioning_candidate_limit
  1003. };
  1004. #if defined(ASTCENC_DIAGNOSTICS)
  1005. // Do this early in diagnostic builds so we can dump uniform metrics
  1006. // for every block. Do it later in release builds to avoid redundant work!
  1007. float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
  1008. float error_threshold = ctx.config.tune_db_limit
  1009. * error_weight_sum
  1010. * block_is_l_scale
  1011. * block_is_la_scale;
  1012. lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
  1013. trace_add_data("lowest_correl", lowest_correl);
  1014. trace_add_data("tune_error_threshold", error_threshold);
  1015. #endif
  1016. // Detected a constant-color block
  1017. if (all(blk.data_min == blk.data_max))
  1018. {
  1019. TRACE_NODE(node1, "pass");
  1020. trace_add_data("partition_count", 0);
  1021. trace_add_data("plane_count", 1);
  1022. scb.partition_count = 0;
  1023. // Encode as FP16 if using HDR
  1024. if ((decode_mode == ASTCENC_PRF_HDR) ||
  1025. (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
  1026. {
  1027. scb.block_type = SYM_BTYPE_CONST_F16;
  1028. vint4 color_f16 = float_to_float16(blk.origin_texel);
  1029. store(color_f16, scb.constant_color);
  1030. }
  1031. // Encode as UNORM16 if NOT using HDR
  1032. else
  1033. {
  1034. scb.block_type = SYM_BTYPE_CONST_U16;
  1035. vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
  1036. vint4 color_u16 = float_to_int_rtn(color_f32);
  1037. store(color_u16, scb.constant_color);
  1038. }
  1039. trace_add_data("exit", "quality hit");
  1040. symbolic_to_physical(bsd, scb, pcb);
  1041. return;
  1042. }
  1043. #if !defined(ASTCENC_DIAGNOSTICS)
  1044. float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
  1045. float error_threshold = ctx.config.tune_db_limit
  1046. * error_weight_sum
  1047. * block_is_l_scale
  1048. * block_is_la_scale;
  1049. #endif
  1050. // Set SCB and mode errors to a very high error value
  1051. scb.errorval = ERROR_CALC_DEFAULT;
  1052. scb.block_type = SYM_BTYPE_ERROR;
  1053. float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
  1054. ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
  1055. };
  1056. float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
  1057. 0.0f,
  1058. ctx.config.tune_2partition_early_out_limit_factor,
  1059. ctx.config.tune_3partition_early_out_limit_factor,
  1060. 0.0f
  1061. };
  1062. // Trial using 1 plane of weights and 1 partition.
  1063. // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
  1064. // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
  1065. // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
  1066. // compression and slightly reduces image quality.
  1067. float errorval_mult[2] {
  1068. 1.0f / ctx.config.tune_mse_overshoot,
  1069. 1.0f
  1070. };
  1071. static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
  1072. // Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels
  1073. int start_trial = 1;
  1074. if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
  1075. {
  1076. start_trial = 0;
  1077. }
  1078. int quant_limit = QUANT_32;
  1079. for (int i = start_trial; i < 2; i++)
  1080. {
  1081. TRACE_NODE(node1, "pass");
  1082. trace_add_data("partition_count", 1);
  1083. trace_add_data("plane_count", 1);
  1084. trace_add_data("search_mode", i);
  1085. float errorval = compress_symbolic_block_for_partition_1plane(
  1086. ctx.config, bsd, blk, i == 0,
  1087. error_threshold * errorval_mult[i] * errorval_overshoot,
  1088. 1, 0, scb, tmpbuf, QUANT_32);
  1089. // Record the quant level so we can use the filter later searches
  1090. const auto& bm = bsd.get_block_mode(scb.block_mode);
  1091. quant_limit = bm.get_weight_quant_mode();
  1092. best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
  1093. if (errorval < (error_threshold * errorval_mult[i]))
  1094. {
  1095. trace_add_data("exit", "quality hit");
  1096. goto END_OF_TESTS;
  1097. }
  1098. }
  1099. #if !defined(ASTCENC_DIAGNOSTICS)
  1100. lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
  1101. #endif
  1102. block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
  1103. // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
  1104. // alpha is the most likely to be non-correlated if it is present in the data.
  1105. for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
  1106. {
  1107. TRACE_NODE(node1, "pass");
  1108. trace_add_data("partition_count", 1);
  1109. trace_add_data("plane_count", 2);
  1110. trace_add_data("plane_component", i);
  1111. if (block_skip_two_plane)
  1112. {
  1113. trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
  1114. continue;
  1115. }
  1116. if (blk.grayscale && i != 3)
  1117. {
  1118. trace_add_data("skip", "grayscale block");
  1119. continue;
  1120. }
  1121. if (blk.is_constant_channel(i))
  1122. {
  1123. trace_add_data("skip", "constant component");
  1124. continue;
  1125. }
  1126. float errorval = compress_symbolic_block_for_partition_2planes(
  1127. ctx.config, bsd, blk, error_threshold * errorval_overshoot,
  1128. i, scb, tmpbuf, quant_limit);
  1129. // If attempting two planes is much worse than the best one plane result
  1130. // then further two plane searches are unlikely to help so move on ...
  1131. if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
  1132. {
  1133. break;
  1134. }
  1135. if (errorval < error_threshold)
  1136. {
  1137. trace_add_data("exit", "quality hit");
  1138. goto END_OF_TESTS;
  1139. }
  1140. }
  1141. // Find best blocks for 2, 3 and 4 partitions
  1142. for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
  1143. {
  1144. unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
  1145. unsigned int requested_indices = requested_partition_indices[partition_count - 2];
  1146. unsigned int requested_trials = requested_partition_trials[partition_count - 2];
  1147. requested_trials = astc::min(requested_trials, requested_indices);
  1148. unsigned int actual_trials = find_best_partition_candidates(
  1149. bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
  1150. float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
  1151. for (unsigned int i = 0; i < actual_trials; i++)
  1152. {
  1153. TRACE_NODE(node1, "pass");
  1154. trace_add_data("partition_count", partition_count);
  1155. trace_add_data("partition_index", partition_indices[i]);
  1156. trace_add_data("plane_count", 1);
  1157. trace_add_data("search_mode", i);
  1158. float errorval = compress_symbolic_block_for_partition_1plane(
  1159. ctx.config, bsd, blk, false,
  1160. error_threshold * errorval_overshoot,
  1161. partition_count, partition_indices[i],
  1162. scb, tmpbuf, quant_limit);
  1163. best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
  1164. // If using N partitions doesn't improve much over using N-1 partitions then skip trying
  1165. // N+1. Error can dramatically improve if the data is correlated or non-correlated and
  1166. // aligns with a partitioning that suits that encoding, so for this inner loop check add
  1167. // a large error scale because the "other" trial could be a lot better.
  1168. float best_error = best_errorvals_for_pcount[partition_count - 1];
  1169. float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
  1170. if (best_error > (best_error_in_prev * best_error_scale))
  1171. {
  1172. trace_add_data("skip", "tune_partition_early_out_limit_factor");
  1173. goto END_OF_TESTS;
  1174. }
  1175. if (errorval < error_threshold)
  1176. {
  1177. trace_add_data("exit", "quality hit");
  1178. goto END_OF_TESTS;
  1179. }
  1180. }
  1181. // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
  1182. float best_error = best_errorvals_for_pcount[partition_count - 1];
  1183. float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
  1184. if (best_error > (best_error_in_prev * best_error_scale))
  1185. {
  1186. trace_add_data("skip", "tune_partition_early_out_limit_factor");
  1187. goto END_OF_TESTS;
  1188. }
  1189. }
  1190. trace_add_data("exit", "quality not hit");
  1191. END_OF_TESTS:
  1192. // If we still have an error block then convert to something we can encode
  1193. // TODO: Do something more sensible here, such as average color block
  1194. if (scb.block_type == SYM_BTYPE_ERROR)
  1195. {
  1196. #if defined(ASTCENC_DIAGNOSTICS)
  1197. static bool printed_once = false;
  1198. if (!printed_once)
  1199. {
  1200. printed_once = true;
  1201. printf("WARN: At least one block failed to find a valid encoding.\n"
  1202. " Try increasing compression quality settings.\n\n");
  1203. }
  1204. #endif
  1205. scb.block_type = SYM_BTYPE_CONST_U16;
  1206. vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
  1207. vint4 color_u16 = float_to_int_rtn(color_f32);
  1208. store(color_u16, scb.constant_color);
  1209. }
  1210. // Compress to a physical block
  1211. symbolic_to_physical(bsd, scb, pcb);
  1212. }
  1213. #endif