123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632 |
- // SPDX-License-Identifier: Apache-2.0
- // ----------------------------------------------------------------------------
- // Copyright 2011-2024 Arm Limited
- //
- // Licensed under the Apache License, Version 2.0 (the "License"); you may not
- // use this file except in compliance with the License. You may obtain a copy
- // of the License at:
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- // License for the specific language governing permissions and limitations
- // under the License.
- // ----------------------------------------------------------------------------
- /**
- * @brief Functions to decompress a symbolic block.
- */
- #include "astcenc_internal.h"
- #include <stdio.h>
- #include <assert.h>
- /**
- * @brief Compute the integer linear interpolation of two color endpoints.
- *
- * @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
- * @param color0 The endpoint0 color.
- * @param color1 The endpoint1 color.
- * @param weights The interpolation weight (between 0 and 64).
- *
- * @return The interpolated color.
- */
- static vint4 lerp_color_int(
- vmask4 u8_mask,
- vint4 color0,
- vint4 color1,
- vint4 weights
- ) {
- vint4 weight1 = weights;
- vint4 weight0 = vint4(64) - weight1;
- vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
- color = asr<6>(color);
- // For decode_unorm8 values force the codec to bit replicate. This allows the
- // rest of the codec to assume the full 0xFFFF range for everything and ignore
- // the decode_mode setting
- vint4 color_u8 = asr<8>(color) * vint4(257);
- color = select(color, color_u8, u8_mask);
- return color;
- }
- /**
- * @brief Convert integer color value into a float value for the decoder.
- *
- * @param data The integer color value post-interpolation.
- * @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16).
- *
- * @return The float color value.
- */
- static inline vfloat4 decode_texel(
- vint4 data,
- vmask4 lns_mask
- ) {
- vint4 color_lns = vint4::zero();
- vint4 color_unorm = vint4::zero();
- if (any(lns_mask))
- {
- color_lns = lns_to_sf16(data);
- }
- if (!all(lns_mask))
- {
- color_unorm = unorm16_to_sf16(data);
- }
- // Pick components and then convert to FP16
- vint4 datai = select(color_unorm, color_lns, lns_mask);
- return float16_to_float(datai);
- }
- /* See header for documentation. */
- void unpack_weights(
- const block_size_descriptor& bsd,
- const symbolic_compressed_block& scb,
- const decimation_info& di,
- bool is_dual_plane,
- int weights_plane1[BLOCK_MAX_TEXELS],
- int weights_plane2[BLOCK_MAX_TEXELS]
- ) {
- // Safe to overshoot as all arrays are allocated to full size
- if (!is_dual_plane)
- {
- // Build full 64-entry weight lookup table
- vint4 tab0 = vint4::load(scb.weights + 0);
- vint4 tab1 = vint4::load(scb.weights + 16);
- vint4 tab2 = vint4::load(scb.weights + 32);
- vint4 tab3 = vint4::load(scb.weights + 48);
- vint tab0p, tab1p, tab2p, tab3p;
- vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
- for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
- {
- vint summed_value(8);
- vint weight_count(di.texel_weight_count + i);
- int max_weight_count = hmax(weight_count).lane<0>();
- promise(max_weight_count > 0);
- for (int j = 0; j < max_weight_count; j++)
- {
- vint texel_weights(di.texel_weights_tr[j] + i);
- vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
- summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
- }
- store(lsr<4>(summed_value), weights_plane1 + i);
- }
- }
- else
- {
- // Build a 32-entry weight lookup table per plane
- // Plane 1
- vint4 tab0_plane1 = vint4::load(scb.weights + 0);
- vint4 tab1_plane1 = vint4::load(scb.weights + 16);
- vint tab0_plane1p, tab1_plane1p;
- vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
- // Plane 2
- vint4 tab0_plane2 = vint4::load(scb.weights + 32);
- vint4 tab1_plane2 = vint4::load(scb.weights + 48);
- vint tab0_plane2p, tab1_plane2p;
- vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
- for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
- {
- vint sum_plane1(8);
- vint sum_plane2(8);
- vint weight_count(di.texel_weight_count + i);
- int max_weight_count = hmax(weight_count).lane<0>();
- promise(max_weight_count > 0);
- for (int j = 0; j < max_weight_count; j++)
- {
- vint texel_weights(di.texel_weights_tr[j] + i);
- vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
- sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
- sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
- }
- store(lsr<4>(sum_plane1), weights_plane1 + i);
- store(lsr<4>(sum_plane2), weights_plane2 + i);
- }
- }
- }
- /**
- * @brief Return an FP32 NaN value for use in error colors.
- *
- * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
- *
- * @return The float color value.
- */
- static float error_color_nan()
- {
- if32 v;
- v.u = 0xFFFFE000U;
- return v.f;
- }
- /* See header for documentation. */
- void decompress_symbolic_block(
- astcenc_profile decode_mode,
- const block_size_descriptor& bsd,
- int xpos,
- int ypos,
- int zpos,
- const symbolic_compressed_block& scb,
- image_block& blk
- ) {
- blk.xpos = xpos;
- blk.ypos = ypos;
- blk.zpos = zpos;
- blk.data_min = vfloat4::zero();
- blk.data_mean = vfloat4::zero();
- blk.data_max = vfloat4::zero();
- blk.grayscale = false;
- // If we detected an error-block, blow up immediately.
- if (scb.block_type == SYM_BTYPE_ERROR)
- {
- for (unsigned int i = 0; i < bsd.texel_count; i++)
- {
- blk.data_r[i] = error_color_nan();
- blk.data_g[i] = error_color_nan();
- blk.data_b[i] = error_color_nan();
- blk.data_a[i] = error_color_nan();
- blk.rgb_lns[i] = 0;
- blk.alpha_lns[i] = 0;
- }
- return;
- }
- if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
- (scb.block_type == SYM_BTYPE_CONST_U16))
- {
- vfloat4 color;
- uint8_t use_lns = 0;
- // UNORM16 constant color block
- if (scb.block_type == SYM_BTYPE_CONST_U16)
- {
- vint4 colori(scb.constant_color);
- // Determine the UNORM8 rounding on the decode
- vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
- // The real decoder would just use the top 8 bits, but we rescale
- // in to a 16-bit value that rounds correctly.
- vint4 colori_u8 = asr<8>(colori) * 257;
- colori = select(colori, colori_u8, u8_mask);
- vint4 colorf16 = unorm16_to_sf16(colori);
- color = float16_to_float(colorf16);
- }
- // FLOAT16 constant color block
- else
- {
- switch (decode_mode)
- {
- case ASTCENC_PRF_LDR_SRGB:
- case ASTCENC_PRF_LDR:
- color = vfloat4(error_color_nan());
- break;
- case ASTCENC_PRF_HDR_RGB_LDR_A:
- case ASTCENC_PRF_HDR:
- // Constant-color block; unpack from FP16 to FP32.
- color = float16_to_float(vint4(scb.constant_color));
- use_lns = 1;
- break;
- }
- }
- for (unsigned int i = 0; i < bsd.texel_count; i++)
- {
- blk.data_r[i] = color.lane<0>();
- blk.data_g[i] = color.lane<1>();
- blk.data_b[i] = color.lane<2>();
- blk.data_a[i] = color.lane<3>();
- blk.rgb_lns[i] = use_lns;
- blk.alpha_lns[i] = use_lns;
- }
- return;
- }
- // Get the appropriate partition-table entry
- int partition_count = scb.partition_count;
- const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
- // Get the appropriate block descriptors
- const auto& bm = bsd.get_block_mode(scb.block_mode);
- const auto& di = bsd.get_decimation_info(bm.decimation_mode);
- bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
- // Unquantize and undecimate the weights
- int plane1_weights[BLOCK_MAX_TEXELS];
- int plane2_weights[BLOCK_MAX_TEXELS];
- unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
- // Now that we have endpoint colors and weights, we can unpack texel colors
- int plane2_component = scb.plane2_component;
- vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
- vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
- for (int i = 0; i < partition_count; i++)
- {
- // Decode the color endpoints for this partition
- vint4 ep0;
- vint4 ep1;
- bool rgb_lns;
- bool a_lns;
- unpack_color_endpoints(decode_mode,
- scb.color_formats[i],
- scb.color_values[i],
- rgb_lns, a_lns,
- ep0, ep1);
- vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
- int texel_count = pi.partition_texel_count[i];
- for (int j = 0; j < texel_count; j++)
- {
- int tix = pi.texels_of_partition[i][j];
- vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
- vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
- vfloat4 colorf = decode_texel(color, lns_mask);
- blk.data_r[tix] = colorf.lane<0>();
- blk.data_g[tix] = colorf.lane<1>();
- blk.data_b[tix] = colorf.lane<2>();
- blk.data_a[tix] = colorf.lane<3>();
- }
- }
- }
- #if !defined(ASTCENC_DECOMPRESS_ONLY)
- /* See header for documentation. */
- float compute_symbolic_block_difference_2plane(
- const astcenc_config& config,
- const block_size_descriptor& bsd,
- const symbolic_compressed_block& scb,
- const image_block& blk
- ) {
- // If we detected an error-block, blow up immediately.
- if (scb.block_type == SYM_BTYPE_ERROR)
- {
- return ERROR_CALC_DEFAULT;
- }
- assert(scb.block_mode >= 0);
- assert(scb.partition_count == 1);
- assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
- // Get the appropriate block descriptor
- const block_mode& bm = bsd.get_block_mode(scb.block_mode);
- const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
- // Unquantize and undecimate the weights
- int plane1_weights[BLOCK_MAX_TEXELS];
- int plane2_weights[BLOCK_MAX_TEXELS];
- unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
- vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
- vfloat4 summa = vfloat4::zero();
- // Decode the color endpoints for this partition
- vint4 ep0;
- vint4 ep1;
- bool rgb_lns;
- bool a_lns;
- unpack_color_endpoints(config.profile,
- scb.color_formats[0],
- scb.color_values[0],
- rgb_lns, a_lns,
- ep0, ep1);
- vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
- // Unpack and compute error for each texel in the partition
- unsigned int texel_count = bsd.texel_count;
- for (unsigned int i = 0; i < texel_count; i++)
- {
- vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
- vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
- vfloat4 color = int_to_float(colori);
- vfloat4 oldColor = blk.texel(i);
- // Compare error using a perceptual decode metric for RGBM textures
- if (config.flags & ASTCENC_FLG_MAP_RGBM)
- {
- // Fail encodings that result in zero weight M pixels. Note that this can cause
- // "interesting" artifacts if we reject all useful encodings - we typically get max
- // brightness encodings instead which look just as bad. We recommend users apply a
- // bias to their stored M value, limiting the lower value to 16 or 32 to avoid
- // getting small M values post-quantization, but we can't prove it would never
- // happen, especially at low bit rates ...
- if (color.lane<3>() == 0.0f)
- {
- return -ERROR_CALC_DEFAULT;
- }
- // Compute error based on decoded RGBM color
- color = vfloat4(
- color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
- color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
- color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
- 1.0f
- );
- oldColor = vfloat4(
- oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
- oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
- oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
- 1.0f
- );
- }
- vfloat4 error = oldColor - color;
- error = min(abs(error), 1e15f);
- error = error * error;
- summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
- }
- return summa.lane<0>();
- }
- /* See header for documentation. */
- float compute_symbolic_block_difference_1plane(
- const astcenc_config& config,
- const block_size_descriptor& bsd,
- const symbolic_compressed_block& scb,
- const image_block& blk
- ) {
- assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
- // If we detected an error-block, blow up immediately.
- if (scb.block_type == SYM_BTYPE_ERROR)
- {
- return ERROR_CALC_DEFAULT;
- }
- assert(scb.block_mode >= 0);
- // Get the appropriate partition-table entry
- unsigned int partition_count = scb.partition_count;
- const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
- // Get the appropriate block descriptor
- const block_mode& bm = bsd.get_block_mode(scb.block_mode);
- const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
- // Unquantize and undecimate the weights
- int plane1_weights[BLOCK_MAX_TEXELS];
- unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
- vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
- vfloat4 summa = vfloat4::zero();
- for (unsigned int i = 0; i < partition_count; i++)
- {
- // Decode the color endpoints for this partition
- vint4 ep0;
- vint4 ep1;
- bool rgb_lns;
- bool a_lns;
- unpack_color_endpoints(config.profile,
- scb.color_formats[i],
- scb.color_values[i],
- rgb_lns, a_lns,
- ep0, ep1);
- // Unpack and compute error for each texel in the partition
- unsigned int texel_count = pi.partition_texel_count[i];
- for (unsigned int j = 0; j < texel_count; j++)
- {
- unsigned int tix = pi.texels_of_partition[i][j];
- vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
- vint4(plane1_weights[tix]));
- vfloat4 color = int_to_float(colori);
- vfloat4 oldColor = blk.texel(tix);
- // Compare error using a perceptual decode metric for RGBM textures
- if (config.flags & ASTCENC_FLG_MAP_RGBM)
- {
- // Fail encodings that result in zero weight M pixels. Note that this can cause
- // "interesting" artifacts if we reject all useful encodings - we typically get max
- // brightness encodings instead which look just as bad. We recommend users apply a
- // bias to their stored M value, limiting the lower value to 16 or 32 to avoid
- // getting small M values post-quantization, but we can't prove it would never
- // happen, especially at low bit rates ...
- if (color.lane<3>() == 0.0f)
- {
- return -ERROR_CALC_DEFAULT;
- }
- // Compute error based on decoded RGBM color
- color = vfloat4(
- color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
- color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
- color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
- 1.0f
- );
- oldColor = vfloat4(
- oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
- oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
- oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
- 1.0f
- );
- }
- vfloat4 error = oldColor - color;
- error = min(abs(error), 1e15f);
- error = error * error;
- summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
- }
- }
- return summa.lane<0>();
- }
- /* See header for documentation. */
- float compute_symbolic_block_difference_1plane_1partition(
- const astcenc_config& config,
- const block_size_descriptor& bsd,
- const symbolic_compressed_block& scb,
- const image_block& blk
- ) {
- // If we detected an error-block, blow up immediately.
- if (scb.block_type == SYM_BTYPE_ERROR)
- {
- return ERROR_CALC_DEFAULT;
- }
- assert(scb.block_mode >= 0);
- assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
- // Get the appropriate block descriptor
- const block_mode& bm = bsd.get_block_mode(scb.block_mode);
- const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
- // Unquantize and undecimate the weights
- ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
- unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
- // Decode the color endpoints for this partition
- vint4 ep0;
- vint4 ep1;
- bool rgb_lns;
- bool a_lns;
- unpack_color_endpoints(config.profile,
- scb.color_formats[0],
- scb.color_values[0],
- rgb_lns, a_lns,
- ep0, ep1);
- vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
- // Unpack and compute error for each texel in the partition
- vfloatacc summav = vfloatacc::zero();
- vint lane_id = vint::lane_id();
- unsigned int texel_count = bsd.texel_count;
- for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
- {
- // Compute EP1 contribution
- vint weight1 = vint::loada(plane1_weights + i);
- vint ep1_r = vint(ep1.lane<0>()) * weight1;
- vint ep1_g = vint(ep1.lane<1>()) * weight1;
- vint ep1_b = vint(ep1.lane<2>()) * weight1;
- vint ep1_a = vint(ep1.lane<3>()) * weight1;
- // Compute EP0 contribution
- vint weight0 = vint(64) - weight1;
- vint ep0_r = vint(ep0.lane<0>()) * weight0;
- vint ep0_g = vint(ep0.lane<1>()) * weight0;
- vint ep0_b = vint(ep0.lane<2>()) * weight0;
- vint ep0_a = vint(ep0.lane<3>()) * weight0;
- // Combine contributions
- vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
- vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
- vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
- vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
- // If using a U8 decode mode bit replicate top 8 bits
- // so rest of codec can assume 0xFFFF max range everywhere
- vint colori_r8 = asr<8>(colori_r) * vint(257);
- colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
- vint colori_g8 = asr<8>(colori_g) * vint(257);
- colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
- vint colori_b8 = asr<8>(colori_b) * vint(257);
- colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
- vint colori_a8 = asr<8>(colori_a) * vint(257);
- colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
- // Compute color diff
- vfloat color_r = int_to_float(colori_r);
- vfloat color_g = int_to_float(colori_g);
- vfloat color_b = int_to_float(colori_b);
- vfloat color_a = int_to_float(colori_a);
- vfloat color_orig_r = loada(blk.data_r + i);
- vfloat color_orig_g = loada(blk.data_g + i);
- vfloat color_orig_b = loada(blk.data_b + i);
- vfloat color_orig_a = loada(blk.data_a + i);
- vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
- vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
- vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
- vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
- // Compute squared error metric
- color_error_r = color_error_r * color_error_r;
- color_error_g = color_error_g * color_error_g;
- color_error_b = color_error_b * color_error_b;
- color_error_a = color_error_a * color_error_a;
- vfloat metric = color_error_r * blk.channel_weight.lane<0>()
- + color_error_g * blk.channel_weight.lane<1>()
- + color_error_b * blk.channel_weight.lane<2>()
- + color_error_a * blk.channel_weight.lane<3>();
- // Mask off bad lanes
- vmask mask = lane_id < vint(texel_count);
- lane_id += vint(ASTCENC_SIMD_WIDTH);
- haccumulate(summav, metric, mask);
- }
- return hadd_s(summav);
- }
- #endif
|