123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559 |
- // SPDX-License-Identifier: Apache-2.0
- // ----------------------------------------------------------------------------
- // Copyright 2011-2022 Arm Limited
- //
- // Licensed under the Apache License, Version 2.0 (the "License"); you may not
- // use this file except in compliance with the License. You may obtain a copy
- // of the License at:
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- // License for the specific language governing permissions and limitations
- // under the License.
- // ----------------------------------------------------------------------------
- /**
- * @brief Functions for creating in-memory ASTC image structures.
- */
- #include <cassert>
- #include <cstring>
- #include "astcenc_internal.h"
- /**
- * @brief Loader pipeline function type for data fetch from memory.
- */
- using pixel_loader = vfloat4(*)(const void*, int);
- /**
- * @brief Loader pipeline function type for swizzling data in a vector.
- */
- using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
- /**
- * @brief Loader pipeline function type for converting data in a vector to LNS.
- */
- using pixel_converter = vfloat4(*)(vfloat4, vmask4);
- /**
- * @brief Load a 8-bit UNORM texel from a data array.
- *
- * @param data The data pointer.
- * @param base_offset The index offset to the start of the pixel.
- */
- static vfloat4 load_texel_u8(
- const void* data,
- int base_offset
- ) {
- const uint8_t* data8 = static_cast<const uint8_t*>(data);
- return int_to_float(vint4(data8 + base_offset)) / 255.0f;
- }
- /**
- * @brief Load a 16-bit fp16 texel from a data array.
- *
- * @param data The data pointer.
- * @param base_offset The index offset to the start of the pixel.
- */
- static vfloat4 load_texel_f16(
- const void* data,
- int base_offset
- ) {
- const uint16_t* data16 = static_cast<const uint16_t*>(data);
- int r = data16[base_offset ];
- int g = data16[base_offset + 1];
- int b = data16[base_offset + 2];
- int a = data16[base_offset + 3];
- return float16_to_float(vint4(r, g, b, a));
- }
- /**
- * @brief Load a 32-bit float texel from a data array.
- *
- * @param data The data pointer.
- * @param base_offset The index offset to the start of the pixel.
- */
- static vfloat4 load_texel_f32(
- const void* data,
- int base_offset
- ) {
- const float* data32 = static_cast<const float*>(data);
- return vfloat4(data32 + base_offset);
- }
- /**
- * @brief Dummy no-op swizzle function.
- *
- * @param data The source RGBA vector to swizzle.
- * @param swz The swizzle to use.
- */
- static vfloat4 swz_texel_skip(
- vfloat4 data,
- const astcenc_swizzle& swz
- ) {
- (void)swz;
- return data;
- }
- /**
- * @brief Swizzle a texel into a new arrangement.
- *
- * @param data The source RGBA vector to swizzle.
- * @param swz The swizzle to use.
- */
- static vfloat4 swz_texel(
- vfloat4 data,
- const astcenc_swizzle& swz
- ) {
- alignas(16) float datas[6];
- storea(data, datas);
- datas[ASTCENC_SWZ_0] = 0.0f;
- datas[ASTCENC_SWZ_1] = 1.0f;
- return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
- }
- /**
- * @brief Encode a texel that is entirely LDR linear.
- *
- * @param data The RGBA data to encode.
- * @param lns_mask The mask for the HDR channels than need LNS encoding.
- */
- static vfloat4 encode_texel_unorm(
- vfloat4 data,
- vmask4 lns_mask
- ) {
- (void)lns_mask;
- return data * 65535.0f;
- }
- /**
- * @brief Encode a texel that includes at least some HDR LNS texels.
- *
- * @param data The RGBA data to encode.
- * @param lns_mask The mask for the HDR channels than need LNS encoding.
- */
- static vfloat4 encode_texel_lns(
- vfloat4 data,
- vmask4 lns_mask
- ) {
- vfloat4 datav_unorm = data * 65535.0f;
- vfloat4 datav_lns = float_to_lns(data);
- return select(datav_unorm, datav_lns, lns_mask);
- }
- /* See header for documentation. */
- void load_image_block(
- astcenc_profile decode_mode,
- const astcenc_image& img,
- image_block& blk,
- const block_size_descriptor& bsd,
- unsigned int xpos,
- unsigned int ypos,
- unsigned int zpos,
- const astcenc_swizzle& swz
- ) {
- unsigned int xsize = img.dim_x;
- unsigned int ysize = img.dim_y;
- unsigned int zsize = img.dim_z;
- blk.xpos = xpos;
- blk.ypos = ypos;
- blk.zpos = zpos;
- // True if any non-identity swizzle
- bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
- (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
- int idx = 0;
- vfloat4 data_min(1e38f);
- vfloat4 data_mean(0.0f);
- vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
- vfloat4 data_max(-1e38f);
- vmask4 grayscalev(true);
- // This works because we impose the same choice everywhere during encode
- uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
- (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
- uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
- vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
- vmask4 lns_mask = use_lns != vint4::zero();
- // Set up the function pointers for loading pipeline as needed
- pixel_loader loader = load_texel_u8;
- if (img.data_type == ASTCENC_TYPE_F16)
- {
- loader = load_texel_f16;
- }
- else if (img.data_type == ASTCENC_TYPE_F32)
- {
- loader = load_texel_f32;
- }
- pixel_swizzler swizzler = swz_texel_skip;
- if (needs_swz)
- {
- swizzler = swz_texel;
- }
- pixel_converter converter = encode_texel_unorm;
- if (any(lns_mask))
- {
- converter = encode_texel_lns;
- }
- for (unsigned int z = 0; z < bsd.zdim; z++)
- {
- unsigned int zi = astc::min(zpos + z, zsize - 1);
- void* plane = img.data[zi];
- for (unsigned int y = 0; y < bsd.ydim; y++)
- {
- unsigned int yi = astc::min(ypos + y, ysize - 1);
- for (unsigned int x = 0; x < bsd.xdim; x++)
- {
- unsigned int xi = astc::min(xpos + x, xsize - 1);
- vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
- datav = swizzler(datav, swz);
- datav = converter(datav, lns_mask);
- // Compute block metadata
- data_min = min(data_min, datav);
- data_mean += datav * data_mean_scale;
- data_max = max(data_max, datav);
- grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
- blk.data_r[idx] = datav.lane<0>();
- blk.data_g[idx] = datav.lane<1>();
- blk.data_b[idx] = datav.lane<2>();
- blk.data_a[idx] = datav.lane<3>();
- blk.rgb_lns[idx] = rgb_lns;
- blk.alpha_lns[idx] = a_lns;
- idx++;
- }
- }
- }
- // Reverse the encoding so we store origin block in the original format
- vfloat4 data_enc = blk.texel(0);
- vfloat4 data_enc_unorm = data_enc / 65535.0f;
- vfloat4 data_enc_lns = vfloat4::zero();
- if (rgb_lns || a_lns)
- {
- data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
- }
- blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
- // Store block metadata
- blk.data_min = data_min;
- blk.data_mean = data_mean;
- blk.data_max = data_max;
- blk.grayscale = all(grayscalev);
- }
- /* See header for documentation. */
- void load_image_block_fast_ldr(
- astcenc_profile decode_mode,
- const astcenc_image& img,
- image_block& blk,
- const block_size_descriptor& bsd,
- unsigned int xpos,
- unsigned int ypos,
- unsigned int zpos,
- const astcenc_swizzle& swz
- ) {
- (void)swz;
- (void)decode_mode;
- unsigned int xsize = img.dim_x;
- unsigned int ysize = img.dim_y;
- blk.xpos = xpos;
- blk.ypos = ypos;
- blk.zpos = zpos;
- vfloat4 data_min(1e38f);
- vfloat4 data_mean = vfloat4::zero();
- vfloat4 data_max(-1e38f);
- vmask4 grayscalev(true);
- int idx = 0;
- const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
- for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
- {
- unsigned int yi = astc::min(y, ysize - 1);
- for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
- {
- unsigned int xi = astc::min(x, xsize - 1);
- vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
- vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
- // Compute block metadata
- data_min = min(data_min, datav);
- data_mean += datav;
- data_max = max(data_max, datav);
- grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
- blk.data_r[idx] = datav.lane<0>();
- blk.data_g[idx] = datav.lane<1>();
- blk.data_b[idx] = datav.lane<2>();
- blk.data_a[idx] = datav.lane<3>();
- idx++;
- }
- }
- // Reverse the encoding so we store origin block in the original format
- blk.origin_texel = blk.texel(0) / 65535.0f;
- // Store block metadata
- blk.rgb_lns[0] = 0;
- blk.alpha_lns[0] = 0;
- blk.data_min = data_min;
- blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
- blk.data_max = data_max;
- blk.grayscale = all(grayscalev);
- }
- /* See header for documentation. */
- void store_image_block(
- astcenc_image& img,
- const image_block& blk,
- const block_size_descriptor& bsd,
- unsigned int xpos,
- unsigned int ypos,
- unsigned int zpos,
- const astcenc_swizzle& swz
- ) {
- unsigned int x_size = img.dim_x;
- unsigned int x_start = xpos;
- unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
- unsigned int x_count = x_end - x_start;
- unsigned int x_nudge = bsd.xdim - x_count;
- unsigned int y_size = img.dim_y;
- unsigned int y_start = ypos;
- unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
- unsigned int y_count = y_end - y_start;
- unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
- unsigned int z_size = img.dim_z;
- unsigned int z_start = zpos;
- unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
- // True if any non-identity swizzle
- bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
- (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
- // True if any swizzle uses Z reconstruct
- bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
- (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
- int idx = 0;
- if (img.data_type == ASTCENC_TYPE_U8)
- {
- for (unsigned int z = z_start; z < z_end; z++)
- {
- // Fetch the image plane
- uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
- for (unsigned int y = y_start; y < y_end; y++)
- {
- uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
- for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
- {
- unsigned int max_texels = ASTCENC_SIMD_WIDTH;
- unsigned int used_texels = astc::min(x_count - x, max_texels);
- // Unaligned load as rows are not always SIMD_WIDTH long
- vfloat data_r(blk.data_r + idx);
- vfloat data_g(blk.data_g + idx);
- vfloat data_b(blk.data_b + idx);
- vfloat data_a(blk.data_a + idx);
- vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
- vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
- vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
- vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
- if (needs_swz)
- {
- vint swizzle_table[7];
- swizzle_table[ASTCENC_SWZ_0] = vint(0);
- swizzle_table[ASTCENC_SWZ_1] = vint(255);
- swizzle_table[ASTCENC_SWZ_R] = data_ri;
- swizzle_table[ASTCENC_SWZ_G] = data_gi;
- swizzle_table[ASTCENC_SWZ_B] = data_bi;
- swizzle_table[ASTCENC_SWZ_A] = data_ai;
- if (needs_z)
- {
- vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
- vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
- vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
- data_z = max(data_z, 0.0f);
- data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
- swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
- }
- data_ri = swizzle_table[swz.r];
- data_gi = swizzle_table[swz.g];
- data_bi = swizzle_table[swz.b];
- data_ai = swizzle_table[swz.a];
- }
- // Errors are NaN encoded - convert to magenta error color
- // Branch is OK here - it is almost never true so predicts well
- vmask nan_mask = data_r != data_r;
- if (any(nan_mask))
- {
- data_ri = select(data_ri, vint(0xFF), nan_mask);
- data_gi = select(data_gi, vint(0x00), nan_mask);
- data_bi = select(data_bi, vint(0xFF), nan_mask);
- data_ai = select(data_ai, vint(0xFF), nan_mask);
- }
- vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
- vmask store_mask = vint::lane_id() < vint(used_texels);
- store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
- data8_row += ASTCENC_SIMD_WIDTH * 4;
- idx += used_texels;
- }
- idx += x_nudge;
- }
- idx += y_nudge;
- }
- }
- else if (img.data_type == ASTCENC_TYPE_F16)
- {
- for (unsigned int z = z_start; z < z_end; z++)
- {
- // Fetch the image plane
- uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
- for (unsigned int y = y_start; y < y_end; y++)
- {
- uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
- for (unsigned int x = 0; x < x_count; x++)
- {
- vint4 color;
- // NaNs are handled inline - no need to special case
- if (needs_swz)
- {
- float data[7];
- data[ASTCENC_SWZ_0] = 0.0f;
- data[ASTCENC_SWZ_1] = 1.0f;
- data[ASTCENC_SWZ_R] = blk.data_r[idx];
- data[ASTCENC_SWZ_G] = blk.data_g[idx];
- data[ASTCENC_SWZ_B] = blk.data_b[idx];
- data[ASTCENC_SWZ_A] = blk.data_a[idx];
- if (needs_z)
- {
- float xN = (data[0] * 2.0f) - 1.0f;
- float yN = (data[3] * 2.0f) - 1.0f;
- float zN = 1.0f - xN * xN - yN * yN;
- if (zN < 0.0f)
- {
- zN = 0.0f;
- }
- data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
- }
- vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
- color = float_to_float16(colorf);
- }
- else
- {
- vfloat4 colorf = blk.texel(idx);
- color = float_to_float16(colorf);
- }
- // TODO: Vectorize with store N shorts?
- data16_row[0] = static_cast<uint16_t>(color.lane<0>());
- data16_row[1] = static_cast<uint16_t>(color.lane<1>());
- data16_row[2] = static_cast<uint16_t>(color.lane<2>());
- data16_row[3] = static_cast<uint16_t>(color.lane<3>());
- data16_row += 4;
- idx++;
- }
- idx += x_nudge;
- }
- idx += y_nudge;
- }
- }
- else // if (img.data_type == ASTCENC_TYPE_F32)
- {
- assert(img.data_type == ASTCENC_TYPE_F32);
- for (unsigned int z = z_start; z < z_end; z++)
- {
- // Fetch the image plane
- float* data32 = static_cast<float*>(img.data[z]);
- for (unsigned int y = y_start; y < y_end; y++)
- {
- float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
- for (unsigned int x = 0; x < x_count; x++)
- {
- vfloat4 color = blk.texel(idx);
- // NaNs are handled inline - no need to special case
- if (needs_swz)
- {
- float data[7];
- data[ASTCENC_SWZ_0] = 0.0f;
- data[ASTCENC_SWZ_1] = 1.0f;
- data[ASTCENC_SWZ_R] = color.lane<0>();
- data[ASTCENC_SWZ_G] = color.lane<1>();
- data[ASTCENC_SWZ_B] = color.lane<2>();
- data[ASTCENC_SWZ_A] = color.lane<3>();
- if (needs_z)
- {
- float xN = (data[0] * 2.0f) - 1.0f;
- float yN = (data[3] * 2.0f) - 1.0f;
- float zN = 1.0f - xN * xN - yN * yN;
- if (zN < 0.0f)
- {
- zN = 0.0f;
- }
- data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
- }
- color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
- }
- store(color, data32_row);
- data32_row += 4;
- idx++;
- }
- idx += x_nudge;
- }
- idx += y_nudge;
- }
- }
- }
|