astcenc_image.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2022 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief Functions for creating in-memory ASTC image structures.
  19. */
  20. #include <cassert>
  21. #include <cstring>
  22. #include "astcenc_internal.h"
  23. /**
  24. * @brief Loader pipeline function type for data fetch from memory.
  25. */
  26. using pixel_loader = vfloat4(*)(const void*, int);
  27. /**
  28. * @brief Loader pipeline function type for swizzling data in a vector.
  29. */
  30. using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
  31. /**
  32. * @brief Loader pipeline function type for converting data in a vector to LNS.
  33. */
  34. using pixel_converter = vfloat4(*)(vfloat4, vmask4);
  35. /**
  36. * @brief Load a 8-bit UNORM texel from a data array.
  37. *
  38. * @param data The data pointer.
  39. * @param base_offset The index offset to the start of the pixel.
  40. */
  41. static vfloat4 load_texel_u8(
  42. const void* data,
  43. int base_offset
  44. ) {
  45. const uint8_t* data8 = static_cast<const uint8_t*>(data);
  46. return int_to_float(vint4(data8 + base_offset)) / 255.0f;
  47. }
  48. /**
  49. * @brief Load a 16-bit fp16 texel from a data array.
  50. *
  51. * @param data The data pointer.
  52. * @param base_offset The index offset to the start of the pixel.
  53. */
  54. static vfloat4 load_texel_f16(
  55. const void* data,
  56. int base_offset
  57. ) {
  58. const uint16_t* data16 = static_cast<const uint16_t*>(data);
  59. int r = data16[base_offset ];
  60. int g = data16[base_offset + 1];
  61. int b = data16[base_offset + 2];
  62. int a = data16[base_offset + 3];
  63. return float16_to_float(vint4(r, g, b, a));
  64. }
  65. /**
  66. * @brief Load a 32-bit float texel from a data array.
  67. *
  68. * @param data The data pointer.
  69. * @param base_offset The index offset to the start of the pixel.
  70. */
  71. static vfloat4 load_texel_f32(
  72. const void* data,
  73. int base_offset
  74. ) {
  75. const float* data32 = static_cast<const float*>(data);
  76. return vfloat4(data32 + base_offset);
  77. }
  78. /**
  79. * @brief Dummy no-op swizzle function.
  80. *
  81. * @param data The source RGBA vector to swizzle.
  82. * @param swz The swizzle to use.
  83. */
  84. static vfloat4 swz_texel_skip(
  85. vfloat4 data,
  86. const astcenc_swizzle& swz
  87. ) {
  88. (void)swz;
  89. return data;
  90. }
  91. /**
  92. * @brief Swizzle a texel into a new arrangement.
  93. *
  94. * @param data The source RGBA vector to swizzle.
  95. * @param swz The swizzle to use.
  96. */
  97. static vfloat4 swz_texel(
  98. vfloat4 data,
  99. const astcenc_swizzle& swz
  100. ) {
  101. alignas(16) float datas[6];
  102. storea(data, datas);
  103. datas[ASTCENC_SWZ_0] = 0.0f;
  104. datas[ASTCENC_SWZ_1] = 1.0f;
  105. return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
  106. }
  107. /**
  108. * @brief Encode a texel that is entirely LDR linear.
  109. *
  110. * @param data The RGBA data to encode.
  111. * @param lns_mask The mask for the HDR channels than need LNS encoding.
  112. */
  113. static vfloat4 encode_texel_unorm(
  114. vfloat4 data,
  115. vmask4 lns_mask
  116. ) {
  117. (void)lns_mask;
  118. return data * 65535.0f;
  119. }
  120. /**
  121. * @brief Encode a texel that includes at least some HDR LNS texels.
  122. *
  123. * @param data The RGBA data to encode.
  124. * @param lns_mask The mask for the HDR channels than need LNS encoding.
  125. */
  126. static vfloat4 encode_texel_lns(
  127. vfloat4 data,
  128. vmask4 lns_mask
  129. ) {
  130. vfloat4 datav_unorm = data * 65535.0f;
  131. vfloat4 datav_lns = float_to_lns(data);
  132. return select(datav_unorm, datav_lns, lns_mask);
  133. }
  134. /* See header for documentation. */
  135. void load_image_block(
  136. astcenc_profile decode_mode,
  137. const astcenc_image& img,
  138. image_block& blk,
  139. const block_size_descriptor& bsd,
  140. unsigned int xpos,
  141. unsigned int ypos,
  142. unsigned int zpos,
  143. const astcenc_swizzle& swz
  144. ) {
  145. unsigned int xsize = img.dim_x;
  146. unsigned int ysize = img.dim_y;
  147. unsigned int zsize = img.dim_z;
  148. blk.xpos = xpos;
  149. blk.ypos = ypos;
  150. blk.zpos = zpos;
  151. // True if any non-identity swizzle
  152. bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
  153. (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
  154. int idx = 0;
  155. vfloat4 data_min(1e38f);
  156. vfloat4 data_mean(0.0f);
  157. vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
  158. vfloat4 data_max(-1e38f);
  159. vmask4 grayscalev(true);
  160. // This works because we impose the same choice everywhere during encode
  161. uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
  162. (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
  163. uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
  164. vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
  165. vmask4 lns_mask = use_lns != vint4::zero();
  166. // Set up the function pointers for loading pipeline as needed
  167. pixel_loader loader = load_texel_u8;
  168. if (img.data_type == ASTCENC_TYPE_F16)
  169. {
  170. loader = load_texel_f16;
  171. }
  172. else if (img.data_type == ASTCENC_TYPE_F32)
  173. {
  174. loader = load_texel_f32;
  175. }
  176. pixel_swizzler swizzler = swz_texel_skip;
  177. if (needs_swz)
  178. {
  179. swizzler = swz_texel;
  180. }
  181. pixel_converter converter = encode_texel_unorm;
  182. if (any(lns_mask))
  183. {
  184. converter = encode_texel_lns;
  185. }
  186. for (unsigned int z = 0; z < bsd.zdim; z++)
  187. {
  188. unsigned int zi = astc::min(zpos + z, zsize - 1);
  189. void* plane = img.data[zi];
  190. for (unsigned int y = 0; y < bsd.ydim; y++)
  191. {
  192. unsigned int yi = astc::min(ypos + y, ysize - 1);
  193. for (unsigned int x = 0; x < bsd.xdim; x++)
  194. {
  195. unsigned int xi = astc::min(xpos + x, xsize - 1);
  196. vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
  197. datav = swizzler(datav, swz);
  198. datav = converter(datav, lns_mask);
  199. // Compute block metadata
  200. data_min = min(data_min, datav);
  201. data_mean += datav * data_mean_scale;
  202. data_max = max(data_max, datav);
  203. grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
  204. blk.data_r[idx] = datav.lane<0>();
  205. blk.data_g[idx] = datav.lane<1>();
  206. blk.data_b[idx] = datav.lane<2>();
  207. blk.data_a[idx] = datav.lane<3>();
  208. blk.rgb_lns[idx] = rgb_lns;
  209. blk.alpha_lns[idx] = a_lns;
  210. idx++;
  211. }
  212. }
  213. }
  214. // Reverse the encoding so we store origin block in the original format
  215. vfloat4 data_enc = blk.texel(0);
  216. vfloat4 data_enc_unorm = data_enc / 65535.0f;
  217. vfloat4 data_enc_lns = vfloat4::zero();
  218. if (rgb_lns || a_lns)
  219. {
  220. data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
  221. }
  222. blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
  223. // Store block metadata
  224. blk.data_min = data_min;
  225. blk.data_mean = data_mean;
  226. blk.data_max = data_max;
  227. blk.grayscale = all(grayscalev);
  228. }
  229. /* See header for documentation. */
  230. void load_image_block_fast_ldr(
  231. astcenc_profile decode_mode,
  232. const astcenc_image& img,
  233. image_block& blk,
  234. const block_size_descriptor& bsd,
  235. unsigned int xpos,
  236. unsigned int ypos,
  237. unsigned int zpos,
  238. const astcenc_swizzle& swz
  239. ) {
  240. (void)swz;
  241. (void)decode_mode;
  242. unsigned int xsize = img.dim_x;
  243. unsigned int ysize = img.dim_y;
  244. blk.xpos = xpos;
  245. blk.ypos = ypos;
  246. blk.zpos = zpos;
  247. vfloat4 data_min(1e38f);
  248. vfloat4 data_mean = vfloat4::zero();
  249. vfloat4 data_max(-1e38f);
  250. vmask4 grayscalev(true);
  251. int idx = 0;
  252. const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
  253. for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
  254. {
  255. unsigned int yi = astc::min(y, ysize - 1);
  256. for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
  257. {
  258. unsigned int xi = astc::min(x, xsize - 1);
  259. vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
  260. vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
  261. // Compute block metadata
  262. data_min = min(data_min, datav);
  263. data_mean += datav;
  264. data_max = max(data_max, datav);
  265. grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
  266. blk.data_r[idx] = datav.lane<0>();
  267. blk.data_g[idx] = datav.lane<1>();
  268. blk.data_b[idx] = datav.lane<2>();
  269. blk.data_a[idx] = datav.lane<3>();
  270. idx++;
  271. }
  272. }
  273. // Reverse the encoding so we store origin block in the original format
  274. blk.origin_texel = blk.texel(0) / 65535.0f;
  275. // Store block metadata
  276. blk.rgb_lns[0] = 0;
  277. blk.alpha_lns[0] = 0;
  278. blk.data_min = data_min;
  279. blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
  280. blk.data_max = data_max;
  281. blk.grayscale = all(grayscalev);
  282. }
  283. /* See header for documentation. */
  284. void store_image_block(
  285. astcenc_image& img,
  286. const image_block& blk,
  287. const block_size_descriptor& bsd,
  288. unsigned int xpos,
  289. unsigned int ypos,
  290. unsigned int zpos,
  291. const astcenc_swizzle& swz
  292. ) {
  293. unsigned int x_size = img.dim_x;
  294. unsigned int x_start = xpos;
  295. unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
  296. unsigned int x_count = x_end - x_start;
  297. unsigned int x_nudge = bsd.xdim - x_count;
  298. unsigned int y_size = img.dim_y;
  299. unsigned int y_start = ypos;
  300. unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
  301. unsigned int y_count = y_end - y_start;
  302. unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
  303. unsigned int z_size = img.dim_z;
  304. unsigned int z_start = zpos;
  305. unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
  306. // True if any non-identity swizzle
  307. bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
  308. (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
  309. // True if any swizzle uses Z reconstruct
  310. bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
  311. (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
  312. int idx = 0;
  313. if (img.data_type == ASTCENC_TYPE_U8)
  314. {
  315. for (unsigned int z = z_start; z < z_end; z++)
  316. {
  317. // Fetch the image plane
  318. uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
  319. for (unsigned int y = y_start; y < y_end; y++)
  320. {
  321. uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
  322. for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
  323. {
  324. unsigned int max_texels = ASTCENC_SIMD_WIDTH;
  325. unsigned int used_texels = astc::min(x_count - x, max_texels);
  326. // Unaligned load as rows are not always SIMD_WIDTH long
  327. vfloat data_r(blk.data_r + idx);
  328. vfloat data_g(blk.data_g + idx);
  329. vfloat data_b(blk.data_b + idx);
  330. vfloat data_a(blk.data_a + idx);
  331. vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
  332. vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
  333. vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
  334. vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
  335. if (needs_swz)
  336. {
  337. vint swizzle_table[7];
  338. swizzle_table[ASTCENC_SWZ_0] = vint(0);
  339. swizzle_table[ASTCENC_SWZ_1] = vint(255);
  340. swizzle_table[ASTCENC_SWZ_R] = data_ri;
  341. swizzle_table[ASTCENC_SWZ_G] = data_gi;
  342. swizzle_table[ASTCENC_SWZ_B] = data_bi;
  343. swizzle_table[ASTCENC_SWZ_A] = data_ai;
  344. if (needs_z)
  345. {
  346. vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
  347. vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
  348. vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
  349. data_z = max(data_z, 0.0f);
  350. data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
  351. swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
  352. }
  353. data_ri = swizzle_table[swz.r];
  354. data_gi = swizzle_table[swz.g];
  355. data_bi = swizzle_table[swz.b];
  356. data_ai = swizzle_table[swz.a];
  357. }
  358. // Errors are NaN encoded - convert to magenta error color
  359. // Branch is OK here - it is almost never true so predicts well
  360. vmask nan_mask = data_r != data_r;
  361. if (any(nan_mask))
  362. {
  363. data_ri = select(data_ri, vint(0xFF), nan_mask);
  364. data_gi = select(data_gi, vint(0x00), nan_mask);
  365. data_bi = select(data_bi, vint(0xFF), nan_mask);
  366. data_ai = select(data_ai, vint(0xFF), nan_mask);
  367. }
  368. vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
  369. vmask store_mask = vint::lane_id() < vint(used_texels);
  370. store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
  371. data8_row += ASTCENC_SIMD_WIDTH * 4;
  372. idx += used_texels;
  373. }
  374. idx += x_nudge;
  375. }
  376. idx += y_nudge;
  377. }
  378. }
  379. else if (img.data_type == ASTCENC_TYPE_F16)
  380. {
  381. for (unsigned int z = z_start; z < z_end; z++)
  382. {
  383. // Fetch the image plane
  384. uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
  385. for (unsigned int y = y_start; y < y_end; y++)
  386. {
  387. uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
  388. for (unsigned int x = 0; x < x_count; x++)
  389. {
  390. vint4 color;
  391. // NaNs are handled inline - no need to special case
  392. if (needs_swz)
  393. {
  394. float data[7];
  395. data[ASTCENC_SWZ_0] = 0.0f;
  396. data[ASTCENC_SWZ_1] = 1.0f;
  397. data[ASTCENC_SWZ_R] = blk.data_r[idx];
  398. data[ASTCENC_SWZ_G] = blk.data_g[idx];
  399. data[ASTCENC_SWZ_B] = blk.data_b[idx];
  400. data[ASTCENC_SWZ_A] = blk.data_a[idx];
  401. if (needs_z)
  402. {
  403. float xN = (data[0] * 2.0f) - 1.0f;
  404. float yN = (data[3] * 2.0f) - 1.0f;
  405. float zN = 1.0f - xN * xN - yN * yN;
  406. if (zN < 0.0f)
  407. {
  408. zN = 0.0f;
  409. }
  410. data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
  411. }
  412. vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
  413. color = float_to_float16(colorf);
  414. }
  415. else
  416. {
  417. vfloat4 colorf = blk.texel(idx);
  418. color = float_to_float16(colorf);
  419. }
  420. // TODO: Vectorize with store N shorts?
  421. data16_row[0] = static_cast<uint16_t>(color.lane<0>());
  422. data16_row[1] = static_cast<uint16_t>(color.lane<1>());
  423. data16_row[2] = static_cast<uint16_t>(color.lane<2>());
  424. data16_row[3] = static_cast<uint16_t>(color.lane<3>());
  425. data16_row += 4;
  426. idx++;
  427. }
  428. idx += x_nudge;
  429. }
  430. idx += y_nudge;
  431. }
  432. }
  433. else // if (img.data_type == ASTCENC_TYPE_F32)
  434. {
  435. assert(img.data_type == ASTCENC_TYPE_F32);
  436. for (unsigned int z = z_start; z < z_end; z++)
  437. {
  438. // Fetch the image plane
  439. float* data32 = static_cast<float*>(img.data[z]);
  440. for (unsigned int y = y_start; y < y_end; y++)
  441. {
  442. float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
  443. for (unsigned int x = 0; x < x_count; x++)
  444. {
  445. vfloat4 color = blk.texel(idx);
  446. // NaNs are handled inline - no need to special case
  447. if (needs_swz)
  448. {
  449. float data[7];
  450. data[ASTCENC_SWZ_0] = 0.0f;
  451. data[ASTCENC_SWZ_1] = 1.0f;
  452. data[ASTCENC_SWZ_R] = color.lane<0>();
  453. data[ASTCENC_SWZ_G] = color.lane<1>();
  454. data[ASTCENC_SWZ_B] = color.lane<2>();
  455. data[ASTCENC_SWZ_A] = color.lane<3>();
  456. if (needs_z)
  457. {
  458. float xN = (data[0] * 2.0f) - 1.0f;
  459. float yN = (data[3] * 2.0f) - 1.0f;
  460. float zN = 1.0f - xN * xN - yN * yN;
  461. if (zN < 0.0f)
  462. {
  463. zN = 0.0f;
  464. }
  465. data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
  466. }
  467. color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
  468. }
  469. store(color, data32_row);
  470. data32_row += 4;
  471. idx++;
  472. }
  473. idx += x_nudge;
  474. }
  475. idx += y_nudge;
  476. }
  477. }
  478. }