astcenc_averages_and_directions.cpp 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2023 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief Functions for finding dominant direction of a set of colors.
  19. */
  20. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  21. #include "astcenc_internal.h"
  22. #include <cassert>
  23. /**
  24. * @brief Compute the average RGB color of each partition.
  25. *
  26. * The algorithm here uses a vectorized sequential scan and per-partition
  27. * color accumulators, using select() to mask texel lanes in other partitions.
  28. *
  29. * We only accumulate sums for N-1 partitions during the scan; the value for
  30. * the last partition can be computed given that we know the block-wide average
  31. * already.
  32. *
  33. * Because of this we could reduce the loop iteration count so it "just" spans
  34. * the max texel index needed for the N-1 partitions, which could need fewer
  35. * iterations than the full block texel count. However, this makes the loop
  36. * count erratic and causes more branch mispredictions so is a net loss.
  37. *
  38. * @param pi The partitioning to use.
  39. * @param blk The block data to process.
  40. * @param[out] averages The output averages. Unused partition indices will
  41. * not be initialized, and lane<3> will be zero.
  42. */
  43. static void compute_partition_averages_rgb(
  44. const partition_info& pi,
  45. const image_block& blk,
  46. vfloat4 averages[BLOCK_MAX_PARTITIONS]
  47. ) {
  48. unsigned int partition_count = pi.partition_count;
  49. unsigned int texel_count = blk.texel_count;
  50. promise(texel_count > 0);
  51. // For 1 partition just use the precomputed mean
  52. if (partition_count == 1)
  53. {
  54. averages[0] = blk.data_mean.swz<0, 1, 2>();
  55. }
  56. // For 2 partitions scan results for partition 0, compute partition 1
  57. else if (partition_count == 2)
  58. {
  59. vfloatacc pp_avg_rgb[3] {};
  60. vint lane_id = vint::lane_id();
  61. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  62. {
  63. vint texel_partition(pi.partition_of_texel + i);
  64. vmask lane_mask = lane_id < vint(texel_count);
  65. lane_id += vint(ASTCENC_SIMD_WIDTH);
  66. vmask p0_mask = lane_mask & (texel_partition == vint(0));
  67. vfloat data_r = loada(blk.data_r + i);
  68. haccumulate(pp_avg_rgb[0], data_r, p0_mask);
  69. vfloat data_g = loada(blk.data_g + i);
  70. haccumulate(pp_avg_rgb[1], data_g, p0_mask);
  71. vfloat data_b = loada(blk.data_b + i);
  72. haccumulate(pp_avg_rgb[2], data_b, p0_mask);
  73. }
  74. vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
  75. vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
  76. hadd_s(pp_avg_rgb[1]),
  77. hadd_s(pp_avg_rgb[2]));
  78. vfloat4 p1_total = block_total - p0_total;
  79. averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
  80. averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
  81. }
  82. // For 3 partitions scan results for partition 0/1, compute partition 2
  83. else if (partition_count == 3)
  84. {
  85. vfloatacc pp_avg_rgb[2][3] {};
  86. vint lane_id = vint::lane_id();
  87. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  88. {
  89. vint texel_partition(pi.partition_of_texel + i);
  90. vmask lane_mask = lane_id < vint(texel_count);
  91. lane_id += vint(ASTCENC_SIMD_WIDTH);
  92. vmask p0_mask = lane_mask & (texel_partition == vint(0));
  93. vmask p1_mask = lane_mask & (texel_partition == vint(1));
  94. vfloat data_r = loada(blk.data_r + i);
  95. haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
  96. haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
  97. vfloat data_g = loada(blk.data_g + i);
  98. haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
  99. haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
  100. vfloat data_b = loada(blk.data_b + i);
  101. haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
  102. haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
  103. }
  104. vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
  105. vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
  106. hadd_s(pp_avg_rgb[0][1]),
  107. hadd_s(pp_avg_rgb[0][2]));
  108. vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
  109. hadd_s(pp_avg_rgb[1][1]),
  110. hadd_s(pp_avg_rgb[1][2]));
  111. vfloat4 p2_total = block_total - p0_total - p1_total;
  112. averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
  113. averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
  114. averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
  115. }
  116. else
  117. {
  118. // For 4 partitions scan results for partition 0/1/2, compute partition 3
  119. vfloatacc pp_avg_rgb[3][3] {};
  120. vint lane_id = vint::lane_id();
  121. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  122. {
  123. vint texel_partition(pi.partition_of_texel + i);
  124. vmask lane_mask = lane_id < vint(texel_count);
  125. lane_id += vint(ASTCENC_SIMD_WIDTH);
  126. vmask p0_mask = lane_mask & (texel_partition == vint(0));
  127. vmask p1_mask = lane_mask & (texel_partition == vint(1));
  128. vmask p2_mask = lane_mask & (texel_partition == vint(2));
  129. vfloat data_r = loada(blk.data_r + i);
  130. haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
  131. haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
  132. haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
  133. vfloat data_g = loada(blk.data_g + i);
  134. haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
  135. haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
  136. haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
  137. vfloat data_b = loada(blk.data_b + i);
  138. haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
  139. haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
  140. haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
  141. }
  142. vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
  143. vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
  144. hadd_s(pp_avg_rgb[0][1]),
  145. hadd_s(pp_avg_rgb[0][2]));
  146. vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
  147. hadd_s(pp_avg_rgb[1][1]),
  148. hadd_s(pp_avg_rgb[1][2]));
  149. vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
  150. hadd_s(pp_avg_rgb[2][1]),
  151. hadd_s(pp_avg_rgb[2][2]));
  152. vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
  153. averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
  154. averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
  155. averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
  156. averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
  157. }
  158. }
  159. /**
  160. * @brief Compute the average RGBA color of each partition.
  161. *
  162. * The algorithm here uses a vectorized sequential scan and per-partition
  163. * color accumulators, using select() to mask texel lanes in other partitions.
  164. *
  165. * We only accumulate sums for N-1 partitions during the scan; the value for
  166. * the last partition can be computed given that we know the block-wide average
  167. * already.
  168. *
  169. * Because of this we could reduce the loop iteration count so it "just" spans
  170. * the max texel index needed for the N-1 partitions, which could need fewer
  171. * iterations than the full block texel count. However, this makes the loop
  172. * count erratic and causes more branch mispredictions so is a net loss.
  173. *
  174. * @param pi The partitioning to use.
  175. * @param blk The block data to process.
  176. * @param[out] averages The output averages. Unused partition indices will
  177. * not be initialized.
  178. */
  179. static void compute_partition_averages_rgba(
  180. const partition_info& pi,
  181. const image_block& blk,
  182. vfloat4 averages[BLOCK_MAX_PARTITIONS]
  183. ) {
  184. unsigned int partition_count = pi.partition_count;
  185. unsigned int texel_count = blk.texel_count;
  186. promise(texel_count > 0);
  187. // For 1 partition just use the precomputed mean
  188. if (partition_count == 1)
  189. {
  190. averages[0] = blk.data_mean;
  191. }
  192. // For 2 partitions scan results for partition 0, compute partition 1
  193. else if (partition_count == 2)
  194. {
  195. vfloat4 pp_avg_rgba[4] {};
  196. vint lane_id = vint::lane_id();
  197. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  198. {
  199. vint texel_partition(pi.partition_of_texel + i);
  200. vmask lane_mask = lane_id < vint(texel_count);
  201. lane_id += vint(ASTCENC_SIMD_WIDTH);
  202. vmask p0_mask = lane_mask & (texel_partition == vint(0));
  203. vfloat data_r = loada(blk.data_r + i);
  204. haccumulate(pp_avg_rgba[0], data_r, p0_mask);
  205. vfloat data_g = loada(blk.data_g + i);
  206. haccumulate(pp_avg_rgba[1], data_g, p0_mask);
  207. vfloat data_b = loada(blk.data_b + i);
  208. haccumulate(pp_avg_rgba[2], data_b, p0_mask);
  209. vfloat data_a = loada(blk.data_a + i);
  210. haccumulate(pp_avg_rgba[3], data_a, p0_mask);
  211. }
  212. vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
  213. vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
  214. hadd_s(pp_avg_rgba[1]),
  215. hadd_s(pp_avg_rgba[2]),
  216. hadd_s(pp_avg_rgba[3]));
  217. vfloat4 p1_total = block_total - p0_total;
  218. averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
  219. averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
  220. }
  221. // For 3 partitions scan results for partition 0/1, compute partition 2
  222. else if (partition_count == 3)
  223. {
  224. vfloat4 pp_avg_rgba[2][4] {};
  225. vint lane_id = vint::lane_id();
  226. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  227. {
  228. vint texel_partition(pi.partition_of_texel + i);
  229. vmask lane_mask = lane_id < vint(texel_count);
  230. lane_id += vint(ASTCENC_SIMD_WIDTH);
  231. vmask p0_mask = lane_mask & (texel_partition == vint(0));
  232. vmask p1_mask = lane_mask & (texel_partition == vint(1));
  233. vfloat data_r = loada(blk.data_r + i);
  234. haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
  235. haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
  236. vfloat data_g = loada(blk.data_g + i);
  237. haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
  238. haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
  239. vfloat data_b = loada(blk.data_b + i);
  240. haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
  241. haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
  242. vfloat data_a = loada(blk.data_a + i);
  243. haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
  244. haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
  245. }
  246. vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
  247. vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
  248. hadd_s(pp_avg_rgba[0][1]),
  249. hadd_s(pp_avg_rgba[0][2]),
  250. hadd_s(pp_avg_rgba[0][3]));
  251. vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
  252. hadd_s(pp_avg_rgba[1][1]),
  253. hadd_s(pp_avg_rgba[1][2]),
  254. hadd_s(pp_avg_rgba[1][3]));
  255. vfloat4 p2_total = block_total - p0_total - p1_total;
  256. averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
  257. averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
  258. averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
  259. }
  260. else
  261. {
  262. // For 4 partitions scan results for partition 0/1/2, compute partition 3
  263. vfloat4 pp_avg_rgba[3][4] {};
  264. vint lane_id = vint::lane_id();
  265. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  266. {
  267. vint texel_partition(pi.partition_of_texel + i);
  268. vmask lane_mask = lane_id < vint(texel_count);
  269. lane_id += vint(ASTCENC_SIMD_WIDTH);
  270. vmask p0_mask = lane_mask & (texel_partition == vint(0));
  271. vmask p1_mask = lane_mask & (texel_partition == vint(1));
  272. vmask p2_mask = lane_mask & (texel_partition == vint(2));
  273. vfloat data_r = loada(blk.data_r + i);
  274. haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
  275. haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
  276. haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
  277. vfloat data_g = loada(blk.data_g + i);
  278. haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
  279. haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
  280. haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
  281. vfloat data_b = loada(blk.data_b + i);
  282. haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
  283. haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
  284. haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
  285. vfloat data_a = loada(blk.data_a + i);
  286. haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
  287. haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
  288. haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
  289. }
  290. vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
  291. vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
  292. hadd_s(pp_avg_rgba[0][1]),
  293. hadd_s(pp_avg_rgba[0][2]),
  294. hadd_s(pp_avg_rgba[0][3]));
  295. vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
  296. hadd_s(pp_avg_rgba[1][1]),
  297. hadd_s(pp_avg_rgba[1][2]),
  298. hadd_s(pp_avg_rgba[1][3]));
  299. vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
  300. hadd_s(pp_avg_rgba[2][1]),
  301. hadd_s(pp_avg_rgba[2][2]),
  302. hadd_s(pp_avg_rgba[2][3]));
  303. vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
  304. averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
  305. averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
  306. averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
  307. averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
  308. }
  309. }
  310. /* See header for documentation. */
  311. void compute_avgs_and_dirs_4_comp(
  312. const partition_info& pi,
  313. const image_block& blk,
  314. partition_metrics pm[BLOCK_MAX_PARTITIONS]
  315. ) {
  316. int partition_count = pi.partition_count;
  317. promise(partition_count > 0);
  318. // Pre-compute partition_averages
  319. vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
  320. compute_partition_averages_rgba(pi, blk, partition_averages);
  321. for (int partition = 0; partition < partition_count; partition++)
  322. {
  323. const uint8_t *texel_indexes = pi.texels_of_partition[partition];
  324. unsigned int texel_count = pi.partition_texel_count[partition];
  325. promise(texel_count > 0);
  326. vfloat4 average = partition_averages[partition];
  327. pm[partition].avg = average;
  328. vfloat4 sum_xp = vfloat4::zero();
  329. vfloat4 sum_yp = vfloat4::zero();
  330. vfloat4 sum_zp = vfloat4::zero();
  331. vfloat4 sum_wp = vfloat4::zero();
  332. for (unsigned int i = 0; i < texel_count; i++)
  333. {
  334. unsigned int iwt = texel_indexes[i];
  335. vfloat4 texel_datum = blk.texel(iwt);
  336. texel_datum = texel_datum - average;
  337. vfloat4 zero = vfloat4::zero();
  338. vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
  339. sum_xp += select(zero, texel_datum, tdm0);
  340. vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
  341. sum_yp += select(zero, texel_datum, tdm1);
  342. vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
  343. sum_zp += select(zero, texel_datum, tdm2);
  344. vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
  345. sum_wp += select(zero, texel_datum, tdm3);
  346. }
  347. vfloat4 prod_xp = dot(sum_xp, sum_xp);
  348. vfloat4 prod_yp = dot(sum_yp, sum_yp);
  349. vfloat4 prod_zp = dot(sum_zp, sum_zp);
  350. vfloat4 prod_wp = dot(sum_wp, sum_wp);
  351. vfloat4 best_vector = sum_xp;
  352. vfloat4 best_sum = prod_xp;
  353. vmask4 mask = prod_yp > best_sum;
  354. best_vector = select(best_vector, sum_yp, mask);
  355. best_sum = select(best_sum, prod_yp, mask);
  356. mask = prod_zp > best_sum;
  357. best_vector = select(best_vector, sum_zp, mask);
  358. best_sum = select(best_sum, prod_zp, mask);
  359. mask = prod_wp > best_sum;
  360. best_vector = select(best_vector, sum_wp, mask);
  361. pm[partition].dir = best_vector;
  362. }
  363. }
  364. /* See header for documentation. */
  365. void compute_avgs_and_dirs_3_comp(
  366. const partition_info& pi,
  367. const image_block& blk,
  368. unsigned int omitted_component,
  369. partition_metrics pm[BLOCK_MAX_PARTITIONS]
  370. ) {
  371. // Pre-compute partition_averages
  372. vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
  373. compute_partition_averages_rgba(pi, blk, partition_averages);
  374. const float* data_vr = blk.data_r;
  375. const float* data_vg = blk.data_g;
  376. const float* data_vb = blk.data_b;
  377. // TODO: Data-driven permute would be useful to avoid this ...
  378. if (omitted_component == 0)
  379. {
  380. partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
  381. partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
  382. partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
  383. partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
  384. data_vr = blk.data_g;
  385. data_vg = blk.data_b;
  386. data_vb = blk.data_a;
  387. }
  388. else if (omitted_component == 1)
  389. {
  390. partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
  391. partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
  392. partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
  393. partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
  394. data_vg = blk.data_b;
  395. data_vb = blk.data_a;
  396. }
  397. else if (omitted_component == 2)
  398. {
  399. partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
  400. partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
  401. partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
  402. partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
  403. data_vb = blk.data_a;
  404. }
  405. else
  406. {
  407. partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
  408. partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
  409. partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
  410. partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
  411. }
  412. unsigned int partition_count = pi.partition_count;
  413. promise(partition_count > 0);
  414. for (unsigned int partition = 0; partition < partition_count; partition++)
  415. {
  416. const uint8_t *texel_indexes = pi.texels_of_partition[partition];
  417. unsigned int texel_count = pi.partition_texel_count[partition];
  418. promise(texel_count > 0);
  419. vfloat4 average = partition_averages[partition];
  420. pm[partition].avg = average;
  421. vfloat4 sum_xp = vfloat4::zero();
  422. vfloat4 sum_yp = vfloat4::zero();
  423. vfloat4 sum_zp = vfloat4::zero();
  424. for (unsigned int i = 0; i < texel_count; i++)
  425. {
  426. unsigned int iwt = texel_indexes[i];
  427. vfloat4 texel_datum = vfloat3(data_vr[iwt],
  428. data_vg[iwt],
  429. data_vb[iwt]);
  430. texel_datum = texel_datum - average;
  431. vfloat4 zero = vfloat4::zero();
  432. vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
  433. sum_xp += select(zero, texel_datum, tdm0);
  434. vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
  435. sum_yp += select(zero, texel_datum, tdm1);
  436. vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
  437. sum_zp += select(zero, texel_datum, tdm2);
  438. }
  439. vfloat4 prod_xp = dot(sum_xp, sum_xp);
  440. vfloat4 prod_yp = dot(sum_yp, sum_yp);
  441. vfloat4 prod_zp = dot(sum_zp, sum_zp);
  442. vfloat4 best_vector = sum_xp;
  443. vfloat4 best_sum = prod_xp;
  444. vmask4 mask = prod_yp > best_sum;
  445. best_vector = select(best_vector, sum_yp, mask);
  446. best_sum = select(best_sum, prod_yp, mask);
  447. mask = prod_zp > best_sum;
  448. best_vector = select(best_vector, sum_zp, mask);
  449. pm[partition].dir = best_vector;
  450. }
  451. }
  452. /* See header for documentation. */
  453. void compute_avgs_and_dirs_3_comp_rgb(
  454. const partition_info& pi,
  455. const image_block& blk,
  456. partition_metrics pm[BLOCK_MAX_PARTITIONS]
  457. ) {
  458. unsigned int partition_count = pi.partition_count;
  459. promise(partition_count > 0);
  460. // Pre-compute partition_averages
  461. vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
  462. compute_partition_averages_rgb(pi, blk, partition_averages);
  463. for (unsigned int partition = 0; partition < partition_count; partition++)
  464. {
  465. const uint8_t *texel_indexes = pi.texels_of_partition[partition];
  466. unsigned int texel_count = pi.partition_texel_count[partition];
  467. promise(texel_count > 0);
  468. vfloat4 average = partition_averages[partition];
  469. pm[partition].avg = average;
  470. vfloat4 sum_xp = vfloat4::zero();
  471. vfloat4 sum_yp = vfloat4::zero();
  472. vfloat4 sum_zp = vfloat4::zero();
  473. for (unsigned int i = 0; i < texel_count; i++)
  474. {
  475. unsigned int iwt = texel_indexes[i];
  476. vfloat4 texel_datum = blk.texel3(iwt);
  477. texel_datum = texel_datum - average;
  478. vfloat4 zero = vfloat4::zero();
  479. vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
  480. sum_xp += select(zero, texel_datum, tdm0);
  481. vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
  482. sum_yp += select(zero, texel_datum, tdm1);
  483. vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
  484. sum_zp += select(zero, texel_datum, tdm2);
  485. }
  486. vfloat4 prod_xp = dot(sum_xp, sum_xp);
  487. vfloat4 prod_yp = dot(sum_yp, sum_yp);
  488. vfloat4 prod_zp = dot(sum_zp, sum_zp);
  489. vfloat4 best_vector = sum_xp;
  490. vfloat4 best_sum = prod_xp;
  491. vmask4 mask = prod_yp > best_sum;
  492. best_vector = select(best_vector, sum_yp, mask);
  493. best_sum = select(best_sum, prod_yp, mask);
  494. mask = prod_zp > best_sum;
  495. best_vector = select(best_vector, sum_zp, mask);
  496. pm[partition].dir = best_vector;
  497. }
  498. }
  499. /* See header for documentation. */
  500. void compute_avgs_and_dirs_2_comp(
  501. const partition_info& pt,
  502. const image_block& blk,
  503. unsigned int component1,
  504. unsigned int component2,
  505. partition_metrics pm[BLOCK_MAX_PARTITIONS]
  506. ) {
  507. vfloat4 average;
  508. const float* data_vr = nullptr;
  509. const float* data_vg = nullptr;
  510. if (component1 == 0 && component2 == 1)
  511. {
  512. average = blk.data_mean.swz<0, 1>();
  513. data_vr = blk.data_r;
  514. data_vg = blk.data_g;
  515. }
  516. else if (component1 == 0 && component2 == 2)
  517. {
  518. average = blk.data_mean.swz<0, 2>();
  519. data_vr = blk.data_r;
  520. data_vg = blk.data_b;
  521. }
  522. else // (component1 == 1 && component2 == 2)
  523. {
  524. assert(component1 == 1 && component2 == 2);
  525. average = blk.data_mean.swz<1, 2>();
  526. data_vr = blk.data_g;
  527. data_vg = blk.data_b;
  528. }
  529. unsigned int partition_count = pt.partition_count;
  530. promise(partition_count > 0);
  531. for (unsigned int partition = 0; partition < partition_count; partition++)
  532. {
  533. const uint8_t *texel_indexes = pt.texels_of_partition[partition];
  534. unsigned int texel_count = pt.partition_texel_count[partition];
  535. promise(texel_count > 0);
  536. // Only compute a partition mean if more than one partition
  537. if (partition_count > 1)
  538. {
  539. average = vfloat4::zero();
  540. for (unsigned int i = 0; i < texel_count; i++)
  541. {
  542. unsigned int iwt = texel_indexes[i];
  543. average += vfloat2(data_vr[iwt], data_vg[iwt]);
  544. }
  545. average = average / static_cast<float>(texel_count);
  546. }
  547. pm[partition].avg = average;
  548. vfloat4 sum_xp = vfloat4::zero();
  549. vfloat4 sum_yp = vfloat4::zero();
  550. for (unsigned int i = 0; i < texel_count; i++)
  551. {
  552. unsigned int iwt = texel_indexes[i];
  553. vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
  554. texel_datum = texel_datum - average;
  555. vfloat4 zero = vfloat4::zero();
  556. vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
  557. sum_xp += select(zero, texel_datum, tdm0);
  558. vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
  559. sum_yp += select(zero, texel_datum, tdm1);
  560. }
  561. vfloat4 prod_xp = dot(sum_xp, sum_xp);
  562. vfloat4 prod_yp = dot(sum_yp, sum_yp);
  563. vfloat4 best_vector = sum_xp;
  564. vfloat4 best_sum = prod_xp;
  565. vmask4 mask = prod_yp > best_sum;
  566. best_vector = select(best_vector, sum_yp, mask);
  567. pm[partition].dir = best_vector;
  568. }
  569. }
  570. /* See header for documentation. */
  571. void compute_error_squared_rgba(
  572. const partition_info& pi,
  573. const image_block& blk,
  574. const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
  575. const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
  576. float line_lengths[BLOCK_MAX_PARTITIONS],
  577. float& uncor_error,
  578. float& samec_error
  579. ) {
  580. unsigned int partition_count = pi.partition_count;
  581. promise(partition_count > 0);
  582. vfloatacc uncor_errorsumv = vfloatacc::zero();
  583. vfloatacc samec_errorsumv = vfloatacc::zero();
  584. for (unsigned int partition = 0; partition < partition_count; partition++)
  585. {
  586. const uint8_t *texel_indexes = pi.texels_of_partition[partition];
  587. processed_line4 l_uncor = uncor_plines[partition];
  588. processed_line4 l_samec = samec_plines[partition];
  589. unsigned int texel_count = pi.partition_texel_count[partition];
  590. promise(texel_count > 0);
  591. // Vectorize some useful scalar inputs
  592. vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
  593. vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
  594. vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
  595. vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
  596. vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
  597. vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
  598. vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
  599. vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
  600. vfloat l_samec_bs0(l_samec.bs.lane<0>());
  601. vfloat l_samec_bs1(l_samec.bs.lane<1>());
  602. vfloat l_samec_bs2(l_samec.bs.lane<2>());
  603. vfloat l_samec_bs3(l_samec.bs.lane<3>());
  604. assert(all(l_samec.amod == vfloat4(0.0f)));
  605. vfloat uncor_loparamv(1e10f);
  606. vfloat uncor_hiparamv(-1e10f);
  607. vfloat ew_r(blk.channel_weight.lane<0>());
  608. vfloat ew_g(blk.channel_weight.lane<1>());
  609. vfloat ew_b(blk.channel_weight.lane<2>());
  610. vfloat ew_a(blk.channel_weight.lane<3>());
  611. // This implementation over-shoots, but this is safe as we initialize the texel_indexes
  612. // array to extend the last value. This means min/max are not impacted, but we need to mask
  613. // out the dummy values when we compute the line weighting.
  614. vint lane_ids = vint::lane_id();
  615. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  616. {
  617. vmask mask = lane_ids < vint(texel_count);
  618. vint texel_idxs(texel_indexes + i);
  619. vfloat data_r = gatherf(blk.data_r, texel_idxs);
  620. vfloat data_g = gatherf(blk.data_g, texel_idxs);
  621. vfloat data_b = gatherf(blk.data_b, texel_idxs);
  622. vfloat data_a = gatherf(blk.data_a, texel_idxs);
  623. vfloat uncor_param = (data_r * l_uncor_bs0)
  624. + (data_g * l_uncor_bs1)
  625. + (data_b * l_uncor_bs2)
  626. + (data_a * l_uncor_bs3);
  627. uncor_loparamv = min(uncor_param, uncor_loparamv);
  628. uncor_hiparamv = max(uncor_param, uncor_hiparamv);
  629. vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
  630. + (uncor_param * l_uncor_bs0);
  631. vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
  632. + (uncor_param * l_uncor_bs1);
  633. vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
  634. + (uncor_param * l_uncor_bs2);
  635. vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
  636. + (uncor_param * l_uncor_bs3);
  637. vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
  638. + (ew_g * uncor_dist1 * uncor_dist1)
  639. + (ew_b * uncor_dist2 * uncor_dist2)
  640. + (ew_a * uncor_dist3 * uncor_dist3);
  641. haccumulate(uncor_errorsumv, uncor_err, mask);
  642. // Process samechroma data
  643. vfloat samec_param = (data_r * l_samec_bs0)
  644. + (data_g * l_samec_bs1)
  645. + (data_b * l_samec_bs2)
  646. + (data_a * l_samec_bs3);
  647. vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
  648. vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
  649. vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
  650. vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
  651. vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
  652. + (ew_g * samec_dist1 * samec_dist1)
  653. + (ew_b * samec_dist2 * samec_dist2)
  654. + (ew_a * samec_dist3 * samec_dist3);
  655. haccumulate(samec_errorsumv, samec_err, mask);
  656. lane_ids += vint(ASTCENC_SIMD_WIDTH);
  657. }
  658. // Turn very small numbers and NaNs into a small number
  659. float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
  660. line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
  661. }
  662. uncor_error = hadd_s(uncor_errorsumv);
  663. samec_error = hadd_s(samec_errorsumv);
  664. }
  665. /* See header for documentation. */
  666. void compute_error_squared_rgb(
  667. const partition_info& pi,
  668. const image_block& blk,
  669. partition_lines3 plines[BLOCK_MAX_PARTITIONS],
  670. float& uncor_error,
  671. float& samec_error
  672. ) {
  673. unsigned int partition_count = pi.partition_count;
  674. promise(partition_count > 0);
  675. vfloatacc uncor_errorsumv = vfloatacc::zero();
  676. vfloatacc samec_errorsumv = vfloatacc::zero();
  677. for (unsigned int partition = 0; partition < partition_count; partition++)
  678. {
  679. partition_lines3& pl = plines[partition];
  680. const uint8_t *texel_indexes = pi.texels_of_partition[partition];
  681. unsigned int texel_count = pi.partition_texel_count[partition];
  682. promise(texel_count > 0);
  683. processed_line3 l_uncor = pl.uncor_pline;
  684. processed_line3 l_samec = pl.samec_pline;
  685. // Vectorize some useful scalar inputs
  686. vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
  687. vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
  688. vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
  689. vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
  690. vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
  691. vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
  692. vfloat l_samec_bs0(l_samec.bs.lane<0>());
  693. vfloat l_samec_bs1(l_samec.bs.lane<1>());
  694. vfloat l_samec_bs2(l_samec.bs.lane<2>());
  695. assert(all(l_samec.amod == vfloat4(0.0f)));
  696. vfloat uncor_loparamv(1e10f);
  697. vfloat uncor_hiparamv(-1e10f);
  698. vfloat ew_r(blk.channel_weight.lane<0>());
  699. vfloat ew_g(blk.channel_weight.lane<1>());
  700. vfloat ew_b(blk.channel_weight.lane<2>());
  701. // This implementation over-shoots, but this is safe as we initialize the weights array
  702. // to extend the last value. This means min/max are not impacted, but we need to mask
  703. // out the dummy values when we compute the line weighting.
  704. vint lane_ids = vint::lane_id();
  705. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  706. {
  707. vmask mask = lane_ids < vint(texel_count);
  708. vint texel_idxs(texel_indexes + i);
  709. vfloat data_r = gatherf(blk.data_r, texel_idxs);
  710. vfloat data_g = gatherf(blk.data_g, texel_idxs);
  711. vfloat data_b = gatherf(blk.data_b, texel_idxs);
  712. vfloat uncor_param = (data_r * l_uncor_bs0)
  713. + (data_g * l_uncor_bs1)
  714. + (data_b * l_uncor_bs2);
  715. uncor_loparamv = min(uncor_param, uncor_loparamv);
  716. uncor_hiparamv = max(uncor_param, uncor_hiparamv);
  717. vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
  718. + (uncor_param * l_uncor_bs0);
  719. vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
  720. + (uncor_param * l_uncor_bs1);
  721. vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
  722. + (uncor_param * l_uncor_bs2);
  723. vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
  724. + (ew_g * uncor_dist1 * uncor_dist1)
  725. + (ew_b * uncor_dist2 * uncor_dist2);
  726. haccumulate(uncor_errorsumv, uncor_err, mask);
  727. // Process samechroma data
  728. vfloat samec_param = (data_r * l_samec_bs0)
  729. + (data_g * l_samec_bs1)
  730. + (data_b * l_samec_bs2);
  731. vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
  732. vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
  733. vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
  734. vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
  735. + (ew_g * samec_dist1 * samec_dist1)
  736. + (ew_b * samec_dist2 * samec_dist2);
  737. haccumulate(samec_errorsumv, samec_err, mask);
  738. lane_ids += vint(ASTCENC_SIMD_WIDTH);
  739. }
  740. // Turn very small numbers and NaNs into a small number
  741. float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
  742. pl.line_length = astc::max(uncor_linelen, 1e-7f);
  743. }
  744. uncor_error = hadd_s(uncor_errorsumv);
  745. samec_error = hadd_s(samec_errorsumv);
  746. }
  747. #endif