astcenc_ideal_endpoints_and_weights.cpp 52 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2023 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  18. /**
  19. * @brief Functions for computing color endpoints and texel weights.
  20. */
  21. #include <cassert>
  22. #include "astcenc_internal.h"
  23. #include "astcenc_vecmathlib.h"
  24. /**
  25. * @brief Compute the infilled weight for N texel indices in a decimated grid.
  26. *
  27. * @param di The weight grid decimation to use.
  28. * @param weights The decimated weight values to use.
  29. * @param index The first texel index to interpolate.
  30. *
  31. * @return The interpolated weight for the given set of SIMD_WIDTH texels.
  32. */
  33. static vfloat bilinear_infill_vla(
  34. const decimation_info& di,
  35. const float* weights,
  36. unsigned int index
  37. ) {
  38. // Load the bilinear filter texel weight indexes in the decimated grid
  39. vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
  40. vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
  41. vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
  42. vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
  43. // Load the bilinear filter weights from the decimated grid
  44. vfloat weight_val0 = gatherf(weights, weight_idx0);
  45. vfloat weight_val1 = gatherf(weights, weight_idx1);
  46. vfloat weight_val2 = gatherf(weights, weight_idx2);
  47. vfloat weight_val3 = gatherf(weights, weight_idx3);
  48. // Load the weight contribution factors for each decimated weight
  49. vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
  50. vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
  51. vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
  52. vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
  53. // Compute the bilinear interpolation to generate the per-texel weight
  54. return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
  55. (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
  56. }
  57. /**
  58. * @brief Compute the infilled weight for N texel indices in a decimated grid.
  59. *
  60. * This is specialized version which computes only two weights per texel for
  61. * encodings that are only decimated in a single axis.
  62. *
  63. * @param di The weight grid decimation to use.
  64. * @param weights The decimated weight values to use.
  65. * @param index The first texel index to interpolate.
  66. *
  67. * @return The interpolated weight for the given set of SIMD_WIDTH texels.
  68. */
  69. static vfloat bilinear_infill_vla_2(
  70. const decimation_info& di,
  71. const float* weights,
  72. unsigned int index
  73. ) {
  74. // Load the bilinear filter texel weight indexes in the decimated grid
  75. vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
  76. vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
  77. // Load the bilinear filter weights from the decimated grid
  78. vfloat weight_val0 = gatherf(weights, weight_idx0);
  79. vfloat weight_val1 = gatherf(weights, weight_idx1);
  80. // Load the weight contribution factors for each decimated weight
  81. vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
  82. vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
  83. // Compute the bilinear interpolation to generate the per-texel weight
  84. return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
  85. }
  86. /**
  87. * @brief Compute the ideal endpoints and weights for 1 color component.
  88. *
  89. * @param blk The image block color data to compress.
  90. * @param pi The partition info for the current trial.
  91. * @param[out] ei The computed ideal endpoints and weights.
  92. * @param component The color component to compute.
  93. */
  94. static void compute_ideal_colors_and_weights_1_comp(
  95. const image_block& blk,
  96. const partition_info& pi,
  97. endpoints_and_weights& ei,
  98. unsigned int component
  99. ) {
  100. unsigned int partition_count = pi.partition_count;
  101. ei.ep.partition_count = partition_count;
  102. promise(partition_count > 0);
  103. unsigned int texel_count = blk.texel_count;
  104. promise(texel_count > 0);
  105. float error_weight;
  106. const float* data_vr = nullptr;
  107. assert(component < BLOCK_MAX_COMPONENTS);
  108. switch (component)
  109. {
  110. case 0:
  111. error_weight = blk.channel_weight.lane<0>();
  112. data_vr = blk.data_r;
  113. break;
  114. case 1:
  115. error_weight = blk.channel_weight.lane<1>();
  116. data_vr = blk.data_g;
  117. break;
  118. case 2:
  119. error_weight = blk.channel_weight.lane<2>();
  120. data_vr = blk.data_b;
  121. break;
  122. default:
  123. assert(component == 3);
  124. error_weight = blk.channel_weight.lane<3>();
  125. data_vr = blk.data_a;
  126. break;
  127. }
  128. vmask4 sep_mask = vint4::lane_id() == vint4(component);
  129. bool is_constant_wes { true };
  130. float partition0_len_sq { 0.0f };
  131. for (unsigned int i = 0; i < partition_count; i++)
  132. {
  133. float lowvalue { 1e10f };
  134. float highvalue { -1e10f };
  135. unsigned int partition_texel_count = pi.partition_texel_count[i];
  136. for (unsigned int j = 0; j < partition_texel_count; j++)
  137. {
  138. unsigned int tix = pi.texels_of_partition[i][j];
  139. float value = data_vr[tix];
  140. lowvalue = astc::min(value, lowvalue);
  141. highvalue = astc::max(value, highvalue);
  142. }
  143. if (highvalue <= lowvalue)
  144. {
  145. lowvalue = 0.0f;
  146. highvalue = 1e-7f;
  147. }
  148. float length = highvalue - lowvalue;
  149. float length_squared = length * length;
  150. float scale = 1.0f / length;
  151. if (i == 0)
  152. {
  153. partition0_len_sq = length_squared;
  154. }
  155. else
  156. {
  157. is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
  158. }
  159. for (unsigned int j = 0; j < partition_texel_count; j++)
  160. {
  161. unsigned int tix = pi.texels_of_partition[i][j];
  162. float value = (data_vr[tix] - lowvalue) * scale;
  163. value = astc::clamp1f(value);
  164. ei.weights[tix] = value;
  165. ei.weight_error_scale[tix] = length_squared * error_weight;
  166. assert(!astc::isnan(ei.weight_error_scale[tix]));
  167. }
  168. ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
  169. ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
  170. }
  171. // Zero initialize any SIMD over-fetch
  172. unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
  173. for (unsigned int i = texel_count; i < texel_count_simd; i++)
  174. {
  175. ei.weights[i] = 0.0f;
  176. ei.weight_error_scale[i] = 0.0f;
  177. }
  178. ei.is_constant_weight_error_scale = is_constant_wes;
  179. }
  180. /**
  181. * @brief Compute the ideal endpoints and weights for 2 color components.
  182. *
  183. * @param blk The image block color data to compress.
  184. * @param pi The partition info for the current trial.
  185. * @param[out] ei The computed ideal endpoints and weights.
  186. * @param component1 The first color component to compute.
  187. * @param component2 The second color component to compute.
  188. */
  189. static void compute_ideal_colors_and_weights_2_comp(
  190. const image_block& blk,
  191. const partition_info& pi,
  192. endpoints_and_weights& ei,
  193. int component1,
  194. int component2
  195. ) {
  196. unsigned int partition_count = pi.partition_count;
  197. ei.ep.partition_count = partition_count;
  198. promise(partition_count > 0);
  199. unsigned int texel_count = blk.texel_count;
  200. promise(texel_count > 0);
  201. partition_metrics pms[BLOCK_MAX_PARTITIONS];
  202. float error_weight;
  203. const float* data_vr = nullptr;
  204. const float* data_vg = nullptr;
  205. if (component1 == 0 && component2 == 1)
  206. {
  207. error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
  208. data_vr = blk.data_r;
  209. data_vg = blk.data_g;
  210. }
  211. else if (component1 == 0 && component2 == 2)
  212. {
  213. error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
  214. data_vr = blk.data_r;
  215. data_vg = blk.data_b;
  216. }
  217. else // (component1 == 1 && component2 == 2)
  218. {
  219. assert(component1 == 1 && component2 == 2);
  220. error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
  221. data_vr = blk.data_g;
  222. data_vg = blk.data_b;
  223. }
  224. compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
  225. bool is_constant_wes { true };
  226. float partition0_len_sq { 0.0f };
  227. vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
  228. vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
  229. for (unsigned int i = 0; i < partition_count; i++)
  230. {
  231. vfloat4 dir = pms[i].dir;
  232. if (hadd_s(dir) < 0.0f)
  233. {
  234. dir = vfloat4::zero() - dir;
  235. }
  236. line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
  237. float lowparam { 1e10f };
  238. float highparam { -1e10f };
  239. unsigned int partition_texel_count = pi.partition_texel_count[i];
  240. for (unsigned int j = 0; j < partition_texel_count; j++)
  241. {
  242. unsigned int tix = pi.texels_of_partition[i][j];
  243. vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
  244. float param = dot_s(point - line.a, line.b);
  245. ei.weights[tix] = param;
  246. lowparam = astc::min(param, lowparam);
  247. highparam = astc::max(param, highparam);
  248. }
  249. // It is possible for a uniform-color partition to produce length=0;
  250. // this causes NaN issues so set to small value to avoid this problem
  251. if (highparam <= lowparam)
  252. {
  253. lowparam = 0.0f;
  254. highparam = 1e-7f;
  255. }
  256. float length = highparam - lowparam;
  257. float length_squared = length * length;
  258. float scale = 1.0f / length;
  259. if (i == 0)
  260. {
  261. partition0_len_sq = length_squared;
  262. }
  263. else
  264. {
  265. is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
  266. }
  267. for (unsigned int j = 0; j < partition_texel_count; j++)
  268. {
  269. unsigned int tix = pi.texels_of_partition[i][j];
  270. float idx = (ei.weights[tix] - lowparam) * scale;
  271. idx = astc::clamp1f(idx);
  272. ei.weights[tix] = idx;
  273. ei.weight_error_scale[tix] = length_squared * error_weight;
  274. assert(!astc::isnan(ei.weight_error_scale[tix]));
  275. }
  276. vfloat4 lowvalue = line.a + line.b * lowparam;
  277. vfloat4 highvalue = line.a + line.b * highparam;
  278. vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
  279. vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
  280. ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
  281. ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
  282. }
  283. // Zero initialize any SIMD over-fetch
  284. unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
  285. for (unsigned int i = texel_count; i < texel_count_simd; i++)
  286. {
  287. ei.weights[i] = 0.0f;
  288. ei.weight_error_scale[i] = 0.0f;
  289. }
  290. ei.is_constant_weight_error_scale = is_constant_wes;
  291. }
  292. /**
  293. * @brief Compute the ideal endpoints and weights for 3 color components.
  294. *
  295. * @param blk The image block color data to compress.
  296. * @param pi The partition info for the current trial.
  297. * @param[out] ei The computed ideal endpoints and weights.
  298. * @param omitted_component The color component excluded from the calculation.
  299. */
  300. static void compute_ideal_colors_and_weights_3_comp(
  301. const image_block& blk,
  302. const partition_info& pi,
  303. endpoints_and_weights& ei,
  304. unsigned int omitted_component
  305. ) {
  306. unsigned int partition_count = pi.partition_count;
  307. ei.ep.partition_count = partition_count;
  308. promise(partition_count > 0);
  309. unsigned int texel_count = blk.texel_count;
  310. promise(texel_count > 0);
  311. partition_metrics pms[BLOCK_MAX_PARTITIONS];
  312. float error_weight;
  313. const float* data_vr = nullptr;
  314. const float* data_vg = nullptr;
  315. const float* data_vb = nullptr;
  316. if (omitted_component == 0)
  317. {
  318. error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
  319. data_vr = blk.data_g;
  320. data_vg = blk.data_b;
  321. data_vb = blk.data_a;
  322. }
  323. else if (omitted_component == 1)
  324. {
  325. error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
  326. data_vr = blk.data_r;
  327. data_vg = blk.data_b;
  328. data_vb = blk.data_a;
  329. }
  330. else if (omitted_component == 2)
  331. {
  332. error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
  333. data_vr = blk.data_r;
  334. data_vg = blk.data_g;
  335. data_vb = blk.data_a;
  336. }
  337. else
  338. {
  339. assert(omitted_component == 3);
  340. error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
  341. data_vr = blk.data_r;
  342. data_vg = blk.data_g;
  343. data_vb = blk.data_b;
  344. }
  345. error_weight = error_weight * (1.0f / 3.0f);
  346. if (omitted_component == 3)
  347. {
  348. compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
  349. }
  350. else
  351. {
  352. compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
  353. }
  354. bool is_constant_wes { true };
  355. float partition0_len_sq { 0.0f };
  356. for (unsigned int i = 0; i < partition_count; i++)
  357. {
  358. vfloat4 dir = pms[i].dir;
  359. if (hadd_rgb_s(dir) < 0.0f)
  360. {
  361. dir = vfloat4::zero() - dir;
  362. }
  363. line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
  364. float lowparam { 1e10f };
  365. float highparam { -1e10f };
  366. unsigned int partition_texel_count = pi.partition_texel_count[i];
  367. for (unsigned int j = 0; j < partition_texel_count; j++)
  368. {
  369. unsigned int tix = pi.texels_of_partition[i][j];
  370. vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
  371. float param = dot3_s(point - line.a, line.b);
  372. ei.weights[tix] = param;
  373. lowparam = astc::min(param, lowparam);
  374. highparam = astc::max(param, highparam);
  375. }
  376. // It is possible for a uniform-color partition to produce length=0;
  377. // this causes NaN issues so set to small value to avoid this problem
  378. if (highparam <= lowparam)
  379. {
  380. lowparam = 0.0f;
  381. highparam = 1e-7f;
  382. }
  383. float length = highparam - lowparam;
  384. float length_squared = length * length;
  385. float scale = 1.0f / length;
  386. if (i == 0)
  387. {
  388. partition0_len_sq = length_squared;
  389. }
  390. else
  391. {
  392. is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
  393. }
  394. for (unsigned int j = 0; j < partition_texel_count; j++)
  395. {
  396. unsigned int tix = pi.texels_of_partition[i][j];
  397. float idx = (ei.weights[tix] - lowparam) * scale;
  398. idx = astc::clamp1f(idx);
  399. ei.weights[tix] = idx;
  400. ei.weight_error_scale[tix] = length_squared * error_weight;
  401. assert(!astc::isnan(ei.weight_error_scale[tix]));
  402. }
  403. vfloat4 ep0 = line.a + line.b * lowparam;
  404. vfloat4 ep1 = line.a + line.b * highparam;
  405. vfloat4 bmin = blk.data_min;
  406. vfloat4 bmax = blk.data_max;
  407. assert(omitted_component < BLOCK_MAX_COMPONENTS);
  408. switch (omitted_component)
  409. {
  410. case 0:
  411. ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
  412. ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
  413. break;
  414. case 1:
  415. ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
  416. ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
  417. break;
  418. case 2:
  419. ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
  420. ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
  421. break;
  422. default:
  423. ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
  424. ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
  425. break;
  426. }
  427. }
  428. // Zero initialize any SIMD over-fetch
  429. unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
  430. for (unsigned int i = texel_count; i < texel_count_simd; i++)
  431. {
  432. ei.weights[i] = 0.0f;
  433. ei.weight_error_scale[i] = 0.0f;
  434. }
  435. ei.is_constant_weight_error_scale = is_constant_wes;
  436. }
  437. /**
  438. * @brief Compute the ideal endpoints and weights for 4 color components.
  439. *
  440. * @param blk The image block color data to compress.
  441. * @param pi The partition info for the current trial.
  442. * @param[out] ei The computed ideal endpoints and weights.
  443. */
  444. static void compute_ideal_colors_and_weights_4_comp(
  445. const image_block& blk,
  446. const partition_info& pi,
  447. endpoints_and_weights& ei
  448. ) {
  449. const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
  450. unsigned int partition_count = pi.partition_count;
  451. unsigned int texel_count = blk.texel_count;
  452. promise(texel_count > 0);
  453. promise(partition_count > 0);
  454. partition_metrics pms[BLOCK_MAX_PARTITIONS];
  455. compute_avgs_and_dirs_4_comp(pi, blk, pms);
  456. bool is_constant_wes { true };
  457. float partition0_len_sq { 0.0f };
  458. for (unsigned int i = 0; i < partition_count; i++)
  459. {
  460. vfloat4 dir = pms[i].dir;
  461. if (hadd_rgb_s(dir) < 0.0f)
  462. {
  463. dir = vfloat4::zero() - dir;
  464. }
  465. line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
  466. float lowparam { 1e10f };
  467. float highparam { -1e10f };
  468. unsigned int partition_texel_count = pi.partition_texel_count[i];
  469. for (unsigned int j = 0; j < partition_texel_count; j++)
  470. {
  471. unsigned int tix = pi.texels_of_partition[i][j];
  472. vfloat4 point = blk.texel(tix);
  473. float param = dot_s(point - line.a, line.b);
  474. ei.weights[tix] = param;
  475. lowparam = astc::min(param, lowparam);
  476. highparam = astc::max(param, highparam);
  477. }
  478. // It is possible for a uniform-color partition to produce length=0;
  479. // this causes NaN issues so set to small value to avoid this problem
  480. if (highparam <= lowparam)
  481. {
  482. lowparam = 0.0f;
  483. highparam = 1e-7f;
  484. }
  485. float length = highparam - lowparam;
  486. float length_squared = length * length;
  487. float scale = 1.0f / length;
  488. if (i == 0)
  489. {
  490. partition0_len_sq = length_squared;
  491. }
  492. else
  493. {
  494. is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
  495. }
  496. ei.ep.endpt0[i] = line.a + line.b * lowparam;
  497. ei.ep.endpt1[i] = line.a + line.b * highparam;
  498. for (unsigned int j = 0; j < partition_texel_count; j++)
  499. {
  500. unsigned int tix = pi.texels_of_partition[i][j];
  501. float idx = (ei.weights[tix] - lowparam) * scale;
  502. idx = astc::clamp1f(idx);
  503. ei.weights[tix] = idx;
  504. ei.weight_error_scale[tix] = length_squared * error_weight;
  505. assert(!astc::isnan(ei.weight_error_scale[tix]));
  506. }
  507. }
  508. // Zero initialize any SIMD over-fetch
  509. unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
  510. for (unsigned int i = texel_count; i < texel_count_simd; i++)
  511. {
  512. ei.weights[i] = 0.0f;
  513. ei.weight_error_scale[i] = 0.0f;
  514. }
  515. ei.is_constant_weight_error_scale = is_constant_wes;
  516. }
  517. /* See header for documentation. */
  518. void compute_ideal_colors_and_weights_1plane(
  519. const image_block& blk,
  520. const partition_info& pi,
  521. endpoints_and_weights& ei
  522. ) {
  523. bool uses_alpha = !blk.is_constant_channel(3);
  524. if (uses_alpha)
  525. {
  526. compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
  527. }
  528. else
  529. {
  530. compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
  531. }
  532. }
  533. /* See header for documentation. */
  534. void compute_ideal_colors_and_weights_2planes(
  535. const block_size_descriptor& bsd,
  536. const image_block& blk,
  537. unsigned int plane2_component,
  538. endpoints_and_weights& ei1,
  539. endpoints_and_weights& ei2
  540. ) {
  541. const auto& pi = bsd.get_partition_info(1, 0);
  542. bool uses_alpha = !blk.is_constant_channel(3);
  543. assert(plane2_component < BLOCK_MAX_COMPONENTS);
  544. switch (plane2_component)
  545. {
  546. case 0: // Separate weights for red
  547. if (uses_alpha)
  548. {
  549. compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
  550. }
  551. else
  552. {
  553. compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
  554. }
  555. compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
  556. break;
  557. case 1: // Separate weights for green
  558. if (uses_alpha)
  559. {
  560. compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
  561. }
  562. else
  563. {
  564. compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
  565. }
  566. compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
  567. break;
  568. case 2: // Separate weights for blue
  569. if (uses_alpha)
  570. {
  571. compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
  572. }
  573. else
  574. {
  575. compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
  576. }
  577. compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
  578. break;
  579. default: // Separate weights for alpha
  580. assert(uses_alpha);
  581. compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
  582. compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
  583. break;
  584. }
  585. }
  586. /* See header for documentation. */
  587. float compute_error_of_weight_set_1plane(
  588. const endpoints_and_weights& eai,
  589. const decimation_info& di,
  590. const float* dec_weight_quant_uvalue
  591. ) {
  592. vfloatacc error_summav = vfloatacc::zero();
  593. unsigned int texel_count = di.texel_count;
  594. promise(texel_count > 0);
  595. // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
  596. if (di.max_texel_weight_count > 2)
  597. {
  598. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  599. {
  600. // Compute the bilinear interpolation of the decimated weight grid
  601. vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
  602. // Compute the error between the computed value and the ideal weight
  603. vfloat actual_values = loada(eai.weights + i);
  604. vfloat diff = current_values - actual_values;
  605. vfloat significance = loada(eai.weight_error_scale + i);
  606. vfloat error = diff * diff * significance;
  607. haccumulate(error_summav, error);
  608. }
  609. }
  610. else if (di.max_texel_weight_count > 1)
  611. {
  612. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  613. {
  614. // Compute the bilinear interpolation of the decimated weight grid
  615. vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
  616. // Compute the error between the computed value and the ideal weight
  617. vfloat actual_values = loada(eai.weights + i);
  618. vfloat diff = current_values - actual_values;
  619. vfloat significance = loada(eai.weight_error_scale + i);
  620. vfloat error = diff * diff * significance;
  621. haccumulate(error_summav, error);
  622. }
  623. }
  624. else
  625. {
  626. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  627. {
  628. // Load the weight set directly, without interpolation
  629. vfloat current_values = loada(dec_weight_quant_uvalue + i);
  630. // Compute the error between the computed value and the ideal weight
  631. vfloat actual_values = loada(eai.weights + i);
  632. vfloat diff = current_values - actual_values;
  633. vfloat significance = loada(eai.weight_error_scale + i);
  634. vfloat error = diff * diff * significance;
  635. haccumulate(error_summav, error);
  636. }
  637. }
  638. // Resolve the final scalar accumulator sum
  639. return hadd_s(error_summav);
  640. }
  641. /* See header for documentation. */
  642. float compute_error_of_weight_set_2planes(
  643. const endpoints_and_weights& eai1,
  644. const endpoints_and_weights& eai2,
  645. const decimation_info& di,
  646. const float* dec_weight_quant_uvalue_plane1,
  647. const float* dec_weight_quant_uvalue_plane2
  648. ) {
  649. vfloatacc error_summav = vfloatacc::zero();
  650. unsigned int texel_count = di.texel_count;
  651. promise(texel_count > 0);
  652. // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
  653. if (di.max_texel_weight_count > 2)
  654. {
  655. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  656. {
  657. // Plane 1
  658. // Compute the bilinear interpolation of the decimated weight grid
  659. vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
  660. // Compute the error between the computed value and the ideal weight
  661. vfloat actual_values1 = loada(eai1.weights + i);
  662. vfloat diff = current_values1 - actual_values1;
  663. vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
  664. // Plane 2
  665. // Compute the bilinear interpolation of the decimated weight grid
  666. vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
  667. // Compute the error between the computed value and the ideal weight
  668. vfloat actual_values2 = loada(eai2.weights + i);
  669. diff = current_values2 - actual_values2;
  670. vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
  671. haccumulate(error_summav, error1 + error2);
  672. }
  673. }
  674. else if (di.max_texel_weight_count > 1)
  675. {
  676. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  677. {
  678. // Plane 1
  679. // Compute the bilinear interpolation of the decimated weight grid
  680. vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
  681. // Compute the error between the computed value and the ideal weight
  682. vfloat actual_values1 = loada(eai1.weights + i);
  683. vfloat diff = current_values1 - actual_values1;
  684. vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
  685. // Plane 2
  686. // Compute the bilinear interpolation of the decimated weight grid
  687. vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
  688. // Compute the error between the computed value and the ideal weight
  689. vfloat actual_values2 = loada(eai2.weights + i);
  690. diff = current_values2 - actual_values2;
  691. vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
  692. haccumulate(error_summav, error1 + error2);
  693. }
  694. }
  695. else
  696. {
  697. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  698. {
  699. // Plane 1
  700. // Load the weight set directly, without interpolation
  701. vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
  702. // Compute the error between the computed value and the ideal weight
  703. vfloat actual_values1 = loada(eai1.weights + i);
  704. vfloat diff = current_values1 - actual_values1;
  705. vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
  706. // Plane 2
  707. // Load the weight set directly, without interpolation
  708. vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
  709. // Compute the error between the computed value and the ideal weight
  710. vfloat actual_values2 = loada(eai2.weights + i);
  711. diff = current_values2 - actual_values2;
  712. vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
  713. haccumulate(error_summav, error1 + error2);
  714. }
  715. }
  716. // Resolve the final scalar accumulator sum
  717. return hadd_s(error_summav);
  718. }
  719. /* See header for documentation. */
  720. void compute_ideal_weights_for_decimation(
  721. const endpoints_and_weights& ei,
  722. const decimation_info& di,
  723. float* dec_weight_ideal_value
  724. ) {
  725. unsigned int texel_count = di.texel_count;
  726. unsigned int weight_count = di.weight_count;
  727. bool is_direct = texel_count == weight_count;
  728. promise(texel_count > 0);
  729. promise(weight_count > 0);
  730. // Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
  731. // can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
  732. // arrays always contain space for 64 elements
  733. unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
  734. storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
  735. // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
  736. // zero-initialized SIMD over-fetch region
  737. if (is_direct)
  738. {
  739. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  740. {
  741. vfloat weight(ei.weights + i);
  742. storea(weight, dec_weight_ideal_value + i);
  743. }
  744. return;
  745. }
  746. // Otherwise compute an estimate and perform single refinement iteration
  747. alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
  748. // Compute an initial average for each decimated weight
  749. bool constant_wes = ei.is_constant_weight_error_scale;
  750. vfloat weight_error_scale(ei.weight_error_scale[0]);
  751. // This overshoots - this is OK as we initialize the array tails in the
  752. // decimation table structures to safe values ...
  753. for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  754. {
  755. // Start with a small value to avoid div-by-zero later
  756. vfloat weight_weight(1e-10f);
  757. vfloat initial_weight = vfloat::zero();
  758. // Accumulate error weighting of all the texels using this weight
  759. vint weight_texel_count(di.weight_texel_count + i);
  760. unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
  761. promise(max_texel_count > 0);
  762. for (unsigned int j = 0; j < max_texel_count; j++)
  763. {
  764. vint texel(di.weight_texels_tr[j] + i);
  765. vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
  766. if (!constant_wes)
  767. {
  768. weight_error_scale = gatherf(ei.weight_error_scale, texel);
  769. }
  770. vfloat contrib_weight = weight * weight_error_scale;
  771. weight_weight += contrib_weight;
  772. initial_weight += gatherf(ei.weights, texel) * contrib_weight;
  773. }
  774. storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
  775. }
  776. // Populate the interpolated weight grid based on the initial average
  777. // Process SIMD-width texel coordinates at at time while we can. Safe to
  778. // over-process full SIMD vectors - the tail is zeroed.
  779. if (di.max_texel_weight_count <= 2)
  780. {
  781. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  782. {
  783. vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
  784. storea(weight, infilled_weights + i);
  785. }
  786. }
  787. else
  788. {
  789. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  790. {
  791. vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
  792. storea(weight, infilled_weights + i);
  793. }
  794. }
  795. // Perform a single iteration of refinement
  796. // Empirically determined step size; larger values don't help but smaller drops image quality
  797. constexpr float stepsize = 0.25f;
  798. constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
  799. for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  800. {
  801. vfloat weight_val = loada(dec_weight_ideal_value + i);
  802. // Accumulate error weighting of all the texels using this weight
  803. // Start with a small value to avoid div-by-zero later
  804. vfloat error_change0(1e-10f);
  805. vfloat error_change1(0.0f);
  806. // Accumulate error weighting of all the texels using this weight
  807. vint weight_texel_count(di.weight_texel_count + i);
  808. unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
  809. promise(max_texel_count > 0);
  810. for (unsigned int j = 0; j < max_texel_count; j++)
  811. {
  812. vint texel(di.weight_texels_tr[j] + i);
  813. vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
  814. if (!constant_wes)
  815. {
  816. weight_error_scale = gatherf(ei.weight_error_scale, texel);
  817. }
  818. vfloat scale = weight_error_scale * contrib_weight;
  819. vfloat old_weight = gatherf(infilled_weights, texel);
  820. vfloat ideal_weight = gatherf(ei.weights, texel);
  821. error_change0 += contrib_weight * scale;
  822. error_change1 += (old_weight - ideal_weight) * scale;
  823. }
  824. vfloat step = (error_change1 * chd_scale) / error_change0;
  825. step = clamp(-stepsize, stepsize, step);
  826. // Update the weight; note this can store negative values
  827. storea(weight_val + step, dec_weight_ideal_value + i);
  828. }
  829. }
  830. /* See header for documentation. */
  831. void compute_quantized_weights_for_decimation(
  832. const decimation_info& di,
  833. float low_bound,
  834. float high_bound,
  835. const float* dec_weight_ideal_value,
  836. float* weight_set_out,
  837. uint8_t* quantized_weight_set,
  838. quant_method quant_level
  839. ) {
  840. int weight_count = di.weight_count;
  841. promise(weight_count > 0);
  842. const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
  843. // The available quant levels, stored with a minus 1 bias
  844. static const float quant_levels_m1[12] {
  845. 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
  846. };
  847. vint steps_m1(get_quant_level(quant_level) - 1);
  848. float quant_level_m1 = quant_levels_m1[quant_level];
  849. // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
  850. // TODO: Oddity to investigate; triggered by test in issue #265.
  851. if (high_bound <= low_bound)
  852. {
  853. low_bound = 0.0f;
  854. high_bound = 1.0f;
  855. }
  856. float rscale = high_bound - low_bound;
  857. float scale = 1.0f / rscale;
  858. float scaled_low_bound = low_bound * scale;
  859. rscale *= 1.0f / 64.0f;
  860. vfloat scalev(scale);
  861. vfloat scaled_low_boundv(scaled_low_bound);
  862. vfloat quant_level_m1v(quant_level_m1);
  863. vfloat rscalev(rscale);
  864. vfloat low_boundv(low_bound);
  865. // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
  866. // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
  867. if (get_quant_level(quant_level) <= 16)
  868. {
  869. vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
  870. vint tab0p;
  871. vtable_prepare(tab0, tab0p);
  872. for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  873. {
  874. vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
  875. ix = clampzo(ix);
  876. // Look up the two closest indexes and return the one that was closest
  877. vfloat ix1 = ix * quant_level_m1v;
  878. vint weightl = float_to_int(ix1);
  879. vint weighth = min(weightl + vint(1), steps_m1);
  880. vint ixli = vtable_8bt_32bi(tab0p, weightl);
  881. vint ixhi = vtable_8bt_32bi(tab0p, weighth);
  882. vfloat ixl = int_to_float(ixli);
  883. vfloat ixh = int_to_float(ixhi);
  884. vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
  885. vint weight = select(ixli, ixhi, mask);
  886. ixl = select(ixl, ixh, mask);
  887. // Invert the weight-scaling that was done initially
  888. storea(ixl * rscalev + low_boundv, weight_set_out + i);
  889. vint scn = pack_low_bytes(weight);
  890. store_nbytes(scn, quantized_weight_set + i);
  891. }
  892. }
  893. else
  894. {
  895. vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
  896. vint4 tab1(reinterpret_cast<const int*>(qat.quant_to_unquant + 16));
  897. vint tab0p, tab1p;
  898. vtable_prepare(tab0, tab1, tab0p, tab1p);
  899. for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  900. {
  901. vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
  902. ix = clampzo(ix);
  903. // Look up the two closest indexes and return the one that was closest
  904. vfloat ix1 = ix * quant_level_m1v;
  905. vint weightl = float_to_int(ix1);
  906. vint weighth = min(weightl + vint(1), steps_m1);
  907. vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
  908. vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
  909. vfloat ixl = int_to_float(ixli);
  910. vfloat ixh = int_to_float(ixhi);
  911. vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
  912. vint weight = select(ixli, ixhi, mask);
  913. ixl = select(ixl, ixh, mask);
  914. // Invert the weight-scaling that was done initially
  915. storea(ixl * rscalev + low_boundv, weight_set_out + i);
  916. vint scn = pack_low_bytes(weight);
  917. store_nbytes(scn, quantized_weight_set + i);
  918. }
  919. }
  920. }
  921. /**
  922. * @brief Compute the RGB + offset for a HDR endpoint mode #7.
  923. *
  924. * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
  925. * gives us ~24 multiplications vs. 96 for a generic inverse.
  926. *
  927. * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x);
  928. * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y);
  929. * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z);
  930. * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum);
  931. * mat = invert(mat);
  932. *
  933. * @param rgba_weight_sum Sum of partition component error weights.
  934. * @param weight_weight_sum Sum of partition component error weights * texel weight.
  935. * @param rgbq_sum Sum of partition component error weights * texel weight * color data.
  936. * @param psum Sum of RGB color weights * texel weight^2.
  937. */
  938. static inline vfloat4 compute_rgbo_vector(
  939. vfloat4 rgba_weight_sum,
  940. vfloat4 weight_weight_sum,
  941. vfloat4 rgbq_sum,
  942. float psum
  943. ) {
  944. float X = rgba_weight_sum.lane<0>();
  945. float Y = rgba_weight_sum.lane<1>();
  946. float Z = rgba_weight_sum.lane<2>();
  947. float P = weight_weight_sum.lane<0>();
  948. float Q = weight_weight_sum.lane<1>();
  949. float R = weight_weight_sum.lane<2>();
  950. float S = psum;
  951. float PP = P * P;
  952. float QQ = Q * Q;
  953. float RR = R * R;
  954. float SZmRR = S * Z - RR;
  955. float DT = SZmRR * Y - Z * QQ;
  956. float YP = Y * P;
  957. float QX = Q * X;
  958. float YX = Y * X;
  959. float mZYP = -Z * YP;
  960. float mZQX = -Z * QX;
  961. float mRYX = -R * YX;
  962. float ZQP = Z * Q * P;
  963. float RYP = R * YP;
  964. float RQX = R * QX;
  965. // Compute the reciprocal of matrix determinant
  966. float rdet = 1.0f / (DT * X + mZYP * P);
  967. // Actually compute the adjugate, and then apply 1/det separately
  968. vfloat4 mat0(DT, ZQP, RYP, mZYP);
  969. vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
  970. vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
  971. vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
  972. vfloat4 vect = rgbq_sum * rdet;
  973. return vfloat4(dot_s(mat0, vect),
  974. dot_s(mat1, vect),
  975. dot_s(mat2, vect),
  976. dot_s(mat3, vect));
  977. }
  978. /* See header for documentation. */
  979. void recompute_ideal_colors_1plane(
  980. const image_block& blk,
  981. const partition_info& pi,
  982. const decimation_info& di,
  983. const uint8_t* dec_weights_uquant,
  984. endpoints& ep,
  985. vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
  986. vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
  987. ) {
  988. unsigned int weight_count = di.weight_count;
  989. unsigned int total_texel_count = blk.texel_count;
  990. unsigned int partition_count = pi.partition_count;
  991. promise(weight_count > 0);
  992. promise(total_texel_count > 0);
  993. promise(partition_count > 0);
  994. alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
  995. for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  996. {
  997. vint unquant_value(dec_weights_uquant + i);
  998. vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
  999. storea(unquant_valuef, dec_weight + i);
  1000. }
  1001. alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
  1002. float* undec_weight_ref;
  1003. if (di.max_texel_weight_count == 1)
  1004. {
  1005. undec_weight_ref = dec_weight;
  1006. }
  1007. else if (di.max_texel_weight_count <= 2)
  1008. {
  1009. for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
  1010. {
  1011. vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
  1012. storea(weight, undec_weight + i);
  1013. }
  1014. undec_weight_ref = undec_weight;
  1015. }
  1016. else
  1017. {
  1018. for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
  1019. {
  1020. vfloat weight = bilinear_infill_vla(di, dec_weight, i);
  1021. storea(weight, undec_weight + i);
  1022. }
  1023. undec_weight_ref = undec_weight;
  1024. }
  1025. vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
  1026. for (unsigned int i = 0; i < partition_count; i++)
  1027. {
  1028. unsigned int texel_count = pi.partition_texel_count[i];
  1029. const uint8_t *texel_indexes = pi.texels_of_partition[i];
  1030. // Only compute a partition mean if more than one partition
  1031. if (partition_count > 1)
  1032. {
  1033. rgba_sum = vfloat4::zero();
  1034. promise(texel_count > 0);
  1035. for (unsigned int j = 0; j < texel_count; j++)
  1036. {
  1037. unsigned int tix = texel_indexes[j];
  1038. rgba_sum += blk.texel(tix);
  1039. }
  1040. }
  1041. rgba_sum = rgba_sum * blk.channel_weight;
  1042. vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
  1043. vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
  1044. float scale_max = 0.0f;
  1045. float scale_min = 1e10f;
  1046. float wmin1 = 1.0f;
  1047. float wmax1 = 0.0f;
  1048. float left_sum_s = 0.0f;
  1049. float middle_sum_s = 0.0f;
  1050. float right_sum_s = 0.0f;
  1051. vfloat4 color_vec_x = vfloat4::zero();
  1052. vfloat4 color_vec_y = vfloat4::zero();
  1053. vfloat4 scale_vec = vfloat4::zero();
  1054. float weight_weight_sum_s = 1e-17f;
  1055. vfloat4 color_weight = blk.channel_weight;
  1056. float ls_weight = hadd_rgb_s(color_weight);
  1057. for (unsigned int j = 0; j < texel_count; j++)
  1058. {
  1059. unsigned int tix = texel_indexes[j];
  1060. vfloat4 rgba = blk.texel(tix);
  1061. float idx0 = undec_weight_ref[tix];
  1062. float om_idx0 = 1.0f - idx0;
  1063. wmin1 = astc::min(idx0, wmin1);
  1064. wmax1 = astc::max(idx0, wmax1);
  1065. float scale = dot3_s(scale_dir, rgba);
  1066. scale_min = astc::min(scale, scale_min);
  1067. scale_max = astc::max(scale, scale_max);
  1068. left_sum_s += om_idx0 * om_idx0;
  1069. middle_sum_s += om_idx0 * idx0;
  1070. right_sum_s += idx0 * idx0;
  1071. weight_weight_sum_s += idx0;
  1072. vfloat4 color_idx(idx0);
  1073. vfloat4 cwprod = rgba;
  1074. vfloat4 cwiprod = cwprod * color_idx;
  1075. color_vec_y += cwiprod;
  1076. color_vec_x += cwprod - cwiprod;
  1077. scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
  1078. }
  1079. vfloat4 left_sum = vfloat4(left_sum_s) * color_weight;
  1080. vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
  1081. vfloat4 right_sum = vfloat4(right_sum_s) * color_weight;
  1082. vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
  1083. color_vec_x = color_vec_x * color_weight;
  1084. color_vec_y = color_vec_y * color_weight;
  1085. // Initialize the luminance and scale vectors with a reasonable default
  1086. float scalediv = scale_min / astc::max(scale_max, 1e-10f);
  1087. scalediv = astc::clamp1f(scalediv);
  1088. vfloat4 sds = scale_dir * scale_max;
  1089. rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
  1090. if (wmin1 >= wmax1 * 0.999f)
  1091. {
  1092. // If all weights in the partition were equal, then just take average of all colors in
  1093. // the partition and use that as both endpoint colors
  1094. vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
  1095. vmask4 notnan_mask = avg == avg;
  1096. ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
  1097. ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
  1098. rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
  1099. }
  1100. else
  1101. {
  1102. // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
  1103. // set of texel weights and pixel colors
  1104. vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
  1105. vfloat4 color_rdet1 = 1.0f / color_det1;
  1106. float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
  1107. float ls_rdet1 = 1.0f / ls_det1;
  1108. vfloat4 color_mss1 = (left_sum * left_sum)
  1109. + (2.0f * middle_sum * middle_sum)
  1110. + (right_sum * right_sum);
  1111. float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
  1112. + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
  1113. + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
  1114. vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
  1115. vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
  1116. vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
  1117. vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
  1118. vmask4 full_mask = det_mask & notnan_mask;
  1119. ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
  1120. ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
  1121. float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
  1122. float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
  1123. if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
  1124. {
  1125. float scalediv2 = scale_ep0 / scale_ep1;
  1126. vfloat4 sdsm = scale_dir * scale_ep1;
  1127. rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
  1128. }
  1129. }
  1130. // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
  1131. if (blk.rgb_lns[0] || blk.alpha_lns[0])
  1132. {
  1133. vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
  1134. float psum = right_sum_s * hadd_rgb_s(color_weight);
  1135. vfloat4 rgbq_sum = color_vec_x + color_vec_y;
  1136. rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
  1137. vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
  1138. rgbo_vectors[i] = rgbovec;
  1139. // We can get a failure due to the use of a singular (non-invertible) matrix
  1140. // If it failed, compute rgbo_vectors[] with a different method ...
  1141. if (astc::isnan(dot_s(rgbovec, rgbovec)))
  1142. {
  1143. vfloat4 v0 = ep.endpt0[i];
  1144. vfloat4 v1 = ep.endpt1[i];
  1145. float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
  1146. avgdif = astc::max(avgdif, 0.0f);
  1147. vfloat4 avg = (v0 + v1) * 0.5f;
  1148. vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
  1149. rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
  1150. }
  1151. }
  1152. }
  1153. }
  1154. /* See header for documentation. */
  1155. void recompute_ideal_colors_2planes(
  1156. const image_block& blk,
  1157. const block_size_descriptor& bsd,
  1158. const decimation_info& di,
  1159. const uint8_t* dec_weights_uquant_plane1,
  1160. const uint8_t* dec_weights_uquant_plane2,
  1161. endpoints& ep,
  1162. vfloat4& rgbs_vector,
  1163. vfloat4& rgbo_vector,
  1164. int plane2_component
  1165. ) {
  1166. unsigned int weight_count = di.weight_count;
  1167. unsigned int total_texel_count = blk.texel_count;
  1168. promise(total_texel_count > 0);
  1169. promise(weight_count > 0);
  1170. alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
  1171. alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
  1172. assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
  1173. for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  1174. {
  1175. vint unquant_value1(dec_weights_uquant_plane1 + i);
  1176. vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
  1177. storea(unquant_value1f, dec_weight_plane1 + i);
  1178. vint unquant_value2(dec_weights_uquant_plane2 + i);
  1179. vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
  1180. storea(unquant_value2f, dec_weight_plane2 + i);
  1181. }
  1182. alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
  1183. alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
  1184. float* undec_weight_plane1_ref;
  1185. float* undec_weight_plane2_ref;
  1186. if (di.max_texel_weight_count == 1)
  1187. {
  1188. undec_weight_plane1_ref = dec_weight_plane1;
  1189. undec_weight_plane2_ref = dec_weight_plane2;
  1190. }
  1191. else if (di.max_texel_weight_count <= 2)
  1192. {
  1193. for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
  1194. {
  1195. vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
  1196. storea(weight, undec_weight_plane1 + i);
  1197. weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
  1198. storea(weight, undec_weight_plane2 + i);
  1199. }
  1200. undec_weight_plane1_ref = undec_weight_plane1;
  1201. undec_weight_plane2_ref = undec_weight_plane2;
  1202. }
  1203. else
  1204. {
  1205. for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
  1206. {
  1207. vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
  1208. storea(weight, undec_weight_plane1 + i);
  1209. weight = bilinear_infill_vla(di, dec_weight_plane2, i);
  1210. storea(weight, undec_weight_plane2 + i);
  1211. }
  1212. undec_weight_plane1_ref = undec_weight_plane1;
  1213. undec_weight_plane2_ref = undec_weight_plane2;
  1214. }
  1215. unsigned int texel_count = bsd.texel_count;
  1216. vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
  1217. vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
  1218. float scale_max = 0.0f;
  1219. float scale_min = 1e10f;
  1220. float wmin1 = 1.0f;
  1221. float wmax1 = 0.0f;
  1222. float wmin2 = 1.0f;
  1223. float wmax2 = 0.0f;
  1224. float left1_sum_s = 0.0f;
  1225. float middle1_sum_s = 0.0f;
  1226. float right1_sum_s = 0.0f;
  1227. float left2_sum_s = 0.0f;
  1228. float middle2_sum_s = 0.0f;
  1229. float right2_sum_s = 0.0f;
  1230. vfloat4 color_vec_x = vfloat4::zero();
  1231. vfloat4 color_vec_y = vfloat4::zero();
  1232. vfloat4 scale_vec = vfloat4::zero();
  1233. vfloat4 weight_weight_sum = vfloat4(1e-17f);
  1234. vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
  1235. vfloat4 color_weight = blk.channel_weight;
  1236. float ls_weight = hadd_rgb_s(color_weight);
  1237. for (unsigned int j = 0; j < texel_count; j++)
  1238. {
  1239. vfloat4 rgba = blk.texel(j);
  1240. float idx0 = undec_weight_plane1_ref[j];
  1241. float om_idx0 = 1.0f - idx0;
  1242. wmin1 = astc::min(idx0, wmin1);
  1243. wmax1 = astc::max(idx0, wmax1);
  1244. float scale = dot3_s(scale_dir, rgba);
  1245. scale_min = astc::min(scale, scale_min);
  1246. scale_max = astc::max(scale, scale_max);
  1247. left1_sum_s += om_idx0 * om_idx0;
  1248. middle1_sum_s += om_idx0 * idx0;
  1249. right1_sum_s += idx0 * idx0;
  1250. float idx1 = undec_weight_plane2_ref[j];
  1251. float om_idx1 = 1.0f - idx1;
  1252. wmin2 = astc::min(idx1, wmin2);
  1253. wmax2 = astc::max(idx1, wmax2);
  1254. left2_sum_s += om_idx1 * om_idx1;
  1255. middle2_sum_s += om_idx1 * idx1;
  1256. right2_sum_s += idx1 * idx1;
  1257. vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
  1258. vfloat4 cwprod = rgba;
  1259. vfloat4 cwiprod = cwprod * color_idx;
  1260. color_vec_y += cwiprod;
  1261. color_vec_x += cwprod - cwiprod;
  1262. scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
  1263. weight_weight_sum += color_idx;
  1264. }
  1265. vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight;
  1266. vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
  1267. vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight;
  1268. vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
  1269. vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight;
  1270. vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
  1271. vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight;
  1272. color_vec_x = color_vec_x * color_weight;
  1273. color_vec_y = color_vec_y * color_weight;
  1274. // Initialize the luminance and scale vectors with a reasonable default
  1275. float scalediv = scale_min / astc::max(scale_max, 1e-10f);
  1276. scalediv = astc::clamp1f(scalediv);
  1277. vfloat4 sds = scale_dir * scale_max;
  1278. rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
  1279. if (wmin1 >= wmax1 * 0.999f)
  1280. {
  1281. // If all weights in the partition were equal, then just take average of all colors in
  1282. // the partition and use that as both endpoint colors
  1283. vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
  1284. vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
  1285. vmask4 notnan_mask = avg == avg;
  1286. vmask4 full_mask = p1_mask & notnan_mask;
  1287. ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
  1288. ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
  1289. rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
  1290. }
  1291. else
  1292. {
  1293. // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
  1294. // set of texel weights and pixel colors
  1295. vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
  1296. vfloat4 color_rdet1 = 1.0f / color_det1;
  1297. float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
  1298. float ls_rdet1 = 1.0f / ls_det1;
  1299. vfloat4 color_mss1 = (left1_sum * left1_sum)
  1300. + (2.0f * middle1_sum * middle1_sum)
  1301. + (right1_sum * right1_sum);
  1302. float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
  1303. + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
  1304. + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
  1305. vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
  1306. vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
  1307. float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
  1308. float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
  1309. vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
  1310. vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
  1311. vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
  1312. vmask4 full_mask = p1_mask & det_mask & notnan_mask;
  1313. ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
  1314. ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
  1315. if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
  1316. {
  1317. float scalediv2 = scale_ep0 / scale_ep1;
  1318. vfloat4 sdsm = scale_dir * scale_ep1;
  1319. rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
  1320. }
  1321. }
  1322. if (wmin2 >= wmax2 * 0.999f)
  1323. {
  1324. // If all weights in the partition were equal, then just take average of all colors in
  1325. // the partition and use that as both endpoint colors
  1326. vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
  1327. vmask4 notnan_mask = avg == avg;
  1328. vmask4 full_mask = p2_mask & notnan_mask;
  1329. ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
  1330. ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
  1331. }
  1332. else
  1333. {
  1334. // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
  1335. // set of texel weights and pixel colors
  1336. vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
  1337. vfloat4 color_rdet2 = 1.0f / color_det2;
  1338. vfloat4 color_mss2 = (left2_sum * left2_sum)
  1339. + (2.0f * middle2_sum * middle2_sum)
  1340. + (right2_sum * right2_sum);
  1341. vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
  1342. vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
  1343. vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
  1344. vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
  1345. vmask4 full_mask = p2_mask & det_mask & notnan_mask;
  1346. ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
  1347. ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
  1348. }
  1349. // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
  1350. if (blk.rgb_lns[0] || blk.alpha_lns[0])
  1351. {
  1352. weight_weight_sum = weight_weight_sum * color_weight;
  1353. float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
  1354. vfloat4 rgbq_sum = color_vec_x + color_vec_y;
  1355. rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
  1356. rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
  1357. // We can get a failure due to the use of a singular (non-invertible) matrix
  1358. // If it failed, compute rgbo_vectors[] with a different method ...
  1359. if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
  1360. {
  1361. vfloat4 v0 = ep.endpt0[0];
  1362. vfloat4 v1 = ep.endpt1[0];
  1363. float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
  1364. avgdif = astc::max(avgdif, 0.0f);
  1365. vfloat4 avg = (v0 + v1) * 0.5f;
  1366. vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
  1367. rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
  1368. }
  1369. }
  1370. }
  1371. #endif