bc6h.glsl 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720
  1. #[versions]
  2. signed = "#define SIGNED";
  3. unsigned = "#define QUALITY"; // The "Quality" preset causes artifacting on signed data, so for now it's exclusive to unsigned.
  4. #[compute]
  5. #version 450
  6. #include "CrossPlatformSettings_piece_all.glsl"
  7. #include "UavCrossPlatform_piece_all.glsl"
  8. #VERSION_DEFINES
  9. float3 f32tof16(float3 value) {
  10. return float3(packHalf2x16(float2(value.x, 0.0)),
  11. packHalf2x16(float2(value.y, 0.0)),
  12. packHalf2x16(float2(value.z, 0.0)));
  13. }
  14. float3 f16tof32(uint3 value) {
  15. return float3(unpackHalf2x16(value.x).x,
  16. unpackHalf2x16(value.y).x,
  17. unpackHalf2x16(value.z).x);
  18. }
  19. float f32tof16(float value) {
  20. return packHalf2x16(float2(value.x, 0.0));
  21. }
  22. float f16tof32(uint value) {
  23. return unpackHalf2x16(value.x).x;
  24. }
  25. layout(binding = 0) uniform sampler2D srcTexture;
  26. layout(binding = 1, rgba32ui) uniform restrict writeonly uimage2D dstTexture;
  27. layout(push_constant, std430) uniform Params {
  28. float2 p_textureSizeRcp;
  29. uint padding0;
  30. uint padding1;
  31. }
  32. params;
  33. const float HALF_MAX = 65504.0f;
  34. const uint PATTERN_NUM = 32u;
  35. #ifdef SIGNED
  36. const float HALF_MIN = -65504.0f;
  37. #else
  38. const float HALF_MIN = 0.0f;
  39. #endif
  40. #ifdef SIGNED
  41. // https://github.com/godotengine/godot/pull/96377#issuecomment-2323488254
  42. // https://github.com/godotengine/godot/pull/96377#issuecomment-2323450950
  43. bool isNegative(float a) {
  44. return a < 0.0f;
  45. }
  46. float CalcSignlessMSLE(float a, float b) {
  47. float err = log2((b + 1.0f) / (a + 1.0f));
  48. err = err * err;
  49. return err;
  50. }
  51. float CrossCalcMSLE(float a, float b) {
  52. float result = 0.0f;
  53. result += CalcSignlessMSLE(0.0f, abs(a));
  54. result += CalcSignlessMSLE(0.0f, abs(b));
  55. return result;
  56. }
  57. float CalcMSLE(float3 a, float3 b) {
  58. float result = 0.0f;
  59. if (isNegative(a.x) != isNegative(b.x)) {
  60. result += CrossCalcMSLE(a.x, b.x);
  61. } else {
  62. result += CalcSignlessMSLE(abs(a.x), abs(b.x));
  63. }
  64. if (isNegative(a.y) != isNegative(b.y)) {
  65. result += CrossCalcMSLE(a.y, b.y);
  66. } else {
  67. result += CalcSignlessMSLE(abs(a.y), abs(b.y));
  68. }
  69. if (isNegative(a.z) != isNegative(b.z)) {
  70. result += CrossCalcMSLE(a.z, b.z);
  71. } else {
  72. result += CalcSignlessMSLE(abs(a.z), abs(b.z));
  73. }
  74. return result;
  75. }
  76. #else
  77. float CalcMSLE(float3 a, float3 b) {
  78. float3 err = log2((b + 1.0f) / (a + 1.0f));
  79. err = err * err;
  80. return err.x + err.y + err.z;
  81. }
  82. #endif
  83. uint PatternFixupID(uint i) {
  84. uint ret = 15u;
  85. ret = ((3441033216u >> i) & 0x1u) != 0 ? 2u : ret;
  86. ret = ((845414400u >> i) & 0x1u) != 0 ? 8u : ret;
  87. return ret;
  88. }
  89. uint Pattern(uint p, uint i) {
  90. uint p2 = p / 2u;
  91. uint p3 = p - p2 * 2u;
  92. uint enc = 0u;
  93. enc = p2 == 0u ? 2290666700u : enc;
  94. enc = p2 == 1u ? 3972591342u : enc;
  95. enc = p2 == 2u ? 4276930688u : enc;
  96. enc = p2 == 3u ? 3967876808u : enc;
  97. enc = p2 == 4u ? 4293707776u : enc;
  98. enc = p2 == 5u ? 3892379264u : enc;
  99. enc = p2 == 6u ? 4278255592u : enc;
  100. enc = p2 == 7u ? 4026597360u : enc;
  101. enc = p2 == 8u ? 9369360u : enc;
  102. enc = p2 == 9u ? 147747072u : enc;
  103. enc = p2 == 10u ? 1930428556u : enc;
  104. enc = p2 == 11u ? 2362323200u : enc;
  105. enc = p2 == 12u ? 823134348u : enc;
  106. enc = p2 == 13u ? 913073766u : enc;
  107. enc = p2 == 14u ? 267393000u : enc;
  108. enc = p2 == 15u ? 966553998u : enc;
  109. enc = p3 != 0u ? enc >> 16u : enc;
  110. uint ret = (enc >> i) & 0x1u;
  111. return ret;
  112. }
  113. #ifndef SIGNED
  114. //UF
  115. float3 Quantize7(float3 x) {
  116. return (f32tof16(x) * 128.0f) / (0x7bff + 1.0f);
  117. }
  118. float3 Quantize9(float3 x) {
  119. return (f32tof16(x) * 512.0f) / (0x7bff + 1.0f);
  120. }
  121. float3 Quantize10(float3 x) {
  122. return (f32tof16(x) * 1024.0f) / (0x7bff + 1.0f);
  123. }
  124. float3 Unquantize7(float3 x) {
  125. return (x * 65536.0f + 0x8000) / 128.0f;
  126. }
  127. float3 Unquantize9(float3 x) {
  128. return (x * 65536.0f + 0x8000) / 512.0f;
  129. }
  130. float3 Unquantize10(float3 x) {
  131. return (x * 65536.0f + 0x8000) / 1024.0f;
  132. }
  133. float3 FinishUnquantize(float3 endpoint0Unq, float3 endpoint1Unq, float weight) {
  134. float3 comp = (endpoint0Unq * (64.0f - weight) + endpoint1Unq * weight + 32.0f) * (31.0f / 4096.0f);
  135. return f16tof32(uint3(comp));
  136. }
  137. #else
  138. //SF
  139. float3 cmpSign(float3 value) {
  140. float3 signVal;
  141. signVal.x = value.x >= 0.0f ? 1.0f : -1.0f;
  142. signVal.y = value.y >= 0.0f ? 1.0f : -1.0f;
  143. signVal.z = value.z >= 0.0f ? 1.0f : -1.0f;
  144. return signVal;
  145. }
  146. float3 Quantize7(float3 x) {
  147. float3 signVal = cmpSign(x);
  148. return signVal * (f32tof16(abs(x)) * 64.0f) / (0x7bff + 1.0f);
  149. }
  150. float3 Quantize9(float3 x) {
  151. float3 signVal = cmpSign(x);
  152. return signVal * (f32tof16(abs(x)) * 256.0f) / (0x7bff + 1.0f);
  153. }
  154. float3 Quantize10(float3 x) {
  155. float3 signVal = cmpSign(x);
  156. return signVal * (f32tof16(abs(x)) * 512.0f) / (0x7bff + 1.0f);
  157. }
  158. float3 Unquantize7(float3 x) {
  159. float3 signVal = sign(x);
  160. x = abs(x);
  161. float3 finalVal = signVal * (x * 32768.0f + 0x4000) / 64.0f;
  162. finalVal.x = x.x >= 64.0f ? 32767.0 : finalVal.x;
  163. finalVal.y = x.y >= 64.0f ? 32767.0 : finalVal.y;
  164. finalVal.z = x.z >= 64.0f ? 32767.0 : finalVal.z;
  165. return finalVal;
  166. }
  167. float3 Unquantize9(float3 x) {
  168. float3 signVal = sign(x);
  169. x = abs(x);
  170. float3 finalVal = signVal * (x * 32768.0f + 0x4000) / 256.0f;
  171. finalVal.x = x.x >= 256.0f ? 32767.0 : finalVal.x;
  172. finalVal.y = x.y >= 256.0f ? 32767.0 : finalVal.y;
  173. finalVal.z = x.z >= 256.0f ? 32767.0 : finalVal.z;
  174. return finalVal;
  175. }
  176. float3 Unquantize10(float3 x) {
  177. float3 signVal = sign(x);
  178. x = abs(x);
  179. float3 finalVal = signVal * (x * 32768.0f + 0x4000) / 512.0f;
  180. finalVal.x = x.x >= 512.0f ? 32767.0 : finalVal.x;
  181. finalVal.y = x.y >= 512.0f ? 32767.0 : finalVal.y;
  182. finalVal.z = x.z >= 512.0f ? 32767.0 : finalVal.z;
  183. return finalVal;
  184. }
  185. float3 FinishUnquantize(float3 endpoint0Unq, float3 endpoint1Unq, float weight) {
  186. float3 comp = (endpoint0Unq * (64.0f - weight) + endpoint1Unq * weight + 32.0f) * (31.0f / 2048.0f);
  187. return f16tof32(uint3(comp));
  188. }
  189. #endif
  190. void Swap(inout float3 a, inout float3 b) {
  191. float3 tmp = a;
  192. a = b;
  193. b = tmp;
  194. }
  195. void Swap(inout float a, inout float b) {
  196. float tmp = a;
  197. a = b;
  198. b = tmp;
  199. }
  200. uint ComputeIndex3(float texelPos, float endPoint0Pos, float endPoint1Pos) {
  201. float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos);
  202. return uint(clamp(r * 6.98182f + 0.00909f + 0.5f, 0.0f, 7.0f));
  203. }
  204. uint ComputeIndex4(float texelPos, float endPoint0Pos, float endPoint1Pos) {
  205. float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos);
  206. return uint(clamp(r * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f));
  207. }
  208. // This adds a bitflag to quantized values that signifies whether they are negative.
  209. void SignExtend(inout float3 v1, uint mask, uint signFlag) {
  210. int3 v = int3(v1);
  211. v.x = (v.x & int(mask)) | (v.x < 0 ? int(signFlag) : 0);
  212. v.y = (v.y & int(mask)) | (v.y < 0 ? int(signFlag) : 0);
  213. v.z = (v.z & int(mask)) | (v.z < 0 ? int(signFlag) : 0);
  214. v1 = v;
  215. }
  216. // Encodes a block with mode 11 (2x 10-bit endpoints).
  217. void EncodeP1(inout uint4 block, inout float blockMSLE, float3 texels[16]) {
  218. // compute endpoints (min/max RGB bbox)
  219. float3 blockMin = texels[0];
  220. float3 blockMax = texels[0];
  221. for (uint i = 1u; i < 16u; ++i) {
  222. blockMin = min(blockMin, texels[i]);
  223. blockMax = max(blockMax, texels[i]);
  224. }
  225. // refine endpoints in log2 RGB space
  226. float3 refinedBlockMin = blockMax;
  227. float3 refinedBlockMax = blockMin;
  228. for (uint i = 0u; i < 16u; ++i) {
  229. refinedBlockMin = min(refinedBlockMin, texels[i] == blockMin ? refinedBlockMin : texels[i]);
  230. refinedBlockMax = max(refinedBlockMax, texels[i] == blockMax ? refinedBlockMax : texels[i]);
  231. }
  232. float3 logBlockMax = log2(blockMax + 1.0f);
  233. float3 logBlockMin = log2(blockMin + 1.0f);
  234. float3 logRefinedBlockMax = log2(refinedBlockMax + 1.0f);
  235. float3 logRefinedBlockMin = log2(refinedBlockMin + 1.0f);
  236. float3 logBlockMaxExt = (logBlockMax - logBlockMin) * (1.0f / 32.0f);
  237. logBlockMin += min(logRefinedBlockMin - logBlockMin, logBlockMaxExt);
  238. logBlockMax -= min(logBlockMax - logRefinedBlockMax, logBlockMaxExt);
  239. blockMin = exp2(logBlockMin) - 1.0f;
  240. blockMax = exp2(logBlockMax) - 1.0f;
  241. float3 blockDir = blockMax - blockMin;
  242. blockDir = blockDir / (blockDir.x + blockDir.y + blockDir.z);
  243. float3 endpoint0 = Quantize10(blockMin);
  244. float3 endpoint1 = Quantize10(blockMax);
  245. float endPoint0Pos = f32tof16(dot(blockMin, blockDir));
  246. float endPoint1Pos = f32tof16(dot(blockMax, blockDir));
  247. #ifdef SIGNED
  248. int maxVal10 = 0x1FF;
  249. endpoint0 = clamp(endpoint0, -maxVal10, maxVal10);
  250. endpoint1 = clamp(endpoint1, -maxVal10, maxVal10);
  251. #endif
  252. // check if endpoint swap is required
  253. float fixupTexelPos = f32tof16(dot(texels[0], blockDir));
  254. uint fixupIndex = ComputeIndex4(fixupTexelPos, endPoint0Pos, endPoint1Pos);
  255. if (fixupIndex > 7) {
  256. Swap(endPoint0Pos, endPoint1Pos);
  257. Swap(endpoint0, endpoint1);
  258. }
  259. // compute indices
  260. uint indices[16] = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u };
  261. for (uint i = 0u; i < 16u; ++i) {
  262. float texelPos = f32tof16(dot(texels[i], blockDir));
  263. indices[i] = ComputeIndex4(texelPos, endPoint0Pos, endPoint1Pos);
  264. }
  265. // compute compression error (MSLE)
  266. float3 endpoint0Unq = Unquantize10(endpoint0);
  267. float3 endpoint1Unq = Unquantize10(endpoint1);
  268. float msle = 0.0f;
  269. for (uint i = 0u; i < 16u; ++i) {
  270. float weight = floor((indices[i] * 64.0f) / 15.0f + 0.5f);
  271. float3 texelUnc = FinishUnquantize(endpoint0Unq, endpoint1Unq, weight);
  272. msle += CalcMSLE(texels[i], texelUnc);
  273. }
  274. #ifdef SIGNED
  275. SignExtend(endpoint0, 0x1FF, 0x200);
  276. SignExtend(endpoint1, 0x1FF, 0x200);
  277. #endif
  278. // encode block for mode 11
  279. blockMSLE = msle;
  280. block.x = 0x03;
  281. // endpoints
  282. block.x |= uint(endpoint0.x) << 5u;
  283. block.x |= uint(endpoint0.y) << 15u;
  284. block.x |= uint(endpoint0.z) << 25u;
  285. block.y |= uint(endpoint0.z) >> 7u;
  286. block.y |= uint(endpoint1.x) << 3u;
  287. block.y |= uint(endpoint1.y) << 13u;
  288. block.y |= uint(endpoint1.z) << 23u;
  289. block.z |= uint(endpoint1.z) >> 9u;
  290. // indices
  291. block.z |= indices[0] << 1u;
  292. block.z |= indices[1] << 4u;
  293. block.z |= indices[2] << 8u;
  294. block.z |= indices[3] << 12u;
  295. block.z |= indices[4] << 16u;
  296. block.z |= indices[5] << 20u;
  297. block.z |= indices[6] << 24u;
  298. block.z |= indices[7] << 28u;
  299. block.w |= indices[8] << 0u;
  300. block.w |= indices[9] << 4u;
  301. block.w |= indices[10] << 8u;
  302. block.w |= indices[11] << 12u;
  303. block.w |= indices[12] << 16u;
  304. block.w |= indices[13] << 20u;
  305. block.w |= indices[14] << 24u;
  306. block.w |= indices[15] << 28u;
  307. }
  308. float DistToLineSq(float3 PointOnLine, float3 LineDirection, float3 Point) {
  309. float3 w = Point - PointOnLine;
  310. float3 x = w - dot(w, LineDirection) * LineDirection;
  311. return dot(x, x);
  312. }
  313. // Gets the deviation from the source data of a particular pattern (smaller is better).
  314. float EvaluateP2Pattern(uint pattern, float3 texels[16]) {
  315. float3 p0BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX);
  316. float3 p0BlockMax = float3(HALF_MIN, HALF_MIN, HALF_MIN);
  317. float3 p1BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX);
  318. float3 p1BlockMax = float3(HALF_MIN, HALF_MIN, HALF_MIN);
  319. for (uint i = 0; i < 16; ++i) {
  320. uint paletteID = Pattern(pattern, i);
  321. if (paletteID == 0) {
  322. p0BlockMin = min(p0BlockMin, texels[i]);
  323. p0BlockMax = max(p0BlockMax, texels[i]);
  324. } else {
  325. p1BlockMin = min(p1BlockMin, texels[i]);
  326. p1BlockMax = max(p1BlockMax, texels[i]);
  327. }
  328. }
  329. float3 p0BlockDir = normalize(p0BlockMax - p0BlockMin);
  330. float3 p1BlockDir = normalize(p1BlockMax - p1BlockMin);
  331. float sqDistanceFromLine = 0.0f;
  332. for (uint i = 0; i < 16; ++i) {
  333. uint paletteID = Pattern(pattern, i);
  334. if (paletteID == 0) {
  335. sqDistanceFromLine += DistToLineSq(p0BlockMin, p0BlockDir, texels[i]);
  336. } else {
  337. sqDistanceFromLine += DistToLineSq(p1BlockMin, p1BlockDir, texels[i]);
  338. }
  339. }
  340. return sqDistanceFromLine;
  341. }
  342. // Encodes a block with either mode 2 (7-bit base, 3x 6-bit delta), or mode 6 (9-bit base, 3x 5-bit delta). Both use pattern encoding.
  343. void EncodeP2Pattern(inout uint4 block, inout float blockMSLE, uint pattern, float3 texels[16]) {
  344. float3 p0BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX);
  345. float3 p0BlockMax = float3(HALF_MIN, HALF_MIN, HALF_MIN);
  346. float3 p1BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX);
  347. float3 p1BlockMax = float3(HALF_MIN, HALF_MIN, HALF_MIN);
  348. for (uint i = 0u; i < 16u; ++i) {
  349. uint paletteID = Pattern(pattern, i);
  350. if (paletteID == 0) {
  351. p0BlockMin = min(p0BlockMin, texels[i]);
  352. p0BlockMax = max(p0BlockMax, texels[i]);
  353. } else {
  354. p1BlockMin = min(p1BlockMin, texels[i]);
  355. p1BlockMax = max(p1BlockMax, texels[i]);
  356. }
  357. }
  358. float3 p0BlockDir = p0BlockMax - p0BlockMin;
  359. float3 p1BlockDir = p1BlockMax - p1BlockMin;
  360. p0BlockDir = p0BlockDir / (p0BlockDir.x + p0BlockDir.y + p0BlockDir.z);
  361. p1BlockDir = p1BlockDir / (p1BlockDir.x + p1BlockDir.y + p1BlockDir.z);
  362. float p0Endpoint0Pos = f32tof16(dot(p0BlockMin, p0BlockDir));
  363. float p0Endpoint1Pos = f32tof16(dot(p0BlockMax, p0BlockDir));
  364. float p1Endpoint0Pos = f32tof16(dot(p1BlockMin, p1BlockDir));
  365. float p1Endpoint1Pos = f32tof16(dot(p1BlockMax, p1BlockDir));
  366. uint fixupID = PatternFixupID(pattern);
  367. float p0FixupTexelPos = f32tof16(dot(texels[0], p0BlockDir));
  368. float p1FixupTexelPos = f32tof16(dot(texels[fixupID], p1BlockDir));
  369. uint p0FixupIndex = ComputeIndex3(p0FixupTexelPos, p0Endpoint0Pos, p0Endpoint1Pos);
  370. uint p1FixupIndex = ComputeIndex3(p1FixupTexelPos, p1Endpoint0Pos, p1Endpoint1Pos);
  371. if (p0FixupIndex > 3u) {
  372. Swap(p0Endpoint0Pos, p0Endpoint1Pos);
  373. Swap(p0BlockMin, p0BlockMax);
  374. }
  375. if (p1FixupIndex > 3u) {
  376. Swap(p1Endpoint0Pos, p1Endpoint1Pos);
  377. Swap(p1BlockMin, p1BlockMax);
  378. }
  379. uint indices[16] = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u };
  380. for (uint i = 0u; i < 16u; ++i) {
  381. float p0TexelPos = f32tof16(dot(texels[i], p0BlockDir));
  382. float p1TexelPos = f32tof16(dot(texels[i], p1BlockDir));
  383. uint p0Index = ComputeIndex3(p0TexelPos, p0Endpoint0Pos, p0Endpoint1Pos);
  384. uint p1Index = ComputeIndex3(p1TexelPos, p1Endpoint0Pos, p1Endpoint1Pos);
  385. uint paletteID = Pattern(pattern, i);
  386. indices[i] = paletteID == 0u ? p0Index : p1Index;
  387. }
  388. float3 endpoint760 = floor(Quantize7(p0BlockMin));
  389. float3 endpoint761 = floor(Quantize7(p0BlockMax));
  390. float3 endpoint762 = floor(Quantize7(p1BlockMin));
  391. float3 endpoint763 = floor(Quantize7(p1BlockMax));
  392. float3 endpoint950 = floor(Quantize9(p0BlockMin));
  393. float3 endpoint951 = floor(Quantize9(p0BlockMax));
  394. float3 endpoint952 = floor(Quantize9(p1BlockMin));
  395. float3 endpoint953 = floor(Quantize9(p1BlockMax));
  396. endpoint761 = endpoint761 - endpoint760;
  397. endpoint762 = endpoint762 - endpoint760;
  398. endpoint763 = endpoint763 - endpoint760;
  399. endpoint951 = endpoint951 - endpoint950;
  400. endpoint952 = endpoint952 - endpoint950;
  401. endpoint953 = endpoint953 - endpoint950;
  402. int maxVal76 = 0x1F;
  403. endpoint761 = clamp(endpoint761, -maxVal76, maxVal76);
  404. endpoint762 = clamp(endpoint762, -maxVal76, maxVal76);
  405. endpoint763 = clamp(endpoint763, -maxVal76, maxVal76);
  406. int maxVal95 = 0xF;
  407. endpoint951 = clamp(endpoint951, -maxVal95, maxVal95);
  408. endpoint952 = clamp(endpoint952, -maxVal95, maxVal95);
  409. endpoint953 = clamp(endpoint953, -maxVal95, maxVal95);
  410. #ifdef SIGNED
  411. int maxVal7 = 0x3F;
  412. int maxVal9 = 0xFF;
  413. endpoint760 = clamp(endpoint760, -maxVal7, maxVal7);
  414. endpoint950 = clamp(endpoint950, -maxVal9, maxVal9);
  415. #endif
  416. float3 endpoint760Unq = Unquantize7(endpoint760);
  417. float3 endpoint761Unq = Unquantize7(endpoint760 + endpoint761);
  418. float3 endpoint762Unq = Unquantize7(endpoint760 + endpoint762);
  419. float3 endpoint763Unq = Unquantize7(endpoint760 + endpoint763);
  420. float3 endpoint950Unq = Unquantize9(endpoint950);
  421. float3 endpoint951Unq = Unquantize9(endpoint950 + endpoint951);
  422. float3 endpoint952Unq = Unquantize9(endpoint950 + endpoint952);
  423. float3 endpoint953Unq = Unquantize9(endpoint950 + endpoint953);
  424. float msle76 = 0.0f;
  425. float msle95 = 0.0f;
  426. for (uint i = 0u; i < 16u; ++i) {
  427. uint paletteID = Pattern(pattern, i);
  428. float3 tmp760Unq = paletteID == 0u ? endpoint760Unq : endpoint762Unq;
  429. float3 tmp761Unq = paletteID == 0u ? endpoint761Unq : endpoint763Unq;
  430. float3 tmp950Unq = paletteID == 0u ? endpoint950Unq : endpoint952Unq;
  431. float3 tmp951Unq = paletteID == 0u ? endpoint951Unq : endpoint953Unq;
  432. float weight = floor((indices[i] * 64.0f) / 7.0f + 0.5f);
  433. float3 texelUnc76 = FinishUnquantize(tmp760Unq, tmp761Unq, weight);
  434. float3 texelUnc95 = FinishUnquantize(tmp950Unq, tmp951Unq, weight);
  435. msle76 += CalcMSLE(texels[i], texelUnc76);
  436. msle95 += CalcMSLE(texels[i], texelUnc95);
  437. }
  438. SignExtend(endpoint761, 0x1F, 0x20);
  439. SignExtend(endpoint762, 0x1F, 0x20);
  440. SignExtend(endpoint763, 0x1F, 0x20);
  441. SignExtend(endpoint951, 0xF, 0x10);
  442. SignExtend(endpoint952, 0xF, 0x10);
  443. SignExtend(endpoint953, 0xF, 0x10);
  444. #ifdef SIGNED
  445. SignExtend(endpoint760, 0x3F, 0x40);
  446. SignExtend(endpoint950, 0xFF, 0x100);
  447. #endif
  448. // encode block
  449. float p2MSLE = min(msle76, msle95);
  450. if (p2MSLE < blockMSLE) {
  451. blockMSLE = p2MSLE;
  452. block = uint4(0u, 0u, 0u, 0u);
  453. if (p2MSLE == msle76) {
  454. // 7.6
  455. block.x = 0x1u;
  456. block.x |= (uint(endpoint762.y) & 0x20u) >> 3u;
  457. block.x |= (uint(endpoint763.y) & 0x10u) >> 1u;
  458. block.x |= (uint(endpoint763.y) & 0x20u) >> 1u;
  459. block.x |= uint(endpoint760.x) << 5u;
  460. block.x |= (uint(endpoint763.z) & 0x01u) << 12u;
  461. block.x |= (uint(endpoint763.z) & 0x02u) << 12u;
  462. block.x |= (uint(endpoint762.z) & 0x10u) << 10u;
  463. block.x |= uint(endpoint760.y) << 15u;
  464. block.x |= (uint(endpoint762.z) & 0x20u) << 17u;
  465. block.x |= (uint(endpoint763.z) & 0x04u) << 21u;
  466. block.x |= (uint(endpoint762.y) & 0x10u) << 20u;
  467. block.x |= uint(endpoint760.z) << 25u;
  468. block.y |= (uint(endpoint763.z) & 0x08u) >> 3u;
  469. block.y |= (uint(endpoint763.z) & 0x20u) >> 4u;
  470. block.y |= (uint(endpoint763.z) & 0x10u) >> 2u;
  471. block.y |= uint(endpoint761.x) << 3u;
  472. block.y |= (uint(endpoint762.y) & 0x0Fu) << 9u;
  473. block.y |= uint(endpoint761.y) << 13u;
  474. block.y |= (uint(endpoint763.y) & 0x0Fu) << 19u;
  475. block.y |= uint(endpoint761.z) << 23u;
  476. block.y |= (uint(endpoint762.z) & 0x07u) << 29u;
  477. block.z |= (uint(endpoint762.z) & 0x08u) >> 3u;
  478. block.z |= uint(endpoint762.x) << 1u;
  479. block.z |= uint(endpoint763.x) << 7u;
  480. } else {
  481. // 9.5
  482. block.x = 0xEu;
  483. block.x |= uint(endpoint950.x) << 5u;
  484. block.x |= (uint(endpoint952.z) & 0x10u) << 10u;
  485. block.x |= uint(endpoint950.y) << 15u;
  486. block.x |= (uint(endpoint952.y) & 0x10u) << 20u;
  487. block.x |= uint(endpoint950.z) << 25u;
  488. block.y |= uint(endpoint950.z) >> 7u;
  489. block.y |= (uint(endpoint953.z) & 0x10u) >> 2u;
  490. block.y |= uint(endpoint951.x) << 3u;
  491. block.y |= (uint(endpoint953.y) & 0x10u) << 4u;
  492. block.y |= (uint(endpoint952.y) & 0x0Fu) << 9u;
  493. block.y |= uint(endpoint951.y) << 13u;
  494. block.y |= (uint(endpoint953.z) & 0x01u) << 18u;
  495. block.y |= (uint(endpoint953.y) & 0x0Fu) << 19u;
  496. block.y |= uint(endpoint951.z) << 23u;
  497. block.y |= (uint(endpoint953.z) & 0x02u) << 27u;
  498. block.y |= uint(endpoint952.z) << 29u;
  499. block.z |= (uint(endpoint952.z) & 0x08u) >> 3u;
  500. block.z |= uint(endpoint952.x) << 1u;
  501. block.z |= (uint(endpoint953.z) & 0x04u) << 4u;
  502. block.z |= uint(endpoint953.x) << 7u;
  503. block.z |= (uint(endpoint953.z) & 0x08u) << 9u;
  504. }
  505. block.z |= pattern << 13u;
  506. uint blockFixupID = PatternFixupID(pattern);
  507. if (blockFixupID == 15u) {
  508. block.z |= indices[0] << 18u;
  509. block.z |= indices[1] << 20u;
  510. block.z |= indices[2] << 23u;
  511. block.z |= indices[3] << 26u;
  512. block.z |= indices[4] << 29u;
  513. block.w |= indices[5] << 0u;
  514. block.w |= indices[6] << 3u;
  515. block.w |= indices[7] << 6u;
  516. block.w |= indices[8] << 9u;
  517. block.w |= indices[9] << 12u;
  518. block.w |= indices[10] << 15u;
  519. block.w |= indices[11] << 18u;
  520. block.w |= indices[12] << 21u;
  521. block.w |= indices[13] << 24u;
  522. block.w |= indices[14] << 27u;
  523. block.w |= indices[15] << 30u;
  524. } else if (blockFixupID == 2u) {
  525. block.z |= indices[0] << 18u;
  526. block.z |= indices[1] << 20u;
  527. block.z |= indices[2] << 23u;
  528. block.z |= indices[3] << 25u;
  529. block.z |= indices[4] << 28u;
  530. block.z |= indices[5] << 31u;
  531. block.w |= indices[5] >> 1u;
  532. block.w |= indices[6] << 2u;
  533. block.w |= indices[7] << 5u;
  534. block.w |= indices[8] << 8u;
  535. block.w |= indices[9] << 11u;
  536. block.w |= indices[10] << 14u;
  537. block.w |= indices[11] << 17u;
  538. block.w |= indices[12] << 20u;
  539. block.w |= indices[13] << 23u;
  540. block.w |= indices[14] << 26u;
  541. block.w |= indices[15] << 29u;
  542. } else {
  543. block.z |= indices[0] << 18u;
  544. block.z |= indices[1] << 20u;
  545. block.z |= indices[2] << 23u;
  546. block.z |= indices[3] << 26u;
  547. block.z |= indices[4] << 29u;
  548. block.w |= indices[5] << 0u;
  549. block.w |= indices[6] << 3u;
  550. block.w |= indices[7] << 6u;
  551. block.w |= indices[8] << 9u;
  552. block.w |= indices[9] << 11u;
  553. block.w |= indices[10] << 14u;
  554. block.w |= indices[11] << 17u;
  555. block.w |= indices[12] << 20u;
  556. block.w |= indices[13] << 23u;
  557. block.w |= indices[14] << 26u;
  558. block.w |= indices[15] << 29u;
  559. }
  560. }
  561. }
  562. layout(local_size_x = 8,
  563. local_size_y = 8,
  564. local_size_z = 1) in;
  565. void main() {
  566. // gather texels for current 4x4 block
  567. // 0 1 2 3
  568. // 4 5 6 7
  569. // 8 9 10 11
  570. // 12 13 14 15
  571. float2 uv = gl_GlobalInvocationID.xy * params.p_textureSizeRcp * 4.0f + params.p_textureSizeRcp;
  572. float2 block0UV = uv;
  573. float2 block1UV = uv + float2(2.0f * params.p_textureSizeRcp.x, 0.0f);
  574. float2 block2UV = uv + float2(0.0f, 2.0f * params.p_textureSizeRcp.y);
  575. float2 block3UV = uv + float2(2.0f * params.p_textureSizeRcp.x, 2.0f * params.p_textureSizeRcp.y);
  576. float4 block0X = OGRE_GatherRed(srcTexture, pointSampler, block0UV);
  577. float4 block1X = OGRE_GatherRed(srcTexture, pointSampler, block1UV);
  578. float4 block2X = OGRE_GatherRed(srcTexture, pointSampler, block2UV);
  579. float4 block3X = OGRE_GatherRed(srcTexture, pointSampler, block3UV);
  580. float4 block0Y = OGRE_GatherGreen(srcTexture, pointSampler, block0UV);
  581. float4 block1Y = OGRE_GatherGreen(srcTexture, pointSampler, block1UV);
  582. float4 block2Y = OGRE_GatherGreen(srcTexture, pointSampler, block2UV);
  583. float4 block3Y = OGRE_GatherGreen(srcTexture, pointSampler, block3UV);
  584. float4 block0Z = OGRE_GatherBlue(srcTexture, pointSampler, block0UV);
  585. float4 block1Z = OGRE_GatherBlue(srcTexture, pointSampler, block1UV);
  586. float4 block2Z = OGRE_GatherBlue(srcTexture, pointSampler, block2UV);
  587. float4 block3Z = OGRE_GatherBlue(srcTexture, pointSampler, block3UV);
  588. float3 texels[16];
  589. texels[0] = float3(block0X.w, block0Y.w, block0Z.w);
  590. texels[1] = float3(block0X.z, block0Y.z, block0Z.z);
  591. texels[2] = float3(block1X.w, block1Y.w, block1Z.w);
  592. texels[3] = float3(block1X.z, block1Y.z, block1Z.z);
  593. texels[4] = float3(block0X.x, block0Y.x, block0Z.x);
  594. texels[5] = float3(block0X.y, block0Y.y, block0Z.y);
  595. texels[6] = float3(block1X.x, block1Y.x, block1Z.x);
  596. texels[7] = float3(block1X.y, block1Y.y, block1Z.y);
  597. texels[8] = float3(block2X.w, block2Y.w, block2Z.w);
  598. texels[9] = float3(block2X.z, block2Y.z, block2Z.z);
  599. texels[10] = float3(block3X.w, block3Y.w, block3Z.w);
  600. texels[11] = float3(block3X.z, block3Y.z, block3Z.z);
  601. texels[12] = float3(block2X.x, block2Y.x, block2Z.x);
  602. texels[13] = float3(block2X.y, block2Y.y, block2Z.y);
  603. texels[14] = float3(block3X.x, block3Y.x, block3Z.x);
  604. texels[15] = float3(block3X.y, block3Y.y, block3Z.y);
  605. uint4 block = uint4(0u, 0u, 0u, 0u);
  606. float blockMSLE = 0.0f;
  607. EncodeP1(block, blockMSLE, texels);
  608. #ifdef QUALITY
  609. float bestScore = EvaluateP2Pattern(0, texels);
  610. uint bestPattern = 0;
  611. for (uint i = 1u; i < PATTERN_NUM; ++i) {
  612. float score = EvaluateP2Pattern(i, texels);
  613. if (score < bestScore) {
  614. bestPattern = i;
  615. bestScore = score;
  616. }
  617. }
  618. EncodeP2Pattern(block, blockMSLE, bestPattern, texels);
  619. #endif
  620. imageStore(dstTexture, int2(gl_GlobalInvocationID.xy), block);
  621. }