bc1.glsl 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. #[versions]
  2. standard = "";
  3. dithered = "#define BC1_DITHER";
  4. #[compute]
  5. #version 450
  6. #include "CrossPlatformSettings_piece_all.glsl"
  7. #define FLT_MAX 340282346638528859811704183484516925440.0f
  8. layout(binding = 0) uniform sampler2D srcTex;
  9. layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
  10. layout(std430, binding = 2) readonly restrict buffer globalBuffer {
  11. float2 c_oMatch5[256];
  12. float2 c_oMatch6[256];
  13. };
  14. layout(push_constant, std430) uniform Params {
  15. uint p_numRefinements;
  16. uint p_padding[3];
  17. }
  18. params;
  19. layout(local_size_x = 8, //
  20. local_size_y = 8, //
  21. local_size_z = 1) in;
  22. float3 rgb565to888(float rgb565) {
  23. float3 retVal;
  24. retVal.x = floor(rgb565 / 2048.0f);
  25. retVal.y = floor(mod(rgb565, 2048.0f) / 32.0f);
  26. retVal.z = floor(mod(rgb565, 32.0f));
  27. // This is the correct 565 to 888 conversion:
  28. // rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
  29. //
  30. // However stb_dxt follows a different one:
  31. // rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
  32. // g = floor( g * ( 256 / 64 + 4 / 64 ) );
  33. //
  34. // I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
  35. // It's quite possible this is the reason:
  36. // http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
  37. //
  38. // Or maybe it's just because it's cheap to do with integer shifts.
  39. // Anyway, we follow stb_dxt's conversion just in case
  40. // (gives almost the same result, with 1 or -1 of difference for a very few values)
  41. //
  42. // Perhaps when we make 888 -> 565 -> 888 it doesn't matter
  43. // because they end up mapping to the original number
  44. return floor(retVal * float3(8.25f, 4.0625f, 8.25f));
  45. }
  46. float rgb888to565(float3 rgbValue) {
  47. rgbValue.rb = floor(rgbValue.rb * 31.0f / 255.0f + 0.5f);
  48. rgbValue.g = floor(rgbValue.g * 63.0f / 255.0f + 0.5f);
  49. return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
  50. }
  51. // linear interpolation at 1/3 point between a and b, using desired rounding type
  52. float3 lerp13(float3 a, float3 b) {
  53. #ifdef STB_DXT_USE_ROUNDING_BIAS
  54. // with rounding bias
  55. return a + floor((b - a) * (1.0f / 3.0f) + 0.5f);
  56. #else
  57. // without rounding bias
  58. return floor((2.0f * a + b) / 3.0f);
  59. #endif
  60. }
  61. /// Unpacks a block of 4 colors from two 16-bit endpoints
  62. void EvalColors(out float3 colors[4], float c0, float c1) {
  63. colors[0] = rgb565to888(c0);
  64. colors[1] = rgb565to888(c1);
  65. colors[2] = lerp13(colors[0], colors[1]);
  66. colors[3] = lerp13(colors[1], colors[0]);
  67. }
  68. /** The color optimization function. (Clever code, part 1)
  69. @param outMinEndp16 [out]
  70. Minimum endpoint, in RGB565
  71. @param outMaxEndp16 [out]
  72. Maximum endpoint, in RGB565
  73. */
  74. void OptimizeColorsBlock(const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16) {
  75. // determine color distribution
  76. float3 avgColor;
  77. float3 minColor;
  78. float3 maxColor;
  79. avgColor = minColor = maxColor = unpackUnorm4x8(srcPixelsBlock[0]).xyz;
  80. for (int i = 1; i < 16; ++i) {
  81. const float3 currColorUnorm = unpackUnorm4x8(srcPixelsBlock[i]).xyz;
  82. avgColor += currColorUnorm;
  83. minColor = min(minColor, currColorUnorm);
  84. maxColor = max(maxColor, currColorUnorm);
  85. }
  86. avgColor = round(avgColor * 255.0f / 16.0f);
  87. maxColor *= 255.0f;
  88. minColor *= 255.0f;
  89. // determine covariance matrix
  90. float cov[6];
  91. for (int i = 0; i < 6; ++i) {
  92. cov[i] = 0;
  93. }
  94. for (int i = 0; i < 16; ++i) {
  95. const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
  96. float3 rgbDiff = currColor - avgColor;
  97. cov[0] += rgbDiff.r * rgbDiff.r;
  98. cov[1] += rgbDiff.r * rgbDiff.g;
  99. cov[2] += rgbDiff.r * rgbDiff.b;
  100. cov[3] += rgbDiff.g * rgbDiff.g;
  101. cov[4] += rgbDiff.g * rgbDiff.b;
  102. cov[5] += rgbDiff.b * rgbDiff.b;
  103. }
  104. // convert covariance matrix to float, find principal axis via power iter
  105. for (int i = 0; i < 6; ++i) {
  106. cov[i] /= 255.0f;
  107. }
  108. float3 vF = maxColor - minColor;
  109. const int nIterPower = 4;
  110. for (int iter = 0; iter < nIterPower; ++iter) {
  111. const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
  112. const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
  113. const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];
  114. vF.r = r;
  115. vF.g = g;
  116. vF.b = b;
  117. }
  118. float magn = max3(abs(vF.r), abs(vF.g), abs(vF.b));
  119. float3 v;
  120. if (magn < 4.0f) { // too small, default to luminance
  121. v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000.
  122. v.g = 587.0f;
  123. v.b = 114.0f;
  124. } else {
  125. v = trunc(vF * (512.0f / magn));
  126. }
  127. // Pick colors at extreme points
  128. float3 minEndpoint, maxEndpoint;
  129. float minDot = FLT_MAX;
  130. float maxDot = -FLT_MAX;
  131. for (int i = 0; i < 16; ++i) {
  132. const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
  133. const float dotValue = dot(currColor, v);
  134. if (dotValue < minDot) {
  135. minDot = dotValue;
  136. minEndpoint = currColor;
  137. }
  138. if (dotValue > maxDot) {
  139. maxDot = dotValue;
  140. maxEndpoint = currColor;
  141. }
  142. }
  143. outMinEndp16 = rgb888to565(minEndpoint);
  144. outMaxEndp16 = rgb888to565(maxEndpoint);
  145. }
  146. // The color matching function
  147. uint MatchColorsBlock(const uint srcPixelsBlock[16], float3 color[4]) {
  148. uint mask = 0u;
  149. float3 dir = color[0] - color[1];
  150. float stops[4];
  151. for (int i = 0; i < 4; ++i) {
  152. stops[i] = dot(color[i], dir);
  153. }
  154. // think of the colors as arranged on a line; project point onto that line, then choose
  155. // next color out of available ones. we compute the crossover points for "best color in top
  156. // half"/"best in bottom half" and then the same inside that subinterval.
  157. //
  158. // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
  159. // but it's very close and a lot faster.
  160. // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
  161. float c0Point = trunc((stops[1] + stops[3]) * 0.5f);
  162. float halfPoint = trunc((stops[3] + stops[2]) * 0.5f);
  163. float c3Point = trunc((stops[2] + stops[0]) * 0.5f);
  164. #ifndef BC1_DITHER
  165. // the version without dithering is straightforward
  166. for (uint i = 16u; i-- > 0u;) {
  167. const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
  168. const float dotValue = dot(currColor, dir);
  169. mask <<= 2u;
  170. if (dotValue < halfPoint) {
  171. mask |= ((dotValue < c0Point) ? 1u : 3u);
  172. } else {
  173. mask |= ((dotValue < c3Point) ? 2u : 0u);
  174. }
  175. }
  176. #else
  177. // with floyd-steinberg dithering
  178. float4 ep1 = float4(0, 0, 0, 0);
  179. float4 ep2 = float4(0, 0, 0, 0);
  180. c0Point *= 16.0f;
  181. halfPoint *= 16.0f;
  182. c3Point *= 16.0f;
  183. for (uint y = 0u; y < 4u; ++y) {
  184. float ditherDot;
  185. uint lmask, step;
  186. float3 currColor;
  187. float dotValue;
  188. currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 0]).xyz * 255.0f;
  189. dotValue = dot(currColor, dir);
  190. ditherDot = (dotValue * 16.0f) + (3 * ep2[1] + 5 * ep2[0]);
  191. if (ditherDot < halfPoint) {
  192. step = (ditherDot < c0Point) ? 1u : 3u;
  193. } else {
  194. step = (ditherDot < c3Point) ? 2u : 0u;
  195. }
  196. ep1[0] = dotValue - stops[step];
  197. lmask = step;
  198. currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 1]).xyz * 255.0f;
  199. dotValue = dot(currColor, dir);
  200. ditherDot = (dotValue * 16.0f) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]);
  201. if (ditherDot < halfPoint) {
  202. step = (ditherDot < c0Point) ? 1u : 3u;
  203. } else {
  204. step = (ditherDot < c3Point) ? 2u : 0u;
  205. }
  206. ep1[1] = dotValue - stops[step];
  207. lmask |= step << 2u;
  208. currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
  209. dotValue = dot(currColor, dir);
  210. ditherDot = (dotValue * 16.0f) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]);
  211. if (ditherDot < halfPoint) {
  212. step = (ditherDot < c0Point) ? 1u : 3u;
  213. } else {
  214. step = (ditherDot < c3Point) ? 2u : 0u;
  215. }
  216. ep1[2] = dotValue - stops[step];
  217. lmask |= step << 4u;
  218. currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
  219. dotValue = dot(currColor, dir);
  220. ditherDot = (dotValue * 16.0f) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]);
  221. if (ditherDot < halfPoint) {
  222. step = (ditherDot < c0Point) ? 1u : 3u;
  223. } else {
  224. step = (ditherDot < c3Point) ? 2u : 0u;
  225. }
  226. ep1[3] = dotValue - stops[step];
  227. lmask |= step << 6u;
  228. mask |= lmask << (y * 8u);
  229. {
  230. float4 tmp = ep1;
  231. ep1 = ep2;
  232. ep2 = tmp;
  233. } // swap
  234. }
  235. #endif
  236. return mask;
  237. }
  238. // The refinement function. (Clever code, part 2)
  239. // Tries to optimize colors to suit block contents better.
  240. // (By solving a least squares system via normal equations+Cramer's rule)
  241. bool RefineBlock(const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
  242. inout float inOutMaxEndp16) {
  243. float newMin16, newMax16;
  244. const float oldMin = inOutMinEndp16;
  245. const float oldMax = inOutMaxEndp16;
  246. if ((mask ^ (mask << 2u)) < 4u) // all pixels have the same index?
  247. {
  248. // yes, linear system would be singular; solve using optimal
  249. // single-color match on average color
  250. float3 rgbVal = float3(8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f);
  251. for (int i = 0; i < 16; ++i) {
  252. rgbVal += unpackUnorm4x8(srcPixelsBlock[i]).xyz;
  253. }
  254. rgbVal = floor(rgbVal * (255.0f / 16.0f));
  255. newMax16 = c_oMatch5[uint(rgbVal.r)][0] * 2048.0f + //
  256. c_oMatch6[uint(rgbVal.g)][0] * 32.0f + //
  257. c_oMatch5[uint(rgbVal.b)][0];
  258. newMin16 = c_oMatch5[uint(rgbVal.r)][1] * 2048.0f + //
  259. c_oMatch6[uint(rgbVal.g)][1] * 32.0f + //
  260. c_oMatch5[uint(rgbVal.b)][1];
  261. } else {
  262. const float w1Tab[4] = { 3, 0, 2, 1 };
  263. const float prods[4] = { 589824.0f, 2304.0f, 262402.0f, 66562.0f };
  264. // ^some magic to save a lot of multiplies in the accumulating loop...
  265. // (precomputed products of weights for least squares system, accumulated inside one 32-bit
  266. // register)
  267. float akku = 0.0f;
  268. uint cm = mask;
  269. float3 at1 = float3(0, 0, 0);
  270. float3 at2 = float3(0, 0, 0);
  271. for (int i = 0; i < 16; ++i, cm >>= 2u) {
  272. const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
  273. const uint step = cm & 3u;
  274. const float w1 = w1Tab[step];
  275. akku += prods[step];
  276. at1 += currColor * w1;
  277. at2 += currColor;
  278. }
  279. at2 = 3.0f * at2 - at1;
  280. // extract solutions and decide solvability
  281. const float xx = floor(akku / 65535.0f);
  282. const float yy = floor(mod(akku, 65535.0f) / 256.0f);
  283. const float xy = mod(akku, 256.0f);
  284. float2 f_rb_g;
  285. f_rb_g.x = 3.0f * 31.0f / 255.0f / (xx * yy - xy * xy);
  286. f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;
  287. // solve.
  288. const float3 newMaxVal = clamp(floor((at1 * yy - at2 * xy) * f_rb_g.xyx + 0.5f),
  289. float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));
  290. newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;
  291. const float3 newMinVal = clamp(floor((at2 * xx - at1 * xy) * f_rb_g.xyx + 0.5f),
  292. float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));
  293. newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
  294. }
  295. inOutMinEndp16 = newMin16;
  296. inOutMaxEndp16 = newMax16;
  297. return oldMin != newMin16 || oldMax != newMax16;
  298. }
  299. #ifdef BC1_DITHER
  300. /// Quantizes 'srcValue' which is originally in 888 (full range),
  301. /// converting it to 565 and then back to 888 (quantized)
  302. float3 quant(float3 srcValue) {
  303. srcValue = clamp(srcValue, 0.0f, 255.0f);
  304. // Convert 888 -> 565
  305. srcValue = floor(srcValue * float3(31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f) + 0.5f);
  306. // Convert 565 -> 888 back
  307. srcValue = floor(srcValue * float3(8.25f, 4.0625f, 8.25f));
  308. return srcValue;
  309. }
  310. void DitherBlock(const uint srcPixBlck[16], out uint dthPixBlck[16]) {
  311. float3 ep1[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };
  312. float3 ep2[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };
  313. for (uint y = 0u; y < 16u; y += 4u) {
  314. float3 srcPixel, dithPixel;
  315. srcPixel = unpackUnorm4x8(srcPixBlck[y + 0u]).xyz * 255.0f;
  316. dithPixel = quant(srcPixel + trunc((3 * ep2[1] + 5 * ep2[0]) * (1.0f / 16.0f)));
  317. ep1[0] = srcPixel - dithPixel;
  318. dthPixBlck[y + 0u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
  319. srcPixel = unpackUnorm4x8(srcPixBlck[y + 1u]).xyz * 255.0f;
  320. dithPixel = quant(
  321. srcPixel + trunc((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) * (1.0f / 16.0f)));
  322. ep1[1] = srcPixel - dithPixel;
  323. dthPixBlck[y + 1u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
  324. srcPixel = unpackUnorm4x8(srcPixBlck[y + 2u]).xyz * 255.0f;
  325. dithPixel = quant(
  326. srcPixel + trunc((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) * (1.0f / 16.0f)));
  327. ep1[2] = srcPixel - dithPixel;
  328. dthPixBlck[y + 2u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
  329. srcPixel = unpackUnorm4x8(srcPixBlck[y + 3u]).xyz * 255.0f;
  330. dithPixel = quant(srcPixel + trunc((7 * ep1[2] + 5 * ep2[3] + ep2[2]) * (1.0f / 16.0f)));
  331. ep1[3] = srcPixel - dithPixel;
  332. dthPixBlck[y + 3u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));
  333. // swap( ep1, ep2 )
  334. for (uint i = 0u; i < 4u; ++i) {
  335. float3 tmp = ep1[i];
  336. ep1[i] = ep2[i];
  337. ep2[i] = tmp;
  338. }
  339. }
  340. }
  341. #endif
  342. void main() {
  343. uint srcPixelsBlock[16];
  344. bool bAllColorsEqual = true;
  345. // Load the whole 4x4 block
  346. const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
  347. for (uint i = 0u; i < 16u; ++i) {
  348. const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i & 0x03u, i >> 2u);
  349. const float3 srcPixels0 = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyz;
  350. srcPixelsBlock[i] = packUnorm4x8(float4(srcPixels0, 1.0f));
  351. bAllColorsEqual = bAllColorsEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
  352. }
  353. float maxEndp16, minEndp16;
  354. uint mask = 0u;
  355. if (bAllColorsEqual) {
  356. const uint3 rgbVal = uint3(unpackUnorm4x8(srcPixelsBlock[0]).xyz * 255.0f);
  357. mask = 0xAAAAAAAAu;
  358. maxEndp16 =
  359. c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
  360. minEndp16 =
  361. c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
  362. } else {
  363. #ifdef BC1_DITHER
  364. uint ditherPixelsBlock[16];
  365. // first step: compute dithered version for PCA if desired
  366. DitherBlock(srcPixelsBlock, ditherPixelsBlock);
  367. #else
  368. #define ditherPixelsBlock srcPixelsBlock
  369. #endif
  370. // second step: pca+map along principal axis
  371. OptimizeColorsBlock(ditherPixelsBlock, minEndp16, maxEndp16);
  372. if (minEndp16 != maxEndp16) {
  373. float3 colors[4];
  374. EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
  375. mask = MatchColorsBlock(srcPixelsBlock, colors);
  376. }
  377. // third step: refine (multiple times if requested)
  378. bool bStopRefinement = false;
  379. for (uint i = 0u; i < params.p_numRefinements && !bStopRefinement; ++i) {
  380. const uint lastMask = mask;
  381. if (RefineBlock(ditherPixelsBlock, mask, minEndp16, maxEndp16)) {
  382. if (minEndp16 != maxEndp16) {
  383. float3 colors[4];
  384. EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
  385. mask = MatchColorsBlock(srcPixelsBlock, colors);
  386. } else {
  387. mask = 0u;
  388. bStopRefinement = true;
  389. }
  390. }
  391. bStopRefinement = mask == lastMask || bStopRefinement;
  392. }
  393. }
  394. // write the color block
  395. if (maxEndp16 < minEndp16) {
  396. const float tmpValue = minEndp16;
  397. minEndp16 = maxEndp16;
  398. maxEndp16 = tmpValue;
  399. mask ^= 0x55555555u;
  400. }
  401. uint2 outputBytes;
  402. outputBytes.x = uint(maxEndp16) | (uint(minEndp16) << 16u);
  403. outputBytes.y = mask;
  404. uint2 dstUV = gl_GlobalInvocationID.xy;
  405. imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));
  406. }