asciiart.glsl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. /*
  2. [configuration]
  3. [OptionBool]
  4. GUIName = Use target window resolution
  5. OptionName = USE_WINDOW_RES
  6. DefaultValue = true
  7. [OptionBool]
  8. GUIName = Debug: Calculate only one character per subgroup
  9. OptionName = DEBUG_ONLY_ONE_CHAR
  10. DefaultValue = false
  11. [/configuration]
  12. */
  13. const uint MAX_CHARS = 96u; // max 96, must be a multiple of 32
  14. const bool HAVE_FULL_FEATURE_FALLBACK = false; // terrible slow, can easily softlock the GPU
  15. const uint UNROLL_FALLBACK = 4;
  16. const uint UNROLL_SIMD = 3; // max MAX_CHARS / 32
  17. // #undef SUPPORTS_SUBGROUP_REDUCTION
  18. #ifdef API_VULKAN
  19. // By default, subgroupBroadcast only supports compile time constants as index.
  20. // However we need an uniform instead. This is always supported in OpenGL,
  21. // but in Vulkan only in SPIR-V >= 1.5.
  22. // So fall back to subgroupShuffle on Vulkan instead.
  23. #define subgroupBroadcast subgroupShuffle
  24. #endif
  25. /*
  26. The header-only font
  27. We have 96 (ASCII) characters, each of them is 12 pixels high and 8 pixels wide.
  28. To store the boolean value per pixel, 96 bits per character is needed.
  29. So three 32 bit integers are used per character.
  30. This takes in total roughly 1 kB of constant buffer.
  31. The first character must be all-one for the optimized implementation below.
  32. */
  33. const uint char_width = 8;
  34. const uint char_height = 12;
  35. const uint char_count = 96;
  36. const uint char_pixels = char_width * char_height;
  37. const float2 char_dim = float2(char_width, char_height);
  38. const uint rasters[char_count][(char_pixels + 31) / 32] = {
  39. {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}, {0x18181818, 0x00181818, 0x00181800},
  40. {0x6c6c6c6c, 0x00000000, 0x00000000}, {0x66660000, 0xff6666ff, 0x00006666},
  41. {0x1bff7e18, 0xd8f87e1f, 0x00187eff}, {0x6edb1b0e, 0x760c1830, 0x0070d8db},
  42. {0x3333361c, 0x1b0e0e1b, 0x00fe63f3}, {0x18383070, 0x00000000, 0x00000000},
  43. {0x0c0c1830, 0x0c0c0c0c, 0x0030180c}, {0x3030180c, 0x30303030, 0x000c1830},
  44. {0x5a990000, 0x5a3cff3c, 0x00000099}, {0x18180000, 0x18ffff18, 0x00001818},
  45. {0x00000000, 0x38000000, 0x000c1838}, {0x00000000, 0x00ffff00, 0x00000000},
  46. {0x00000000, 0x00000000, 0x00001c1c}, {0x6060c0c0, 0x18183030, 0x06060c0c},
  47. {0xe3c3663c, 0xc7cfdbf3, 0x003c66c3}, {0x181e1c18, 0x18181818, 0x007e1818},
  48. {0x60c0e77e, 0x060c1830, 0x00ff0303}, {0xc0c0e77e, 0xc0e07ee0, 0x007ee7c0},
  49. {0x363c3830, 0x3030ff33, 0x00303030}, {0x030303ff, 0xc0e07f03, 0x007ee7c0},
  50. {0x0303e77e, 0xc3e37f03, 0x007ee7c3}, {0xc0c0c0ff, 0x0c183060, 0x000c0c0c},
  51. {0xc3c3e77e, 0xc3e77ee7, 0x007ee7c3}, {0xc3c3e77e, 0xc0c0fee7, 0x007ee7c0},
  52. {0x00000000, 0x00001c1c, 0x00001c1c}, {0x38000000, 0x38000038, 0x000c1838},
  53. {0x0c183060, 0x0c060306, 0x00603018}, {0x00000000, 0xff00ffff, 0x000000ff},
  54. {0x30180c06, 0x3060c060, 0x00060c18}, {0xc0c3c37e, 0x18183060, 0x00180000},
  55. {0x7e000000, 0xdbcbbbc3, 0x00fc06f3}, {0xc3663c18, 0xc3ffc3c3, 0x00c3c3c3},
  56. {0xc3c3e37f, 0xc3e37fe3, 0x007fe3c3}, {0x0303e77e, 0x03030303, 0x007ee703},
  57. {0xc3e3733f, 0xc3c3c3c3, 0x003f73e3}, {0x030303ff, 0x03033f03, 0x00ff0303},
  58. {0x030303ff, 0x0303033f, 0x00030303}, {0x0303e77e, 0xc3f30303, 0x007ee7c3},
  59. {0xc3c3c3c3, 0xc3c3ffc3, 0x00c3c3c3}, {0x1818187e, 0x18181818, 0x007e1818},
  60. {0x60606060, 0x60606060, 0x003e7763}, {0x1b3363c3, 0x1b0f070f, 0x00c36333},
  61. {0x03030303, 0x03030303, 0x00ff0303}, {0xffffe7c3, 0xc3c3c3db, 0x00c3c3c3},
  62. {0xcfcfc7c7, 0xf3fbdbdf, 0x00e3e3f3}, {0xc3c3e77e, 0xc3c3c3c3, 0x007ee7c3},
  63. {0xc3c3e37f, 0x03037fe3, 0x00030303}, {0xc3c3663c, 0xdbc3c3c3, 0x00fc76fb},
  64. {0xc3c3e37f, 0x1b0f7fe3, 0x00c36333}, {0x0303e77e, 0xc0e07e07, 0x007ee7c0},
  65. {0x181818ff, 0x18181818, 0x00181818}, {0xc3c3c3c3, 0xc3c3c3c3, 0x007ee7c3},
  66. {0xc3c3c3c3, 0x6666c3c3, 0x00183c3c}, {0xc3c3c3c3, 0xffdbdbc3, 0x00c3e7ff},
  67. {0x3c6666c3, 0x3c3c183c, 0x00c36666}, {0x3c6666c3, 0x1818183c, 0x00181818},
  68. {0x60c0c0ff, 0x060c7e30, 0x00ff0303}, {0x0c0c0c3c, 0x0c0c0c0c, 0x003c0c0c},
  69. {0x0c0c0606, 0x30301818, 0xc0c06060}, {0x3030303c, 0x30303030, 0x003c3030},
  70. {0xc3663c18, 0x00000000, 0x00000000}, {0x00000000, 0x00000000, 0xff000000},
  71. {0x181c0c0e, 0x00000000, 0x00000000}, {0x00000000, 0xfec0c37e, 0x00fec3c3},
  72. {0x03030303, 0xc3c37f03, 0x007fc3c3}, {0x00000000, 0x0303c37e, 0x007ec303},
  73. {0xc0c0c0c0, 0xc3c3fec0, 0x00fec3c3}, {0x00000000, 0x7fc3c37e, 0x00fe0303},
  74. {0x0c0ccc78, 0x0c0c3f0c, 0x000c0c0c}, {0x00000000, 0xc3c3c37e, 0xc3c0c0fe},
  75. {0x03030303, 0xc3c3c37f, 0x00c3c3c3}, {0x00001800, 0x18181818, 0x00181818},
  76. {0x00003000, 0x30303030, 0x36303030}, {0x03030303, 0x0f1b3363, 0x0063331f},
  77. {0x1818181e, 0x18181818, 0x007e1818}, {0x00000000, 0xdbdbdb7f, 0x00dbdbdb},
  78. {0x00000000, 0x6363633f, 0x00636363}, {0x00000000, 0x6363633e, 0x003e6363},
  79. {0x00000000, 0xc3c3c37f, 0x03037fc3}, {0x00000000, 0xc3c3c3fe, 0xc0c0fec3},
  80. {0x00000000, 0x0303077f, 0x00030303}, {0x00000000, 0x7e0303fe, 0x007fc0c0},
  81. {0x0c0c0c00, 0x0c0c0c3f, 0x00386c0c}, {0x00000000, 0x63636363, 0x007e6363},
  82. {0x00000000, 0x6666c3c3, 0x00183c3c}, {0x00000000, 0xdbc3c3c3, 0x00c3e7ff},
  83. {0x00000000, 0x183c66c3, 0x00c3663c}, {0x00000000, 0x3c6666c3, 0x06060c18},
  84. {0x00000000, 0x183060ff, 0x00ff060c}, {0x181818f0, 0x181c0f1c, 0x00f01818},
  85. {0x18181818, 0x18181818, 0x18181818}, {0x1818180f, 0x1838f038, 0x000f1818},
  86. {0x06000000, 0x0060f18f, 0x00000000}, {0x00000000, 0x00000000, 0x00000000}};
  87. // Precalculated sum of all pixels per character
  88. const uint raster_active_pixels[char_count] = {
  89. 96, 18, 16, 40, 56, 42, 46, 10, 22, 22, 32, 28, 10, 16, 6, 24, 52, 29, 36, 44, 35, 42, 50, 28,
  90. 58, 51, 12, 16, 22, 32, 22, 26, 41, 46, 57, 38, 52, 38, 32, 46, 48, 30, 31, 43, 28, 56, 64, 52,
  91. 42, 52, 52, 44, 28, 48, 42, 58, 42, 32, 38, 26, 24, 26, 14, 8, 10, 34, 40, 26, 40, 32, 30, 33,
  92. 39, 16, 20, 37, 28, 43, 30, 30, 34, 34, 20, 28, 27, 30, 26, 36, 26, 24, 26, 30, 24, 30, 14, 0};
  93. // Get one sample of the font: (pixel index, character index)
  94. float SampleFont(uint2 pos)
  95. {
  96. return (rasters[pos.y][pos.x / 32] >> (pos.x % 32)) & uint(1);
  97. }
  98. // Get one sample of the framebuffer: (character position in screen space, pixel index)
  99. float3 SampleTex(uint2 char_pos, uint pixel)
  100. {
  101. float2 inv_resoltion =
  102. OptionEnabled(USE_WINDOW_RES) ? GetInvWindowResolution() : GetInvResolution();
  103. float2 tex_pos = char_pos * char_dim + float2(pixel % char_width, pixel / char_width) + 0.5;
  104. return SampleLocation(tex_pos * inv_resoltion).xyz;
  105. }
  106. struct CharResults
  107. {
  108. float3 fg; // font color
  109. float3 bg; // background color
  110. float err; // MSE of this configuration
  111. uint c; // character index
  112. };
  113. // Calculate the font and background color and the MSE for a given character
  114. CharResults CalcCharRes(uint c, float3 t, float3 ft)
  115. {
  116. CharResults o;
  117. o.c = c;
  118. // Inputs:
  119. // tt: sum of all texture samples squared
  120. // t: sum of all texture samples
  121. // ff: sum of all font samples squared
  122. // f: sum of all font samples
  123. // ft: sum of all font samples * texture samples
  124. // The font is either 1.0 or 0.0, so ff == f
  125. // As the font is constant, this is pre-calculated
  126. float f = raster_active_pixels[c];
  127. float ff = f;
  128. // The calculation isn't stable if the font is all-one. Return max err
  129. // instead.
  130. if (f == char_pixels)
  131. {
  132. o.err = char_pixels * char_pixels;
  133. return o;
  134. }
  135. // tt is only used as constant offset for the error, define it as zero
  136. float3 tt = float3(0.0, 0.0, 0.0);
  137. // The next lines are a bit harder, hf :-)
  138. // The idea is to find the perfect char with the perfect background color
  139. // and the perfect font color. As this is an equation with three unknowns,
  140. // we can't just try all chars and color combinations.
  141. // As criterion how "perfect" the selection is, we compare the "mean
  142. // squared error" of the resulted colors of all chars. So, now the big
  143. // issue: how to calculate the MSE without knowing the two colors ...
  144. // In the next steps, "a" is the font color, "b" is the background color,
  145. // "f" is the font value at this pixel, "t" is the texture value
  146. // So the square error of one pixel is:
  147. // e = ( t - a⋅f - b⋅(1-f) ) ^ 2
  148. // In longer:
  149. // e = a^2⋅f^2 - 2⋅a⋅b⋅f^2 + 2⋅a⋅b⋅f - 2⋅a⋅f⋅t + b^2⋅f^2 - 2⋅b^2⋅f + b^2 +
  150. // 2⋅b⋅f⋅t - 2⋅b⋅t + t^2
  151. // The sum of all errors is: (as shortcut, ff,f,ft,t,tt are now the sums
  152. // like declared above, sum(1) is the count of pixels) sum(e) = a^2⋅ff -
  153. // 2⋅a^2⋅ff + 2⋅a⋅b⋅f - 2⋅a⋅ft + b^2⋅ff - 2⋅b^2⋅f + b^2⋅sum(1) + 2⋅b⋅ft -
  154. // 2⋅b⋅t + tt
  155. // tt is only used as a constant offset, so its value has no effect on a,b or
  156. // on the relative error. So it can be completely dropped.
  157. // To find the minimum, we have to derive this by "a" and "b":
  158. // d/da sum(e) = 2⋅a⋅ff + 2⋅b⋅f - 2⋅b⋅ff - 2⋅ft
  159. // d/db sum(e) = 2⋅a⋅f - 2⋅a⋅ff - 4⋅b⋅f + 2⋅b⋅ff + 2⋅b⋅sum(1) + 2⋅ft - 2⋅t
  160. // So, both equations must be zero at minimum and there is only one
  161. // solution.
  162. float3 a = (ft * (f - float(char_pixels)) + t * (f - ff)) / (f * f - ff * float(char_pixels));
  163. float3 b = (ft * f - t * ff) / (f * f - ff * float(char_pixels));
  164. float3 e = a * a * ff + 2.0 * a * b * (f - ff) - 2.0 * a * ft +
  165. b * b * (-2.0 * f + ff + float(char_pixels)) + 2.0 * b * ft - 2.0 * b * t + tt;
  166. o.err = dot(e, float3(1.0, 1.0, 1.0));
  167. o.fg = a;
  168. o.bg = b;
  169. o.c = c;
  170. return o;
  171. }
  172. // Get the color of the pixel of this invocation based on the character details
  173. float3 GetFinalPixel(CharResults char_out)
  174. {
  175. float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
  176. uint2 char_pos = uint2(floor(GetCoordinates() * resolution / char_dim));
  177. uint2 pixel_offset = uint2(floor(GetCoordinates() * resolution) - char_pos * char_dim);
  178. float font = SampleFont(int2(pixel_offset.x + char_width * pixel_offset.y, char_out.c));
  179. return char_out.fg * font + char_out.bg * (1.0 - font);
  180. }
  181. /*
  182. This shader performs some kind of brute force evaluation, which character fits best.
  183. for c in characters:
  184. for p in pixels:
  185. ft += font(c,p) * texture(p)
  186. res = CalcCharRes(ft)
  187. min(res.err)
  188. Terrible in performance, only for reference.
  189. */
  190. CharResults CalcCharTrivial(uint2 char_pos)
  191. {
  192. float3 t;
  193. CharResults char_out;
  194. char_out.err = char_pixels * char_pixels;
  195. for (uint c = 0; c < MAX_CHARS; c += 1)
  196. {
  197. float3 ft = float3(0.0, 0.0, 0.0);
  198. for (uint pixel = 0; pixel < char_pixels; pixel += 1)
  199. {
  200. float3 tex = SampleTex(char_pos, pixel);
  201. float font = SampleFont(uint2(pixel, c));
  202. ft += font * tex;
  203. }
  204. if (c == 0)
  205. t = ft;
  206. CharResults res = CalcCharRes(c, t, ft);
  207. if (res.err < char_out.err)
  208. char_out = res;
  209. }
  210. return char_out;
  211. }
  212. /*
  213. However for better performance, some characters are tested at once. This saves some expensive
  214. texture() calls. Also split the loop over the pixels in groups of 32 for only fetching the uint32
  215. of the font once.
  216. */
  217. CharResults CalcCharFallback(uint2 char_pos)
  218. {
  219. float3 t;
  220. CharResults char_out;
  221. char_out.err = char_pixels * char_pixels;
  222. for (uint c = 0; c < MAX_CHARS; c += UNROLL_FALLBACK)
  223. {
  224. // Declare ft
  225. float3 ft[UNROLL_FALLBACK];
  226. for (uint i = 0; i < UNROLL_FALLBACK; i++)
  227. ft[i] = float3(0.0, 0.0, 0.0);
  228. // Split `for p : pixels` in groups of 32. This makes accessing the texture (bit in uint32)
  229. // easier.
  230. for (uint pixel = 0; pixel < char_pixels; pixel += 32)
  231. {
  232. uint font_i[UNROLL_FALLBACK];
  233. for (uint i = 0; i < UNROLL_FALLBACK; i++)
  234. font_i[i] = rasters[c + i][pixel / 32];
  235. for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += 1)
  236. {
  237. float3 tex = SampleTex(char_pos, pixel + pixel_offset);
  238. // Inner kernel of `ft += font * tex`. Most time is spend in here.
  239. for (uint i = 0; i < UNROLL_FALLBACK; i++)
  240. {
  241. float font = (font_i[i] >> pixel_offset) & uint(1);
  242. ft[i] += font * tex;
  243. }
  244. }
  245. }
  246. if (c == 0)
  247. {
  248. // First char has font := 1, so t = ft. Cache this value for the next iterations.
  249. t = ft[0];
  250. }
  251. // Check if this character fits better than the last one.
  252. for (uint i = 0; i < UNROLL_FALLBACK; i++)
  253. {
  254. CharResults res = CalcCharRes(c + i, t, ft[i]);
  255. if (res.err < char_out.err)
  256. char_out = res;
  257. }
  258. }
  259. return char_out;
  260. }
  261. /*
  262. SIMD optimized version with subgroup intrinsics
  263. - distribute all characters over the lanes and check for them in parallel
  264. - distribute the uniform texture access and broadcast each back to each lane
  265. */
  266. CharResults CalcCharSIMD(uint2 char_pos, uint simd_width)
  267. {
  268. // Font color, bg color, character, error -- of character with minimum error
  269. CharResults char_out;
  270. char_out.err = char_pixels * char_pixels;
  271. float3 t;
  272. #ifdef SUPPORTS_SUBGROUP_REDUCTION
  273. // Hack: Work in hard-codeded fixed SIMD mode
  274. if (gl_SubgroupInvocationID < simd_width)
  275. {
  276. // Loop over all characters
  277. for (uint c = 0; c < MAX_CHARS; c += UNROLL_SIMD * simd_width)
  278. {
  279. // registers for "sum of font * texture"
  280. float3 ft[UNROLL_SIMD];
  281. for (uint i = 0; i < UNROLL_SIMD; i++)
  282. ft[i] = float3(0.0, 0.0, 0.0);
  283. for (uint pixel = 0; pixel < char_pixels; pixel += 32)
  284. {
  285. // Preload the font uint32 for the next 32 pixels
  286. uint font_i[UNROLL_SIMD];
  287. for (uint i = 0; i < UNROLL_SIMD; i++)
  288. font_i[i] = rasters[c + UNROLL_SIMD * gl_SubgroupInvocationID + i][pixel / 32];
  289. for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += simd_width)
  290. {
  291. // Copy one full WRAP of textures into registers and shuffle them around for later usage.
  292. // This avoids one memory transaction per tested pixel & character.
  293. float3 tex_simd = SampleTex(char_pos, pixel + pixel_offset + gl_SubgroupInvocationID);
  294. for (uint k = 0; k < simd_width; k += 1)
  295. {
  296. float3 tex = subgroupBroadcast(tex_simd, k);
  297. // Note: As pixel iterates based on power-of-two gl_SubgroupSize,
  298. // the const memory access to rasters is CSE'd and the inner loop
  299. // after unrolling only contains: testing one bit + shuffle +
  300. // conditional add
  301. for (uint i = 0; i < UNROLL_SIMD; i++)
  302. {
  303. float font = (font_i[i] >> (k + pixel_offset % 32)) & uint(1);
  304. ft[i] += font * tex;
  305. }
  306. }
  307. }
  308. }
  309. if (c == 0)
  310. {
  311. // font[0] is a hardcoded 1 font, so t = ft
  312. t = subgroupBroadcast(ft[0], 0);
  313. }
  314. for (uint i = 0; i < UNROLL_SIMD; i++)
  315. {
  316. CharResults res = CalcCharRes(c + UNROLL_SIMD * gl_SubgroupInvocationID + i, t, ft[i]);
  317. if (res.err < char_out.err)
  318. char_out = res;
  319. }
  320. }
  321. }
  322. // Broadcast to get the best character of all threads
  323. float err_min = subgroupMin(char_out.err);
  324. uint smallest = subgroupBallotFindLSB(subgroupBallot(err_min == char_out.err));
  325. char_out.fg = subgroupBroadcast(char_out.fg, smallest);
  326. char_out.bg = subgroupBroadcast(char_out.bg, smallest);
  327. char_out.c = subgroupBroadcast(char_out.c, smallest);
  328. char_out.err = err_min;
  329. #endif
  330. return char_out;
  331. }
  332. bool supportsSIMD(uint simd_width)
  333. {
  334. #ifdef SUPPORTS_SUBGROUP_REDUCTION
  335. const uint mask = simd_width == 32u ? 0xFFFFFFFFu : (1u << simd_width) - 1;
  336. return (subgroupBallot(true)[0] & mask) == mask;
  337. #else
  338. return false;
  339. #endif
  340. }
  341. // "Error: The AsciiArt shader requires the missing GPU extention KHR_shader_subgroup."
  342. const uint missing_subgroup_warning_len = 82;
  343. const uint missing_subgroup_warning[missing_subgroup_warning_len] = {
  344. 37, 82, 82, 79, 82, 26, 95, 52, 72, 69, 95, 33, 83, 67, 73, 73, 33, 82, 84, 95, 83,
  345. 72, 65, 68, 69, 82, 95, 82, 69, 81, 85, 73, 82, 69, 83, 95, 84, 72, 69, 95, 77, 73,
  346. 83, 83, 73, 78, 71, 95, 39, 48, 53, 95, 69, 88, 84, 69, 78, 84, 73, 79, 78, 95, 43,
  347. 40, 50, 63, 83, 72, 65, 68, 69, 82, 63, 83, 85, 66, 71, 82, 79, 85, 80, 14};
  348. float3 ShowWarning(uint2 char_pos)
  349. {
  350. CharResults char_out;
  351. char_out.fg = float3(1.0, 1.0, 1.0);
  352. char_out.bg = float3(0.0, 0.0, 0.0);
  353. char_out.c = 95u; // just background
  354. if (char_pos.y == 0u && char_pos.x < missing_subgroup_warning_len)
  355. {
  356. char_out.c = missing_subgroup_warning[char_pos.x];
  357. }
  358. return GetFinalPixel(char_out);
  359. }
  360. void main()
  361. {
  362. // Calculate the character position of this pixel
  363. float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
  364. uint2 char_pos_self = uint2(floor(GetCoordinates() * resolution / char_dim));
  365. float3 color_out;
  366. #ifdef SUPPORTS_SUBGROUP_REDUCTION
  367. if (supportsSIMD(8))
  368. {
  369. // Loop over all character positions covered by this wave
  370. bool pixel_active = !gl_HelperInvocation;
  371. CharResults char_out;
  372. while (true)
  373. {
  374. // Fetch the next active character position
  375. uint4 active_lanes = subgroupBallot(pixel_active);
  376. if (active_lanes == uint4(0, 0, 0, 0))
  377. {
  378. break;
  379. }
  380. uint2 char_pos = subgroupBroadcast(char_pos_self, subgroupBallotFindLSB(active_lanes));
  381. // And calculate everything for this character position
  382. if (supportsSIMD(32))
  383. {
  384. char_out = CalcCharSIMD(char_pos, 32);
  385. }
  386. else if (supportsSIMD(16))
  387. {
  388. char_out = CalcCharSIMD(char_pos, 16);
  389. }
  390. else if (supportsSIMD(8))
  391. {
  392. char_out = CalcCharSIMD(char_pos, 8);
  393. }
  394. // Draw the character on screen
  395. if (char_pos == char_pos_self)
  396. {
  397. color_out = GetFinalPixel(char_out);
  398. pixel_active = false;
  399. }
  400. if (OptionEnabled(DEBUG_ONLY_ONE_CHAR))
  401. {
  402. break;
  403. }
  404. }
  405. }
  406. else
  407. #else
  408. if (char_pos_self.y <= 1u)
  409. {
  410. color_out = ShowWarning(char_pos_self);
  411. }
  412. else
  413. #endif
  414. if (HAVE_FULL_FEATURE_FALLBACK)
  415. {
  416. color_out = GetFinalPixel(CalcCharFallback(char_pos_self));
  417. }
  418. else
  419. {
  420. color_out = Sample().xyz;
  421. }
  422. SetOutput(float4(color_out, 1.0));
  423. }