PixelShaderGen.cpp 86 KB


  1. // Copyright 2008 Dolphin Emulator Project
  2. // SPDX-License-Identifier: GPL-2.0-or-later
  3. #include "VideoCommon/PixelShaderGen.h"
  4. #include <algorithm>
  5. #include <cmath>
  6. #include <cstdio>
  7. #include <fmt/format.h>
  8. #include "Common/Assert.h"
  9. #include "Common/CommonTypes.h"
  10. #include "Common/EnumMap.h"
  11. #include "Common/Logging/Log.h"
  12. #include "VideoCommon/BPMemory.h"
  13. #include "VideoCommon/BoundingBox.h"
  14. #include "VideoCommon/DriverDetails.h"
  15. #include "VideoCommon/LightingShaderGen.h"
  16. #include "VideoCommon/NativeVertexFormat.h"
  17. #include "VideoCommon/RenderState.h"
  18. #include "VideoCommon/VertexLoaderManager.h"
  19. #include "VideoCommon/VideoCommon.h"
  20. #include "VideoCommon/VideoConfig.h"
  21. #include "VideoCommon/XFMemory.h" // for texture projection mode
  22. // TODO: Get rid of these
  23. enum : u32
  24. {
  25. C_COLORMATRIX = 0, // 0
  26. C_COLORS = 0, // 0
  27. C_KCOLORS = C_COLORS + 4, // 4
  28. C_ALPHA = C_KCOLORS + 4, // 8
  29. C_TEXDIMS = C_ALPHA + 1, // 9
  30. C_ZBIAS = C_TEXDIMS + 8, // 17
  31. C_INDTEXSCALE = C_ZBIAS + 2, // 19
  32. C_INDTEXMTX = C_INDTEXSCALE + 2, // 21
  33. C_FOGCOLOR = C_INDTEXMTX + 6, // 27
  34. C_FOGI = C_FOGCOLOR + 1, // 28
  35. C_FOGF = C_FOGI + 1, // 29
  36. C_ZSLOPE = C_FOGF + 2, // 31
  37. C_EFBSCALE = C_ZSLOPE + 1, // 32
  38. C_PENVCONST_END = C_EFBSCALE + 1
  39. };
  40. constexpr Common::EnumMap<const char*, KonstSel::K3_A> tev_ksel_table_c{
  41. "255,255,255", // 1 = 0x00
  42. "223,223,223", // 7_8 = 0x01
  43. "191,191,191", // 3_4 = 0x02
  44. "159,159,159", // 5_8 = 0x03
  45. "128,128,128", // 1_2 = 0x04
  46. "96,96,96", // 3_8 = 0x05
  47. "64,64,64", // 1_4 = 0x06
  48. "32,32,32", // 1_8 = 0x07
  49. "0,0,0", // INVALID = 0x08
  50. "0,0,0", // INVALID = 0x09
  51. "0,0,0", // INVALID = 0x0a
  52. "0,0,0", // INVALID = 0x0b
  53. I_KCOLORS "[0].rgb", // K0 = 0x0C
  54. I_KCOLORS "[1].rgb", // K1 = 0x0D
  55. I_KCOLORS "[2].rgb", // K2 = 0x0E
  56. I_KCOLORS "[3].rgb", // K3 = 0x0F
  57. I_KCOLORS "[0].rrr", // K0_R = 0x10
  58. I_KCOLORS "[1].rrr", // K1_R = 0x11
  59. I_KCOLORS "[2].rrr", // K2_R = 0x12
  60. I_KCOLORS "[3].rrr", // K3_R = 0x13
  61. I_KCOLORS "[0].ggg", // K0_G = 0x14
  62. I_KCOLORS "[1].ggg", // K1_G = 0x15
  63. I_KCOLORS "[2].ggg", // K2_G = 0x16
  64. I_KCOLORS "[3].ggg", // K3_G = 0x17
  65. I_KCOLORS "[0].bbb", // K0_B = 0x18
  66. I_KCOLORS "[1].bbb", // K1_B = 0x19
  67. I_KCOLORS "[2].bbb", // K2_B = 0x1A
  68. I_KCOLORS "[3].bbb", // K3_B = 0x1B
  69. I_KCOLORS "[0].aaa", // K0_A = 0x1C
  70. I_KCOLORS "[1].aaa", // K1_A = 0x1D
  71. I_KCOLORS "[2].aaa", // K2_A = 0x1E
  72. I_KCOLORS "[3].aaa", // K3_A = 0x1F
  73. };
  74. constexpr Common::EnumMap<const char*, KonstSel::K3_A> tev_ksel_table_a{
  75. "255", // 1 = 0x00
  76. "223", // 7_8 = 0x01
  77. "191", // 3_4 = 0x02
  78. "159", // 5_8 = 0x03
  79. "128", // 1_2 = 0x04
  80. "96", // 3_8 = 0x05
  81. "64", // 1_4 = 0x06
  82. "32", // 1_8 = 0x07
  83. "0", // INVALID = 0x08
  84. "0", // INVALID = 0x09
  85. "0", // INVALID = 0x0a
  86. "0", // INVALID = 0x0b
  87. "0", // INVALID = 0x0c
  88. "0", // INVALID = 0x0d
  89. "0", // INVALID = 0x0e
  90. "0", // INVALID = 0x0f
  91. I_KCOLORS "[0].r", // K0_R = 0x10
  92. I_KCOLORS "[1].r", // K1_R = 0x11
  93. I_KCOLORS "[2].r", // K2_R = 0x12
  94. I_KCOLORS "[3].r", // K3_R = 0x13
  95. I_KCOLORS "[0].g", // K0_G = 0x14
  96. I_KCOLORS "[1].g", // K1_G = 0x15
  97. I_KCOLORS "[2].g", // K2_G = 0x16
  98. I_KCOLORS "[3].g", // K3_G = 0x17
  99. I_KCOLORS "[0].b", // K0_B = 0x18
  100. I_KCOLORS "[1].b", // K1_B = 0x19
  101. I_KCOLORS "[2].b", // K2_B = 0x1A
  102. I_KCOLORS "[3].b", // K3_B = 0x1B
  103. I_KCOLORS "[0].a", // K0_A = 0x1C
  104. I_KCOLORS "[1].a", // K1_A = 0x1D
  105. I_KCOLORS "[2].a", // K2_A = 0x1E
  106. I_KCOLORS "[3].a", // K3_A = 0x1F
  107. };
  108. constexpr Common::EnumMap<const char*, TevColorArg::Zero> tev_c_input_table{
  109. "prev.rgb", // CPREV,
  110. "prev.aaa", // APREV,
  111. "c0.rgb", // C0,
  112. "c0.aaa", // A0,
  113. "c1.rgb", // C1,
  114. "c1.aaa", // A1,
  115. "c2.rgb", // C2,
  116. "c2.aaa", // A2,
  117. "textemp.rgb", // TEXC,
  118. "textemp.aaa", // TEXA,
  119. "rastemp.rgb", // RASC,
  120. "rastemp.aaa", // RASA,
  121. "int3(255,255,255)", // ONE
  122. "int3(128,128,128)", // HALF
  123. "konsttemp.rgb", // KONST
  124. "int3(0,0,0)", // ZERO
  125. };
  126. constexpr Common::EnumMap<const char*, TevColorArg::Zero> tev_c_input_type{
  127. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_PREV", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_PREV",
  128. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR",
  129. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR",
  130. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR",
  131. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_TEX", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_TEX",
  132. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_RAS", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_RAS",
  133. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_NUMERIC", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_NUMERIC",
  134. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_KONST", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_NUMERIC",
  135. };
  136. constexpr Common::EnumMap<const char*, TevAlphaArg::Zero> tev_a_input_table{
  137. "prev.a", // APREV,
  138. "c0.a", // A0,
  139. "c1.a", // A1,
  140. "c2.a", // A2,
  141. "textemp.a", // TEXA,
  142. "rastemp.a", // RASA,
  143. "konsttemp.a", // KONST, (hw1 had quarter)
  144. "0", // ZERO
  145. };
  146. constexpr Common::EnumMap<const char*, TevAlphaArg::Zero> tev_a_input_type{
  147. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_PREV", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR",
  148. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_COLOR",
  149. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_TEX", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_RAS",
  150. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_KONST", "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_NUMERIC",
  151. };
  152. constexpr Common::EnumMap<const char*, RasColorChan::Zero> tev_ras_table{
  153. "iround(col0 * 255.0)",
  154. "iround(col1 * 255.0)",
  155. "ERROR13", // 2
  156. "ERROR14", // 3
  157. "ERROR15", // 4
  158. "(int4(1, 1, 1, 1) * alphabump)", // bump alpha (0..248)
  159. "(int4(1, 1, 1, 1) * (alphabump | (alphabump >> 5)))", // normalized bump alpha (0..255)
  160. "int4(0, 0, 0, 0)", // zero
  161. };
  162. constexpr Common::EnumMap<const char*, TevOutput::Color2> tev_c_output_table{
  163. "prev.rgb",
  164. "c0.rgb",
  165. "c1.rgb",
  166. "c2.rgb",
  167. };
  168. constexpr Common::EnumMap<const char*, TevOutput::Color2> tev_a_output_table{
  169. "prev.a",
  170. "c0.a",
  171. "c1.a",
  172. "c2.a",
  173. };
  174. constexpr Common::EnumMap<char, ColorChannel::Alpha> rgba_swizzle{'r', 'g', 'b', 'a'};
  175. PixelShaderUid GetPixelShaderUid()
  176. {
  177. PixelShaderUid out;
  178. pixel_shader_uid_data* const uid_data = out.GetUidData();
  179. uid_data->useDstAlpha = bpmem.dstalpha.enable && bpmem.blendmode.alphaupdate &&
  180. bpmem.zcontrol.pixel_format == PixelFormat::RGBA6_Z24;
  181. uid_data->genMode_numindstages = bpmem.genMode.numindstages;
  182. uid_data->genMode_numtevstages = bpmem.genMode.numtevstages;
  183. uid_data->genMode_numtexgens = bpmem.genMode.numtexgens;
  184. uid_data->bounding_box = g_ActiveConfig.bBBoxEnable && g_bounding_box->IsEnabled();
  185. uid_data->rgba6_format =
  186. bpmem.zcontrol.pixel_format == PixelFormat::RGBA6_Z24 && !g_ActiveConfig.bForceTrueColor;
  187. uid_data->dither = bpmem.blendmode.dither && uid_data->rgba6_format;
  188. uid_data->uint_output = bpmem.blendmode.UseLogicOp();
  189. u32 numStages = uid_data->genMode_numtevstages + 1;
  190. uid_data->Pretest = bpmem.alpha_test.TestResult();
  191. uid_data->ztest = bpmem.GetEmulatedZ();
  192. if (uid_data->ztest == EmulatedZ::Early &&
  193. (g_ActiveConfig.bFastDepthCalc ||
  194. bpmem.alpha_test.TestResult() == AlphaTestResult::Undetermined)
  195. // We can't allow early_ztest for zfreeze because depth is overridden per-pixel.
  196. // This means it's impossible for zcomploc to be emulated on a zfrozen polygon.
  197. && !bpmem.genMode.zfreeze)
  198. {
  199. uid_data->ztest = EmulatedZ::ForcedEarly;
  200. }
  201. const bool forced_early_z = uid_data->ztest == EmulatedZ::ForcedEarly;
  202. const bool per_pixel_depth =
  203. (bpmem.ztex2.op != ZTexOp::Disabled && uid_data->ztest == EmulatedZ::Late) ||
  204. (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) ||
  205. (bpmem.zmode.testenable && bpmem.genMode.zfreeze);
  206. uid_data->per_pixel_depth = per_pixel_depth;
  207. if (g_ActiveConfig.bEnablePixelLighting)
  208. {
  209. uid_data->numColorChans = xfmem.numChan.numColorChans;
  210. GetLightingShaderUid(uid_data->lighting);
  211. }
  212. if (uid_data->genMode_numtexgens > 0)
  213. {
  214. for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
  215. {
  216. // optional perspective divides
  217. uid_data->texMtxInfo_n_projection |= static_cast<u32>(xfmem.texMtxInfo[i].projection.Value())
  218. << i;
  219. }
  220. }
  221. // indirect texture map lookup
  222. int nIndirectStagesUsed = 0;
  223. for (unsigned int i = 0; i < numStages; ++i)
  224. {
  225. if (bpmem.tevind[i].IsActive())
  226. nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
  227. }
  228. uid_data->nIndirectStagesUsed = nIndirectStagesUsed;
  229. for (u32 i = 0; i < uid_data->genMode_numindstages; ++i)
  230. {
  231. if (uid_data->nIndirectStagesUsed & (1 << i))
  232. uid_data->SetTevindrefValues(i, bpmem.tevindref.getTexCoord(i), bpmem.tevindref.getTexMap(i));
  233. }
  234. for (unsigned int n = 0; n < numStages; n++)
  235. {
  236. uid_data->stagehash[n].tevorders_texcoord = bpmem.tevorders[n / 2].getTexCoord(n & 1);
  237. uid_data->stagehash[n].tevind = bpmem.tevind[n].hex;
  238. TevStageCombiner::ColorCombiner& cc = bpmem.combiners[n].colorC;
  239. TevStageCombiner::AlphaCombiner& ac = bpmem.combiners[n].alphaC;
  240. uid_data->stagehash[n].cc = cc.hex & 0xFFFFFF;
  241. uid_data->stagehash[n].ac = ac.hex & 0xFFFFF0; // Storing rswap and tswap later
  242. if (cc.a == TevColorArg::RasAlpha || cc.a == TevColorArg::RasColor ||
  243. cc.b == TevColorArg::RasAlpha || cc.b == TevColorArg::RasColor ||
  244. cc.c == TevColorArg::RasAlpha || cc.c == TevColorArg::RasColor ||
  245. cc.d == TevColorArg::RasAlpha || cc.d == TevColorArg::RasColor ||
  246. ac.a == TevAlphaArg::RasAlpha || ac.b == TevAlphaArg::RasAlpha ||
  247. ac.c == TevAlphaArg::RasAlpha || ac.d == TevAlphaArg::RasAlpha)
  248. {
  249. const auto ras_swap_table = bpmem.tevksel.GetSwapTable(bpmem.combiners[n].alphaC.rswap);
  250. uid_data->stagehash[n].ras_swap_r = ras_swap_table[ColorChannel::Red];
  251. uid_data->stagehash[n].ras_swap_g = ras_swap_table[ColorChannel::Green];
  252. uid_data->stagehash[n].ras_swap_b = ras_swap_table[ColorChannel::Blue];
  253. uid_data->stagehash[n].ras_swap_a = ras_swap_table[ColorChannel::Alpha];
  254. uid_data->stagehash[n].tevorders_colorchan = bpmem.tevorders[n / 2].getColorChan(n & 1);
  255. }
  256. uid_data->stagehash[n].tevorders_enable = bpmem.tevorders[n / 2].getEnable(n & 1);
  257. if (uid_data->stagehash[n].tevorders_enable)
  258. {
  259. const auto tex_swap_table = bpmem.tevksel.GetSwapTable(bpmem.combiners[n].alphaC.tswap);
  260. uid_data->stagehash[n].tex_swap_r = tex_swap_table[ColorChannel::Red];
  261. uid_data->stagehash[n].tex_swap_g = tex_swap_table[ColorChannel::Green];
  262. uid_data->stagehash[n].tex_swap_b = tex_swap_table[ColorChannel::Blue];
  263. uid_data->stagehash[n].tex_swap_a = tex_swap_table[ColorChannel::Alpha];
  264. uid_data->stagehash[n].tevorders_texmap = bpmem.tevorders[n / 2].getTexMap(n & 1);
  265. }
  266. if (cc.a == TevColorArg::Konst || cc.b == TevColorArg::Konst || cc.c == TevColorArg::Konst ||
  267. cc.d == TevColorArg::Konst || ac.a == TevAlphaArg::Konst || ac.b == TevAlphaArg::Konst ||
  268. ac.c == TevAlphaArg::Konst || ac.d == TevAlphaArg::Konst)
  269. {
  270. uid_data->stagehash[n].tevksel_kc = bpmem.tevksel.GetKonstColor(n);
  271. uid_data->stagehash[n].tevksel_ka = bpmem.tevksel.GetKonstAlpha(n);
  272. }
  273. }
  274. #define MY_STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
  275. uid_data->num_values = (g_ActiveConfig.bEnablePixelLighting) ?
  276. sizeof(*uid_data) :
  277. MY_STRUCT_OFFSET(*uid_data, stagehash[numStages]);
  278. // NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
  279. // (in this case we need to write a depth value if depth test passes regardless of the alpha
  280. // testing result)
  281. if (uid_data->Pretest == AlphaTestResult::Undetermined ||
  282. (uid_data->Pretest == AlphaTestResult::Fail && uid_data->ztest == EmulatedZ::Late))
  283. {
  284. uid_data->alpha_test_comp0 = bpmem.alpha_test.comp0;
  285. uid_data->alpha_test_comp1 = bpmem.alpha_test.comp1;
  286. uid_data->alpha_test_logic = bpmem.alpha_test.logic;
  287. }
  288. uid_data->zfreeze = bpmem.genMode.zfreeze;
  289. uid_data->ztex_op = bpmem.ztex2.op;
  290. uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
  291. uid_data->fog_proj = bpmem.fog.c_proj_fsel.proj;
  292. uid_data->fog_RangeBaseEnabled = bpmem.fogRange.Base.Enabled;
  293. return out;
  294. }
  295. void ClearUnusedPixelShaderUidBits(APIType api_type, const ShaderHostConfig& host_config,
  296. PixelShaderUid* uid)
  297. {
  298. pixel_shader_uid_data* const uid_data = uid->GetUidData();
  299. // OpenGL and Vulkan convert implicitly normalized color outputs to their uint representation.
  300. // Therefore, it is not necessary to use a uint output on these backends. We also disable the
  301. // uint output when logic op is not supported (i.e. driver/device does not support D3D11.1).
  302. if (api_type != APIType::D3D || !host_config.backend_logic_op)
  303. uid_data->uint_output = 0;
  304. // If bounding box is enabled when a UID cache is created, then later disabled, we shouldn't
  305. // emit the bounding box portion of the shader.
  306. uid_data->bounding_box &= host_config.bounding_box && host_config.backend_bbox;
  307. }
  308. void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
  309. const ShaderHostConfig& host_config, bool bounding_box,
  310. const CustomPixelShaderContents& custom_details)
  311. {
  312. // dot product for integer vectors
  313. out.Write("int idot(int3 x, int3 y)\n"
  314. "{{\n"
  315. "\tint3 tmp = x * y;\n"
  316. "\treturn tmp.x + tmp.y + tmp.z;\n"
  317. "}}\n");
  318. out.Write("int idot(int4 x, int4 y)\n"
  319. "{{\n"
  320. "\tint4 tmp = x * y;\n"
  321. "\treturn tmp.x + tmp.y + tmp.z + tmp.w;\n"
  322. "}}\n\n");
  323. // rounding + casting to integer at once in a single function
  324. out.Write("int iround(float x) {{ return int (round(x)); }}\n"
  325. "int2 iround(float2 x) {{ return int2(round(x)); }}\n"
  326. "int3 iround(float3 x) {{ return int3(round(x)); }}\n"
  327. "int4 iround(float4 x) {{ return int4(round(x)); }}\n\n");
  328. out.Write("SAMPLER_BINDING(0) uniform sampler2DArray samp[8];\n");
  329. out.Write("\n");
  330. out.Write("UBO_BINDING(std140, 1) uniform PSBlock {{\n");
  331. out.Write("\tint4 " I_COLORS "[4];\n"
  332. "\tint4 " I_KCOLORS "[4];\n"
  333. "\tint4 " I_ALPHA ";\n"
  334. "\tint4 " I_TEXDIMS "[8];\n"
  335. "\tint4 " I_ZBIAS "[2];\n"
  336. "\tint4 " I_INDTEXSCALE "[2];\n"
  337. "\tint4 " I_INDTEXMTX "[6];\n"
  338. "\tint4 " I_FOGCOLOR ";\n"
  339. "\tint4 " I_FOGI ";\n"
  340. "\tfloat4 " I_FOGF ";\n"
  341. "\tfloat4 " I_FOGRANGE "[3];\n"
  342. "\tfloat4 " I_ZSLOPE ";\n"
  343. "\tfloat2 " I_EFBSCALE ";\n"
  344. "\tuint bpmem_genmode;\n"
  345. "\tuint bpmem_alphaTest;\n"
  346. "\tuint bpmem_fogParam3;\n"
  347. "\tuint bpmem_fogRangeBase;\n"
  348. "\tuint bpmem_dstalpha;\n"
  349. "\tuint bpmem_ztex_op;\n"
  350. "\tbool bpmem_late_ztest;\n"
  351. "\tbool bpmem_rgba6_format;\n"
  352. "\tbool bpmem_dither;\n"
  353. "\tbool bpmem_bounding_box;\n"
  354. "\tuint4 bpmem_pack1[16];\n" // .xy - combiners, .z - tevind
  355. "\tuint4 bpmem_pack2[8];\n" // .x - tevorder, .y - tevksel, .zw - SamplerState tm0/tm1
  356. "\tint4 konstLookup[32];\n"
  357. "\tbool blend_enable;\n"
  358. "\tuint blend_src_factor;\n"
  359. "\tuint blend_src_factor_alpha;\n"
  360. "\tuint blend_dst_factor;\n"
  361. "\tuint blend_dst_factor_alpha;\n"
  362. "\tbool blend_subtract;\n"
  363. "\tbool blend_subtract_alpha;\n"
  364. "\tbool logic_op_enable;\n"
  365. "\tuint logic_op_mode;\n"
  366. "\tuint time_ms;\n"
  367. "}};\n\n");
  368. out.Write("#define bpmem_combiners(i) (bpmem_pack1[(i)].xy)\n"
  369. "#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n"
  370. "#define bpmem_iref(i) (bpmem_pack1[(i)].w)\n"
  371. "#define bpmem_tevorder(i) (bpmem_pack2[(i)].x)\n"
  372. "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n"
  373. "#define samp_texmode0(i) (bpmem_pack2[(i)].z)\n"
  374. "#define samp_texmode1(i) (bpmem_pack2[(i)].w)\n\n");
  375. if (host_config.per_pixel_lighting)
  376. {
  377. out.Write("{}", s_lighting_struct);
  378. out.Write("UBO_BINDING(std140, 2) uniform VSBlock {{\n");
  379. out.Write("{}", s_shader_uniforms);
  380. out.Write("}};\n");
  381. }
  382. if (!custom_details.shaders.empty() &&
  383. !custom_details.shaders.back().material_uniform_block.empty())
  384. {
  385. out.Write("UBO_BINDING(std140, 3) uniform CustomShaderBlock {{\n");
  386. out.Write("{}", custom_details.shaders.back().material_uniform_block);
  387. out.Write("}} custom_uniforms;\n");
  388. }
  389. if (bounding_box)
  390. {
  391. out.Write("SSBO_BINDING(0) coherent buffer BBox {{\n"
  392. " int bbox_data[4];\n"
  393. "}};");
  394. out.Write(R"(
  395. #define bbox_left bbox_data[0]
  396. #define bbox_right bbox_data[1]
  397. #define bbox_top bbox_data[2]
  398. #define bbox_bottom bbox_data[3]
  399. void UpdateBoundingBoxBuffer(int2 min_pos, int2 max_pos) {{
  400. if (bbox_left > min_pos.x)
  401. atomicMin(bbox_left, min_pos.x);
  402. if (bbox_right < max_pos.x)
  403. atomicMax(bbox_right, max_pos.x);
  404. if (bbox_top > min_pos.y)
  405. atomicMin(bbox_top, min_pos.y);
  406. if (bbox_bottom < max_pos.y)
  407. atomicMax(bbox_bottom, max_pos.y);
  408. }}
  409. void UpdateBoundingBox(float2 rawpos) {{
  410. // We only want to include coordinates for pixels aligned with the native resolution pixel centers.
  411. // This makes bounding box sizes more accurate (though not perfect) at higher resolutions,
  412. // avoiding EFB copy buffer overflow in affected games.
  413. //
  414. // For a more detailed explanation, see https://dolp.in/pr9801
  415. int2 int_efb_scale = iround(1.0 / {efb_scale}.xy);
  416. if (int(rawpos.x) % int_efb_scale.x != int_efb_scale.x >> 1 ||
  417. int(rawpos.y) % int_efb_scale.y != int_efb_scale.y >> 1) // right shift for fast divide by two
  418. {{
  419. return;
  420. }}
  421. // The rightmost shaded pixel is not included in the right bounding box register,
  422. // such that width = right - left + 1. This has been verified on hardware.
  423. int2 pos = int2(rawpos * {efb_scale}.xy);
  424. #ifdef API_OPENGL
  425. // We need to invert the Y coordinate due to OpenGL's lower-left origin
  426. pos.y = {efb_height} - pos.y - 1;
  427. #endif
  428. // The GC/Wii GPU rasterizes in 2x2 pixel groups, so bounding box values will be rounded to the
  429. // extents of these groups, rather than the exact pixel.
  430. int2 pos_tl = pos & ~1; // round down to even
  431. int2 pos_br = pos | 1; // round up to odd
  432. #if defined(SUPPORTS_SUBGROUP_REDUCTION) && !defined(BROKEN_SUBGROUP_WITH_DISCARD)
  433. if (!IS_HELPER_INVOCATION)
  434. {{
  435. SUBGROUP_MIN(pos_tl);
  436. SUBGROUP_MAX(pos_br);
  437. if (IS_FIRST_ACTIVE_INVOCATION)
  438. UpdateBoundingBoxBuffer(pos_tl, pos_br);
  439. }}
  440. #else
  441. UpdateBoundingBoxBuffer(pos_tl, pos_br);
  442. #endif
  443. }}
  444. )",
  445. fmt::arg("efb_height", EFB_HEIGHT), fmt::arg("efb_scale", I_EFBSCALE));
  446. }
  447. if (host_config.manual_texture_sampling)
  448. {
  449. out.Write(R"(
  450. int4 readTexture(in sampler2DArray tex, uint u, uint v, int layer, int lod) {{
  451. return iround(texelFetch(tex, int3(u, v, layer), lod) * 255.0);
  452. }}
  453. int4 readTextureLinear(in sampler2DArray tex, uint2 uv1, uint2 uv2, int layer, int lod, int2 frac_uv) {{)");
  454. out.Write(R"(
  455. int4 result =
  456. readTexture(tex, uv1.x, uv1.y, layer, lod) * (128 - frac_uv.x) * (128 - frac_uv.y) +
  457. readTexture(tex, uv2.x, uv1.y, layer, lod) * ( frac_uv.x) * (128 - frac_uv.y) +
  458. readTexture(tex, uv1.x, uv2.y, layer, lod) * (128 - frac_uv.x) * ( frac_uv.y) +
  459. readTexture(tex, uv2.x, uv2.y, layer, lod) * ( frac_uv.x) * ( frac_uv.y);
  460. return result >> 14;
  461. }}
  462. )");
  463. if (host_config.manual_texture_sampling_custom_texture_sizes)
  464. {
  465. // This is slower, and doesn't result in the same odd behavior that happens on console when
  466. // wrapping with non-power-of-2 sizes, but it's fine for custom textures to have non-console
  467. // behavior.
  468. out.Write(R"(
  469. // Both GLSL and HLSL produce undefined values when the modulo operator (%) is used with a negative
  470. // dividend and a positive divisor. We want a positive value such that SafeModulo(-1, 3) is 2.
  471. int SafeModulo(int dividend, int divisor) {{
  472. if (dividend >= 0) {{
  473. return dividend % divisor;
  474. }} else {{
  475. // This works because ~x is the same as -x - 1.
  476. // `~x % 5` over -5 to -1 gives 4, 3, 2, 1, 0. `4 - (~x % 5)` gives 0, 1, 2, 3, 4.
  477. return (divisor - 1) - (~dividend % divisor);
  478. }}
  479. }}
  480. uint WrapCoord(int coord, uint wrap, int size) {{
  481. switch (wrap) {{
  482. case {:s}:
  483. default: // confirmed that clamp is used for invalid (3) via hardware test
  484. return uint(clamp(coord, 0, size - 1));
  485. case {:s}:
  486. return uint(SafeModulo(coord, size)); // coord % size
  487. case {:s}:
  488. if (SafeModulo(coord, 2 * size) >= size) {{ // coord % (2 * size)
  489. coord = ~coord;
  490. }}
  491. return uint(SafeModulo(coord, size)); // coord % size
  492. }}
  493. }}
  494. )",
  495. WrapMode::Clamp, WrapMode::Repeat, WrapMode::Mirror);
  496. }
  497. else
  498. {
  499. out.Write(R"(
  500. uint WrapCoord(int coord, uint wrap, int size) {{
  501. switch (wrap) {{
  502. case {:s}:
  503. default: // confirmed that clamp is used for invalid (3) via hardware test
  504. return uint(clamp(coord, 0, size - 1));
  505. case {:s}:
  506. return uint(coord & (size - 1));
  507. case {:s}:
  508. if ((coord & size) != 0) {{
  509. coord = ~coord;
  510. }}
  511. return uint(coord & (size - 1));
  512. }}
  513. }}
  514. )",
  515. WrapMode::Clamp, WrapMode::Repeat, WrapMode::Mirror);
  516. }
  517. }
  518. out.Write("\nint4 sampleTexture(uint texmap, in sampler2DArray tex, int2 uv, int layer) {{\n");
  519. if (!host_config.manual_texture_sampling)
  520. {
  521. out.Write(" float size_s = float(" I_TEXDIMS "[texmap].x * 128);\n"
  522. " float size_t = float(" I_TEXDIMS "[texmap].y * 128);\n"
  523. " float3 coords = float3(float(uv.x) / size_s, float(uv.y) / size_t, layer);\n");
  524. if (!host_config.backend_sampler_lod_bias)
  525. {
  526. out.Write(" uint texmode0 = samp_texmode0(texmap);\n"
  527. " float lod_bias = float({}) / 256.0f;\n"
  528. " return iround(255.0 * texture(tex, coords, lod_bias));\n",
  529. BitfieldExtract<&SamplerState::TM0::lod_bias>("texmode0"));
  530. }
  531. else
  532. {
  533. out.Write(" return iround(255.0 * texture(tex, coords));\n");
  534. }
  535. out.Write("}}\n");
  536. }
  537. else
  538. {
  539. out.Write(R"(
  540. uint texmode0 = samp_texmode0(texmap);
  541. uint texmode1 = samp_texmode1(texmap);
  542. uint wrap_s = {};
  543. uint wrap_t = {};
  544. bool mag_linear = {} != 0u;
  545. bool mipmap_linear = {} != 0u;
  546. bool min_linear = {} != 0u;
  547. bool diag_lod = {} != 0u;
  548. int lod_bias = {};
  549. // uint max_aniso = TODO;
  550. bool lod_clamp = {} != 0u;
  551. int min_lod = int({});
  552. int max_lod = int({});
  553. )",
  554. BitfieldExtract<&SamplerState::TM0::wrap_u>("texmode0"),
  555. BitfieldExtract<&SamplerState::TM0::wrap_v>("texmode0"),
  556. BitfieldExtract<&SamplerState::TM0::mag_filter>("texmode0"),
  557. BitfieldExtract<&SamplerState::TM0::mipmap_filter>("texmode0"),
  558. BitfieldExtract<&SamplerState::TM0::min_filter>("texmode0"),
  559. BitfieldExtract<&SamplerState::TM0::diag_lod>("texmode0"),
  560. BitfieldExtract<&SamplerState::TM0::lod_bias>("texmode0"),
  561. // BitfieldExtract<&SamplerState::TM0::max_aniso>("texmode0"),
  562. BitfieldExtract<&SamplerState::TM0::lod_clamp>("texmode0"),
  563. BitfieldExtract<&SamplerState::TM1::min_lod>("texmode1"),
  564. BitfieldExtract<&SamplerState::TM1::max_lod>("texmode1"));
  565. if (host_config.manual_texture_sampling_custom_texture_sizes)
  566. {
  567. out.Write(R"(
  568. int native_size_s = )" I_TEXDIMS R"([texmap].x;
  569. int native_size_t = )" I_TEXDIMS R"([texmap].y;
  570. )");
  571. out.Write(R"(
  572. int3 size = textureSize(tex, 0);
  573. int size_s = size.x;
  574. int size_t = size.y;
  575. int num_layers = size.z;
  576. )");
  577. if (g_ActiveConfig.backend_info.bSupportsTextureQueryLevels)
  578. {
  579. out.Write(" int number_of_levels = textureQueryLevels(tex);\n");
  580. }
  581. else
  582. {
  583. out.Write(" int number_of_levels = 256; // textureQueryLevels is not supported\n");
  584. ERROR_LOG_FMT(VIDEO, "textureQueryLevels is not supported! Odd graphical results may "
  585. "occur if custom textures are in use!");
  586. }
  587. out.Write(R"(
  588. // Prevent out-of-bounds LOD values when using custom textures
  589. max_lod = min(max_lod, (number_of_levels - 1) << 4);
  590. // Rescale uv to account for the new texture size
  591. uv.x = (uv.x * size_s) / native_size_s;
  592. uv.y = (uv.y * size_t) / native_size_t;
  593. // Clamp layer as well (texture() automatically clamps, but texelFetch() doesn't)
  594. layer = clamp(layer, 0, num_layers - 1);
  595. )");
  596. }
  597. else
  598. {
  599. out.Write(R"(
  600. int size_s = )" I_TEXDIMS R"([texmap].x;
  601. int size_t = )" I_TEXDIMS R"([texmap].y;
  602. )");
  603. }
  604. if (g_ActiveConfig.backend_info.bSupportsCoarseDerivatives)
  605. {
  606. // The software renderer uses the equivalent of coarse derivatives, so use them here for
  607. // consistency. This hasn't been hardware tested.
  608. // Note that bSupportsCoarseDerivatives being false only means dFdxCoarse and dFdxFine don't
  609. // exist. The GPU may still implement dFdx using coarse derivatives; we just don't have the
  610. // ability to specifically require it.
  611. out.Write(R"(
  612. float2 uv_delta_x = abs(dFdxCoarse(float2(uv)));
  613. float2 uv_delta_y = abs(dFdyCoarse(float2(uv)));
  614. )");
  615. }
  616. else
  617. {
  618. out.Write(R"(
  619. float2 uv_delta_x = abs(dFdx(float2(uv)));
  620. float2 uv_delta_y = abs(dFdy(float2(uv)));
  621. )");
  622. }
  623. // TODO: LOD bias is normally S2.5 (Dolphin uses S7.8 for arbitrary mipmap detection and higher
  624. // IRs), but (at least per the software renderer) actual LOD is S28.4. How does this work?
  625. // Also, note that we can make some assumptions due to use of a SamplerState version of the BP
  626. // configuration, which tidies things compared to whatever nonsense games can put in.
  627. out.Write(R"(
  628. float2 uv_delta = diag_lod ? uv_delta_x + uv_delta_y : max(uv_delta_x, uv_delta_y);
  629. float max_delta = max(uv_delta.x / 128.0, uv_delta.y / 128.0);
  630. // log2(x) is undefined if x <= 0, but in practice it seems log2(0) is -infinity, which becomes INT_MIN.
  631. // If lod_bias is negative, adding it to INT_MIN causes an underflow, resulting in a large positive value.
  632. // Hardware testing indicates that min_lod should be used when the derivative is 0.
  633. int lod = max_delta == 0.0 ? min_lod : int(floor(log2(max_delta) * 16.0)) + (lod_bias >> 4);
  634. bool is_linear = (lod > 0) ? min_linear : mag_linear;
  635. lod = clamp(lod, min_lod, max_lod);
  636. int base_lod = lod >> 4;
  637. int frac_lod = lod & 15;
  638. if (!mipmap_linear && frac_lod >= 8) {{
  639. // Round to nearest LOD in point mode
  640. base_lod++;
  641. }}
  642. if (is_linear) {{
  643. uint2 texuv1 = uint2(
  644. WrapCoord(((uv.x >> base_lod) - 64) >> 7, wrap_s, size_s >> base_lod),
  645. WrapCoord(((uv.y >> base_lod) - 64) >> 7, wrap_t, size_t >> base_lod));
  646. uint2 texuv2 = uint2(
  647. WrapCoord(((uv.x >> base_lod) + 64) >> 7, wrap_s, size_s >> base_lod),
  648. WrapCoord(((uv.y >> base_lod) + 64) >> 7, wrap_t, size_t >> base_lod));
  649. int2 frac_uv = int2(((uv.x >> base_lod) - 64) & 0x7f, ((uv.y >> base_lod) - 64) & 0x7f);
  650. int4 result = readTextureLinear(tex, texuv1, texuv2, layer, base_lod, frac_uv);
  651. if (frac_lod != 0 && mipmap_linear) {{
  652. texuv1 = uint2(
  653. WrapCoord(((uv.x >> (base_lod + 1)) - 64) >> 7, wrap_s, size_s >> (base_lod + 1)),
  654. WrapCoord(((uv.y >> (base_lod + 1)) - 64) >> 7, wrap_t, size_t >> (base_lod + 1)));
  655. texuv2 = uint2(
  656. WrapCoord(((uv.x >> (base_lod + 1)) + 64) >> 7, wrap_s, size_s >> (base_lod + 1)),
  657. WrapCoord(((uv.y >> (base_lod + 1)) + 64) >> 7, wrap_t, size_t >> (base_lod + 1)));
  658. frac_uv = int2(((uv.x >> (base_lod + 1)) - 64) & 0x7f, ((uv.y >> (base_lod + 1)) - 64) & 0x7f);
  659. result *= 16 - frac_lod;
  660. result += readTextureLinear(tex, texuv1, texuv2, layer, base_lod + 1, frac_uv) * frac_lod;
  661. result >>= 4;
  662. }}
  663. return result;
  664. }} else {{
  665. uint2 texuv = uint2(
  666. WrapCoord(uv.x >> (7 + base_lod), wrap_s, size_s >> base_lod),
  667. WrapCoord(uv.y >> (7 + base_lod), wrap_t, size_t >> base_lod));
  668. int4 result = readTexture(tex, texuv.x, texuv.y, layer, base_lod);
  669. if (frac_lod != 0 && mipmap_linear) {{
  670. texuv = uint2(
  671. WrapCoord(uv.x >> (7 + base_lod + 1), wrap_s, size_s >> (base_lod + 1)),
  672. WrapCoord(uv.y >> (7 + base_lod + 1), wrap_t, size_t >> (base_lod + 1)));
  673. result *= 16 - frac_lod;
  674. result += readTexture(tex, texuv.x, texuv.y, layer, base_lod + 1) * frac_lod;
  675. result >>= 4;
  676. }}
  677. return result;
  678. }}
  679. }}
  680. )");
  681. }
  682. }
  683. void WriteCustomShaderStructImpl(ShaderCode* out, u32 num_stages, bool per_pixel_lighting,
  684. const pixel_shader_uid_data* uid_data)
  685. {
  686. out->Write("\tCustomShaderData custom_data;\n");
  687. if (per_pixel_lighting)
  688. {
  689. out->Write("\tcustom_data.position = WorldPos;\n");
  690. out->Write("\tcustom_data.normal = Normal;\n");
  691. }
  692. else
  693. {
  694. out->Write("\tcustom_data.position = float3(0, 0, 0);\n");
  695. out->Write("\tcustom_data.normal = float3(0, 0, 0);\n");
  696. }
  697. if (uid_data->genMode_numtexgens == 0) [[unlikely]]
  698. {
  699. out->Write("\tcustom_data.texcoord[0] = float3(0, 0, 0);\n");
  700. }
  701. else
  702. {
  703. for (u32 i = 0; i < uid_data->genMode_numtexgens; ++i)
  704. {
  705. out->Write("\tif (tex{0}.z == 0.0)\n", i);
  706. out->Write("\t{{\n");
  707. out->Write("\t\tcustom_data.texcoord[{0}] = tex{0};\n", i);
  708. out->Write("\t}}\n");
  709. out->Write("\telse {{\n");
  710. out->Write("\t\tcustom_data.texcoord[{0}] = float3(tex{0}.xy / tex{0}.z, 0);\n", i);
  711. out->Write("\t}}\n");
  712. }
  713. }
  714. for (u32 i = 0; i < 8; i++)
  715. {
  716. // Shader compilation complains if every index isn't initialized
  717. out->Write("\tcustom_data.texmap_to_texcoord_index[{0}] = 0;\n", i);
  718. }
  719. for (u32 i = 0; i < uid_data->genMode_numindstages; ++i)
  720. {
  721. if ((uid_data->nIndirectStagesUsed & (1U << i)) != 0)
  722. {
  723. u32 texcoord = uid_data->GetTevindirefCoord(i);
  724. const u32 texmap = uid_data->GetTevindirefMap(i);
  725. // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
  726. // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
  727. // This affects the Mario portrait in Luigi's Mansion, where the developers forgot to set
  728. // the number of tex gens to 2 (bug 11462).
  729. if (texcoord >= uid_data->genMode_numtexgens)
  730. texcoord = 0;
  731. out->Write("\tcustom_data.texmap_to_texcoord_index[{}] = {};\n", texmap, texcoord);
  732. }
  733. }
  734. out->Write("\tcustom_data.texcoord_count = {};\n", uid_data->genMode_numtexgens);
  735. // Try and do a best guess on what the texcoord index is
  736. // Note: one issue with this would be textures that are used
  737. // multiple times in the same draw but with different texture coordinates.
  738. // In that scenario, only the last texture coordinate would be defined.
  739. // This issue can be seen in how Rogue Squadron 2 does bump mapping
  740. for (u32 i = 0; i < num_stages; i++)
  741. {
  742. auto& tevstage = uid_data->stagehash[i];
  743. // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
  744. // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
  745. u32 texcoord = tevstage.tevorders_texcoord;
  746. const bool has_tex_coord = texcoord < uid_data->genMode_numtexgens;
  747. if (!has_tex_coord)
  748. texcoord = 0;
  749. out->Write("\tcustom_data.texmap_to_texcoord_index[{}] = {};\n", tevstage.tevorders_texmap,
  750. texcoord);
  751. }
  752. if (per_pixel_lighting)
  753. GenerateCustomLightingImplementation(out, uid_data->lighting, "colors_");
  754. for (u32 i = 0; i < 16; i++)
  755. {
  756. // Shader compilation complains if every struct isn't initialized
  757. // Color Input
  758. for (u32 j = 0; j < 4; j++)
  759. {
  760. out->Write("\tcustom_data.tev_stages[{}].input_color[{}].input_type = "
  761. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_UNUSED;\n",
  762. i, j);
  763. out->Write("\tcustom_data.tev_stages[{}].input_color[{}].value = "
  764. "float3(0, 0, 0);\n",
  765. i, j);
  766. }
  767. // Alpha Input
  768. for (u32 j = 0; j < 4; j++)
  769. {
  770. out->Write("\tcustom_data.tev_stages[{}].input_alpha[{}].input_type = "
  771. "CUSTOM_SHADER_TEV_STAGE_INPUT_TYPE_UNUSED;\n",
  772. i, j);
  773. out->Write("\tcustom_data.tev_stages[{}].input_alpha[{}].value = "
  774. "float(0);\n",
  775. i, j);
  776. }
  777. // Texmap
  778. out->Write("\tcustom_data.tev_stages[{}].texmap = 0u;\n", i);
  779. // Output
  780. out->Write("\tcustom_data.tev_stages[{}].output_color = "
  781. "float4(0, 0, 0, 0);\n",
  782. i);
  783. }
  784. // Actual data will be filled out in the tev stage code, just set the
  785. // stage count for now
  786. out->Write("\tcustom_data.tev_stage_count = {};\n", num_stages);
  787. // Time
  788. out->Write("\tcustom_data.time_ms = time_ms;\n");
  789. }
  790. static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n,
  791. APIType api_type, bool stereo, bool has_custom_shaders);
  792. static void WriteTevRegular(ShaderCode& out, std::string_view components, TevBias bias, TevOp op,
  793. bool clamp, TevScale scale);
  794. static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType api_type,
  795. bool per_pixel_depth, bool use_dual_source);
  796. static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data);
  797. static void WriteLogicOp(ShaderCode& out, const pixel_shader_uid_data* uid_data);
  798. static void WriteLogicOpBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data);
  799. static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
  800. bool use_dual_source);
  801. static void WriteBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data);
  802. ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& host_config,
  803. const pixel_shader_uid_data* uid_data,
  804. const CustomPixelShaderContents& custom_details)
  805. {
  806. ShaderCode out;
  807. const bool per_pixel_lighting = g_ActiveConfig.bEnablePixelLighting;
  808. const bool msaa = host_config.msaa;
  809. const bool ssaa = host_config.ssaa;
  810. const bool stereo = host_config.stereo;
  811. const u32 numStages = uid_data->genMode_numtevstages + 1;
  812. out.Write("// Pixel Shader for TEV stages\n");
  813. out.Write("// {} TEV stages, {} texgens, {} IND stages\n", numStages,
  814. uid_data->genMode_numtexgens, uid_data->genMode_numindstages);
  815. // Stuff that is shared between ubershaders and pixelgen.
  816. WriteBitfieldExtractHeader(out, api_type, host_config);
  817. WritePixelShaderCommonHeader(out, api_type, host_config, uid_data->bounding_box, custom_details);
  818. // Custom shader details
  819. WriteCustomShaderStructDef(&out, uid_data->genMode_numtexgens);
  820. for (std::size_t i = 0; i < custom_details.shaders.size(); i++)
  821. {
  822. const auto& shader_details = custom_details.shaders[i];
  823. out.Write(fmt::runtime(shader_details.custom_shader), i);
  824. }
  825. out.Write("\n#define sampleTextureWrapper(texmap, uv, layer) "
  826. "sampleTexture(texmap, samp[texmap], uv, layer)\n");
  827. if (uid_data->ztest == EmulatedZ::ForcedEarly)
  828. {
  829. // Zcomploc (aka early_ztest) is a way to control whether depth test is done before
  830. // or after texturing and alpha test. PC graphics APIs used to provide no way to emulate
  831. // this feature properly until 2012: Depth tests were always done after alpha testing.
  832. // Most importantly, it was not possible to write to the depth buffer without also writing
  833. // a color value (unless color writing was disabled altogether).
  834. // OpenGL 4.2 actually provides two extensions which can force an early z test:
  835. // * ARB_image_load_store has 'layout(early_fragment_tests)' which forces the driver to do z
  836. // and stencil tests early.
  837. // * ARB_conservative_depth has 'layout(depth_unchanged) which signals to the driver that it
  838. // can make optimisations
  839. // which assume the pixel shader won't update the depth buffer.
  840. // early_fragment_tests is the best option, as it requires the driver to do early-z and defines
  841. // early-z exactly as
  842. // we expect, with discard causing the shader to exit with only the depth buffer updated.
  843. // Conservative depth's 'depth_unchanged' only hints to the driver that an early-z optimisation
  844. // can be made and
  845. // doesn't define what will happen if we discard the fragment. But the way modern graphics
  846. // hardware is implemented
  847. // means it is not unreasonable to expect the same behaviour as early_fragment_tests.
  848. // We can also assume that if a driver has gone out of its way to support conservative depth and
  849. // not image_load_store
  850. // as required by OpenGL 4.2 that it will be doing the optimisation.
  851. // If the driver doesn't actually do an early z optimisation, ZCompLoc will be broken and depth
  852. // will only be written
  853. // if the alpha test passes.
  854. // We support Conservative as a fallback, because many drivers based on Mesa haven't implemented
  855. // all of the
  856. // ARB_image_load_store extension yet.
  857. // This is a #define which signals whatever early-z method the driver supports.
  858. out.Write("FORCE_EARLY_Z; \n");
  859. }
  860. const bool use_framebuffer_fetch = uid_data->blend_enable || uid_data->logic_op_enable ||
  861. uid_data->ztest == EmulatedZ::EarlyWithFBFetch;
  862. #ifdef __APPLE__
  863. // Framebuffer fetch is only supported by Metal, so ensure that we're running Vulkan (MoltenVK)
  864. // if we want to use it.
  865. if (api_type == APIType::Vulkan || api_type == APIType::Metal)
  866. {
  867. if (!uid_data->no_dual_src)
  868. {
  869. out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 {};\n"
  870. "FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n",
  871. use_framebuffer_fetch ? "real_ocol0" : "ocol0");
  872. }
  873. else
  874. {
  875. // Metal doesn't support a single unified variable for both input and output,
  876. // so when using framebuffer fetch, we declare the input separately below.
  877. out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 {};\n",
  878. use_framebuffer_fetch ? "real_ocol0" : "ocol0");
  879. }
  880. if (use_framebuffer_fetch)
  881. {
  882. // Subpass inputs will be converted to framebuffer fetch by SPIRV-Cross.
  883. out.Write("INPUT_ATTACHMENT_BINDING(0, 0, 0) uniform subpassInput in_ocol0;\n");
  884. }
  885. }
  886. else
  887. #endif
  888. {
  889. if (use_framebuffer_fetch)
  890. {
  891. out.Write("FRAGMENT_OUTPUT_LOCATION(0) FRAGMENT_INOUT vec4 real_ocol0;\n");
  892. }
  893. else
  894. {
  895. out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out {} ocol0;\n",
  896. uid_data->uint_output ? "uvec4" : "vec4");
  897. }
  898. if (!uid_data->no_dual_src)
  899. {
  900. out.Write("{} out {} ocol1;\n", "FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1)",
  901. uid_data->uint_output ? "uvec4" : "vec4");
  902. }
  903. }
  904. if (uid_data->per_pixel_depth)
  905. out.Write("#define depth gl_FragDepth\n");
  906. if (host_config.backend_geometry_shaders)
  907. {
  908. out.Write("VARYING_LOCATION(0) in VertexData {{\n");
  909. GenerateVSOutputMembers(out, api_type, uid_data->genMode_numtexgens, host_config,
  910. GetInterpolationQualifier(msaa, ssaa, true, true), ShaderStage::Pixel);
  911. out.Write("}};\n");
  912. if (stereo && !host_config.backend_gl_layer_in_fs)
  913. out.Write("flat in int layer;");
  914. }
  915. else
  916. {
  917. // Let's set up attributes
  918. u32 counter = 0;
  919. out.Write("VARYING_LOCATION({}) {} in float4 colors_0;\n", counter++,
  920. GetInterpolationQualifier(msaa, ssaa));
  921. out.Write("VARYING_LOCATION({}) {} in float4 colors_1;\n", counter++,
  922. GetInterpolationQualifier(msaa, ssaa));
  923. for (u32 i = 0; i < uid_data->genMode_numtexgens; ++i)
  924. {
  925. out.Write("VARYING_LOCATION({}) {} in float3 tex{};\n", counter++,
  926. GetInterpolationQualifier(msaa, ssaa), i);
  927. }
  928. if (!host_config.fast_depth_calc)
  929. {
  930. out.Write("VARYING_LOCATION({}) {} in float4 clipPos;\n", counter++,
  931. GetInterpolationQualifier(msaa, ssaa));
  932. }
  933. if (per_pixel_lighting)
  934. {
  935. out.Write("VARYING_LOCATION({}) {} in float3 Normal;\n", counter++,
  936. GetInterpolationQualifier(msaa, ssaa));
  937. out.Write("VARYING_LOCATION({}) {} in float3 WorldPos;\n", counter++,
  938. GetInterpolationQualifier(msaa, ssaa));
  939. }
  940. }
  941. out.Write("void main()\n{{\n");
  942. out.Write("\tfloat4 rawpos = gl_FragCoord;\n");
  943. bool has_custom_shaders = false;
  944. if (std::any_of(custom_details.shaders.begin(), custom_details.shaders.end(),
  945. [](const std::optional<CustomPixelShader>& ps) { return ps.has_value(); }))
  946. {
  947. WriteCustomShaderStructImpl(&out, numStages, per_pixel_lighting, uid_data);
  948. has_custom_shaders = true;
  949. }
  950. if (use_framebuffer_fetch)
  951. {
  952. // Store off a copy of the initial framebuffer value.
  953. //
  954. // If FB_FETCH_VALUE isn't defined (i.e. no special keyword for fetching from the
  955. // framebuffer), we read from real_ocol0.
  956. out.Write("#ifdef FB_FETCH_VALUE\n"
  957. "\tfloat4 initial_ocol0 = FB_FETCH_VALUE;\n"
  958. "#else\n"
  959. "\tfloat4 initial_ocol0 = real_ocol0;\n"
  960. "#endif\n");
  961. // QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an
  962. // intermediate value with multiple reads & modifications, so we pull out the "real" output
  963. // value above and use a temporary for calculations, then set the output value once at the
  964. // end of the shader.
  965. out.Write("\tfloat4 ocol0;\n");
  966. }
  967. if (uid_data->blend_enable)
  968. {
  969. out.Write("\tfloat4 ocol1;\n");
  970. }
  971. if (host_config.backend_geometry_shaders && stereo)
  972. {
  973. if (host_config.backend_gl_layer_in_fs)
  974. out.Write("\tint layer = gl_Layer;\n");
  975. }
  976. else
  977. {
  978. out.Write("\tint layer = 0;\n");
  979. }
  980. out.Write("\tint4 c0 = " I_COLORS "[1], c1 = " I_COLORS "[2], c2 = " I_COLORS
  981. "[3], prev = " I_COLORS "[0];\n"
  982. "\tint4 rastemp = int4(0, 0, 0, 0), textemp = int4(0, 0, 0, 0), konsttemp = int4(0, 0, "
  983. "0, 0);\n"
  984. "\tint3 comp16 = int3(1, 256, 0), comp24 = int3(1, 256, 256*256);\n"
  985. "\tint alphabump=0;\n"
  986. "\tint3 tevcoord=int3(0, 0, 0);\n"
  987. "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n"
  988. "\tint4 "
  989. "tevin_a=int4(0,0,0,0),tevin_b=int4(0,0,0,0),tevin_c=int4(0,0,0,0),tevin_d=int4(0,0,0,"
  990. "0);\n\n"); // tev combiner inputs
  991. // On GLSL, input variables must not be assigned to.
  992. // This is why we declare these variables locally instead.
  993. out.Write("\tfloat4 col0 = colors_0;\n"
  994. "\tfloat4 col1 = colors_1;\n");
  995. if (per_pixel_lighting)
  996. {
  997. out.Write("\tfloat3 _normal = normalize(Normal.xyz);\n\n"
  998. "\tfloat3 pos = WorldPos;\n");
  999. out.Write("\tint4 lacc;\n"
  1000. "\tfloat3 ldir, h, cosAttn, distAttn;\n"
  1001. "\tfloat dist, dist2, attn;\n");
  1002. // TODO: Our current constant usage code isn't able to handle more than one buffer.
  1003. // So we can't mark the VS constant as used here. But keep them here as reference.
  1004. // out.SetConstantsUsed(C_PLIGHT_COLORS, C_PLIGHT_COLORS+7); // TODO: Can be optimized further
  1005. // out.SetConstantsUsed(C_PLIGHTS, C_PLIGHTS+31); // TODO: Can be optimized further
  1006. // out.SetConstantsUsed(C_PMATERIALS, C_PMATERIALS+3);
  1007. GenerateLightingShaderCode(out, uid_data->lighting, "colors_", "col");
  1008. // The number of colors available to TEV is determined by numColorChans.
  1009. // Normally this is performed in the vertex shader after lighting, but with per-pixel lighting,
  1010. // we need to perform it here. (It needs to be done after lighting, as what was originally
  1011. // black might become a different color after lighting).
  1012. if (uid_data->numColorChans == 0)
  1013. out.Write("col0 = float4(0.0, 0.0, 0.0, 0.0);\n");
  1014. if (uid_data->numColorChans <= 1)
  1015. out.Write("col1 = float4(0.0, 0.0, 0.0, 0.0);\n");
  1016. }
  1017. if (uid_data->genMode_numtexgens == 0)
  1018. {
  1019. // TODO: This is a hack to ensure that shaders still compile when setting out of bounds tex
  1020. // coord indices to 0. Ideally, it shouldn't exist at all, but the exact behavior hasn't been
  1021. // tested.
  1022. out.Write("\tint2 fixpoint_uv0 = int2(0, 0);\n\n");
  1023. }
  1024. else
  1025. {
  1026. out.SetConstantsUsed(C_TEXDIMS, C_TEXDIMS + uid_data->genMode_numtexgens - 1);
  1027. for (u32 i = 0; i < uid_data->genMode_numtexgens; ++i)
  1028. {
  1029. out.Write("\tint2 fixpoint_uv{} = int2(", i);
  1030. out.Write("(tex{}.z == 0.0 ? tex{}.xy : tex{}.xy / tex{}.z)", i, i, i, i);
  1031. out.Write(" * float2(" I_TEXDIMS "[{}].zw * 128));\n", i);
  1032. // TODO: S24 overflows here?
  1033. }
  1034. }
  1035. for (u32 i = 0; i < uid_data->genMode_numindstages; ++i)
  1036. {
  1037. if ((uid_data->nIndirectStagesUsed & (1U << i)) != 0)
  1038. {
  1039. u32 texcoord = uid_data->GetTevindirefCoord(i);
  1040. const u32 texmap = uid_data->GetTevindirefMap(i);
  1041. // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
  1042. // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
  1043. // This affects the Mario portrait in Luigi's Mansion, where the developers forgot to set
  1044. // the number of tex gens to 2 (bug 11462).
  1045. if (texcoord >= uid_data->genMode_numtexgens)
  1046. texcoord = 0;
  1047. out.SetConstantsUsed(C_INDTEXSCALE + i / 2, C_INDTEXSCALE + i / 2);
  1048. out.Write("\ttempcoord = fixpoint_uv{} >> " I_INDTEXSCALE "[{}].{};\n", texcoord, i / 2,
  1049. (i & 1) ? "zw" : "xy");
  1050. out.Write("\tint3 iindtex{0} = sampleTextureWrapper({1}u, tempcoord, layer).abg;\n", i,
  1051. texmap);
  1052. }
  1053. }
  1054. for (u32 i = 0; i < numStages; i++)
  1055. {
  1056. // Build the equation for this stage
  1057. WriteStage(out, uid_data, i, api_type, stereo, has_custom_shaders);
  1058. }
  1059. {
  1060. // The results of the last texenv stage are put onto the screen,
  1061. // regardless of the used destination register
  1062. TevStageCombiner::ColorCombiner last_cc;
  1063. TevStageCombiner::AlphaCombiner last_ac;
  1064. last_cc.hex = uid_data->stagehash[uid_data->genMode_numtevstages].cc;
  1065. last_ac.hex = uid_data->stagehash[uid_data->genMode_numtevstages].ac;
  1066. if (last_cc.dest != TevOutput::Prev)
  1067. {
  1068. out.Write("\tprev.rgb = {};\n", tev_c_output_table[last_cc.dest]);
  1069. }
  1070. if (last_ac.dest != TevOutput::Prev)
  1071. {
  1072. out.Write("\tprev.a = {};\n", tev_a_output_table[last_ac.dest]);
  1073. }
  1074. }
  1075. out.Write("\tprev = prev & 255;\n");
  1076. // NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
  1077. // (in this case we need to write a depth value if depth test passes regardless of the alpha
  1078. // testing result)
  1079. if (uid_data->Pretest == AlphaTestResult::Undetermined ||
  1080. (uid_data->Pretest == AlphaTestResult::Fail && uid_data->ztest == EmulatedZ::Late))
  1081. {
  1082. WriteAlphaTest(out, uid_data, api_type, uid_data->per_pixel_depth,
  1083. !uid_data->no_dual_src || uid_data->blend_enable);
  1084. }
  1085. // This situation is important for Mario Kart Wii's menus (they will render incorrectly if the
  1086. // alpha test for the FMV in the background fails, since they depend on depth for drawing a yellow
  1087. // border) and Fortune Street's gameplay (where a rectangle with an alpha value of 1 is drawn over
  1088. // the center of the screen several times, but those rectangles shouldn't be visible).
  1089. // Blending seems to result in no changes to the output with an alpha of 1, even if the input
  1090. // color is white.
  1091. // TODO: Investigate this further: we might be handling blending incorrectly in general (though
  1092. // there might not be any good way of changing blending behavior)
  1093. out.Write("\t// Hardware testing indicates that an alpha of 1 can pass an alpha test,\n"
  1094. "\t// but doesn't do anything in blending\n"
  1095. "\tif (prev.a == 1) prev.a = 0;\n");
  1096. if (uid_data->zfreeze)
  1097. {
  1098. out.SetConstantsUsed(C_ZSLOPE, C_ZSLOPE);
  1099. out.SetConstantsUsed(C_EFBSCALE, C_EFBSCALE);
  1100. out.Write("\tfloat2 screenpos = rawpos.xy * " I_EFBSCALE ".xy;\n");
  1101. // Opengl has reversed vertical screenspace coordinates
  1102. if (api_type == APIType::OpenGL)
  1103. out.Write("\tscreenpos.y = {}.0 - screenpos.y;\n", EFB_HEIGHT);
  1104. out.Write("\tint zCoord = int(" I_ZSLOPE ".z + " I_ZSLOPE ".x * screenpos.x + " I_ZSLOPE
  1105. ".y * screenpos.y);\n");
  1106. }
  1107. else if (!host_config.fast_depth_calc)
  1108. {
  1109. // FastDepth means to trust the depth generated in perspective division.
  1110. // It should be correct, but it seems not to be as accurate as required. TODO: Find out why!
  1111. // For disabled FastDepth we just calculate the depth value again.
  1112. // The performance impact of this additional calculation doesn't matter, but it prevents
  1113. // the host GPU driver from performing any early depth test optimizations.
  1114. out.SetConstantsUsed(C_ZBIAS + 1, C_ZBIAS + 1);
  1115. // the screen space depth value = far z + (clip z / clip w) * z range
  1116. out.Write("\tint zCoord = " I_ZBIAS "[1].x + int((clipPos.z / clipPos.w) * float(" I_ZBIAS
  1117. "[1].y));\n");
  1118. }
  1119. else
  1120. {
  1121. if (!host_config.backend_reversed_depth_range)
  1122. out.Write("\tint zCoord = int((1.0 - rawpos.z) * 16777216.0);\n");
  1123. else
  1124. out.Write("\tint zCoord = int(rawpos.z * 16777216.0);\n");
  1125. }
  1126. out.Write("\tzCoord = clamp(zCoord, 0, 0xFFFFFF);\n");
  1127. // depth texture can safely be ignored if the result won't be written to the depth buffer
  1128. // (early_ztest) and isn't used for fog either
  1129. const bool skip_ztexture = !uid_data->per_pixel_depth && uid_data->fog_fsel == FogType::Off;
  1130. // Note: z-textures are not written to depth buffer if early depth test is used
  1131. const bool early_ztest = uid_data->ztest == EmulatedZ::Early ||
  1132. uid_data->ztest == EmulatedZ::EarlyWithFBFetch ||
  1133. uid_data->ztest == EmulatedZ::EarlyWithZComplocHack;
  1134. if (uid_data->per_pixel_depth && early_ztest)
  1135. {
  1136. if (!host_config.backend_reversed_depth_range)
  1137. out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
  1138. else
  1139. out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
  1140. }
  1141. // Note: depth texture output is only written to depth buffer if late depth test is used
  1142. // theoretical final depth value is used for fog calculation, though, so we have to emulate
  1143. // ztextures anyway
  1144. if (uid_data->ztex_op != ZTexOp::Disabled && !skip_ztexture)
  1145. {
  1146. // use the texture input of the last texture stage (textemp), hopefully this has been read and
  1147. // is in correct format...
  1148. out.SetConstantsUsed(C_ZBIAS, C_ZBIAS + 1);
  1149. out.Write("\tzCoord = idot(" I_ZBIAS "[0].xyzw, textemp.xyzw) + " I_ZBIAS "[1].w {};\n",
  1150. (uid_data->ztex_op == ZTexOp::Add) ? "+ zCoord" : "");
  1151. out.Write("\tzCoord = zCoord & 0xFFFFFF;\n");
  1152. }
  1153. if (uid_data->per_pixel_depth && uid_data->ztest == EmulatedZ::Late)
  1154. {
  1155. if (!host_config.backend_reversed_depth_range)
  1156. out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
  1157. else
  1158. out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
  1159. }
  1160. // No dithering for RGB8 mode
  1161. if (uid_data->dither)
  1162. {
  1163. // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering
  1164. // Here the matrix is encoded into the two factor constants
  1165. out.Write("\tint2 dither = int2(rawpos.xy) & 1;\n");
  1166. out.Write("\tprev.rgb = (prev.rgb - (prev.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2);\n");
  1167. }
  1168. WriteFog(out, uid_data);
  1169. for (std::size_t i = 0; i < custom_details.shaders.size(); i++)
  1170. {
  1171. const auto& shader_details = custom_details.shaders[i];
  1172. if (!shader_details.custom_shader.empty())
  1173. {
  1174. out.Write("\t{{\n");
  1175. out.Write("\t\tcustom_data.final_color = float4(prev.r / 255.0, prev.g / 255.0, prev.b "
  1176. "/ 255.0, prev.a / 255.0);\n");
  1177. out.Write("\t\tCustomShaderOutput custom_output = {}_{}(custom_data);\n",
  1178. CUSTOM_PIXELSHADER_COLOR_FUNC, i);
  1179. out.Write("\t\tprev = int4(custom_output.main_rt.r * 255, custom_output.main_rt.g * 255, "
  1180. "custom_output.main_rt.b * 255, custom_output.main_rt.a * 255);\n");
  1181. out.Write("\t}}\n\n");
  1182. }
  1183. }
  1184. if (uid_data->logic_op_enable)
  1185. WriteLogicOp(out, uid_data);
  1186. else if (uid_data->emulate_logic_op_with_blend)
  1187. WriteLogicOpBlend(out, uid_data);
  1188. // Write the color and alpha values to the framebuffer
  1189. // If using shader blend, we still use the separate alpha
  1190. const bool use_dual_source = !uid_data->no_dual_src || uid_data->blend_enable;
  1191. WriteColor(out, api_type, uid_data, use_dual_source);
  1192. if (uid_data->blend_enable)
  1193. WriteBlend(out, uid_data);
  1194. else if (use_framebuffer_fetch)
  1195. out.Write("\treal_ocol0 = ocol0;\n");
  1196. if (uid_data->bounding_box)
  1197. out.Write("\tUpdateBoundingBox(rawpos.xy);\n");
  1198. out.Write("}}\n");
  1199. return out;
  1200. }
  1201. static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n,
  1202. APIType api_type, bool stereo, bool has_custom_shaders)
  1203. {
  1204. using Common::EnumMap;
  1205. const auto& stage = uid_data->stagehash[n];
  1206. out.Write("\n\t// TEV stage {}\n", n);
  1207. // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does not
  1208. // exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
  1209. u32 texcoord = stage.tevorders_texcoord;
  1210. const bool has_tex_coord = texcoord < uid_data->genMode_numtexgens;
  1211. if (!has_tex_coord)
  1212. texcoord = 0;
  1213. {
  1214. const TevStageIndirect tevind{.hex = stage.tevind};
  1215. out.Write("\t// indirect op\n");
  1216. // Quirk: Referencing a stage above the number of ind stages is undefined behavior,
  1217. // and on console produces a noise pattern (details unknown).
  1218. // Instead, just skip applying the indirect operation, which is close enough.
  1219. // We need to do *something*, as there won't be an iindtex variable otherwise.
  1220. // Viewtiful Joe hits this case (bug 12525).
  1221. // Wrapping and add to previous still apply in this case (and when the stage is disabled).
  1222. const bool has_ind_stage = tevind.bt < uid_data->genMode_numindstages;
  1223. // Perform the indirect op on the incoming regular coordinates
  1224. // using iindtex{} as the offset coords
  1225. if (has_ind_stage && tevind.bs != IndTexBumpAlpha::Off)
  1226. {
  1227. static constexpr EnumMap<const char*, IndTexBumpAlpha::U> tev_ind_alpha_sel{
  1228. "",
  1229. "x",
  1230. "y",
  1231. "z",
  1232. };
  1233. // According to libogc, the bump alpha value is 5 bits, and comes from the bottom bits of the
  1234. // component byte, except in the case of ITF_8, which presumably uses the top bits with a
  1235. // mask.
  1236. // https://github.com/devkitPro/libogc/blob/bd24a9b3f59502f9b30d6bac0ae35fc485045f78/gc/ogc/gx.h#L3038-L3041
  1237. // https://github.com/devkitPro/libogc/blob/bd24a9b3f59502f9b30d6bac0ae35fc485045f78/gc/ogc/gx.h#L790-L800
  1238. static constexpr EnumMap<char, IndTexFormat::ITF_3> tev_ind_alpha_shift{
  1239. '0', // ITF_8: 0bXXXXXYYY -> 0bXXXXX000? No shift?
  1240. '5', // ITF_5: 0bIIIIIAAA -> 0bAAA00000, shift of 5
  1241. '4', // ITF_4: 0bIIIIAAAA -> 0bAAAA0000, shift of 4
  1242. '3', // ITF_3: 0bIIIAAAAA -> 0bAAAAA000, shift of 3
  1243. };
  1244. out.Write("\talphabump = (iindtex{}.{} << {}) & 248;\n", tevind.bt,
  1245. tev_ind_alpha_sel[tevind.bs], tev_ind_alpha_shift[tevind.fmt]);
  1246. }
  1247. else
  1248. {
  1249. // TODO: Should we reset alphabump to 0 here?
  1250. }
  1251. if (has_ind_stage && tevind.matrix_index != IndMtxIndex::Off)
  1252. {
  1253. // format
  1254. static constexpr EnumMap<char, IndTexFormat::ITF_3> tev_ind_fmt_shift{
  1255. '0', // ITF_8: 0bXXXXXXXX -> 0bXXXXXXXX, no shift
  1256. '3', // ITF_5: 0bIIIIIAAA -> 0b000IIIII, shift of 3
  1257. '4', // ITF_4: 0bIIIIAAAA -> 0b0000IIII, shift of 4
  1258. '5', // ITF_3: 0bIIIAAAAA -> 0b00000III, shift of 5
  1259. };
  1260. out.Write("\tint3 iindtevcrd{} = iindtex{} >> {};\n", n, tevind.bt,
  1261. tev_ind_fmt_shift[tevind.fmt]);
  1262. // bias - TODO: Check if this needs to be this complicated...
  1263. // indexed by bias
  1264. static constexpr EnumMap<const char*, IndTexBias::STU> tev_ind_bias_field{
  1265. "", "x", "y", "xy", "z", "xz", "yz", "xyz",
  1266. };
  1267. // indexed by fmt
  1268. static constexpr EnumMap<const char*, IndTexFormat::ITF_3> tev_ind_bias_add{
  1269. "-128",
  1270. "1",
  1271. "1",
  1272. "1",
  1273. };
  1274. if (tevind.bias == IndTexBias::S || tevind.bias == IndTexBias::T ||
  1275. tevind.bias == IndTexBias::U)
  1276. {
  1277. out.Write("\tiindtevcrd{}.{} += int({});\n", n, tev_ind_bias_field[tevind.bias],
  1278. tev_ind_bias_add[tevind.fmt]);
  1279. }
  1280. else if (tevind.bias == IndTexBias::ST || tevind.bias == IndTexBias::SU ||
  1281. tevind.bias == IndTexBias::TU_)
  1282. {
  1283. out.Write("\tiindtevcrd{0}.{1} += int2({2}, {2});\n", n, tev_ind_bias_field[tevind.bias],
  1284. tev_ind_bias_add[tevind.fmt]);
  1285. }
  1286. else if (tevind.bias == IndTexBias::STU)
  1287. {
  1288. out.Write("\tiindtevcrd{0}.{1} += int3({2}, {2}, {2});\n", n,
  1289. tev_ind_bias_field[tevind.bias], tev_ind_bias_add[tevind.fmt]);
  1290. }
  1291. // Multiplied by 2 because each matrix has two rows.
  1292. // Note also that the 4th column of the matrix contains the scale factor.
  1293. const u32 mtxidx = 2 * (static_cast<u32>(tevind.matrix_index.Value()) - 1);
  1294. // multiply by offset matrix and scale - calculations are likely to overflow badly,
  1295. // yet it works out since we only care about the lower 23 bits (+1 sign bit) of the result
  1296. if (tevind.matrix_id == IndMtxId::Indirect)
  1297. {
  1298. out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
  1299. out.Write("\tint2 indtevtrans{} = int2(idot(" I_INDTEXMTX
  1300. "[{}].xyz, iindtevcrd{}), idot(" I_INDTEXMTX "[{}].xyz, iindtevcrd{})) >> 3;\n",
  1301. n, mtxidx, n, mtxidx + 1, n);
  1302. // TODO: should use a shader uid branch for this for better performance
  1303. if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
  1304. {
  1305. out.Write("\tint indtexmtx_w_inverse_{} = -" I_INDTEXMTX "[{}].w;\n", n, mtxidx);
  1306. out.Write("\tif (" I_INDTEXMTX "[{}].w >= 0) indtevtrans{} >>= " I_INDTEXMTX "[{}].w;\n",
  1307. mtxidx, n, mtxidx);
  1308. out.Write("\telse indtevtrans{} <<= indtexmtx_w_inverse_{};\n", n, n);
  1309. }
  1310. else
  1311. {
  1312. out.Write("\tif (" I_INDTEXMTX "[{}].w >= 0) indtevtrans{} >>= " I_INDTEXMTX "[{}].w;\n",
  1313. mtxidx, n, mtxidx);
  1314. out.Write("\telse indtevtrans{} <<= (-" I_INDTEXMTX "[{}].w);\n", n, mtxidx);
  1315. }
  1316. }
  1317. else if (tevind.matrix_id == IndMtxId::S)
  1318. {
  1319. ASSERT(has_tex_coord);
  1320. out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
  1321. out.Write("\tint2 indtevtrans{} = int2(fixpoint_uv{} * iindtevcrd{}.xx) >> 8;\n", n,
  1322. texcoord, n);
  1323. if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
  1324. {
  1325. out.Write("\tint indtexmtx_w_inverse_{} = -" I_INDTEXMTX "[{}].w;\n", n, mtxidx);
  1326. out.Write("\tif (" I_INDTEXMTX "[{}].w >= 0) indtevtrans{} >>= " I_INDTEXMTX "[{}].w;\n",
  1327. mtxidx, n, mtxidx);
  1328. out.Write("\telse indtevtrans{} <<= (indtexmtx_w_inverse_{});\n", n, n);
  1329. }
  1330. else
  1331. {
  1332. out.Write("\tif (" I_INDTEXMTX "[{}].w >= 0) indtevtrans{} >>= " I_INDTEXMTX "[{}].w;\n",
  1333. mtxidx, n, mtxidx);
  1334. out.Write("\telse indtevtrans{} <<= (-" I_INDTEXMTX "[{}].w);\n", n, mtxidx);
  1335. }
  1336. }
  1337. else if (tevind.matrix_id == IndMtxId::T)
  1338. {
  1339. ASSERT(has_tex_coord);
  1340. out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
  1341. out.Write("\tint2 indtevtrans{} = int2(fixpoint_uv{} * iindtevcrd{}.yy) >> 8;\n", n,
  1342. texcoord, n);
  1343. if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
  1344. {
  1345. out.Write("\tint indtexmtx_w_inverse_{} = -" I_INDTEXMTX "[{}].w;\n", n, mtxidx);
  1346. out.Write("\tif (" I_INDTEXMTX "[{}].w >= 0) indtevtrans{} >>= " I_INDTEXMTX "[{}].w;\n",
  1347. mtxidx, n, mtxidx);
  1348. out.Write("\telse indtevtrans{} <<= (indtexmtx_w_inverse_{});\n", n, n);
  1349. }
  1350. else
  1351. {
  1352. out.Write("\tif (" I_INDTEXMTX "[{}].w >= 0) indtevtrans{} >>= " I_INDTEXMTX "[{}].w;\n",
  1353. mtxidx, n, mtxidx);
  1354. out.Write("\telse indtevtrans{} <<= (-" I_INDTEXMTX "[{}].w);\n", n, mtxidx);
  1355. }
  1356. }
  1357. else
  1358. {
  1359. out.Write("\tint2 indtevtrans{} = int2(0, 0);\n", n);
  1360. ASSERT(false); // Unknown value for matrix_id
  1361. }
  1362. }
  1363. else
  1364. {
  1365. out.Write("\tint2 indtevtrans{} = int2(0, 0);\n", n);
  1366. if (tevind.matrix_index == IndMtxIndex::Off)
  1367. {
  1368. // If matrix_index is Off (0), matrix_id should be Indirect (0)
  1369. ASSERT(tevind.matrix_id == IndMtxId::Indirect);
  1370. }
  1371. }
  1372. // ---------
  1373. // Wrapping
  1374. // ---------
  1375. static constexpr std::array<const char*, 5> tev_ind_wrap_start{
  1376. "(256<<7)", "(128<<7)", "(64<<7)", "(32<<7)", "(16<<7)",
  1377. };
  1378. // wrap S
  1379. if (tevind.sw == IndTexWrap::ITW_OFF)
  1380. {
  1381. out.Write("\twrappedcoord.x = fixpoint_uv{}.x;\n", texcoord);
  1382. }
  1383. else if (tevind.sw >= IndTexWrap::ITW_0) // 7 (Invalid) appears to behave the same as 6 (ITW_0)
  1384. {
  1385. out.Write("\twrappedcoord.x = 0;\n");
  1386. }
  1387. else
  1388. {
  1389. out.Write("\twrappedcoord.x = fixpoint_uv{}.x & ({} - 1);\n", texcoord,
  1390. tev_ind_wrap_start[u32(tevind.sw.Value()) - u32(IndTexWrap::ITW_256)]);
  1391. }
  1392. // wrap T
  1393. if (tevind.tw == IndTexWrap::ITW_OFF)
  1394. {
  1395. out.Write("\twrappedcoord.y = fixpoint_uv{}.y;\n", texcoord);
  1396. }
  1397. else if (tevind.tw >= IndTexWrap::ITW_0) // 7 (Invalid) appears to behave the same as 6 (ITW_0)
  1398. {
  1399. out.Write("\twrappedcoord.y = 0;\n");
  1400. }
  1401. else
  1402. {
  1403. out.Write("\twrappedcoord.y = fixpoint_uv{}.y & ({} - 1);\n", texcoord,
  1404. tev_ind_wrap_start[u32(tevind.tw.Value()) - u32(IndTexWrap::ITW_256)]);
  1405. }
  1406. if (tevind.fb_addprev) // add previous tevcoord
  1407. out.Write("\ttevcoord.xy += wrappedcoord + indtevtrans{};\n", n);
  1408. else
  1409. out.Write("\ttevcoord.xy = wrappedcoord + indtevtrans{};\n", n);
  1410. // Emulate s24 overflows
  1411. out.Write("\ttevcoord.xy = (tevcoord.xy << 8) >> 8;\n");
  1412. }
  1413. TevStageCombiner::ColorCombiner cc;
  1414. TevStageCombiner::AlphaCombiner ac;
  1415. cc.hex = stage.cc;
  1416. ac.hex = stage.ac;
  1417. if (cc.a == TevColorArg::RasAlpha || cc.a == TevColorArg::RasColor ||
  1418. cc.b == TevColorArg::RasAlpha || cc.b == TevColorArg::RasColor ||
  1419. cc.c == TevColorArg::RasAlpha || cc.c == TevColorArg::RasColor ||
  1420. cc.d == TevColorArg::RasAlpha || cc.d == TevColorArg::RasColor ||
  1421. ac.a == TevAlphaArg::RasAlpha || ac.b == TevAlphaArg::RasAlpha ||
  1422. ac.c == TevAlphaArg::RasAlpha || ac.d == TevAlphaArg::RasAlpha)
  1423. {
  1424. // Generate swizzle string to represent the Ras color channel swapping
  1425. out.Write("\trastemp = {}.{}{}{}{};\n", tev_ras_table[stage.tevorders_colorchan],
  1426. rgba_swizzle[stage.ras_swap_r], rgba_swizzle[stage.ras_swap_g],
  1427. rgba_swizzle[stage.ras_swap_b], rgba_swizzle[stage.ras_swap_a]);
  1428. }
  1429. if (stage.tevorders_enable && uid_data->genMode_numtexgens > 0)
  1430. {
  1431. // Generate swizzle string to represent the texture color channel swapping
  1432. out.Write("\ttextemp = sampleTextureWrapper({}u, tevcoord.xy, layer).{}{}{}{};\n",
  1433. stage.tevorders_texmap, rgba_swizzle[stage.tex_swap_r],
  1434. rgba_swizzle[stage.tex_swap_g], rgba_swizzle[stage.tex_swap_b],
  1435. rgba_swizzle[stage.tex_swap_a]);
  1436. }
  1437. else if (uid_data->genMode_numtexgens == 0)
  1438. {
  1439. // It seems like the result is always black when no tex coords are enabled, but further testing
  1440. // is needed.
  1441. out.Write("\ttextemp = int4(0, 0, 0, 0);\n");
  1442. }
  1443. else
  1444. {
  1445. out.Write("\ttextemp = int4(255, 255, 255, 255);\n");
  1446. }
  1447. if (cc.a == TevColorArg::Konst || cc.b == TevColorArg::Konst || cc.c == TevColorArg::Konst ||
  1448. cc.d == TevColorArg::Konst || ac.a == TevAlphaArg::Konst || ac.b == TevAlphaArg::Konst ||
  1449. ac.c == TevAlphaArg::Konst || ac.d == TevAlphaArg::Konst)
  1450. {
  1451. out.Write("\tkonsttemp = int4({}, {});\n", tev_ksel_table_c[stage.tevksel_kc],
  1452. tev_ksel_table_a[stage.tevksel_ka]);
  1453. if (u32(stage.tevksel_kc) > 7)
  1454. {
  1455. out.SetConstantsUsed(C_KCOLORS + ((u32(stage.tevksel_kc) - 0xc) % 4),
  1456. C_KCOLORS + ((u32(stage.tevksel_kc) - 0xc) % 4));
  1457. }
  1458. if (u32(stage.tevksel_ka) > 7)
  1459. {
  1460. out.SetConstantsUsed(C_KCOLORS + ((u32(stage.tevksel_ka) - 0xc) % 4),
  1461. C_KCOLORS + ((u32(stage.tevksel_ka) - 0xc) % 4));
  1462. }
  1463. }
  1464. if (cc.d == TevColorArg::Color0 || cc.d == TevColorArg::Alpha0 || ac.d == TevAlphaArg::Alpha0)
  1465. out.SetConstantsUsed(C_COLORS + 1, C_COLORS + 1);
  1466. if (cc.d == TevColorArg::Color1 || cc.d == TevColorArg::Alpha1 || ac.d == TevAlphaArg::Alpha1)
  1467. out.SetConstantsUsed(C_COLORS + 2, C_COLORS + 2);
  1468. if (cc.d == TevColorArg::Color2 || cc.d == TevColorArg::Alpha2 || ac.d == TevAlphaArg::Alpha2)
  1469. out.SetConstantsUsed(C_COLORS + 3, C_COLORS + 3);
  1470. if (cc.dest >= TevOutput::Color0)
  1471. out.SetConstantsUsed(C_COLORS + u32(cc.dest.Value()), C_COLORS + u32(cc.dest.Value()));
  1472. if (ac.dest >= TevOutput::Color0)
  1473. out.SetConstantsUsed(C_COLORS + u32(ac.dest.Value()), C_COLORS + u32(ac.dest.Value()));
  1474. if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_VECTOR_BITWISE_AND))
  1475. {
  1476. out.Write("\ttevin_a = int4({} & 255, {} & 255);\n", tev_c_input_table[cc.a],
  1477. tev_a_input_table[ac.a]);
  1478. out.Write("\ttevin_b = int4({} & 255, {} & 255);\n", tev_c_input_table[cc.b],
  1479. tev_a_input_table[ac.b]);
  1480. out.Write("\ttevin_c = int4({} & 255, {} & 255);\n", tev_c_input_table[cc.c],
  1481. tev_a_input_table[ac.c]);
  1482. }
  1483. else
  1484. {
  1485. out.Write("\ttevin_a = int4({}, {})&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.a],
  1486. tev_a_input_table[ac.a]);
  1487. out.Write("\ttevin_b = int4({}, {})&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.b],
  1488. tev_a_input_table[ac.b]);
  1489. out.Write("\ttevin_c = int4({}, {})&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.c],
  1490. tev_a_input_table[ac.c]);
  1491. }
  1492. out.Write("\ttevin_d = int4({}, {});\n", tev_c_input_table[cc.d], tev_a_input_table[ac.d]);
  1493. out.Write("\t// color combine\n");
  1494. out.Write("\t{} = clamp(", tev_c_output_table[cc.dest]);
  1495. if (cc.bias != TevBias::Compare)
  1496. {
  1497. WriteTevRegular(out, "rgb", cc.bias, cc.op, cc.clamp, cc.scale);
  1498. }
  1499. else
  1500. {
  1501. static constexpr EnumMap<const char*, TevCompareMode::RGB8> tev_rgb_comparison_gt{
  1502. "((tevin_a.r > tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TevCompareMode::R8
  1503. "((idot(tevin_a.rgb, comp16) > idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // GR16
  1504. "((idot(tevin_a.rgb, comp24) > idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // BGR24
  1505. "(max(sign(tevin_a.rgb - tevin_b.rgb), int3(0,0,0)) * tevin_c.rgb)", // RGB8
  1506. };
  1507. static constexpr EnumMap<const char*, TevCompareMode::RGB8> tev_rgb_comparison_eq{
  1508. "((tevin_a.r == tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TevCompareMode::R8
  1509. "((idot(tevin_a.rgb,comp16) == idot(tevin_b.rgb,comp16)) ? tevin_c.rgb : int3(0,0,0))", // GR16
  1510. "((idot(tevin_a.rgb,comp24) == idot(tevin_b.rgb,comp24)) ? tevin_c.rgb : int3(0,0,0))", // BGR24
  1511. "((int3(1,1,1) - sign(abs(tevin_a.rgb - tevin_b.rgb))) * tevin_c.rgb)" // RGB8
  1512. };
  1513. if (cc.comparison == TevComparison::EQ)
  1514. out.Write(" tevin_d.rgb + {}", tev_rgb_comparison_eq[cc.compare_mode]);
  1515. else
  1516. out.Write(" tevin_d.rgb + {}", tev_rgb_comparison_gt[cc.compare_mode]);
  1517. }
  1518. if (cc.clamp)
  1519. out.Write(", int3(0,0,0), int3(255,255,255))");
  1520. else
  1521. out.Write(", int3(-1024,-1024,-1024), int3(1023,1023,1023))");
  1522. out.Write(";\n");
  1523. out.Write("\t// alpha combine\n");
  1524. out.Write("\t{} = clamp(", tev_a_output_table[ac.dest]);
  1525. if (ac.bias != TevBias::Compare)
  1526. {
  1527. WriteTevRegular(out, "a", ac.bias, ac.op, ac.clamp, ac.scale);
  1528. }
  1529. else
  1530. {
  1531. static constexpr EnumMap<const char*, TevCompareMode::A8> tev_a_comparison_gt{
  1532. "((tevin_a.r > tevin_b.r) ? tevin_c.a : 0)", // TevCompareMode::R8
  1533. "((idot(tevin_a.rgb, comp16) > idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // GR16
  1534. "((idot(tevin_a.rgb, comp24) > idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // BGR24
  1535. "((tevin_a.a > tevin_b.a) ? tevin_c.a : 0)", // A8
  1536. };
  1537. static constexpr EnumMap<const char*, TevCompareMode::A8> tev_a_comparison_eq{
  1538. "((tevin_a.r == tevin_b.r) ? tevin_c.a : 0)", // TevCompareMode::R8
  1539. "((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // GR16,
  1540. "((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // BGR24,
  1541. "((tevin_a.a == tevin_b.a) ? tevin_c.a : 0)", // A8
  1542. };
  1543. if (ac.comparison == TevComparison::EQ)
  1544. out.Write(" tevin_d.a + {}", tev_a_comparison_eq[ac.compare_mode]);
  1545. else
  1546. out.Write(" tevin_d.a + {}", tev_a_comparison_gt[ac.compare_mode]);
  1547. }
  1548. if (ac.clamp)
  1549. out.Write(", 0, 255)");
  1550. else
  1551. out.Write(", -1024, 1023)");
  1552. out.Write(";\n");
  1553. if (has_custom_shaders)
  1554. {
  1555. // Color input
  1556. out.Write(
  1557. "\tcustom_data.tev_stages[{}].input_color[0].value = {} / float3(255.0, 255.0, 255.0);\n",
  1558. n, tev_c_input_table[cc.a]);
  1559. out.Write("\tcustom_data.tev_stages[{}].input_color[0].input_type = {};\n", n,
  1560. tev_c_input_type[cc.a]);
  1561. out.Write(
  1562. "\tcustom_data.tev_stages[{}].input_color[1].value = {} / float3(255.0, 255.0, 255.0);\n",
  1563. n, tev_c_input_table[cc.b]);
  1564. out.Write("\tcustom_data.tev_stages[{}].input_color[1].input_type = {};\n", n,
  1565. tev_c_input_type[cc.b]);
  1566. out.Write(
  1567. "\tcustom_data.tev_stages[{}].input_color[2].value = {} / float3(255.0, 255.0, 255.0);\n",
  1568. n, tev_c_input_table[cc.c]);
  1569. out.Write("\tcustom_data.tev_stages[{}].input_color[2].input_type = {};\n", n,
  1570. tev_c_input_type[cc.c]);
  1571. out.Write(
  1572. "\tcustom_data.tev_stages[{}].input_color[3].value = {} / float3(255.0, 255.0, 255.0);\n",
  1573. n, tev_c_input_table[cc.d]);
  1574. out.Write("\tcustom_data.tev_stages[{}].input_color[3].input_type = {};\n", n,
  1575. tev_c_input_type[cc.d]);
  1576. // Alpha input
  1577. out.Write("\tcustom_data.tev_stages[{}].input_alpha[0].value = {} / float(255.0);\n", n,
  1578. tev_a_input_table[ac.a]);
  1579. out.Write("\tcustom_data.tev_stages[{}].input_alpha[0].input_type = {};\n", n,
  1580. tev_a_input_type[ac.a]);
  1581. out.Write("\tcustom_data.tev_stages[{}].input_alpha[1].value = {} / float(255.0);\n", n,
  1582. tev_a_input_table[ac.b]);
  1583. out.Write("\tcustom_data.tev_stages[{}].input_alpha[1].input_type = {};\n", n,
  1584. tev_a_input_type[ac.b]);
  1585. out.Write("\tcustom_data.tev_stages[{}].input_alpha[2].value = {} / float(255.0);\n", n,
  1586. tev_a_input_table[ac.c]);
  1587. out.Write("\tcustom_data.tev_stages[{}].input_alpha[2].input_type = {};\n", n,
  1588. tev_a_input_type[ac.c]);
  1589. out.Write("\tcustom_data.tev_stages[{}].input_alpha[3].value = {} / float(255.0);\n", n,
  1590. tev_a_input_table[ac.d]);
  1591. out.Write("\tcustom_data.tev_stages[{}].input_alpha[3].input_type = {};\n", n,
  1592. tev_a_input_type[ac.d]);
  1593. // Texmap
  1594. out.Write("\tcustom_data.tev_stages[{}].texmap = {}u;\n", n, stage.tevorders_texmap);
  1595. // Output
  1596. out.Write("\tcustom_data.tev_stages[{}].output_color.rgb = {} / float3(255.0, 255.0, 255.0);\n",
  1597. n, tev_c_output_table[cc.dest]);
  1598. out.Write("\tcustom_data.tev_stages[{}].output_color.a = {} / float(255.0);\n", n,
  1599. tev_a_output_table[ac.dest]);
  1600. }
  1601. }
  1602. static void WriteTevRegular(ShaderCode& out, std::string_view components, TevBias bias, TevOp op,
  1603. bool clamp, TevScale scale)
  1604. {
  1605. static constexpr Common::EnumMap<const char*, TevScale::Divide2> tev_scale_table_left{
  1606. "", // Scale1
  1607. " << 1", // Scale2
  1608. " << 2", // Scale4
  1609. "", // Divide2
  1610. };
  1611. static constexpr Common::EnumMap<const char*, TevScale::Divide2> tev_scale_table_right{
  1612. "", // Scale1
  1613. "", // Scale2
  1614. "", // Scale4
  1615. " >> 1", // Divide2
  1616. };
  1617. static constexpr Common::EnumMap<const char*, TevOp::Sub> tev_lerp_bias{
  1618. " + 128",
  1619. " + 127",
  1620. };
  1621. static constexpr Common::EnumMap<const char*, TevBias::Compare> tev_bias_table{
  1622. "", // Zero,
  1623. " + 128", // AddHalf,
  1624. " - 128", // SubHalf,
  1625. "",
  1626. };
  1627. static constexpr Common::EnumMap<char, TevOp::Sub> tev_op_table{
  1628. '+', // TevOp::Add = 0,
  1629. '-', // TevOp::Sub = 1,
  1630. };
  1631. // Regular TEV stage: (d + bias + lerp(a,b,c)) * scale
  1632. // The GameCube/Wii GPU uses a very sophisticated algorithm for scale-lerping:
  1633. // - c is scaled from 0..255 to 0..256, which allows dividing the result by 256 instead of 255
  1634. // - if scale is bigger than one, it is moved inside the lerp calculation for increased accuracy
  1635. // - a rounding bias is added before dividing by 256
  1636. // TODO: Is the rounding bias still added when the scale is divide by 2? Currently we do not
  1637. // apply it.
  1638. out.Write("(((tevin_d.{}{}){})", components, tev_bias_table[bias], tev_scale_table_left[scale]);
  1639. out.Write(" {} ", tev_op_table[op]);
  1640. out.Write("(((((tevin_a.{0}<<8) + "
  1641. "(tevin_b.{0}-tevin_a.{0})*(tevin_c.{0}+(tevin_c.{0}>>7))){1}){2})>>8)",
  1642. components, tev_scale_table_left[scale],
  1643. (scale != TevScale::Divide2) ? tev_lerp_bias[op] : "");
  1644. out.Write("){}", tev_scale_table_right[scale]);
  1645. }
  1646. constexpr Common::EnumMap<const char*, CompareMode::Always> tev_alpha_funcs_table{
  1647. "(false)", // CompareMode::Never
  1648. "(prev.a < {})", // CompareMode::Less
  1649. "(prev.a == {})", // CompareMode::Equal
  1650. "(prev.a <= {})", // CompareMode::LEqual
  1651. "(prev.a > {})", // CompareMode::Greater
  1652. "(prev.a != {})", // CompareMode::NEqual
  1653. "(prev.a >= {})", // CompareMode::GEqual
  1654. "(true)" // CompareMode::Always
  1655. };
  1656. constexpr Common::EnumMap<const char*, AlphaTestOp::Xnor> tev_alpha_funclogic_table{
  1657. " && ", // and
  1658. " || ", // or
  1659. " != ", // xor
  1660. " == " // xnor
  1661. };
  1662. static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType api_type,
  1663. bool per_pixel_depth, bool use_dual_source)
  1664. {
  1665. static constexpr std::array<std::string_view, 2> alpha_ref{
  1666. I_ALPHA ".r",
  1667. I_ALPHA ".g",
  1668. };
  1669. const auto write_alpha_func = [&out](CompareMode mode, std::string_view ref) {
  1670. const bool has_no_arguments = mode == CompareMode::Never || mode == CompareMode::Always;
  1671. if (has_no_arguments)
  1672. out.Write("{}", tev_alpha_funcs_table[mode]);
  1673. else
  1674. out.Write(fmt::runtime(tev_alpha_funcs_table[mode]), ref);
  1675. };
  1676. out.SetConstantsUsed(C_ALPHA, C_ALPHA);
  1677. if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_NEGATED_BOOLEAN))
  1678. out.Write("\tif(( ");
  1679. else
  1680. out.Write("\tif(!( ");
  1681. // Lookup the first component from the alpha function table
  1682. write_alpha_func(uid_data->alpha_test_comp0, alpha_ref[0]);
  1683. // Lookup the logic op
  1684. out.Write("{}", tev_alpha_funclogic_table[uid_data->alpha_test_logic]);
  1685. // Lookup the second component from the alpha function table
  1686. write_alpha_func(uid_data->alpha_test_comp1, alpha_ref[1]);
  1687. if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_NEGATED_BOOLEAN))
  1688. out.Write(") == false) {{\n");
  1689. else
  1690. out.Write(")) {{\n");
  1691. if (uid_data->uint_output)
  1692. out.Write("\t\tocol0 = uint4(0, 0, 0, 0);\n");
  1693. else
  1694. out.Write("\t\tocol0 = float4(0.0, 0.0, 0.0, 0.0);\n");
  1695. if (use_dual_source)
  1696. {
  1697. if (uid_data->uint_output)
  1698. out.Write("\t\tocol1 = uint4(0, 0, 0, 0);\n");
  1699. else
  1700. out.Write("\t\tocol1 = float4(0.0, 0.0, 0.0, 0.0);\n");
  1701. }
  1702. if (per_pixel_depth)
  1703. {
  1704. out.Write("\t\tdepth = {};\n",
  1705. !g_ActiveConfig.backend_info.bSupportsReversedDepthRange ? "0.0" : "1.0");
  1706. }
  1707. // ZCOMPLOC HACK:
  1708. if (uid_data->ztest != EmulatedZ::EarlyWithZComplocHack)
  1709. {
  1710. #ifdef __APPLE__
  1711. if (uid_data->ztest == EmulatedZ::EarlyWithFBFetch)
  1712. {
  1713. // Instead of using discard, fetch the framebuffer's color value and use it as the output
  1714. // for this fragment.
  1715. out.Write("\t\t{} = float4(initial_ocol0.xyz, 1.0);\n",
  1716. use_dual_source ? "real_ocol0" : "ocol0");
  1717. out.Write("\t\treturn;\n");
  1718. }
  1719. else
  1720. #endif
  1721. {
  1722. out.Write("\t\tdiscard;\n");
  1723. if (api_type == APIType::D3D)
  1724. out.Write("\t\treturn;\n");
  1725. }
  1726. }
  1727. out.Write("\t}}\n");
  1728. }
  1729. constexpr Common::EnumMap<const char*, FogType::BackwardsExpSq> tev_fog_funcs_table{
  1730. "", // No Fog
  1731. "", // ?
  1732. "", // Linear
  1733. "", // ?
  1734. "\tfog = 1.0 - exp2(-8.0 * fog);\n", // exp
  1735. "\tfog = 1.0 - exp2(-8.0 * fog * fog);\n", // exp2
  1736. "\tfog = exp2(-8.0 * (1.0 - fog));\n", // backward exp
  1737. "\tfog = 1.0 - fog;\n fog = exp2(-8.0 * fog * fog);\n" // backward exp2
  1738. };
  1739. static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data)
  1740. {
  1741. if (uid_data->fog_fsel == FogType::Off)
  1742. return; // no Fog
  1743. out.SetConstantsUsed(C_FOGCOLOR, C_FOGCOLOR);
  1744. out.SetConstantsUsed(C_FOGI, C_FOGI);
  1745. out.SetConstantsUsed(C_FOGF, C_FOGF + 1);
  1746. if (uid_data->fog_proj == FogProjection::Perspective)
  1747. {
  1748. // perspective
  1749. // ze = A/(B - (Zs >> B_SHF)
  1750. // TODO: Verify that we want to drop lower bits here! (currently taken over from software
  1751. // renderer)
  1752. // Maybe we want to use "ze = (A << B_SHF)/((B << B_SHF) - Zs)" instead?
  1753. // That's equivalent, but keeps the lower bits of Zs.
  1754. out.Write("\tfloat ze = (" I_FOGF ".x * 16777216.0) / float(" I_FOGI ".y - (zCoord >> " I_FOGI
  1755. ".w));\n");
  1756. }
  1757. else
  1758. {
  1759. // orthographic
  1760. // ze = a*Zs (here, no B_SHF)
  1761. out.Write("\tfloat ze = " I_FOGF ".x * float(zCoord) / 16777216.0;\n");
  1762. }
  1763. // x_adjust = sqrt((x-center)^2 + k^2)/k
  1764. // ze *= x_adjust
  1765. if (uid_data->fog_RangeBaseEnabled)
  1766. {
  1767. out.SetConstantsUsed(C_FOGF, C_FOGF);
  1768. out.Write("\tfloat offset = (2.0 * (rawpos.x / " I_FOGF ".w)) - 1.0 - " I_FOGF ".z;\n"
  1769. "\tfloat floatindex = clamp(9.0 - abs(offset) * 9.0, 0.0, 9.0);\n"
  1770. "\tuint indexlower = uint(floatindex);\n"
  1771. "\tuint indexupper = indexlower + 1u;\n"
  1772. "\tfloat klower = " I_FOGRANGE "[indexlower >> 2u][indexlower & 3u];\n"
  1773. "\tfloat kupper = " I_FOGRANGE "[indexupper >> 2u][indexupper & 3u];\n"
  1774. "\tfloat k = lerp(klower, kupper, frac(floatindex));\n"
  1775. "\tfloat x_adjust = sqrt(offset * offset + k * k) / k;\n"
  1776. "\tze *= x_adjust;\n");
  1777. }
  1778. out.Write("\tfloat fog = clamp(ze - " I_FOGF ".y, 0.0, 1.0);\n");
  1779. if (uid_data->fog_fsel >= FogType::Exp)
  1780. {
  1781. out.Write("{}", tev_fog_funcs_table[uid_data->fog_fsel]);
  1782. }
  1783. else
  1784. {
  1785. if (uid_data->fog_fsel != FogType::Linear)
  1786. WARN_LOG_FMT(VIDEO, "Unknown Fog Type! {}", uid_data->fog_fsel);
  1787. }
  1788. out.Write("\tint ifog = iround(fog * 256.0);\n");
  1789. out.Write("\tprev.rgb = (prev.rgb * (256 - ifog) + " I_FOGCOLOR ".rgb * ifog) >> 8;\n");
  1790. }
  1791. static void WriteLogicOp(ShaderCode& out, const pixel_shader_uid_data* uid_data)
  1792. {
  1793. static constexpr std::array<const char*, 16> logic_op_mode{
  1794. "int4(0, 0, 0, 0)", // CLEAR
  1795. "prev & fb_value", // AND
  1796. "prev & ~fb_value", // AND_REVERSE
  1797. "prev", // COPY
  1798. "~prev & fb_value", // AND_INVERTED
  1799. "fb_value", // NOOP
  1800. "prev ^ fb_value", // XOR
  1801. "prev | fb_value", // OR
  1802. "~(prev | fb_value)", // NOR
  1803. "~(prev ^ fb_value)", // EQUIV
  1804. "~fb_value", // INVERT
  1805. "prev | ~fb_value", // OR_REVERSE
  1806. "~prev", // COPY_INVERTED
  1807. "~prev | fb_value", // OR_INVERTED
  1808. "~(prev & fb_value)", // NAND
  1809. "int4(255, 255, 255, 255)", // SET
  1810. };
  1811. out.Write("\tint4 fb_value = iround(initial_ocol0 * 255.0);\n");
  1812. out.Write("\tprev = ({}) & 0xff;\n", logic_op_mode[uid_data->logic_op_mode]);
  1813. }
  1814. static void WriteLogicOpBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data)
  1815. {
  1816. switch (static_cast<LogicOp>(uid_data->logic_op_mode))
  1817. {
  1818. case LogicOp::Clear:
  1819. case LogicOp::NoOp:
  1820. out.Write("\tprev = int4(0, 0, 0, 0);\n");
  1821. break;
  1822. case LogicOp::Copy:
  1823. // Do nothing!
  1824. break;
  1825. case LogicOp::CopyInverted:
  1826. out.Write("\tprev ^= 255;\n");
  1827. break;
  1828. case LogicOp::Set:
  1829. case LogicOp::Invert: // In cooperation with blend
  1830. out.Write("\tprev = int4(255, 255, 255, 255);\n");
  1831. break;
  1832. default:
  1833. break;
  1834. }
  1835. }
  1836. static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
  1837. bool use_dual_source)
  1838. {
  1839. // Some backends require the shader outputs be uint when writing to a uint render target for logic
  1840. // op.
  1841. if (uid_data->uint_output)
  1842. {
  1843. if (uid_data->rgba6_format)
  1844. out.Write("\tocol0 = uint4(prev & 0xFC);\n");
  1845. else
  1846. out.Write("\tocol0 = uint4(prev);\n");
  1847. return;
  1848. }
  1849. if (uid_data->rgba6_format)
  1850. out.Write("\tocol0.rgb = float3(prev.rgb >> 2) / 63.0;\n");
  1851. else
  1852. out.Write("\tocol0.rgb = float3(prev.rgb) / 255.0;\n");
  1853. // Colors will be blended against the 8-bit alpha from ocol1 and
  1854. // the 6-bit alpha from ocol0 will be written to the framebuffer
  1855. if (uid_data->useDstAlpha)
  1856. {
  1857. out.SetConstantsUsed(C_ALPHA, C_ALPHA);
  1858. out.Write("\tocol0.a = float(" I_ALPHA ".a >> 2) / 63.0;\n");
  1859. // Use dual-source color blending to perform dst alpha in a single pass
  1860. if (use_dual_source)
  1861. out.Write("\tocol1 = float4(0.0, 0.0, 0.0, float(prev.a) / 255.0);\n");
  1862. }
  1863. else
  1864. {
  1865. out.Write("\tocol0.a = float(prev.a >> 2) / 63.0;\n");
  1866. if (use_dual_source)
  1867. out.Write("\tocol1 = float4(0.0, 0.0, 0.0, float(prev.a) / 255.0);\n");
  1868. }
  1869. }
  1870. static void WriteBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data)
  1871. {
  1872. if (uid_data->blend_enable)
  1873. {
  1874. using Common::EnumMap;
  1875. static constexpr EnumMap<const char*, SrcBlendFactor::InvDstAlpha> blend_src_factor{
  1876. "float3(0,0,0);", // ZERO
  1877. "float3(1,1,1);", // ONE
  1878. "initial_ocol0.rgb;", // DSTCLR
  1879. "float3(1,1,1) - initial_ocol0.rgb;", // INVDSTCLR
  1880. "src_color.aaa;", // SRCALPHA
  1881. "float3(1,1,1) - src_color.aaa;", // INVSRCALPHA
  1882. "initial_ocol0.aaa;", // DSTALPHA
  1883. "float3(1,1,1) - initial_ocol0.aaa;", // INVDSTALPHA
  1884. };
  1885. static constexpr EnumMap<const char*, SrcBlendFactor::InvDstAlpha> blend_src_factor_alpha{
  1886. "0.0;", // ZERO
  1887. "1.0;", // ONE
  1888. "initial_ocol0.a;", // DSTCLR
  1889. "1.0 - initial_ocol0.a;", // INVDSTCLR
  1890. "src_color.a;", // SRCALPHA
  1891. "1.0 - src_color.a;", // INVSRCALPHA
  1892. "initial_ocol0.a;", // DSTALPHA
  1893. "1.0 - initial_ocol0.a;", // INVDSTALPHA
  1894. };
  1895. static constexpr EnumMap<const char*, DstBlendFactor::InvDstAlpha> blend_dst_factor{
  1896. "float3(0,0,0);", // ZERO
  1897. "float3(1,1,1);", // ONE
  1898. "ocol0.rgb;", // SRCCLR
  1899. "float3(1,1,1) - ocol0.rgb;", // INVSRCCLR
  1900. "src_color.aaa;", // SRCALHA
  1901. "float3(1,1,1) - src_color.aaa;", // INVSRCALPHA
  1902. "initial_ocol0.aaa;", // DSTALPHA
  1903. "float3(1,1,1) - initial_ocol0.aaa;", // INVDSTALPHA
  1904. };
  1905. static constexpr EnumMap<const char*, DstBlendFactor::InvDstAlpha> blend_dst_factor_alpha{
  1906. "0.0;", // ZERO
  1907. "1.0;", // ONE
  1908. "ocol0.a;", // SRCCLR
  1909. "1.0 - ocol0.a;", // INVSRCCLR
  1910. "src_color.a;", // SRCALPHA
  1911. "1.0 - src_color.a;", // INVSRCALPHA
  1912. "initial_ocol0.a;", // DSTALPHA
  1913. "1.0 - initial_ocol0.a;", // INVDSTALPHA
  1914. };
  1915. out.Write("\tfloat4 src_color = {};\n"
  1916. "\tfloat4 blend_src;",
  1917. uid_data->useDstAlpha ? "ocol1" : "ocol0");
  1918. out.Write("\tblend_src.rgb = {}\n", blend_src_factor[uid_data->blend_src_factor]);
  1919. out.Write("\tblend_src.a = {}\n", blend_src_factor_alpha[uid_data->blend_src_factor_alpha]);
  1920. out.Write("\tfloat4 blend_dst;\n");
  1921. out.Write("\tblend_dst.rgb = {}\n", blend_dst_factor[uid_data->blend_dst_factor]);
  1922. out.Write("\tblend_dst.a = {}\n", blend_dst_factor_alpha[uid_data->blend_dst_factor_alpha]);
  1923. out.Write("\tfloat4 blend_result;\n");
  1924. if (uid_data->blend_subtract)
  1925. {
  1926. out.Write("\tblend_result.rgb = initial_ocol0.rgb * blend_dst.rgb - ocol0.rgb * "
  1927. "blend_src.rgb;\n");
  1928. }
  1929. else
  1930. {
  1931. out.Write(
  1932. "\tblend_result.rgb = initial_ocol0.rgb * blend_dst.rgb + ocol0.rgb * blend_src.rgb;\n");
  1933. }
  1934. if (uid_data->blend_subtract_alpha)
  1935. out.Write("\tblend_result.a = initial_ocol0.a * blend_dst.a - ocol0.a * blend_src.a;\n");
  1936. else
  1937. out.Write("\tblend_result.a = initial_ocol0.a * blend_dst.a + ocol0.a * blend_src.a;\n");
  1938. }
  1939. else
  1940. {
  1941. out.Write("\tfloat4 blend_result = ocol0;\n");
  1942. }
  1943. out.Write("\treal_ocol0 = blend_result;\n");
  1944. }