Rasterizer.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. // Copyright 2009 Dolphin Emulator Project
  2. // SPDX-License-Identifier: GPL-2.0-or-later
  3. #include "VideoBackends/Software/Rasterizer.h"
  4. #include <algorithm>
  5. #include <cstring>
  6. #include <vector>
  7. #include "Common/Assert.h"
  8. #include "Common/CommonTypes.h"
  9. #include "VideoBackends/Software/EfbInterface.h"
  10. #include "VideoBackends/Software/NativeVertexFormat.h"
  11. #include "VideoBackends/Software/Tev.h"
  12. #include "VideoCommon/BPFunctions.h"
  13. #include "VideoCommon/BPMemory.h"
  14. #include "VideoCommon/PerfQueryBase.h"
  15. #include "VideoCommon/Statistics.h"
  16. #include "VideoCommon/VideoCommon.h"
  17. #include "VideoCommon/VideoConfig.h"
  18. #include "VideoCommon/XFMemory.h"
  19. namespace Rasterizer
  20. {
  21. static constexpr int BLOCK_SIZE = 2;
  22. struct SlopeContext
  23. {
  24. SlopeContext(const OutputVertexData* v0, const OutputVertexData* v1, const OutputVertexData* v2,
  25. s32 x0_, s32 y0_, s32 x_off, s32 y_off)
  26. : x0(x0_), y0(y0_)
  27. {
  28. // adjust a little less than 0.5
  29. const float adjust = 0.495f;
  30. xOff = ((float)x0_ - (v0->screenPosition.x - x_off)) + adjust;
  31. yOff = ((float)y0_ - (v0->screenPosition.y - y_off)) + adjust;
  32. dx10 = v1->screenPosition.x - v0->screenPosition.x;
  33. dx20 = v2->screenPosition.x - v0->screenPosition.x;
  34. dy10 = v1->screenPosition.y - v0->screenPosition.y;
  35. dy20 = v2->screenPosition.y - v0->screenPosition.y;
  36. }
  37. s32 x0;
  38. s32 y0;
  39. float xOff;
  40. float yOff;
  41. float dx10;
  42. float dx20;
  43. float dy10;
  44. float dy20;
  45. };
  46. struct Slope
  47. {
  48. Slope() = default;
  49. Slope(float f0_, float f1, float f2, const SlopeContext& ctx) : f0(f0_)
  50. {
  51. float delta_20 = f2 - f0_;
  52. float delta_10 = f1 - f0_;
  53. // x2 - x0 y1 - y0 x1 - x0 y2 - y0
  54. float a = delta_20 * ctx.dy10 - delta_10 * ctx.dy20;
  55. float b = ctx.dx20 * delta_10 - ctx.dx10 * delta_20;
  56. float c = ctx.dx20 * ctx.dy10 - ctx.dx10 * ctx.dy20;
  57. dfdx = a / c;
  58. dfdy = b / c;
  59. x0 = ctx.x0;
  60. y0 = ctx.y0;
  61. xOff = ctx.xOff;
  62. yOff = ctx.yOff;
  63. }
  64. // These default values are used in the unlikely case that zfreeze is enabled when drawing the
  65. // first primitive.
  66. // TODO: This is just a guess!
  67. float dfdx = 0.0f;
  68. float dfdy = 0.0f;
  69. float f0 = 1.0f;
  70. // Both an s32 value and a float value are used to minimize rounding error
  71. // TODO: is this really needed?
  72. s32 x0 = 0;
  73. s32 y0 = 0;
  74. float xOff = 0.0f;
  75. float yOff = 0.0f;
  76. float GetValue(s32 x, s32 y) const
  77. {
  78. float dx = xOff + (float)(x - x0);
  79. float dy = yOff + (float)(y - y0);
  80. return f0 + (dfdx * dx) + (dfdy * dy);
  81. }
  82. };
  83. static Slope ZSlope;
  84. static Slope WSlope;
  85. static Slope ColorSlopes[2][4];
  86. static Slope TexSlopes[8][3];
  87. static Tev tev;
  88. static RasterBlock rasterBlock;
  89. static std::vector<BPFunctions::ScissorRect> scissors;
  90. void Init()
  91. {
  92. // The other slopes are set each for each primitive drawn, but zfreeze means that the z slope
  93. // needs to be set to an (untested) default value.
  94. ZSlope = Slope();
  95. }
  96. void ScissorChanged()
  97. {
  98. scissors = std::move(BPFunctions::ComputeScissorRects().m_result);
  99. }
  100. // Returns approximation of log2(f) in s28.4
  101. // results are close enough to use for LOD
  102. static s32 FixedLog2(float f)
  103. {
  104. u32 x;
  105. std::memcpy(&x, &f, sizeof(u32));
  106. s32 logInt = ((x & 0x7F800000) >> 19) - 2032; // integer part
  107. s32 logFract = (x & 0x007fffff) >> 19; // approximate fractional part
  108. return logInt + logFract;
  109. }
  110. static inline int iround(float x)
  111. {
  112. int t = (int)x;
  113. if ((x - t) >= 0.5)
  114. return t + 1;
  115. return t;
  116. }
  117. void SetTevKonstColors()
  118. {
  119. tev.SetKonstColors();
  120. }
  121. static void Draw(s32 x, s32 y, s32 xi, s32 yi)
  122. {
  123. INCSTAT(g_stats.this_frame.rasterized_pixels);
  124. s32 z = (s32)std::clamp<float>(ZSlope.GetValue(x, y), 0.0f, 16777215.0f);
  125. if (bpmem.GetEmulatedZ() == EmulatedZ::Early)
  126. {
  127. // TODO: Test if perf regs are incremented even if test is disabled
  128. EfbInterface::IncPerfCounterQuadCount(PQ_ZCOMP_INPUT_ZCOMPLOC);
  129. if (bpmem.zmode.testenable)
  130. {
  131. // early z
  132. if (!EfbInterface::ZCompare(x, y, z))
  133. return;
  134. }
  135. EfbInterface::IncPerfCounterQuadCount(PQ_ZCOMP_OUTPUT_ZCOMPLOC);
  136. }
  137. RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi];
  138. tev.Position[0] = x;
  139. tev.Position[1] = y;
  140. tev.Position[2] = z;
  141. // colors
  142. for (unsigned int i = 0; i < bpmem.genMode.numcolchans; i++)
  143. {
  144. for (int comp = 0; comp < 4; comp++)
  145. {
  146. u16 color = (u16)ColorSlopes[i][comp].GetValue(x, y);
  147. // clamp color value to 0
  148. u16 mask = ~(color >> 8);
  149. tev.Color[i][comp] = color & mask;
  150. }
  151. }
  152. // tex coords
  153. for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
  154. {
  155. // multiply by 128 because TEV stores UVs as s17.7
  156. tev.Uv[i].s = (s32)(pixel.Uv[i][0] * 128);
  157. tev.Uv[i].t = (s32)(pixel.Uv[i][1] * 128);
  158. }
  159. for (unsigned int i = 0; i < bpmem.genMode.numindstages; i++)
  160. {
  161. tev.IndirectLod[i] = rasterBlock.IndirectLod[i];
  162. tev.IndirectLinear[i] = rasterBlock.IndirectLinear[i];
  163. }
  164. for (unsigned int i = 0; i <= bpmem.genMode.numtevstages; i++)
  165. {
  166. tev.TextureLod[i] = rasterBlock.TextureLod[i];
  167. tev.TextureLinear[i] = rasterBlock.TextureLinear[i];
  168. }
  169. tev.Draw();
  170. }
  171. static inline void CalculateLOD(s32* lodp, bool* linear, u32 texmap, u32 texcoord)
  172. {
  173. auto texUnit = bpmem.tex.GetUnit(texmap);
  174. // LOD calculation requires data from the texture mode for bias, etc.
  175. // it does not seem to use the actual texture size
  176. const TexMode0& tm0 = texUnit.texMode0;
  177. const TexMode1& tm1 = texUnit.texMode1;
  178. float sDelta, tDelta;
  179. float* uv00 = rasterBlock.Pixel[0][0].Uv[texcoord];
  180. float* uv10 = rasterBlock.Pixel[1][0].Uv[texcoord];
  181. float* uv01 = rasterBlock.Pixel[0][1].Uv[texcoord];
  182. float dudx = fabsf(uv00[0] - uv10[0]);
  183. float dvdx = fabsf(uv00[1] - uv10[1]);
  184. float dudy = fabsf(uv00[0] - uv01[0]);
  185. float dvdy = fabsf(uv00[1] - uv01[1]);
  186. if (tm0.diag_lod == LODType::Diagonal)
  187. {
  188. sDelta = dudx + dudy;
  189. tDelta = dvdx + dvdy;
  190. }
  191. else
  192. {
  193. sDelta = std::max(dudx, dudy);
  194. tDelta = std::max(dvdx, dvdy);
  195. }
  196. // get LOD in s28.4
  197. s32 lod = FixedLog2(std::max(sDelta, tDelta));
  198. // bias is s2.5
  199. int bias = tm0.lod_bias;
  200. bias >>= 1;
  201. lod += bias;
  202. *linear = ((lod > 0 && tm0.min_filter == FilterMode::Linear) ||
  203. (lod <= 0 && tm0.mag_filter == FilterMode::Linear));
  204. // NOTE: The order of comparisons for this clamp check matters.
  205. if (lod > static_cast<s32>(tm1.max_lod))
  206. lod = static_cast<s32>(tm1.max_lod);
  207. else if (lod < static_cast<s32>(tm1.min_lod))
  208. lod = static_cast<s32>(tm1.min_lod);
  209. *lodp = lod;
  210. }
  211. static void BuildBlock(s32 blockX, s32 blockY)
  212. {
  213. for (s32 yi = 0; yi < BLOCK_SIZE; yi++)
  214. {
  215. for (s32 xi = 0; xi < BLOCK_SIZE; xi++)
  216. {
  217. RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi];
  218. s32 x = xi + blockX;
  219. s32 y = yi + blockY;
  220. float invW = 1.0f / WSlope.GetValue(x, y);
  221. pixel.InvW = invW;
  222. // tex coords
  223. for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
  224. {
  225. float projection = invW;
  226. float q = TexSlopes[i][2].GetValue(x, y) * invW;
  227. if (q != 0.0f)
  228. projection = invW / q;
  229. pixel.Uv[i][0] = TexSlopes[i][0].GetValue(x, y) * projection;
  230. pixel.Uv[i][1] = TexSlopes[i][1].GetValue(x, y) * projection;
  231. }
  232. }
  233. }
  234. for (unsigned int i = 0; i < bpmem.genMode.numindstages; i++)
  235. {
  236. u32 texmap = bpmem.tevindref.getTexMap(i);
  237. u32 texcoord = bpmem.tevindref.getTexCoord(i);
  238. CalculateLOD(&rasterBlock.IndirectLod[i], &rasterBlock.IndirectLinear[i], texmap, texcoord);
  239. }
  240. for (unsigned int i = 0; i <= bpmem.genMode.numtevstages; i++)
  241. {
  242. int stageOdd = i & 1;
  243. const TwoTevStageOrders& order = bpmem.tevorders[i >> 1];
  244. if (order.getEnable(stageOdd))
  245. {
  246. u32 texmap = order.getTexMap(stageOdd);
  247. u32 texcoord = order.getTexCoord(stageOdd);
  248. CalculateLOD(&rasterBlock.TextureLod[i], &rasterBlock.TextureLinear[i], texmap, texcoord);
  249. }
  250. }
  251. }
  252. void UpdateZSlope(const OutputVertexData* v0, const OutputVertexData* v1,
  253. const OutputVertexData* v2, s32 x_off, s32 y_off)
  254. {
  255. if (!bpmem.genMode.zfreeze)
  256. {
  257. const s32 X1 = iround(16.0f * (v0->screenPosition.x - x_off)) - 9;
  258. const s32 Y1 = iround(16.0f * (v0->screenPosition.y - y_off)) - 9;
  259. const SlopeContext ctx(v0, v1, v2, (X1 + 0xF) >> 4, (Y1 + 0xF) >> 4, x_off, y_off);
  260. ZSlope = Slope(v0->screenPosition.z, v1->screenPosition.z, v2->screenPosition.z, ctx);
  261. }
  262. }
  263. static void DrawTriangleFrontFace(const OutputVertexData* v0, const OutputVertexData* v1,
  264. const OutputVertexData* v2,
  265. const BPFunctions::ScissorRect& scissor)
  266. {
  267. // The zslope should be updated now, even if the triangle is rejected by the scissor test, as
  268. // zfreeze depends on it
  269. UpdateZSlope(v0, v1, v2, scissor.x_off, scissor.y_off);
  270. // adapted from http://devmaster.net/posts/6145/advanced-rasterization
  271. // 28.4 fixed-point coordinates. rounded to nearest and adjusted to match hardware output
  272. // could also take floor and adjust -8
  273. const s32 Y1 = iround(16.0f * (v0->screenPosition.y - scissor.y_off)) - 9;
  274. const s32 Y2 = iround(16.0f * (v1->screenPosition.y - scissor.y_off)) - 9;
  275. const s32 Y3 = iround(16.0f * (v2->screenPosition.y - scissor.y_off)) - 9;
  276. const s32 X1 = iround(16.0f * (v0->screenPosition.x - scissor.x_off)) - 9;
  277. const s32 X2 = iround(16.0f * (v1->screenPosition.x - scissor.x_off)) - 9;
  278. const s32 X3 = iround(16.0f * (v2->screenPosition.x - scissor.x_off)) - 9;
  279. // Deltas
  280. const s32 DX12 = X1 - X2;
  281. const s32 DX23 = X2 - X3;
  282. const s32 DX31 = X3 - X1;
  283. const s32 DY12 = Y1 - Y2;
  284. const s32 DY23 = Y2 - Y3;
  285. const s32 DY31 = Y3 - Y1;
  286. // Fixed-point deltas
  287. const s32 FDX12 = DX12 * 16;
  288. const s32 FDX23 = DX23 * 16;
  289. const s32 FDX31 = DX31 * 16;
  290. const s32 FDY12 = DY12 * 16;
  291. const s32 FDY23 = DY23 * 16;
  292. const s32 FDY31 = DY31 * 16;
  293. // Bounding rectangle
  294. s32 minx = (std::min(std::min(X1, X2), X3) + 0xF) >> 4;
  295. s32 maxx = (std::max(std::max(X1, X2), X3) + 0xF) >> 4;
  296. s32 miny = (std::min(std::min(Y1, Y2), Y3) + 0xF) >> 4;
  297. s32 maxy = (std::max(std::max(Y1, Y2), Y3) + 0xF) >> 4;
  298. // scissor
  299. ASSERT(scissor.rect.left >= 0);
  300. ASSERT(scissor.rect.right <= static_cast<int>(EFB_WIDTH));
  301. ASSERT(scissor.rect.top >= 0);
  302. ASSERT(scissor.rect.bottom <= static_cast<int>(EFB_HEIGHT));
  303. minx = std::max(minx, scissor.rect.left);
  304. maxx = std::min(maxx, scissor.rect.right);
  305. miny = std::max(miny, scissor.rect.top);
  306. maxy = std::min(maxy, scissor.rect.bottom);
  307. if (minx >= maxx || miny >= maxy)
  308. return;
  309. // Set up the remaining slopes
  310. const SlopeContext ctx(v0, v1, v2, (X1 + 0xF) >> 4, (Y1 + 0xF) >> 4, scissor.x_off,
  311. scissor.y_off);
  312. float w[3] = {1.0f / v0->projectedPosition.w, 1.0f / v1->projectedPosition.w,
  313. 1.0f / v2->projectedPosition.w};
  314. WSlope = Slope(w[0], w[1], w[2], ctx);
  315. for (unsigned int i = 0; i < bpmem.genMode.numcolchans; i++)
  316. {
  317. for (int comp = 0; comp < 4; comp++)
  318. ColorSlopes[i][comp] = Slope(v0->color[i][comp], v1->color[i][comp], v2->color[i][comp], ctx);
  319. }
  320. for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
  321. {
  322. for (int comp = 0; comp < 3; comp++)
  323. {
  324. TexSlopes[i][comp] = Slope(v0->texCoords[i][comp] * w[0], v1->texCoords[i][comp] * w[1],
  325. v2->texCoords[i][comp] * w[2], ctx);
  326. }
  327. }
  328. // Half-edge constants
  329. s32 C1 = DY12 * X1 - DX12 * Y1;
  330. s32 C2 = DY23 * X2 - DX23 * Y2;
  331. s32 C3 = DY31 * X3 - DX31 * Y3;
  332. // Correct for fill convention
  333. if (DY12 < 0 || (DY12 == 0 && DX12 > 0))
  334. C1++;
  335. if (DY23 < 0 || (DY23 == 0 && DX23 > 0))
  336. C2++;
  337. if (DY31 < 0 || (DY31 == 0 && DX31 > 0))
  338. C3++;
  339. // Start in corner of 2x2 block
  340. s32 block_minx = minx & ~(BLOCK_SIZE - 1);
  341. s32 block_miny = miny & ~(BLOCK_SIZE - 1);
  342. // Loop through blocks
  343. for (s32 y = block_miny & ~(BLOCK_SIZE - 1); y < maxy; y += BLOCK_SIZE)
  344. {
  345. for (s32 x = block_minx; x < maxx; x += BLOCK_SIZE)
  346. {
  347. s32 x1_ = (x + BLOCK_SIZE - 1);
  348. s32 y1_ = (y + BLOCK_SIZE - 1);
  349. // Corners of block
  350. s32 x0 = x << 4;
  351. s32 x1 = x1_ << 4;
  352. s32 y0 = y << 4;
  353. s32 y1 = y1_ << 4;
  354. // Evaluate half-space functions
  355. bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0;
  356. bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0;
  357. bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0;
  358. bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0;
  359. int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3);
  360. bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0;
  361. bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0;
  362. bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0;
  363. bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0;
  364. int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3);
  365. bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0;
  366. bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0;
  367. bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0;
  368. bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0;
  369. int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3);
  370. // Skip block when outside an edge
  371. if (a == 0x0 || b == 0x0 || c == 0x0)
  372. continue;
  373. BuildBlock(x, y);
  374. // Accept whole block when totally covered
  375. // We still need to check min/max x/y because of the scissor
  376. if (a == 0xF && b == 0xF && c == 0xF && x >= minx && x1_ < maxx && y >= miny && y1_ < maxy)
  377. {
  378. for (s32 iy = 0; iy < BLOCK_SIZE; iy++)
  379. {
  380. for (s32 ix = 0; ix < BLOCK_SIZE; ix++)
  381. {
  382. Draw(x + ix, y + iy, ix, iy);
  383. }
  384. }
  385. }
  386. else // Partially covered block
  387. {
  388. s32 CY1 = C1 + DX12 * y0 - DY12 * x0;
  389. s32 CY2 = C2 + DX23 * y0 - DY23 * x0;
  390. s32 CY3 = C3 + DX31 * y0 - DY31 * x0;
  391. for (s32 iy = 0; iy < BLOCK_SIZE; iy++)
  392. {
  393. s32 CX1 = CY1;
  394. s32 CX2 = CY2;
  395. s32 CX3 = CY3;
  396. for (s32 ix = 0; ix < BLOCK_SIZE; ix++)
  397. {
  398. if (CX1 > 0 && CX2 > 0 && CX3 > 0)
  399. {
  400. // This check enforces the scissor rectangle, since it might not be aligned with the
  401. // blocks
  402. if (x + ix >= minx && x + ix < maxx && y + iy >= miny && y + iy < maxy)
  403. Draw(x + ix, y + iy, ix, iy);
  404. }
  405. CX1 -= FDY12;
  406. CX2 -= FDY23;
  407. CX3 -= FDY31;
  408. }
  409. CY1 += FDX12;
  410. CY2 += FDX23;
  411. CY3 += FDX31;
  412. }
  413. }
  414. }
  415. }
  416. }
  417. void DrawTriangleFrontFace(const OutputVertexData* v0, const OutputVertexData* v1,
  418. const OutputVertexData* v2)
  419. {
  420. INCSTAT(g_stats.this_frame.num_triangles_drawn);
  421. for (const auto& scissor : scissors)
  422. DrawTriangleFrontFace(v0, v1, v2, scissor);
  423. }
  424. } // namespace Rasterizer