MaskedOcclusionCullingCommon.inl 94 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054
  1. ////////////////////////////////////////////////////////////////////////////////
  2. // Copyright 2017 Intel Corporation
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5. // use this file except in compliance with the License. You may obtain a copy
  6. // of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. // License for the specific language governing permissions and limitations
  14. // under the License.
  15. ////////////////////////////////////////////////////////////////////////////////
  16. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  17. // Common SIMD math utility functions
  18. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  19. template<typename T> FORCE_INLINE T max(const T &a, const T &b) { return a > b ? a : b; }
  20. template<typename T> FORCE_INLINE T min(const T &a, const T &b) { return a < b ? a : b; }
  21. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  22. // Common defines and constants
  23. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  24. #define SIMD_ALL_LANES_MASK ((1 << SIMD_LANES) - 1)
  25. // Tile dimensions are 32xN pixels. These values are not tweakable and the code must also be modified
  26. // to support different tile sizes as it is tightly coupled with the SSE/AVX register size
  27. #define TILE_WIDTH_SHIFT 5
  28. #define TILE_WIDTH (1 << TILE_WIDTH_SHIFT)
  29. #define TILE_HEIGHT (1 << TILE_HEIGHT_SHIFT)
  30. // Sub-tiles (used for updating the masked HiZ buffer) are 8x4 tiles, so there are 4x2 sub-tiles in a tile
  31. #define SUB_TILE_WIDTH 8
  32. #define SUB_TILE_HEIGHT 4
  33. // The number of fixed point bits used to represent vertex coordinates / edge slopes.
  34. #if PRECISE_COVERAGE != 0
  35. #define FP_BITS 8
  36. #define FP_HALF_PIXEL (1 << (FP_BITS - 1))
  37. #define FP_INV (1.0f / (float)(1 << FP_BITS))
  38. #else
  39. // Note that too low precision, without precise coverage, may cause overshoots / false coverage during rasterization.
  40. // This is configured for 14 bits for AVX512 and 16 bits for SSE. Max tile slope delta is roughly
  41. // (screenWidth + 2*(GUARD_BAND_PIXEL_SIZE + 1)) * (2^FP_BITS * (TILE_HEIGHT + GUARD_BAND_PIXEL_SIZE + 1))
  42. // and must fit in 31 bits. With this config, max image resolution (width) is ~3272, so stay well clear of this limit.
  43. #define FP_BITS (19 - TILE_HEIGHT_SHIFT)
  44. #endif
  45. // Tile dimensions in fixed point coordinates
  46. #define FP_TILE_HEIGHT_SHIFT (FP_BITS + TILE_HEIGHT_SHIFT)
  47. #define FP_TILE_HEIGHT (1 << FP_TILE_HEIGHT_SHIFT)
  48. // Maximum number of triangles that may be generated during clipping. We process SIMD_LANES triangles at a time and
  49. // clip against 5 planes, so the max should be 5*8 = 40 (we immediately draw the first clipped triangle).
  50. // This number must be a power of two.
  51. #define MAX_CLIPPED (8*SIMD_LANES)
  52. #define MAX_CLIPPED_WRAP (MAX_CLIPPED - 1)
  53. // Size of guard band in pixels. Clipping doesn't seem to be very expensive so we use a small guard band
  54. // to improve rasterization performance. It's not recommended to set the guard band to zero, as this may
  55. // cause leakage along the screen border due to precision/rounding.
  56. #define GUARD_BAND_PIXEL_SIZE 1.0f
  57. // We classify triangles as big if the bounding box is wider than this given threshold and use a tighter
  58. // but slightly more expensive traversal algorithm. This improves performance greatly for sliver triangles
  59. #define BIG_TRIANGLE 3
  60. // Only gather statistics if enabled.
  61. #if ENABLE_STATS != 0
  62. #define STATS_ADD(var, val) _InterlockedExchangeAdd64( &var, val )
  63. #else
  64. #define STATS_ADD(var, val)
  65. #endif
  66. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  67. // SIMD common defines (constant values)
  68. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  69. #define SIMD_BITS_ONE _mmw_set1_epi32(~0)
  70. #define SIMD_BITS_ZERO _mmw_setzero_epi32()
  71. #define SIMD_TILE_WIDTH _mmw_set1_epi32(TILE_WIDTH)
  72. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  73. // Vertex fetch utility function, need to be in global namespace due to template specialization
  74. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  75. template<int N> FORCE_INLINE void VtxFetch4(__mw *v, const unsigned int *inTrisPtr, int triVtx, const float *inVtx, int numLanes)
  76. {
  77. // Fetch 4 vectors (matching 1 sse part of the SIMD register), and continue to the next
  78. const int ssePart = (SIMD_LANES / 4) - N;
  79. for (int k = 0; k < 4; k++)
  80. {
  81. int lane = 4 * ssePart + k;
  82. if (numLanes > lane)
  83. v[k] = _mmw_insertf32x4_ps(v[k], _mm_loadu_ps(&inVtx[inTrisPtr[lane * 3 + triVtx] << 2]), ssePart);
  84. }
  85. VtxFetch4<N - 1>(v, inTrisPtr, triVtx, inVtx, numLanes);
  86. }
  87. template<> FORCE_INLINE void VtxFetch4<0>(__mw *v, const unsigned int *inTrisPtr, int triVtx, const float *inVtx, int numLanes)
  88. {
  89. // Workaround for unused parameter warning
  90. (void)v; (void)inTrisPtr; (void)triVtx; (void)inVtx; (void)numLanes;
  91. }
  92. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  93. // Private class containing the implementation
  94. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  95. class MaskedOcclusionCullingPrivate : public MaskedOcclusionCulling
  96. {
  97. public:
  98. struct ZTile
  99. {
  100. __mw mZMin[2];
  101. __mwi mMask;
  102. };
  103. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  104. // Member variables
  105. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  106. __mw mHalfWidth;
  107. __mw mHalfHeight;
  108. __mw mCenterX;
  109. __mw mCenterY;
  110. __m128 mCSFrustumPlanes[5];
  111. __m128 mIHalfSize;
  112. __m128 mICenter;
  113. __m128i mIScreenSize;
  114. float mNearDist;
  115. int mWidth;
  116. int mHeight;
  117. int mTilesWidth;
  118. int mTilesHeight;
  119. ZTile *mMaskedHiZBuffer;
  120. ScissorRect mFullscreenScissor;
  121. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  122. // Constructors and state handling
  123. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  124. MaskedOcclusionCullingPrivate(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree) : mFullscreenScissor(0, 0, 0, 0)
  125. {
  126. mMaskedHiZBuffer = nullptr;
  127. mAlignedAllocCallback = alignedAlloc;
  128. mAlignedFreeCallback = alignedFree;
  129. #if MOC_RECORDER_ENABLE
  130. mRecorder = nullptr;
  131. #endif
  132. SetNearClipPlane(0.0f);
  133. mCSFrustumPlanes[0] = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f);
  134. mCSFrustumPlanes[1] = _mm_setr_ps(1.0f, 0.0f, 1.0f, 0.0f);
  135. mCSFrustumPlanes[2] = _mm_setr_ps(-1.0f, 0.0f, 1.0f, 0.0f);
  136. mCSFrustumPlanes[3] = _mm_setr_ps(0.0f, 1.0f, 1.0f, 0.0f);
  137. mCSFrustumPlanes[4] = _mm_setr_ps(0.0f, -1.0f, 1.0f, 0.0f);
  138. memset(&mStats, 0, sizeof(OcclusionCullingStatistics));
  139. SetResolution(0, 0);
  140. }
  141. ~MaskedOcclusionCullingPrivate() override
  142. {
  143. if (mMaskedHiZBuffer != nullptr)
  144. mAlignedFreeCallback(mMaskedHiZBuffer);
  145. mMaskedHiZBuffer = nullptr;
  146. #if MOC_RECORDER_ENABLE
  147. assert( mRecorder == nullptr ); // forgot to call StopRecording()?
  148. #endif
  149. }
  150. void SetResolution(unsigned int width, unsigned int height) override
  151. {
  152. // Resolution must be a multiple of the subtile size
  153. assert(width % SUB_TILE_WIDTH == 0 && height % SUB_TILE_HEIGHT == 0);
  154. #if PRECISE_COVERAGE == 0
  155. // Test if combination of resolution & SLOPE_FP_BITS bits may cause 32-bit overflow. Note that the maximum resolution estimate
  156. // is only an estimate (not conservative). It's advicable to stay well below the limit.
  157. assert(width < ((1U << 31) - 1U) / ((1U << FP_BITS) * (TILE_HEIGHT + (unsigned int)(GUARD_BAND_PIXEL_SIZE + 1.0f))) - (2U * (unsigned int)(GUARD_BAND_PIXEL_SIZE + 1.0f)));
  158. #endif
  159. // Delete current masked hierarchical Z buffer
  160. if (mMaskedHiZBuffer != nullptr)
  161. mAlignedFreeCallback(mMaskedHiZBuffer);
  162. mMaskedHiZBuffer = nullptr;
  163. // Setup various resolution dependent constant values
  164. mWidth = (int)width;
  165. mHeight = (int)height;
  166. mTilesWidth = (int)(width + TILE_WIDTH - 1) >> TILE_WIDTH_SHIFT;
  167. mTilesHeight = (int)(height + TILE_HEIGHT - 1) >> TILE_HEIGHT_SHIFT;
  168. mCenterX = _mmw_set1_ps((float)mWidth * 0.5f);
  169. mCenterY = _mmw_set1_ps((float)mHeight * 0.5f);
  170. mICenter = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)mHeight * 0.5f, (float)mHeight * 0.5f);
  171. mHalfWidth = _mmw_set1_ps((float)mWidth * 0.5f);
  172. #if USE_D3D != 0
  173. mHalfHeight = _mmw_set1_ps((float)-mHeight * 0.5f);
  174. mIHalfSize = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)-mHeight * 0.5f, (float)-mHeight * 0.5f);
  175. #else
  176. mHalfHeight = _mmw_set1_ps((float)mHeight * 0.5f);
  177. mIHalfSize = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)mHeight * 0.5f, (float)mHeight * 0.5f);
  178. #endif
  179. mIScreenSize = _mm_setr_epi32(mWidth - 1, mWidth - 1, mHeight - 1, mHeight - 1);
  180. // Setup a full screen scissor rectangle
  181. mFullscreenScissor.mMinX = 0;
  182. mFullscreenScissor.mMinY = 0;
  183. mFullscreenScissor.mMaxX = mTilesWidth << TILE_WIDTH_SHIFT;
  184. mFullscreenScissor.mMaxY = mTilesHeight << TILE_HEIGHT_SHIFT;
  185. // Adjust clip planes to include a small guard band to avoid clipping leaks
  186. if (mWidth > 0.0f && mHeight > 0.0f)
  187. {
  188. float guardBandWidth = (2.0f / (float)mWidth) * GUARD_BAND_PIXEL_SIZE;
  189. float guardBandHeight = (2.0f / (float)mHeight) * GUARD_BAND_PIXEL_SIZE;
  190. mCSFrustumPlanes[1] = _mm_setr_ps(1.0f - guardBandWidth, 0.0f, 1.0f, 0.0f);
  191. mCSFrustumPlanes[2] = _mm_setr_ps(-1.0f + guardBandWidth, 0.0f, 1.0f, 0.0f);
  192. mCSFrustumPlanes[3] = _mm_setr_ps(0.0f, 1.0f - guardBandHeight, 1.0f, 0.0f);
  193. mCSFrustumPlanes[4] = _mm_setr_ps(0.0f, -1.0f + guardBandHeight, 1.0f, 0.0f);
  194. }
  195. // Allocate masked hierarchical Z buffer (if zero size leave at nullptr)
  196. if(mTilesWidth * mTilesHeight > 0)
  197. mMaskedHiZBuffer = (ZTile *)mAlignedAllocCallback(64, sizeof(ZTile) * mTilesWidth * mTilesHeight);
  198. }
  199. void GetResolution(unsigned int &width, unsigned int &height) const override
  200. {
  201. width = mWidth;
  202. height = mHeight;
  203. }
  204. void ComputeBinWidthHeight(unsigned int nBinsW, unsigned int nBinsH, unsigned int & outBinWidth, unsigned int & outBinHeight) override
  205. {
  206. outBinWidth = (mWidth / nBinsW) - ((mWidth / nBinsW) % TILE_WIDTH);
  207. outBinHeight = (mHeight / nBinsH) - ((mHeight / nBinsH) % TILE_HEIGHT);
  208. }
  209. void SetNearClipPlane(float nearDist) override
  210. {
  211. // Setup the near frustum plane
  212. mNearDist = nearDist;
  213. mCSFrustumPlanes[0] = _mm_setr_ps(0.0f, 0.0f, 1.0f, -nearDist);
  214. }
  215. float GetNearClipPlane() const override
  216. {
  217. return mNearDist;
  218. }
  219. void ClearBuffer() override
  220. {
  221. assert(mMaskedHiZBuffer != nullptr);
  222. // Iterate through all depth tiles and clear to default values
  223. for (int i = 0; i < mTilesWidth * mTilesHeight; i++)
  224. {
  225. mMaskedHiZBuffer[i].mMask = _mmw_setzero_epi32();
  226. // Clear z0 to beyond infinity to ensure we never merge with clear data
  227. mMaskedHiZBuffer[i].mZMin[0] = _mmw_set1_ps(-1.0f);
  228. #if QUICK_MASK != 0
  229. // Clear z1 to nearest depth value as it is pushed back on each update
  230. mMaskedHiZBuffer[i].mZMin[1] = _mmw_set1_ps(FLT_MAX);
  231. #else
  232. mMaskedHiZBuffer[i].mZMin[1] = _mmw_setzero_ps();
  233. #endif
  234. }
  235. #if ENABLE_STATS != 0
  236. memset(&mStats, 0, sizeof(OcclusionCullingStatistics));
  237. #endif
  238. #if MOC_RECORDER_ENABLE != 0
  239. {
  240. std::lock_guard<std::mutex> lock( mRecorderMutex );
  241. if( mRecorder != nullptr ) mRecorder->RecordClearBuffer();
  242. }
  243. #endif
  244. }
  245. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  246. // MergeBuffer
  247. // Utility Function merges another MOC buffer into the existing one
  248. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  249. void MergeBuffer(MaskedOcclusionCulling* BufferB) override
  250. {
  251. assert(mMaskedHiZBuffer != nullptr);
  252. //// Iterate through all depth tiles and merge the 2 tiles
  253. for (int i = 0; i < mTilesWidth * mTilesHeight; i++)
  254. {
  255. __mw *zMinB = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mZMin;
  256. __mw *zMinA = mMaskedHiZBuffer[i].mZMin;
  257. __mwi RastMaskB = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask;
  258. #if QUICK_MASK != 0
  259. // Clear z0 to beyond infinity to ensure we never merge with clear data
  260. __mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[0]), 31);
  261. // Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
  262. sign0 = _mmw_cmpeq_epi32(sign0, SIMD_BITS_ZERO);
  263. if (!_mmw_testz_epi32(sign0, sign0))
  264. {
  265. STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
  266. zMinA[0] = _mmw_max_ps(zMinA[0], zMinB[0]);
  267. __mwi rastMask = mMaskedHiZBuffer[i].mMask;
  268. __mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
  269. // Mask out all subtiles failing the depth test (don't update these subtiles)
  270. deadLane = _mmw_or_epi32(deadLane, _mmw_srai_epi32(simd_cast<__mwi>(_mmw_sub_ps(zMinA[1], zMinA[0])), 31));
  271. mMaskedHiZBuffer[i].mMask = _mmw_andnot_epi32(deadLane, rastMask);
  272. }
  273. // Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
  274. __mwi LiveTile = _mmw_cmpeq_epi32(RastMaskB, SIMD_BITS_ZERO);
  275. // invert to have bits set for clear subtiles
  276. __mwi t0inv = _mmw_not_epi32(LiveTile);
  277. // VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
  278. if (!_mmw_testz_epi32(t0inv, t0inv))
  279. {
  280. STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
  281. UpdateTileQuick(i, RastMaskB, zMinB[1]);
  282. }
  283. #else
  284. // Clear z0 to beyond infinity to ensure we never merge with clear data
  285. __mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(mMaskedHiZBuffer[i].mZMin[0]), 31);
  286. // Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
  287. sign1 = _mmw_cmpeq_epi32(sign1, SIMD_BITS_ZERO);
  288. // Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
  289. __mwi LiveTile1 = _mmw_cmpeq_epi32(mMaskedHiZBuffer[i].mMask, SIMD_BITS_ZERO);
  290. // invert to have bits set for clear subtiles
  291. __mwi t1inv = _mmw_not_epi32(LiveTile1);
  292. // VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
  293. if (_mmw_testz_epi32(sign1, sign1) && _mmw_testz_epi32(t1inv, t1inv))
  294. {
  295. mMaskedHiZBuffer[i].mMask = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask;
  296. mMaskedHiZBuffer[i].mZMin[0] = zMinB[0];
  297. mMaskedHiZBuffer[i].mZMin[1] = zMinB[1];
  298. }
  299. else
  300. {
  301. // Clear z0 to beyond infinity to ensure we never merge with clear data
  302. __mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[0]), 31);
  303. sign0 = _mmw_cmpeq_epi32(sign0, SIMD_BITS_ZERO);
  304. // Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state
  305. if (!_mmw_testz_epi32(sign0, sign0))
  306. {
  307. // build a mask for Zmin[0], full if the layer has been completed, or partial if tile is still partly filled.
  308. // cant just use the completement of the mask, as tiles might not get updated by merge
  309. __mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[1]), 31);
  310. __mwi LayerMask0 = _mmw_not_epi32(sign1);
  311. __mwi LayerMask1 = _mmw_not_epi32(((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask);
  312. __mwi rastMask = _mmw_or_epi32(LayerMask0, LayerMask1);
  313. UpdateTileAccurate(i, rastMask, zMinB[0]);
  314. }
  315. // Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile
  316. __mwi LiveTile = _mmw_cmpeq_epi32(((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask, SIMD_BITS_ZERO);
  317. // invert to have bits set for clear subtiles
  318. __mwi t0inv = _mmw_not_epi32(LiveTile);
  319. // VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear)
  320. if (!_mmw_testz_epi32(t0inv, t0inv))
  321. {
  322. UpdateTileAccurate(i, ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask, zMinB[1]);
  323. }
  324. //if (_mmw_testz_epi32(sign0, sign0) && _mmw_testz_epi32(t0inv, t0inv))
  325. // STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1);
  326. }
  327. #endif
  328. }
  329. }
  330. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  331. // Polygon clipping functions
  332. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  333. FORCE_INLINE int ClipPolygon(__m128 *outVtx, __m128 *inVtx, const __m128 &plane, int n) const
  334. {
  335. __m128 p0 = inVtx[n - 1];
  336. __m128 dist0 = _mmx_dp4_ps(p0, plane);
  337. // Loop over all polygon edges and compute intersection with clip plane (if any)
  338. int nout = 0;
  339. for (int k = 0; k < n; k++)
  340. {
  341. __m128 p1 = inVtx[k];
  342. __m128 dist1 = _mmx_dp4_ps(p1, plane);
  343. int dist0Neg = _mm_movemask_ps(dist0);
  344. if (!dist0Neg) // dist0 > 0.0f
  345. outVtx[nout++] = p0;
  346. // Edge intersects the clip plane if dist0 and dist1 have opposing signs
  347. if (_mm_movemask_ps(_mm_xor_ps(dist0, dist1)))
  348. {
  349. // Always clip from the positive side to avoid T-junctions
  350. if (!dist0Neg)
  351. {
  352. __m128 t = _mm_div_ps(dist0, _mm_sub_ps(dist0, dist1));
  353. outVtx[nout++] = _mmx_fmadd_ps(_mm_sub_ps(p1, p0), t, p0);
  354. }
  355. else
  356. {
  357. __m128 t = _mm_div_ps(dist1, _mm_sub_ps(dist1, dist0));
  358. outVtx[nout++] = _mmx_fmadd_ps(_mm_sub_ps(p0, p1), t, p1);
  359. }
  360. }
  361. dist0 = dist1;
  362. p0 = p1;
  363. }
  364. return nout;
  365. }
  366. template<ClipPlanes CLIP_PLANE> void TestClipPlane(__mw *vtxX, __mw *vtxY, __mw *vtxW, unsigned int &straddleMask, unsigned int &triMask, ClipPlanes clipPlaneMask)
  367. {
  368. straddleMask = 0;
  369. // Skip masked clip planes
  370. if (!(clipPlaneMask & CLIP_PLANE))
  371. return;
  372. // Evaluate all 3 vertices against the frustum plane
  373. __mw planeDp[3];
  374. for (int i = 0; i < 3; ++i)
  375. {
  376. switch (CLIP_PLANE)
  377. {
  378. case ClipPlanes::CLIP_PLANE_LEFT: planeDp[i] = _mmw_add_ps(vtxW[i], vtxX[i]); break;
  379. case ClipPlanes::CLIP_PLANE_RIGHT: planeDp[i] = _mmw_sub_ps(vtxW[i], vtxX[i]); break;
  380. case ClipPlanes::CLIP_PLANE_BOTTOM: planeDp[i] = _mmw_add_ps(vtxW[i], vtxY[i]); break;
  381. case ClipPlanes::CLIP_PLANE_TOP: planeDp[i] = _mmw_sub_ps(vtxW[i], vtxY[i]); break;
  382. case ClipPlanes::CLIP_PLANE_NEAR: planeDp[i] = _mmw_sub_ps(vtxW[i], _mmw_set1_ps(mNearDist)); break;
  383. }
  384. }
  385. // Look at FP sign and determine if tri is inside, outside or straddles the frustum plane
  386. __mw inside = _mmw_andnot_ps(planeDp[0], _mmw_andnot_ps(planeDp[1], _mmw_not_ps(planeDp[2])));
  387. __mw outside = _mmw_and_ps(planeDp[0], _mmw_and_ps(planeDp[1], planeDp[2]));
  388. unsigned int inMask = (unsigned int)_mmw_movemask_ps(inside);
  389. unsigned int outMask = (unsigned int)_mmw_movemask_ps(outside);
  390. straddleMask = (~outMask) & (~inMask);
  391. triMask &= ~outMask;
  392. }
  393. FORCE_INLINE void ClipTriangleAndAddToBuffer(__mw *vtxX, __mw *vtxY, __mw *vtxW, __m128 *clippedTrisBuffer, int &clipWriteIdx, unsigned int &triMask, unsigned int triClipMask, ClipPlanes clipPlaneMask)
  394. {
  395. if (!triClipMask)
  396. return;
  397. // Inside test all 3 triangle vertices against all active frustum planes
  398. unsigned int straddleMask[5];
  399. TestClipPlane<ClipPlanes::CLIP_PLANE_NEAR>(vtxX, vtxY, vtxW, straddleMask[0], triMask, clipPlaneMask);
  400. TestClipPlane<ClipPlanes::CLIP_PLANE_LEFT>(vtxX, vtxY, vtxW, straddleMask[1], triMask, clipPlaneMask);
  401. TestClipPlane<ClipPlanes::CLIP_PLANE_RIGHT>(vtxX, vtxY, vtxW, straddleMask[2], triMask, clipPlaneMask);
  402. TestClipPlane<ClipPlanes::CLIP_PLANE_BOTTOM>(vtxX, vtxY, vtxW, straddleMask[3], triMask, clipPlaneMask);
  403. TestClipPlane<ClipPlanes::CLIP_PLANE_TOP>(vtxX, vtxY, vtxW, straddleMask[4], triMask, clipPlaneMask);
  404. // Clip triangle against straddling planes and add to the clipped triangle buffer
  405. __m128 vtxBuf[2][8];
  406. #if CLIPPING_PRESERVES_ORDER != 0
  407. unsigned int clipMask = triClipMask & triMask;
  408. unsigned int clipAndStraddleMask = (straddleMask[0] | straddleMask[1] | straddleMask[2] | straddleMask[3] | straddleMask[4]) & clipMask;
  409. // no clipping needed after all - early out
  410. if (clipAndStraddleMask == 0)
  411. return;
  412. while( clipMask )
  413. {
  414. // Find and setup next triangle to clip
  415. unsigned int triIdx = find_clear_lsb(&clipMask);
  416. unsigned int triBit = (1U << triIdx);
  417. assert(triIdx < SIMD_LANES);
  418. int bufIdx = 0;
  419. int nClippedVerts = 3;
  420. for (int i = 0; i < 3; i++)
  421. vtxBuf[0][i] = _mm_setr_ps(simd_f32(vtxX[i])[triIdx], simd_f32(vtxY[i])[triIdx], simd_f32(vtxW[i])[triIdx], 1.0f);
  422. // Clip triangle with straddling planes.
  423. for (int i = 0; i < 5; ++i)
  424. {
  425. if ((straddleMask[i] & triBit) && (clipPlaneMask & (1 << i))) // <- second part maybe not needed?
  426. {
  427. nClippedVerts = ClipPolygon(vtxBuf[bufIdx ^ 1], vtxBuf[bufIdx], mCSFrustumPlanes[i], nClippedVerts);
  428. bufIdx ^= 1;
  429. }
  430. }
  431. if (nClippedVerts >= 3)
  432. {
  433. // Write all triangles into the clip buffer and process them next loop iteration
  434. clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
  435. clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][1];
  436. clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][2];
  437. clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
  438. for (int i = 2; i < nClippedVerts - 1; i++)
  439. {
  440. clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
  441. clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][i];
  442. clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][i + 1];
  443. clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
  444. }
  445. }
  446. }
  447. // since all triangles were copied to clip buffer for next iteration, skip further processing
  448. triMask = 0;
  449. #else
  450. unsigned int clipMask = (straddleMask[0] | straddleMask[1] | straddleMask[2] | straddleMask[3] | straddleMask[4]) & (triClipMask & triMask);
  451. while (clipMask)
  452. {
  453. // Find and setup next triangle to clip
  454. unsigned int triIdx = find_clear_lsb(&clipMask);
  455. unsigned int triBit = (1U << triIdx);
  456. assert(triIdx < SIMD_LANES);
  457. int bufIdx = 0;
  458. int nClippedVerts = 3;
  459. for (int i = 0; i < 3; i++)
  460. vtxBuf[0][i] = _mm_setr_ps(simd_f32(vtxX[i])[triIdx], simd_f32(vtxY[i])[triIdx], simd_f32(vtxW[i])[triIdx], 1.0f);
  461. // Clip triangle with straddling planes.
  462. for (int i = 0; i < 5; ++i)
  463. {
  464. if ((straddleMask[i] & triBit) && (clipPlaneMask & (1 << i)))
  465. {
  466. nClippedVerts = ClipPolygon(vtxBuf[bufIdx ^ 1], vtxBuf[bufIdx], mCSFrustumPlanes[i], nClippedVerts);
  467. bufIdx ^= 1;
  468. }
  469. }
  470. if (nClippedVerts >= 3)
  471. {
  472. // Write the first triangle back into the list of currently processed triangles
  473. for (int i = 0; i < 3; i++)
  474. {
  475. simd_f32(vtxX[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[0];
  476. simd_f32(vtxY[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[1];
  477. simd_f32(vtxW[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[2];
  478. }
  479. // Write the remaining triangles into the clip buffer and process them next loop iteration
  480. for (int i = 2; i < nClippedVerts - 1; i++)
  481. {
  482. clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0];
  483. clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][i];
  484. clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][i + 1];
  485. clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1);
  486. }
  487. }
  488. else // Kill triangles that was removed by clipping
  489. triMask &= ~triBit;
  490. }
  491. #endif
  492. }
  493. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  494. // Vertex transform & projection
  495. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  496. FORCE_INLINE void TransformVerts(__mw *vtxX, __mw *vtxY, __mw *vtxW, const float *modelToClipMatrix)
  497. {
  498. if (modelToClipMatrix != nullptr)
  499. {
  500. for (int i = 0; i < 3; ++i)
  501. {
  502. __mw tmpX, tmpY, tmpW;
  503. tmpX = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[0]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[4]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[8]), _mmw_set1_ps(modelToClipMatrix[12]))));
  504. tmpY = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[1]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[5]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[9]), _mmw_set1_ps(modelToClipMatrix[13]))));
  505. tmpW = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[3]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[7]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[11]), _mmw_set1_ps(modelToClipMatrix[15]))));
  506. vtxX[i] = tmpX; vtxY[i] = tmpY; vtxW[i] = tmpW;
  507. }
  508. }
  509. }
  510. #if PRECISE_COVERAGE != 0
  511. FORCE_INLINE void ProjectVertices(__mwi *ipVtxX, __mwi *ipVtxY, __mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw *vtxX, const __mw *vtxY, const __mw *vtxW)
  512. {
  513. #if USE_D3D != 0
  514. static const int vertexOrder[] = {2, 1, 0};
  515. #else
  516. static const int vertexOrder[] = {0, 1, 2};
  517. #endif
  518. // Project vertices and transform to screen space. Snap to sub-pixel coordinates with FP_BITS precision.
  519. for (int i = 0; i < 3; i++)
  520. {
  521. int idx = vertexOrder[i];
  522. __mw rcpW = _mmw_div_ps(_mmw_set1_ps(1.0f), vtxW[i]);
  523. __mw screenX = _mmw_fmadd_ps(_mmw_mul_ps(vtxX[i], mHalfWidth), rcpW, mCenterX);
  524. __mw screenY = _mmw_fmadd_ps(_mmw_mul_ps(vtxY[i], mHalfHeight), rcpW, mCenterY);
  525. ipVtxX[idx] = _mmw_cvtps_epi32(_mmw_mul_ps(screenX, _mmw_set1_ps(float(1 << FP_BITS))));
  526. ipVtxY[idx] = _mmw_cvtps_epi32(_mmw_mul_ps(screenY, _mmw_set1_ps(float(1 << FP_BITS))));
  527. pVtxX[idx] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxX[idx]), _mmw_set1_ps(FP_INV));
  528. pVtxY[idx] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxY[idx]), _mmw_set1_ps(FP_INV));
  529. pVtxZ[idx] = rcpW;
  530. }
  531. }
  532. #else
  533. FORCE_INLINE void ProjectVertices(__mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw *vtxX, const __mw *vtxY, const __mw *vtxW)
  534. {
  535. #if USE_D3D != 0
  536. static const int vertexOrder[] = {2, 1, 0};
  537. #else
  538. static const int vertexOrder[] = {0, 1, 2};
  539. #endif
  540. // Project vertices and transform to screen space. Round to nearest integer pixel coordinate
  541. for (int i = 0; i < 3; i++)
  542. {
  543. int idx = vertexOrder[i];
  544. __mw rcpW = _mmw_div_ps(_mmw_set1_ps(1.0f), vtxW[i]);
  545. // The rounding modes are set to match HW rasterization with OpenGL. In practice our samples are placed
  546. // in the (1,0) corner of each pixel, while HW rasterizer uses (0.5, 0.5). We get (1,0) because of the
  547. // floor used when interpolating along triangle edges. The rounding modes match an offset of (0.5, -0.5)
  548. pVtxX[idx] = _mmw_ceil_ps(_mmw_fmadd_ps(_mmw_mul_ps(vtxX[i], mHalfWidth), rcpW, mCenterX));
  549. pVtxY[idx] = _mmw_floor_ps(_mmw_fmadd_ps(_mmw_mul_ps(vtxY[i], mHalfHeight), rcpW, mCenterY));
  550. pVtxZ[idx] = rcpW;
  551. }
  552. }
  553. #endif
  554. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  555. // Common SSE/AVX input assembly functions, note that there are specialized gathers for the general case in the SSE/AVX specific files
  556. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  557. FORCE_INLINE void GatherVerticesFast(__mw *vtxX, __mw *vtxY, __mw *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes)
  558. {
  559. // This function assumes that the vertex layout is four packed x, y, z, w-values.
  560. // Since the layout is known we can get some additional performance by using a
  561. // more optimized gather strategy.
  562. assert(numLanes >= 1);
  563. // Gather vertices
  564. __mw v[4], swz[4];
  565. for (int i = 0; i < 3; i++)
  566. {
  567. // Load 4 (x,y,z,w) vectors per SSE part of the SIMD register (so 4 vectors for SSE, 8 vectors for AVX)
  568. // this fetch uses templates to unroll the loop
  569. VtxFetch4<SIMD_LANES / 4>(v, inTrisPtr, i, inVtx, numLanes);
  570. // Transpose each individual SSE part of the SSE/AVX register (similar to _MM_TRANSPOSE4_PS)
  571. swz[0] = _mmw_shuffle_ps(v[0], v[1], 0x44);
  572. swz[2] = _mmw_shuffle_ps(v[0], v[1], 0xEE);
  573. swz[1] = _mmw_shuffle_ps(v[2], v[3], 0x44);
  574. swz[3] = _mmw_shuffle_ps(v[2], v[3], 0xEE);
  575. vtxX[i] = _mmw_shuffle_ps(swz[0], swz[1], 0x88);
  576. vtxY[i] = _mmw_shuffle_ps(swz[0], swz[1], 0xDD);
  577. vtxW[i] = _mmw_shuffle_ps(swz[2], swz[3], 0xDD);
  578. }
  579. }
  580. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  581. // Rasterization functions
  582. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  583. FORCE_INLINE void ComputeBoundingBox(__mwi &bbminX, __mwi &bbminY, __mwi &bbmaxX, __mwi &bbmaxY, const __mw *vX, const __mw *vY, const ScissorRect *scissor)
  584. {
  585. static const __mwi SIMD_PAD_W_MASK = _mmw_set1_epi32(~(TILE_WIDTH - 1));
  586. static const __mwi SIMD_PAD_H_MASK = _mmw_set1_epi32(~(TILE_HEIGHT - 1));
  587. // Find Min/Max vertices
  588. bbminX = _mmw_cvttps_epi32(_mmw_min_ps(vX[0], _mmw_min_ps(vX[1], vX[2])));
  589. bbminY = _mmw_cvttps_epi32(_mmw_min_ps(vY[0], _mmw_min_ps(vY[1], vY[2])));
  590. bbmaxX = _mmw_cvttps_epi32(_mmw_max_ps(vX[0], _mmw_max_ps(vX[1], vX[2])));
  591. bbmaxY = _mmw_cvttps_epi32(_mmw_max_ps(vY[0], _mmw_max_ps(vY[1], vY[2])));
  592. // Clamp to tile boundaries
  593. bbminX = _mmw_and_epi32(bbminX, SIMD_PAD_W_MASK);
  594. bbmaxX = _mmw_and_epi32(_mmw_add_epi32(bbmaxX, _mmw_set1_epi32(TILE_WIDTH)), SIMD_PAD_W_MASK);
  595. bbminY = _mmw_and_epi32(bbminY, SIMD_PAD_H_MASK);
  596. bbmaxY = _mmw_and_epi32(_mmw_add_epi32(bbmaxY, _mmw_set1_epi32(TILE_HEIGHT)), SIMD_PAD_H_MASK);
  597. // Clip to scissor
  598. bbminX = _mmw_max_epi32(bbminX, _mmw_set1_epi32(scissor->mMinX));
  599. bbmaxX = _mmw_min_epi32(bbmaxX, _mmw_set1_epi32(scissor->mMaxX));
  600. bbminY = _mmw_max_epi32(bbminY, _mmw_set1_epi32(scissor->mMinY));
  601. bbmaxY = _mmw_min_epi32(bbmaxY, _mmw_set1_epi32(scissor->mMaxY));
  602. }
  603. #if PRECISE_COVERAGE != 0
  604. FORCE_INLINE void SortVertices(__mwi *vX, __mwi *vY)
  605. {
  606. // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
  607. for (int i = 0; i < 2; i++)
  608. {
  609. __mwi ey1 = _mmw_sub_epi32(vY[1], vY[0]);
  610. __mwi ey2 = _mmw_sub_epi32(vY[2], vY[0]);
  611. __mwi swapMask = _mmw_or_epi32(_mmw_or_epi32(ey1, ey2), _mmw_cmpeq_epi32(simd_cast<__mwi>(ey2), SIMD_BITS_ZERO));
  612. __mwi sX, sY;
  613. sX = _mmw_blendv_epi32(vX[2], vX[0], swapMask);
  614. vX[0] = _mmw_blendv_epi32(vX[0], vX[1], swapMask);
  615. vX[1] = _mmw_blendv_epi32(vX[1], vX[2], swapMask);
  616. vX[2] = sX;
  617. sY = _mmw_blendv_epi32(vY[2], vY[0], swapMask);
  618. vY[0] = _mmw_blendv_epi32(vY[0], vY[1], swapMask);
  619. vY[1] = _mmw_blendv_epi32(vY[1], vY[2], swapMask);
  620. vY[2] = sY;
  621. }
  622. }
  623. FORCE_INLINE int CullBackfaces(__mwi *ipVtxX, __mwi *ipVtxY, __mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw &ccwMask, BackfaceWinding bfWinding)
  624. {
  625. // Reverse vertex order if non cw faces are considered front facing (rasterizer code requires CCW order)
  626. if (!(bfWinding & BACKFACE_CW))
  627. {
  628. __mw tmpX, tmpY, tmpZ;
  629. __mwi itmpX, itmpY;
  630. itmpX = _mmw_blendv_epi32(ipVtxX[2], ipVtxX[0], simd_cast<__mwi>(ccwMask));
  631. itmpY = _mmw_blendv_epi32(ipVtxY[2], ipVtxY[0], simd_cast<__mwi>(ccwMask));
  632. tmpX = _mmw_blendv_ps(pVtxX[2], pVtxX[0], ccwMask);
  633. tmpY = _mmw_blendv_ps(pVtxY[2], pVtxY[0], ccwMask);
  634. tmpZ = _mmw_blendv_ps(pVtxZ[2], pVtxZ[0], ccwMask);
  635. ipVtxX[2] = _mmw_blendv_epi32(ipVtxX[0], ipVtxX[2], simd_cast<__mwi>(ccwMask));
  636. ipVtxY[2] = _mmw_blendv_epi32(ipVtxY[0], ipVtxY[2], simd_cast<__mwi>(ccwMask));
  637. pVtxX[2] = _mmw_blendv_ps(pVtxX[0], pVtxX[2], ccwMask);
  638. pVtxY[2] = _mmw_blendv_ps(pVtxY[0], pVtxY[2], ccwMask);
  639. pVtxZ[2] = _mmw_blendv_ps(pVtxZ[0], pVtxZ[2], ccwMask);
  640. ipVtxX[0] = itmpX;
  641. ipVtxY[0] = itmpY;
  642. pVtxX[0] = tmpX;
  643. pVtxY[0] = tmpY;
  644. pVtxZ[0] = tmpZ;
  645. }
  646. // Return a lane mask with all front faces set
  647. return ((bfWinding & BACKFACE_CCW) ? 0 : _mmw_movemask_ps(ccwMask)) | ((bfWinding & BACKFACE_CW) ? 0 : ~_mmw_movemask_ps(ccwMask));
  648. }
  649. #else
  650. FORCE_INLINE void SortVertices(__mw *vX, __mw *vY)
  651. {
  652. // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
  653. for (int i = 0; i < 2; i++)
  654. {
  655. __mw ey1 = _mmw_sub_ps(vY[1], vY[0]);
  656. __mw ey2 = _mmw_sub_ps(vY[2], vY[0]);
  657. __mw swapMask = _mmw_or_ps(_mmw_or_ps(ey1, ey2), simd_cast<__mw>(_mmw_cmpeq_epi32(simd_cast<__mwi>(ey2), SIMD_BITS_ZERO)));
  658. __mw sX, sY;
  659. sX = _mmw_blendv_ps(vX[2], vX[0], swapMask);
  660. vX[0] = _mmw_blendv_ps(vX[0], vX[1], swapMask);
  661. vX[1] = _mmw_blendv_ps(vX[1], vX[2], swapMask);
  662. vX[2] = sX;
  663. sY = _mmw_blendv_ps(vY[2], vY[0], swapMask);
  664. vY[0] = _mmw_blendv_ps(vY[0], vY[1], swapMask);
  665. vY[1] = _mmw_blendv_ps(vY[1], vY[2], swapMask);
  666. vY[2] = sY;
  667. }
  668. }
  669. FORCE_INLINE int CullBackfaces(__mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw &ccwMask, BackfaceWinding bfWinding)
  670. {
  671. // Reverse vertex order if non cw faces are considered front facing (rasterizer code requires CCW order)
  672. if (!(bfWinding & BACKFACE_CW))
  673. {
  674. __mw tmpX, tmpY, tmpZ;
  675. tmpX = _mmw_blendv_ps(pVtxX[2], pVtxX[0], ccwMask);
  676. tmpY = _mmw_blendv_ps(pVtxY[2], pVtxY[0], ccwMask);
  677. tmpZ = _mmw_blendv_ps(pVtxZ[2], pVtxZ[0], ccwMask);
  678. pVtxX[2] = _mmw_blendv_ps(pVtxX[0], pVtxX[2], ccwMask);
  679. pVtxY[2] = _mmw_blendv_ps(pVtxY[0], pVtxY[2], ccwMask);
  680. pVtxZ[2] = _mmw_blendv_ps(pVtxZ[0], pVtxZ[2], ccwMask);
  681. pVtxX[0] = tmpX;
  682. pVtxY[0] = tmpY;
  683. pVtxZ[0] = tmpZ;
  684. }
  685. // Return a lane mask with all front faces set
  686. return ((bfWinding & BACKFACE_CCW) ? 0 : _mmw_movemask_ps(ccwMask)) | ((bfWinding & BACKFACE_CW) ? 0 : ~_mmw_movemask_ps(ccwMask));
  687. }
  688. #endif
  689. FORCE_INLINE void ComputeDepthPlane(const __mw *pVtxX, const __mw *pVtxY, const __mw *pVtxZ, __mw &zPixelDx, __mw &zPixelDy) const
  690. {
  691. // Setup z(x,y) = z0 + dx*x + dy*y screen space depth plane equation
  692. __mw x2 = _mmw_sub_ps(pVtxX[2], pVtxX[0]);
  693. __mw x1 = _mmw_sub_ps(pVtxX[1], pVtxX[0]);
  694. __mw y1 = _mmw_sub_ps(pVtxY[1], pVtxY[0]);
  695. __mw y2 = _mmw_sub_ps(pVtxY[2], pVtxY[0]);
  696. __mw z1 = _mmw_sub_ps(pVtxZ[1], pVtxZ[0]);
  697. __mw z2 = _mmw_sub_ps(pVtxZ[2], pVtxZ[0]);
  698. __mw d = _mmw_div_ps(_mmw_set1_ps(1.0f), _mmw_fmsub_ps(x1, y2, _mmw_mul_ps(y1, x2)));
  699. zPixelDx = _mmw_mul_ps(_mmw_fmsub_ps(z1, y2, _mmw_mul_ps(y1, z2)), d);
  700. zPixelDy = _mmw_mul_ps(_mmw_fmsub_ps(x1, z2, _mmw_mul_ps(z1, x2)), d);
  701. }
  702. FORCE_INLINE void UpdateTileQuick(int tileIdx, const __mwi &coverage, const __mw &zTriv)
  703. {
  704. // Update heuristic used in the paper "Masked Software Occlusion Culling",
  705. // good balance between performance and accuracy
  706. STATS_ADD(mStats.mOccluders.mNumTilesUpdated, 1);
  707. assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
  708. __mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
  709. __mw *zMin = mMaskedHiZBuffer[tileIdx].mZMin;
  710. // Swizzle coverage mask to 8x4 subtiles and test if any subtiles are not covered at all
  711. __mwi rastMask = coverage;
  712. __mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
  713. // Mask out all subtiles failing the depth test (don't update these subtiles)
  714. deadLane = _mmw_or_epi32(deadLane, _mmw_srai_epi32(simd_cast<__mwi>(_mmw_sub_ps(zTriv, zMin[0])), 31));
  715. rastMask = _mmw_andnot_epi32(deadLane, rastMask);
  716. // Use distance heuristic to discard layer 1 if incoming triangle is significantly nearer to observer
  717. // than the buffer contents. See Section 3.2 in "Masked Software Occlusion Culling"
  718. __mwi coveredLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ONE);
  719. __mw diff = _mmw_fmsub_ps(zMin[1], _mmw_set1_ps(2.0f), _mmw_add_ps(zTriv, zMin[0]));
  720. __mwi discardLayerMask = _mmw_andnot_epi32(deadLane, _mmw_or_epi32(_mmw_srai_epi32(simd_cast<__mwi>(diff), 31), coveredLane));
  721. // Update the mask with incoming triangle coverage
  722. mask = _mmw_or_epi32(_mmw_andnot_epi32(discardLayerMask, mask), rastMask);
  723. __mwi maskFull = _mmw_cmpeq_epi32(mask, SIMD_BITS_ONE);
  724. // Compute new value for zMin[1]. This has one of four outcomes: zMin[1] = min(zMin[1], zTriv), zMin[1] = zTriv,
  725. // zMin[1] = FLT_MAX or unchanged, depending on if the layer is updated, discarded, fully covered, or not updated
  726. __mw opA = _mmw_blendv_ps(zTriv, zMin[1], simd_cast<__mw>(deadLane));
  727. __mw opB = _mmw_blendv_ps(zMin[1], zTriv, simd_cast<__mw>(discardLayerMask));
  728. __mw z1min = _mmw_min_ps(opA, opB);
  729. zMin[1] = _mmw_blendv_ps(z1min, _mmw_set1_ps(FLT_MAX), simd_cast<__mw>(maskFull));
  730. // Propagate zMin[1] back to zMin[0] if tile was fully covered, and update the mask
  731. zMin[0] = _mmw_blendv_ps(zMin[0], z1min, simd_cast<__mw>(maskFull));
  732. mMaskedHiZBuffer[tileIdx].mMask = _mmw_andnot_epi32(maskFull, mask);
  733. }
  734. FORCE_INLINE void UpdateTileAccurate(int tileIdx, const __mwi &coverage, const __mw &zTriv)
  735. {
  736. assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
  737. __mw *zMin = mMaskedHiZBuffer[tileIdx].mZMin;
  738. __mwi &mask = mMaskedHiZBuffer[tileIdx].mMask;
  739. // Swizzle coverage mask to 8x4 subtiles
  740. __mwi rastMask = coverage;
  741. // Perform individual depth tests with layer 0 & 1 and mask out all failing pixels
  742. __mw sdist0 = _mmw_sub_ps(zMin[0], zTriv);
  743. __mw sdist1 = _mmw_sub_ps(zMin[1], zTriv);
  744. __mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(sdist0), 31);
  745. __mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(sdist1), 31);
  746. __mwi triMask = _mmw_and_epi32(rastMask, _mmw_or_epi32(_mmw_andnot_epi32(mask, sign0), _mmw_and_epi32(mask, sign1)));
  747. // Early out if no pixels survived the depth test (this test is more accurate than
  748. // the early culling test in TraverseScanline())
  749. __mwi t0 = _mmw_cmpeq_epi32(triMask, SIMD_BITS_ZERO);
  750. __mwi t0inv = _mmw_not_epi32(t0);
  751. if (_mmw_testz_epi32(t0inv, t0inv))
  752. return;
  753. STATS_ADD(mStats.mOccluders.mNumTilesUpdated, 1);
  754. __mw zTri = _mmw_blendv_ps(zTriv, zMin[0], simd_cast<__mw>(t0));
  755. // Test if incoming triangle completely overwrites layer 0 or 1
  756. __mwi layerMask0 = _mmw_andnot_epi32(triMask, _mmw_not_epi32(mask));
  757. __mwi layerMask1 = _mmw_andnot_epi32(triMask, mask);
  758. __mwi lm0 = _mmw_cmpeq_epi32(layerMask0, SIMD_BITS_ZERO);
  759. __mwi lm1 = _mmw_cmpeq_epi32(layerMask1, SIMD_BITS_ZERO);
  760. __mw z0 = _mmw_blendv_ps(zMin[0], zTri, simd_cast<__mw>(lm0));
  761. __mw z1 = _mmw_blendv_ps(zMin[1], zTri, simd_cast<__mw>(lm1));
  762. // Compute distances used for merging heuristic
  763. __mw d0 = _mmw_abs_ps(sdist0);
  764. __mw d1 = _mmw_abs_ps(sdist1);
  765. __mw d2 = _mmw_abs_ps(_mmw_sub_ps(z0, z1));
  766. // Find minimum distance
  767. __mwi c01 = simd_cast<__mwi>(_mmw_sub_ps(d0, d1));
  768. __mwi c02 = simd_cast<__mwi>(_mmw_sub_ps(d0, d2));
  769. __mwi c12 = simd_cast<__mwi>(_mmw_sub_ps(d1, d2));
  770. // Two tests indicating which layer the incoming triangle will merge with or
  771. // overwrite. d0min indicates that the triangle will overwrite layer 0, and
  772. // d1min flags that the triangle will overwrite layer 1.
  773. __mwi d0min = _mmw_or_epi32(_mmw_and_epi32(c01, c02), _mmw_or_epi32(lm0, t0));
  774. __mwi d1min = _mmw_andnot_epi32(d0min, _mmw_or_epi32(c12, lm1));
  775. ///////////////////////////////////////////////////////////////////////////////
  776. // Update depth buffer entry. NOTE: we always merge into layer 0, so if the
  777. // triangle should be merged with layer 1, we first swap layer 0 & 1 and then
  778. // merge into layer 0.
  779. ///////////////////////////////////////////////////////////////////////////////
  780. // Update mask based on which layer the triangle overwrites or was merged into
  781. __mw inner = _mmw_blendv_ps(simd_cast<__mw>(triMask), simd_cast<__mw>(layerMask1), simd_cast<__mw>(d0min));
  782. mask = simd_cast<__mwi>(_mmw_blendv_ps(inner, simd_cast<__mw>(layerMask0), simd_cast<__mw>(d1min)));
  783. // Update the zMin[0] value. There are four outcomes: overwrite with layer 1,
  784. // merge with layer 1, merge with zTri or overwrite with layer 1 and then merge
  785. // with zTri.
  786. __mw e0 = _mmw_blendv_ps(z0, z1, simd_cast<__mw>(d1min));
  787. __mw e1 = _mmw_blendv_ps(z1, zTri, simd_cast<__mw>(_mmw_or_epi32(d1min, d0min)));
  788. zMin[0] = _mmw_min_ps(e0, e1);
  789. // Update the zMin[1] value. There are three outcomes: keep current value,
  790. // overwrite with zTri, or overwrite with z1
  791. __mw z1t = _mmw_blendv_ps(zTri, z1, simd_cast<__mw>(d0min));
  792. zMin[1] = _mmw_blendv_ps(z1t, z0, simd_cast<__mw>(d1min));
  793. }
  794. template<int TEST_Z, int NRIGHT, int NLEFT>
  795. FORCE_INLINE int TraverseScanline(int leftOffset, int rightOffset, int tileIdx, int rightEvent, int leftEvent, const __mwi *events, const __mw &zTriMin, const __mw &zTriMax, const __mw &iz0, float zx)
  796. {
  797. // Floor edge events to integer pixel coordinates (shift out fixed point bits)
  798. int eventOffset = leftOffset << TILE_WIDTH_SHIFT;
  799. __mwi right[NRIGHT], left[NLEFT];
  800. for (int i = 0; i < NRIGHT; ++i)
  801. right[i] = _mmw_max_epi32(_mmw_sub_epi32(_mmw_srai_epi32(events[rightEvent + i], FP_BITS), _mmw_set1_epi32(eventOffset)), SIMD_BITS_ZERO);
  802. for (int i = 0; i < NLEFT; ++i)
  803. left[i] = _mmw_max_epi32(_mmw_sub_epi32(_mmw_srai_epi32(events[leftEvent - i], FP_BITS), _mmw_set1_epi32(eventOffset)), SIMD_BITS_ZERO);
  804. __mw z0 = _mmw_add_ps(iz0, _mmw_set1_ps(zx*leftOffset));
  805. int tileIdxEnd = tileIdx + rightOffset;
  806. tileIdx += leftOffset;
  807. for (;;)
  808. {
  809. if (TEST_Z)
  810. STATS_ADD(mStats.mOccludees.mNumTilesTraversed, 1);
  811. else
  812. STATS_ADD(mStats.mOccluders.mNumTilesTraversed, 1);
  813. // Perform a coarse test to quickly discard occluded tiles
  814. #if QUICK_MASK != 0
  815. // Only use the reference layer (layer 0) to cull as it is always conservative
  816. __mw zMinBuf = mMaskedHiZBuffer[tileIdx].mZMin[0];
  817. #else
  818. // Compute zMin for the overlapped layers
  819. __mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
  820. __mw zMin0 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[0], mMaskedHiZBuffer[tileIdx].mZMin[1], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_set1_epi32(~0))));
  821. __mw zMin1 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[1], mMaskedHiZBuffer[tileIdx].mZMin[0], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_setzero_epi32())));
  822. __mw zMinBuf = _mmw_min_ps(zMin0, zMin1);
  823. #endif
  824. __mw dist0 = _mmw_sub_ps(zTriMax, zMinBuf);
  825. if (_mmw_movemask_ps(dist0) != SIMD_ALL_LANES_MASK)
  826. {
  827. // Compute coverage mask for entire 32xN using shift operations
  828. __mwi accumulatedMask = _mmw_sllv_ones(left[0]);
  829. for (int i = 1; i < NLEFT; ++i)
  830. accumulatedMask = _mmw_and_epi32(accumulatedMask, _mmw_sllv_ones(left[i]));
  831. for (int i = 0; i < NRIGHT; ++i)
  832. accumulatedMask = _mmw_andnot_epi32(_mmw_sllv_ones(right[i]), accumulatedMask);
  833. if (TEST_Z)
  834. {
  835. // Perform a conservative visibility test (test zMax against buffer for each covered 8x4 subtile)
  836. __mw zSubTileMax = _mmw_min_ps(z0, zTriMax);
  837. __mwi zPass = simd_cast<__mwi>(_mmw_cmpge_ps(zSubTileMax, zMinBuf));
  838. __mwi rastMask = _mmw_transpose_epi8(accumulatedMask);
  839. __mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO);
  840. zPass = _mmw_andnot_epi32(deadLane, zPass);
  841. if (!_mmw_testz_epi32(zPass, zPass))
  842. return CullingResult::VISIBLE;
  843. }
  844. else
  845. {
  846. // Compute interpolated min for each 8x4 subtile and update the masked hierarchical z buffer entry
  847. __mw zSubTileMin = _mmw_max_ps(z0, zTriMin);
  848. #if QUICK_MASK != 0
  849. UpdateTileQuick(tileIdx, _mmw_transpose_epi8(accumulatedMask), zSubTileMin);
  850. #else
  851. UpdateTileAccurate(tileIdx, _mmw_transpose_epi8(accumulatedMask), zSubTileMin);
  852. #endif
  853. }
  854. }
  855. // Update buffer address, interpolate z and edge events
  856. tileIdx++;
  857. if (tileIdx >= tileIdxEnd)
  858. break;
  859. z0 = _mmw_add_ps(z0, _mmw_set1_ps(zx));
  860. for (int i = 0; i < NRIGHT; ++i)
  861. right[i] = _mmw_subs_epu16(right[i], SIMD_TILE_WIDTH); // Trick, use sub saturated to avoid checking against < 0 for shift (values should fit in 16 bits)
  862. for (int i = 0; i < NLEFT; ++i)
  863. left[i] = _mmw_subs_epu16(left[i], SIMD_TILE_WIDTH);
  864. }
  865. return TEST_Z ? CullingResult::OCCLUDED : CullingResult::VISIBLE;
  866. }
  867. template<int TEST_Z, int TIGHT_TRAVERSAL, int MID_VTX_RIGHT>
  868. #if PRECISE_COVERAGE != 0
  869. FORCE_INLINE int RasterizeTriangle(unsigned int triIdx, int bbWidth, int tileRowIdx, int tileMidRowIdx, int tileEndRowIdx, const __mwi *eventStart, const __mw *slope, const __mwi *slopeTileDelta, const __mw &zTriMin, const __mw &zTriMax, __mw &z0, float zx, float zy, const __mwi *edgeY, const __mwi *absEdgeX, const __mwi *slopeSign, const __mwi *eventStartRemainder, const __mwi *slopeTileRemainder)
  870. #else
  871. FORCE_INLINE int RasterizeTriangle(unsigned int triIdx, int bbWidth, int tileRowIdx, int tileMidRowIdx, int tileEndRowIdx, const __mwi *eventStart, const __mwi *slope, const __mwi *slopeTileDelta, const __mw &zTriMin, const __mw &zTriMax, __mw &z0, float zx, float zy)
  872. #endif
  873. {
  874. if (TEST_Z)
  875. STATS_ADD(mStats.mOccludees.mNumRasterizedTriangles, 1);
  876. else
  877. STATS_ADD(mStats.mOccluders.mNumRasterizedTriangles, 1);
  878. int cullResult;
  879. #if PRECISE_COVERAGE != 0
  880. #define LEFT_EDGE_BIAS -1
  881. #define RIGHT_EDGE_BIAS 1
  882. #define UPDATE_TILE_EVENTS_Y(i) \
  883. triEventRemainder[i] = _mmw_sub_epi32(triEventRemainder[i], triSlopeTileRemainder[i]); \
  884. __mwi overflow##i = _mmw_srai_epi32(triEventRemainder[i], 31); \
  885. triEventRemainder[i] = _mmw_add_epi32(triEventRemainder[i], _mmw_and_epi32(overflow##i, triEdgeY[i])); \
  886. triEvent[i] = _mmw_add_epi32(triEvent[i], _mmw_add_epi32(triSlopeTileDelta[i], _mmw_and_epi32(overflow##i, triSlopeSign[i])))
  887. __mwi triEvent[3], triSlopeSign[3], triSlopeTileDelta[3], triEdgeY[3], triSlopeTileRemainder[3], triEventRemainder[3];
  888. for (int i = 0; i < 3; ++i)
  889. {
  890. triSlopeSign[i] = _mmw_set1_epi32(simd_i32(slopeSign[i])[triIdx]);
  891. triSlopeTileDelta[i] = _mmw_set1_epi32(simd_i32(slopeTileDelta[i])[triIdx]);
  892. triEdgeY[i] = _mmw_set1_epi32(simd_i32(edgeY[i])[triIdx]);
  893. triSlopeTileRemainder[i] = _mmw_set1_epi32(simd_i32(slopeTileRemainder[i])[triIdx]);
  894. __mw triSlope = _mmw_set1_ps(simd_f32(slope[i])[triIdx]);
  895. __mwi triAbsEdgeX = _mmw_set1_epi32(simd_i32(absEdgeX[i])[triIdx]);
  896. __mwi triStartRemainder = _mmw_set1_epi32(simd_i32(eventStartRemainder[i])[triIdx]);
  897. __mwi triEventStart = _mmw_set1_epi32(simd_i32(eventStart[i])[triIdx]);
  898. __mwi scanlineDelta = _mmw_cvttps_epi32(_mmw_mul_ps(triSlope, SIMD_LANE_YCOORD_F));
  899. __mwi scanlineSlopeRemainder = _mmw_sub_epi32(_mmw_mullo_epi32(triAbsEdgeX, SIMD_LANE_YCOORD_I), _mmw_mullo_epi32(_mmw_abs_epi32(scanlineDelta), triEdgeY[i]));
  900. triEventRemainder[i] = _mmw_sub_epi32(triStartRemainder, scanlineSlopeRemainder);
  901. __mwi overflow = _mmw_srai_epi32(triEventRemainder[i], 31);
  902. triEventRemainder[i] = _mmw_add_epi32(triEventRemainder[i], _mmw_and_epi32(overflow, triEdgeY[i]));
  903. triEvent[i] = _mmw_add_epi32(_mmw_add_epi32(triEventStart, scanlineDelta), _mmw_and_epi32(overflow, triSlopeSign[i]));
  904. }
  905. #else
  906. #define LEFT_EDGE_BIAS 0
  907. #define RIGHT_EDGE_BIAS 0
  908. #define UPDATE_TILE_EVENTS_Y(i) triEvent[i] = _mmw_add_epi32(triEvent[i], triSlopeTileDelta[i]);
  909. // Get deltas used to increment edge events each time we traverse one scanline of tiles
  910. __mwi triSlopeTileDelta[3];
  911. triSlopeTileDelta[0] = _mmw_set1_epi32(simd_i32(slopeTileDelta[0])[triIdx]);
  912. triSlopeTileDelta[1] = _mmw_set1_epi32(simd_i32(slopeTileDelta[1])[triIdx]);
  913. triSlopeTileDelta[2] = _mmw_set1_epi32(simd_i32(slopeTileDelta[2])[triIdx]);
  914. // Setup edge events for first batch of SIMD_LANES scanlines
  915. __mwi triEvent[3];
  916. triEvent[0] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[0])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[0])[triIdx])));
  917. triEvent[1] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[1])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[1])[triIdx])));
  918. triEvent[2] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[2])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[2])[triIdx])));
  919. #endif
  920. // For big triangles track start & end tile for each scanline and only traverse the valid region
  921. int startDelta, endDelta, topDelta, startEvent, endEvent, topEvent;
  922. if (TIGHT_TRAVERSAL)
  923. {
  924. startDelta = simd_i32(slopeTileDelta[2])[triIdx] + LEFT_EDGE_BIAS;
  925. endDelta = simd_i32(slopeTileDelta[0])[triIdx] + RIGHT_EDGE_BIAS;
  926. topDelta = simd_i32(slopeTileDelta[1])[triIdx] + (MID_VTX_RIGHT ? RIGHT_EDGE_BIAS : LEFT_EDGE_BIAS);
  927. // Compute conservative bounds for the edge events over a 32xN tile
  928. startEvent = simd_i32(eventStart[2])[triIdx] + min(0, startDelta);
  929. endEvent = simd_i32(eventStart[0])[triIdx] + max(0, endDelta) + (TILE_WIDTH << FP_BITS);
  930. if (MID_VTX_RIGHT)
  931. topEvent = simd_i32(eventStart[1])[triIdx] + max(0, topDelta) + (TILE_WIDTH << FP_BITS);
  932. else
  933. topEvent = simd_i32(eventStart[1])[triIdx] + min(0, topDelta);
  934. }
  935. if (tileRowIdx <= tileMidRowIdx)
  936. {
  937. int tileStopIdx = min(tileEndRowIdx, tileMidRowIdx);
  938. // Traverse the bottom half of the triangle
  939. while (tileRowIdx < tileStopIdx)
  940. {
  941. int start = 0, end = bbWidth;
  942. if (TIGHT_TRAVERSAL)
  943. {
  944. // Compute tighter start and endpoints to avoid traversing empty space
  945. start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
  946. end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
  947. startEvent += startDelta;
  948. endEvent += endDelta;
  949. }
  950. // Traverse the scanline and update the masked hierarchical z buffer
  951. cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
  952. if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
  953. return CullingResult::VISIBLE;
  954. // move to the next scanline of tiles, update edge events and interpolate z
  955. tileRowIdx += mTilesWidth;
  956. z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
  957. UPDATE_TILE_EVENTS_Y(0);
  958. UPDATE_TILE_EVENTS_Y(2);
  959. }
  960. // Traverse the middle scanline of tiles. We must consider all three edges only in this region
  961. if (tileRowIdx < tileEndRowIdx)
  962. {
  963. int start = 0, end = bbWidth;
  964. if (TIGHT_TRAVERSAL)
  965. {
  966. // Compute tighter start and endpoints to avoid traversing lots of empty space
  967. start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
  968. end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
  969. // Switch the traversal start / end to account for the upper side edge
  970. endEvent = MID_VTX_RIGHT ? topEvent : endEvent;
  971. endDelta = MID_VTX_RIGHT ? topDelta : endDelta;
  972. startEvent = MID_VTX_RIGHT ? startEvent : topEvent;
  973. startDelta = MID_VTX_RIGHT ? startDelta : topDelta;
  974. startEvent += startDelta;
  975. endEvent += endDelta;
  976. }
  977. // Traverse the scanline and update the masked hierarchical z buffer.
  978. if (MID_VTX_RIGHT)
  979. cullResult = TraverseScanline<TEST_Z, 2, 1>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
  980. else
  981. cullResult = TraverseScanline<TEST_Z, 1, 2>(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx);
  982. if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
  983. return CullingResult::VISIBLE;
  984. tileRowIdx += mTilesWidth;
  985. }
  986. // Traverse the top half of the triangle
  987. if (tileRowIdx < tileEndRowIdx)
  988. {
  989. // move to the next scanline of tiles, update edge events and interpolate z
  990. z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
  991. int i0 = MID_VTX_RIGHT + 0;
  992. int i1 = MID_VTX_RIGHT + 1;
  993. UPDATE_TILE_EVENTS_Y(i0);
  994. UPDATE_TILE_EVENTS_Y(i1);
  995. for (;;)
  996. {
  997. int start = 0, end = bbWidth;
  998. if (TIGHT_TRAVERSAL)
  999. {
  1000. // Compute tighter start and endpoints to avoid traversing lots of empty space
  1001. start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
  1002. end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
  1003. startEvent += startDelta;
  1004. endEvent += endDelta;
  1005. }
  1006. // Traverse the scanline and update the masked hierarchical z buffer
  1007. cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, MID_VTX_RIGHT + 0, MID_VTX_RIGHT + 1, triEvent, zTriMin, zTriMax, z0, zx);
  1008. if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
  1009. return CullingResult::VISIBLE;
  1010. // move to the next scanline of tiles, update edge events and interpolate z
  1011. tileRowIdx += mTilesWidth;
  1012. if (tileRowIdx >= tileEndRowIdx)
  1013. break;
  1014. z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
  1015. UPDATE_TILE_EVENTS_Y(i0);
  1016. UPDATE_TILE_EVENTS_Y(i1);
  1017. }
  1018. }
  1019. }
  1020. else
  1021. {
  1022. if (TIGHT_TRAVERSAL)
  1023. {
  1024. // For large triangles, switch the traversal start / end to account for the upper side edge
  1025. endEvent = MID_VTX_RIGHT ? topEvent : endEvent;
  1026. endDelta = MID_VTX_RIGHT ? topDelta : endDelta;
  1027. startEvent = MID_VTX_RIGHT ? startEvent : topEvent;
  1028. startDelta = MID_VTX_RIGHT ? startDelta : topDelta;
  1029. }
  1030. // Traverse the top half of the triangle
  1031. if (tileRowIdx < tileEndRowIdx)
  1032. {
  1033. int i0 = MID_VTX_RIGHT + 0;
  1034. int i1 = MID_VTX_RIGHT + 1;
  1035. for (;;)
  1036. {
  1037. int start = 0, end = bbWidth;
  1038. if (TIGHT_TRAVERSAL)
  1039. {
  1040. // Compute tighter start and endpoints to avoid traversing lots of empty space
  1041. start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
  1042. end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS)));
  1043. startEvent += startDelta;
  1044. endEvent += endDelta;
  1045. }
  1046. // Traverse the scanline and update the masked hierarchical z buffer
  1047. cullResult = TraverseScanline<TEST_Z, 1, 1>(start, end, tileRowIdx, MID_VTX_RIGHT + 0, MID_VTX_RIGHT + 1, triEvent, zTriMin, zTriMax, z0, zx);
  1048. if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query
  1049. return CullingResult::VISIBLE;
  1050. // move to the next scanline of tiles, update edge events and interpolate z
  1051. tileRowIdx += mTilesWidth;
  1052. if (tileRowIdx >= tileEndRowIdx)
  1053. break;
  1054. z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy));
  1055. UPDATE_TILE_EVENTS_Y(i0);
  1056. UPDATE_TILE_EVENTS_Y(i1);
  1057. }
  1058. }
  1059. }
  1060. return TEST_Z ? CullingResult::OCCLUDED : CullingResult::VISIBLE;
  1061. }
  1062. template<bool TEST_Z>
  1063. #if PRECISE_COVERAGE != 0
  1064. FORCE_INLINE int RasterizeTriangleBatch(__mwi ipVtxX[3], __mwi ipVtxY[3], __mw pVtxX[3], __mw pVtxY[3], __mw pVtxZ[3], unsigned int triMask, const ScissorRect *scissor)
  1065. #else
  1066. FORCE_INLINE int RasterizeTriangleBatch(__mw pVtxX[3], __mw pVtxY[3], __mw pVtxZ[3], unsigned int triMask, const ScissorRect *scissor)
  1067. #endif
  1068. {
  1069. int cullResult = CullingResult::VIEW_CULLED;
  1070. //////////////////////////////////////////////////////////////////////////////
  1071. // Compute bounding box and clamp to tile coordinates
  1072. //////////////////////////////////////////////////////////////////////////////
  1073. __mwi bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY;
  1074. ComputeBoundingBox(bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY, pVtxX, pVtxY, scissor);
  1075. // Clamp bounding box to tiles (it's already padded in computeBoundingBox)
  1076. __mwi bbTileMinX = _mmw_srai_epi32(bbPixelMinX, TILE_WIDTH_SHIFT);
  1077. __mwi bbTileMinY = _mmw_srai_epi32(bbPixelMinY, TILE_HEIGHT_SHIFT);
  1078. __mwi bbTileMaxX = _mmw_srai_epi32(bbPixelMaxX, TILE_WIDTH_SHIFT);
  1079. __mwi bbTileMaxY = _mmw_srai_epi32(bbPixelMaxY, TILE_HEIGHT_SHIFT);
  1080. __mwi bbTileSizeX = _mmw_sub_epi32(bbTileMaxX, bbTileMinX);
  1081. __mwi bbTileSizeY = _mmw_sub_epi32(bbTileMaxY, bbTileMinY);
  1082. // Cull triangles with zero bounding box
  1083. __mwi bboxSign = _mmw_or_epi32(_mmw_sub_epi32(bbTileSizeX, _mmw_set1_epi32(1)), _mmw_sub_epi32(bbTileSizeY, _mmw_set1_epi32(1)));
  1084. triMask &= ~_mmw_movemask_ps(simd_cast<__mw>(bboxSign)) & SIMD_ALL_LANES_MASK;
  1085. if (triMask == 0x0)
  1086. return cullResult;
  1087. if (!TEST_Z)
  1088. cullResult = CullingResult::VISIBLE;
  1089. //////////////////////////////////////////////////////////////////////////////
  1090. // Set up screen space depth plane
  1091. //////////////////////////////////////////////////////////////////////////////
  1092. __mw zPixelDx, zPixelDy;
  1093. ComputeDepthPlane(pVtxX, pVtxY, pVtxZ, zPixelDx, zPixelDy);
  1094. // Compute z value at min corner of bounding box. Offset to make sure z is conservative for all 8x4 subtiles
  1095. __mw bbMinXV0 = _mmw_sub_ps(_mmw_cvtepi32_ps(bbPixelMinX), pVtxX[0]);
  1096. __mw bbMinYV0 = _mmw_sub_ps(_mmw_cvtepi32_ps(bbPixelMinY), pVtxY[0]);
  1097. __mw zPlaneOffset = _mmw_fmadd_ps(zPixelDx, bbMinXV0, _mmw_fmadd_ps(zPixelDy, bbMinYV0, pVtxZ[0]));
  1098. __mw zTileDx = _mmw_mul_ps(zPixelDx, _mmw_set1_ps((float)TILE_WIDTH));
  1099. __mw zTileDy = _mmw_mul_ps(zPixelDy, _mmw_set1_ps((float)TILE_HEIGHT));
  1100. if (TEST_Z)
  1101. {
  1102. zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_max_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDx, _mmw_set1_ps(SUB_TILE_WIDTH))));
  1103. zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_max_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDy, _mmw_set1_ps(SUB_TILE_HEIGHT))));
  1104. }
  1105. else
  1106. {
  1107. zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_min_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDx, _mmw_set1_ps(SUB_TILE_WIDTH))));
  1108. zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_min_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDy, _mmw_set1_ps(SUB_TILE_HEIGHT))));
  1109. }
  1110. // Compute Zmin and Zmax for the triangle (used to narrow the range for difficult tiles)
  1111. __mw zMin = _mmw_min_ps(pVtxZ[0], _mmw_min_ps(pVtxZ[1], pVtxZ[2]));
  1112. __mw zMax = _mmw_max_ps(pVtxZ[0], _mmw_max_ps(pVtxZ[1], pVtxZ[2]));
  1113. //////////////////////////////////////////////////////////////////////////////
  1114. // Sort vertices (v0 has lowest Y, and the rest is in winding order) and
  1115. // compute edges. Also find the middle vertex and compute tile
  1116. //////////////////////////////////////////////////////////////////////////////
  1117. #if PRECISE_COVERAGE != 0
  1118. // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value
  1119. SortVertices(ipVtxX, ipVtxY);
  1120. // Compute edges
  1121. __mwi edgeX[3] = { _mmw_sub_epi32(ipVtxX[1], ipVtxX[0]), _mmw_sub_epi32(ipVtxX[2], ipVtxX[1]), _mmw_sub_epi32(ipVtxX[2], ipVtxX[0]) };
  1122. __mwi edgeY[3] = { _mmw_sub_epi32(ipVtxY[1], ipVtxY[0]), _mmw_sub_epi32(ipVtxY[2], ipVtxY[1]), _mmw_sub_epi32(ipVtxY[2], ipVtxY[0]) };
  1123. // Classify if the middle vertex is on the left or right and compute its position
  1124. int midVtxRight = ~_mmw_movemask_ps(simd_cast<__mw>(edgeY[1]));
  1125. __mwi midPixelX = _mmw_blendv_epi32(ipVtxX[1], ipVtxX[2], edgeY[1]);
  1126. __mwi midPixelY = _mmw_blendv_epi32(ipVtxY[1], ipVtxY[2], edgeY[1]);
  1127. __mwi midTileY = _mmw_srai_epi32(_mmw_max_epi32(midPixelY, SIMD_BITS_ZERO), TILE_HEIGHT_SHIFT + FP_BITS);
  1128. __mwi bbMidTileY = _mmw_max_epi32(bbTileMinY, _mmw_min_epi32(bbTileMaxY, midTileY));
  1129. // Compute edge events for the bottom of the bounding box, or for the middle tile in case of
  1130. // the edge originating from the middle vertex.
  1131. __mwi xDiffi[2], yDiffi[2];
  1132. xDiffi[0] = _mmw_sub_epi32(ipVtxX[0], _mmw_slli_epi32(bbPixelMinX, FP_BITS));
  1133. xDiffi[1] = _mmw_sub_epi32(midPixelX, _mmw_slli_epi32(bbPixelMinX, FP_BITS));
  1134. yDiffi[0] = _mmw_sub_epi32(ipVtxY[0], _mmw_slli_epi32(bbPixelMinY, FP_BITS));
  1135. yDiffi[1] = _mmw_sub_epi32(midPixelY, _mmw_slli_epi32(bbMidTileY, FP_BITS + TILE_HEIGHT_SHIFT));
  1136. //////////////////////////////////////////////////////////////////////////////
  1137. // Edge slope setup - Note we do not conform to DX/GL rasterization rules
  1138. //////////////////////////////////////////////////////////////////////////////
  1139. // Potentially flip edge to ensure that all edges have positive Y slope.
  1140. edgeX[1] = _mmw_blendv_epi32(edgeX[1], _mmw_neg_epi32(edgeX[1]), edgeY[1]);
  1141. edgeY[1] = _mmw_abs_epi32(edgeY[1]);
  1142. // Compute floating point slopes
  1143. __mw slope[3];
  1144. slope[0] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[0]), _mmw_cvtepi32_ps(edgeY[0]));
  1145. slope[1] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[1]), _mmw_cvtepi32_ps(edgeY[1]));
  1146. slope[2] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[2]), _mmw_cvtepi32_ps(edgeY[2]));
  1147. // Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen
  1148. // width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that
  1149. // vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0
  1150. __mw horizontalSlopeDelta = _mmw_set1_ps(2.0f * ((float)mWidth + 2.0f*(GUARD_BAND_PIXEL_SIZE + 1.0f)));
  1151. __mwi horizontalSlope0 = _mmw_cmpeq_epi32(edgeY[0], _mmw_setzero_epi32());
  1152. __mwi horizontalSlope1 = _mmw_cmpeq_epi32(edgeY[1], _mmw_setzero_epi32());
  1153. slope[0] = _mmw_blendv_ps(slope[0], horizontalSlopeDelta, simd_cast<__mw>(horizontalSlope0));
  1154. slope[1] = _mmw_blendv_ps(slope[1], _mmw_neg_ps(horizontalSlopeDelta), simd_cast<__mw>(horizontalSlope1));
  1155. __mwi vy[3] = { yDiffi[0], yDiffi[1], yDiffi[0] };
  1156. __mwi offset0 = _mmw_and_epi32(_mmw_add_epi32(yDiffi[0], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), _mmw_set1_epi32((int)((~0u) << FP_BITS)));
  1157. __mwi offset1 = _mmw_and_epi32(_mmw_add_epi32(yDiffi[1], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), _mmw_set1_epi32((int)((~0u) << FP_BITS)));
  1158. vy[0] = _mmw_blendv_epi32(yDiffi[0], offset0, horizontalSlope0);
  1159. vy[1] = _mmw_blendv_epi32(yDiffi[1], offset1, horizontalSlope1);
  1160. // Compute edge events for the bottom of the bounding box, or for the middle tile in case of
  1161. // the edge originating from the middle vertex.
  1162. __mwi slopeSign[3], absEdgeX[3];
  1163. __mwi slopeTileDelta[3], eventStartRemainder[3], slopeTileRemainder[3], eventStart[3];
  1164. for (int i = 0; i < 3; i++)
  1165. {
  1166. // Common, compute slope sign (used to propagate the remainder term when overflowing) is postive or negative x-direction
  1167. slopeSign[i] = _mmw_blendv_epi32(_mmw_set1_epi32(1), _mmw_set1_epi32(-1), edgeX[i]);
  1168. absEdgeX[i] = _mmw_abs_epi32(edgeX[i]);
  1169. // Delta and error term for one vertical tile step. The exact delta is exactDelta = edgeX / edgeY, due to limited precision we
  1170. // repersent the delta as delta = qoutient + remainder / edgeY, where quotient = int(edgeX / edgeY). In this case, since we step
  1171. // one tile of scanlines at a time, the slope is computed for a tile-sized step.
  1172. slopeTileDelta[i] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[i], _mmw_set1_ps(FP_TILE_HEIGHT)));
  1173. slopeTileRemainder[i] = _mmw_sub_epi32(_mmw_slli_epi32(absEdgeX[i], FP_TILE_HEIGHT_SHIFT), _mmw_mullo_epi32(_mmw_abs_epi32(slopeTileDelta[i]), edgeY[i]));
  1174. // Jump to bottom scanline of tile row, this is the bottom of the bounding box, or the middle vertex of the triangle.
  1175. // The jump can be in both positive and negative y-direction due to clipping / offscreen vertices.
  1176. __mwi tileStartDir = _mmw_blendv_epi32(slopeSign[i], _mmw_neg_epi32(slopeSign[i]), vy[i]);
  1177. __mwi tieBreaker = _mmw_blendv_epi32(_mmw_set1_epi32(0), _mmw_set1_epi32(1), tileStartDir);
  1178. __mwi tileStartSlope = _mmw_cvttps_epi32(_mmw_mul_ps(slope[i], _mmw_cvtepi32_ps(_mmw_neg_epi32(vy[i]))));
  1179. __mwi tileStartRemainder = _mmw_sub_epi32(_mmw_mullo_epi32(absEdgeX[i], _mmw_abs_epi32(vy[i])), _mmw_mullo_epi32(_mmw_abs_epi32(tileStartSlope), edgeY[i]));
  1180. eventStartRemainder[i] = _mmw_sub_epi32(tileStartRemainder, tieBreaker);
  1181. __mwi overflow = _mmw_srai_epi32(eventStartRemainder[i], 31);
  1182. eventStartRemainder[i] = _mmw_add_epi32(eventStartRemainder[i], _mmw_and_epi32(overflow, edgeY[i]));
  1183. eventStartRemainder[i] = _mmw_blendv_epi32(eventStartRemainder[i], _mmw_sub_epi32(_mmw_sub_epi32(edgeY[i], eventStartRemainder[i]), _mmw_set1_epi32(1)), vy[i]);
  1184. //eventStart[i] = xDiffi[i & 1] + tileStartSlope + (overflow & tileStartDir) + _mmw_set1_epi32(FP_HALF_PIXEL - 1) + tieBreaker;
  1185. eventStart[i] = _mmw_add_epi32(_mmw_add_epi32(xDiffi[i & 1], tileStartSlope), _mmw_and_epi32(overflow, tileStartDir));
  1186. eventStart[i] = _mmw_add_epi32(_mmw_add_epi32(eventStart[i], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), tieBreaker);
  1187. }
  1188. #else // PRECISE_COVERAGE
  1189. SortVertices(pVtxX, pVtxY);
  1190. // Compute edges
  1191. __mw edgeX[3] = { _mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxX[2], pVtxX[1]), _mmw_sub_ps(pVtxX[2], pVtxX[0]) };
  1192. __mw edgeY[3] = { _mmw_sub_ps(pVtxY[1], pVtxY[0]), _mmw_sub_ps(pVtxY[2], pVtxY[1]), _mmw_sub_ps(pVtxY[2], pVtxY[0]) };
  1193. // Classify if the middle vertex is on the left or right and compute its position
  1194. int midVtxRight = ~_mmw_movemask_ps(edgeY[1]);
  1195. __mw midPixelX = _mmw_blendv_ps(pVtxX[1], pVtxX[2], edgeY[1]);
  1196. __mw midPixelY = _mmw_blendv_ps(pVtxY[1], pVtxY[2], edgeY[1]);
  1197. __mwi midTileY = _mmw_srai_epi32(_mmw_max_epi32(_mmw_cvttps_epi32(midPixelY), SIMD_BITS_ZERO), TILE_HEIGHT_SHIFT);
  1198. __mwi bbMidTileY = _mmw_max_epi32(bbTileMinY, _mmw_min_epi32(bbTileMaxY, midTileY));
  1199. //////////////////////////////////////////////////////////////////////////////
  1200. // Edge slope setup - Note we do not conform to DX/GL rasterization rules
  1201. //////////////////////////////////////////////////////////////////////////////
  1202. // Compute floating point slopes
  1203. __mw slope[3];
  1204. slope[0] = _mmw_div_ps(edgeX[0], edgeY[0]);
  1205. slope[1] = _mmw_div_ps(edgeX[1], edgeY[1]);
  1206. slope[2] = _mmw_div_ps(edgeX[2], edgeY[2]);
  1207. // Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen
  1208. // width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that
  1209. // vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0
  1210. __mw horizontalSlopeDelta = _mmw_set1_ps((float)mWidth + 2.0f*(GUARD_BAND_PIXEL_SIZE + 1.0f));
  1211. slope[0] = _mmw_blendv_ps(slope[0], horizontalSlopeDelta, _mmw_cmpeq_ps(edgeY[0], _mmw_setzero_ps()));
  1212. slope[1] = _mmw_blendv_ps(slope[1], _mmw_neg_ps(horizontalSlopeDelta), _mmw_cmpeq_ps(edgeY[1], _mmw_setzero_ps()));
  1213. // Convert floaing point slopes to fixed point
  1214. __mwi slopeFP[3];
  1215. slopeFP[0] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[0], _mmw_set1_ps(1 << FP_BITS)));
  1216. slopeFP[1] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[1], _mmw_set1_ps(1 << FP_BITS)));
  1217. slopeFP[2] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[2], _mmw_set1_ps(1 << FP_BITS)));
  1218. // Fan out edge slopes to avoid (rare) cracks at vertices. We increase right facing slopes
  1219. // by 1 LSB, which results in overshooting vertices slightly, increasing triangle coverage.
  1220. // e0 is always right facing, e1 depends on if the middle vertex is on the left or right
  1221. slopeFP[0] = _mmw_add_epi32(slopeFP[0], _mmw_set1_epi32(1));
  1222. slopeFP[1] = _mmw_add_epi32(slopeFP[1], _mmw_srli_epi32(_mmw_not_epi32(simd_cast<__mwi>(edgeY[1])), 31));
  1223. // Compute slope deltas for an SIMD_LANES scanline step (tile height)
  1224. __mwi slopeTileDelta[3];
  1225. slopeTileDelta[0] = _mmw_slli_epi32(slopeFP[0], TILE_HEIGHT_SHIFT);
  1226. slopeTileDelta[1] = _mmw_slli_epi32(slopeFP[1], TILE_HEIGHT_SHIFT);
  1227. slopeTileDelta[2] = _mmw_slli_epi32(slopeFP[2], TILE_HEIGHT_SHIFT);
  1228. // Compute edge events for the bottom of the bounding box, or for the middle tile in case of
  1229. // the edge originating from the middle vertex.
  1230. __mwi xDiffi[2], yDiffi[2];
  1231. xDiffi[0] = _mmw_slli_epi32(_mmw_sub_epi32(_mmw_cvttps_epi32(pVtxX[0]), bbPixelMinX), FP_BITS);
  1232. xDiffi[1] = _mmw_slli_epi32(_mmw_sub_epi32(_mmw_cvttps_epi32(midPixelX), bbPixelMinX), FP_BITS);
  1233. yDiffi[0] = _mmw_sub_epi32(_mmw_cvttps_epi32(pVtxY[0]), bbPixelMinY);
  1234. yDiffi[1] = _mmw_sub_epi32(_mmw_cvttps_epi32(midPixelY), _mmw_slli_epi32(bbMidTileY, TILE_HEIGHT_SHIFT));
  1235. __mwi eventStart[3];
  1236. eventStart[0] = _mmw_sub_epi32(xDiffi[0], _mmw_mullo_epi32(slopeFP[0], yDiffi[0]));
  1237. eventStart[1] = _mmw_sub_epi32(xDiffi[1], _mmw_mullo_epi32(slopeFP[1], yDiffi[1]));
  1238. eventStart[2] = _mmw_sub_epi32(xDiffi[0], _mmw_mullo_epi32(slopeFP[2], yDiffi[0]));
  1239. #endif
  1240. //////////////////////////////////////////////////////////////////////////////
  1241. // Split bounding box into bottom - middle - top region.
  1242. //////////////////////////////////////////////////////////////////////////////
  1243. __mwi bbBottomIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(bbTileMinY, _mmw_set1_epi32(mTilesWidth)));
  1244. __mwi bbTopIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(_mmw_add_epi32(bbTileMinY, bbTileSizeY), _mmw_set1_epi32(mTilesWidth)));
  1245. __mwi bbMidIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(midTileY, _mmw_set1_epi32(mTilesWidth)));
  1246. //////////////////////////////////////////////////////////////////////////////
  1247. // Loop over non-culled triangle and change SIMD axis to per-pixel
  1248. //////////////////////////////////////////////////////////////////////////////
  1249. while (triMask)
  1250. {
  1251. unsigned int triIdx = find_clear_lsb(&triMask);
  1252. int triMidVtxRight = (midVtxRight >> triIdx) & 1;
  1253. // Get Triangle Zmin zMax
  1254. __mw zTriMax = _mmw_set1_ps(simd_f32(zMax)[triIdx]);
  1255. __mw zTriMin = _mmw_set1_ps(simd_f32(zMin)[triIdx]);
  1256. // Setup Zmin value for first set of 8x4 subtiles
  1257. __mw z0 = _mmw_fmadd_ps(_mmw_set1_ps(simd_f32(zPixelDx)[triIdx]), SIMD_SUB_TILE_COL_OFFSET_F,
  1258. _mmw_fmadd_ps(_mmw_set1_ps(simd_f32(zPixelDy)[triIdx]), SIMD_SUB_TILE_ROW_OFFSET_F, _mmw_set1_ps(simd_f32(zPlaneOffset)[triIdx])));
  1259. float zx = simd_f32(zTileDx)[triIdx];
  1260. float zy = simd_f32(zTileDy)[triIdx];
  1261. // Get dimension of bounding box bottom, mid & top segments
  1262. int bbWidth = simd_i32(bbTileSizeX)[triIdx];
  1263. int bbHeight = simd_i32(bbTileSizeY)[triIdx];
  1264. int tileRowIdx = simd_i32(bbBottomIdx)[triIdx];
  1265. int tileMidRowIdx = simd_i32(bbMidIdx)[triIdx];
  1266. int tileEndRowIdx = simd_i32(bbTopIdx)[triIdx];
  1267. if (bbWidth > BIG_TRIANGLE && bbHeight > BIG_TRIANGLE) // For big triangles we use a more expensive but tighter traversal algorithm
  1268. {
  1269. #if PRECISE_COVERAGE != 0
  1270. if (triMidVtxRight)
  1271. cullResult &= RasterizeTriangle<TEST_Z, 1, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
  1272. else
  1273. cullResult &= RasterizeTriangle<TEST_Z, 1, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
  1274. #else
  1275. if (triMidVtxRight)
  1276. cullResult &= RasterizeTriangle<TEST_Z, 1, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
  1277. else
  1278. cullResult &= RasterizeTriangle<TEST_Z, 1, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
  1279. #endif
  1280. }
  1281. else
  1282. {
  1283. #if PRECISE_COVERAGE != 0
  1284. if (triMidVtxRight)
  1285. cullResult &= RasterizeTriangle<TEST_Z, 0, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
  1286. else
  1287. cullResult &= RasterizeTriangle<TEST_Z, 0, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder);
  1288. #else
  1289. if (triMidVtxRight)
  1290. cullResult &= RasterizeTriangle<TEST_Z, 0, 1>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
  1291. else
  1292. cullResult &= RasterizeTriangle<TEST_Z, 0, 0>(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy);
  1293. #endif
  1294. }
  1295. if (TEST_Z && cullResult == CullingResult::VISIBLE)
  1296. return CullingResult::VISIBLE;
  1297. }
  1298. return cullResult;
  1299. }
  1300. template<int TEST_Z, int FAST_GATHER>
  1301. FORCE_INLINE CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout)
  1302. {
  1303. assert(mMaskedHiZBuffer != nullptr);
  1304. if (TEST_Z)
  1305. STATS_ADD(mStats.mOccludees.mNumProcessedTriangles, nTris);
  1306. else
  1307. STATS_ADD(mStats.mOccluders.mNumProcessedTriangles, nTris);
  1308. #if PRECISE_COVERAGE != 0
  1309. int originalRoundingMode = _MM_GET_ROUNDING_MODE();
  1310. _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
  1311. #endif
  1312. int clipHead = 0;
  1313. int clipTail = 0;
  1314. __m128 clipTriBuffer[MAX_CLIPPED * 3];
  1315. int cullResult = CullingResult::VIEW_CULLED;
  1316. const unsigned int *inTrisPtr = inTris;
  1317. int numLanes = SIMD_LANES;
  1318. int triIndex = 0;
  1319. while (triIndex < nTris || clipHead != clipTail)
  1320. {
  1321. __mw vtxX[3], vtxY[3], vtxW[3];
  1322. unsigned int triMask = SIMD_ALL_LANES_MASK;
  1323. GatherTransformClip<FAST_GATHER>( clipHead, clipTail, numLanes, nTris, triIndex, vtxX, vtxY, vtxW, inVtx, inTrisPtr, vtxLayout, modelToClipMatrix, clipTriBuffer, triMask, clipPlaneMask );
  1324. if (triMask == 0x0)
  1325. continue;
  1326. //////////////////////////////////////////////////////////////////////////////
  1327. // Project, transform to screen space and perform backface culling. Note
  1328. // that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and
  1329. // z = 1 is near. We must also use a greater than depth test, and in effect
  1330. // everything is reversed compared to regular z implementations.
  1331. //////////////////////////////////////////////////////////////////////////////
  1332. __mw pVtxX[3], pVtxY[3], pVtxZ[3];
  1333. #if PRECISE_COVERAGE != 0
  1334. __mwi ipVtxX[3], ipVtxY[3];
  1335. ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
  1336. #else
  1337. ProjectVertices(pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
  1338. #endif
  1339. // Perform backface test.
  1340. __mw triArea1 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxY[2], pVtxY[0]));
  1341. __mw triArea2 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[0], pVtxX[2]), _mmw_sub_ps(pVtxY[0], pVtxY[1]));
  1342. __mw triArea = _mmw_sub_ps(triArea1, triArea2);
  1343. __mw ccwMask = _mmw_cmpgt_ps(triArea, _mmw_setzero_ps());
  1344. #if PRECISE_COVERAGE != 0
  1345. triMask &= CullBackfaces(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
  1346. #else
  1347. triMask &= CullBackfaces(pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
  1348. #endif
  1349. if (triMask == 0x0)
  1350. continue;
  1351. //////////////////////////////////////////////////////////////////////////////
  1352. // Setup and rasterize a SIMD batch of triangles
  1353. //////////////////////////////////////////////////////////////////////////////
  1354. #if PRECISE_COVERAGE != 0
  1355. cullResult &= RasterizeTriangleBatch<TEST_Z>(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, &mFullscreenScissor);
  1356. #else
  1357. cullResult &= RasterizeTriangleBatch<TEST_Z>(pVtxX, pVtxY, pVtxZ, triMask, &mFullscreenScissor);
  1358. #endif
  1359. if (TEST_Z && cullResult == CullingResult::VISIBLE) {
  1360. #if PRECISE_COVERAGE != 0
  1361. _MM_SET_ROUNDING_MODE(originalRoundingMode);
  1362. #endif
  1363. return CullingResult::VISIBLE;
  1364. }
  1365. }
  1366. #if PRECISE_COVERAGE != 0
  1367. _MM_SET_ROUNDING_MODE(originalRoundingMode);
  1368. #endif
  1369. return (CullingResult)cullResult;
  1370. }
  1371. CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
  1372. {
  1373. CullingResult retVal;
  1374. if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
  1375. retVal = (CullingResult)RenderTriangles<0, 1>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
  1376. else
  1377. retVal = (CullingResult)RenderTriangles<0, 0>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
  1378. #if MOC_RECORDER_ENABLE
  1379. RecordRenderTriangles( inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout, retVal );
  1380. #endif
  1381. return retVal;
  1382. }
  1383. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1384. // Occlusion query functions
  1385. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1386. CullingResult TestTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
  1387. {
  1388. CullingResult retVal;
  1389. if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
  1390. retVal = (CullingResult)RenderTriangles<1, 1>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
  1391. else
  1392. retVal = (CullingResult)RenderTriangles<1, 0>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
  1393. #if MOC_RECORDER_ENABLE
  1394. {
  1395. std::lock_guard<std::mutex> lock( mRecorderMutex );
  1396. if( mRecorder != nullptr ) mRecorder->RecordTestTriangles( retVal, inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout );
  1397. }
  1398. #endif
  1399. return retVal;
  1400. }
  1401. CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin ) const override
  1402. {
  1403. STATS_ADD(mStats.mOccludees.mNumProcessedRectangles, 1);
  1404. assert(mMaskedHiZBuffer != nullptr);
  1405. static const __m128i SIMD_TILE_PAD = _mm_setr_epi32(0, TILE_WIDTH, 0, TILE_HEIGHT);
  1406. static const __m128i SIMD_TILE_PAD_MASK = _mm_setr_epi32(~(TILE_WIDTH - 1), ~(TILE_WIDTH - 1), ~(TILE_HEIGHT - 1), ~(TILE_HEIGHT - 1));
  1407. static const __m128i SIMD_SUB_TILE_PAD = _mm_setr_epi32(0, SUB_TILE_WIDTH, 0, SUB_TILE_HEIGHT);
  1408. static const __m128i SIMD_SUB_TILE_PAD_MASK = _mm_setr_epi32(~(SUB_TILE_WIDTH - 1), ~(SUB_TILE_WIDTH - 1), ~(SUB_TILE_HEIGHT - 1), ~(SUB_TILE_HEIGHT - 1));
  1409. //////////////////////////////////////////////////////////////////////////////
  1410. // Compute screen space bounding box and guard for out of bounds
  1411. //////////////////////////////////////////////////////////////////////////////
  1412. #if USE_D3D != 0
  1413. __m128 pixelBBox = _mmx_fmadd_ps(_mm_setr_ps(xmin, xmax, ymax, ymin), mIHalfSize, mICenter);
  1414. #else
  1415. __m128 pixelBBox = _mmx_fmadd_ps(_mm_setr_ps(xmin, xmax, ymin, ymax), mIHalfSize, mICenter);
  1416. #endif
  1417. __m128i pixelBBoxi = _mm_cvttps_epi32(pixelBBox);
  1418. pixelBBoxi = _mmx_max_epi32(_mm_setzero_si128(), _mmx_min_epi32(mIScreenSize, pixelBBoxi));
  1419. //////////////////////////////////////////////////////////////////////////////
  1420. // Pad bounding box to (32xN) tiles. Tile BB is used for looping / traversal
  1421. //////////////////////////////////////////////////////////////////////////////
  1422. __m128i tileBBoxi = _mm_and_si128(_mm_add_epi32(pixelBBoxi, SIMD_TILE_PAD), SIMD_TILE_PAD_MASK);
  1423. int txMin = simd_i32(tileBBoxi)[0] >> TILE_WIDTH_SHIFT;
  1424. int txMax = simd_i32(tileBBoxi)[1] >> TILE_WIDTH_SHIFT;
  1425. int tileRowIdx = (simd_i32(tileBBoxi)[2] >> TILE_HEIGHT_SHIFT)*mTilesWidth;
  1426. int tileRowIdxEnd = (simd_i32(tileBBoxi)[3] >> TILE_HEIGHT_SHIFT)*mTilesWidth;
  1427. if (simd_i32(tileBBoxi)[0] == simd_i32(tileBBoxi)[1] || simd_i32(tileBBoxi)[2] == simd_i32(tileBBoxi)[3])
  1428. {
  1429. #if MOC_RECORDER_ENABLE
  1430. {
  1431. std::lock_guard<std::mutex> lock( mRecorderMutex );
  1432. if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::VIEW_CULLED, xmin, ymin, xmax, ymax, wmin );
  1433. }
  1434. #endif
  1435. return CullingResult::VIEW_CULLED;
  1436. }
  1437. ///////////////////////////////////////////////////////////////////////////////
  1438. // Pad bounding box to (8x4) subtiles. Skip SIMD lanes outside the subtile BB
  1439. ///////////////////////////////////////////////////////////////////////////////
  1440. __m128i subTileBBoxi = _mm_and_si128(_mm_add_epi32(pixelBBoxi, SIMD_SUB_TILE_PAD), SIMD_SUB_TILE_PAD_MASK);
  1441. __mwi stxmin = _mmw_set1_epi32(simd_i32(subTileBBoxi)[0] - 1); // - 1 to be able to use GT test
  1442. __mwi stymin = _mmw_set1_epi32(simd_i32(subTileBBoxi)[2] - 1); // - 1 to be able to use GT test
  1443. __mwi stxmax = _mmw_set1_epi32(simd_i32(subTileBBoxi)[1]);
  1444. __mwi stymax = _mmw_set1_epi32(simd_i32(subTileBBoxi)[3]);
  1445. // Setup pixel coordinates used to discard lanes outside subtile BB
  1446. __mwi startPixelX = _mmw_add_epi32(SIMD_SUB_TILE_COL_OFFSET, _mmw_set1_epi32(simd_i32(tileBBoxi)[0]));
  1447. __mwi pixelY = _mmw_add_epi32(SIMD_SUB_TILE_ROW_OFFSET, _mmw_set1_epi32(simd_i32(tileBBoxi)[2]));
  1448. //////////////////////////////////////////////////////////////////////////////
  1449. // Compute z from w. Note that z is reversed order, 0 = far, 1 = near, which
  1450. // means we use a greater than test, so zMax is used to test for visibility.
  1451. //////////////////////////////////////////////////////////////////////////////
  1452. __mw zMax = _mmw_div_ps(_mmw_set1_ps(1.0f), _mmw_set1_ps(wmin));
  1453. for (;;)
  1454. {
  1455. __mwi pixelX = startPixelX;
  1456. for (int tx = txMin;;)
  1457. {
  1458. STATS_ADD(mStats.mOccludees.mNumTilesTraversed, 1);
  1459. int tileIdx = tileRowIdx + tx;
  1460. assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight);
  1461. // Fetch zMin from masked hierarchical Z buffer
  1462. #if QUICK_MASK != 0
  1463. __mw zBuf = mMaskedHiZBuffer[tileIdx].mZMin[0];
  1464. #else
  1465. __mwi mask = mMaskedHiZBuffer[tileIdx].mMask;
  1466. __mw zMin0 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[0], mMaskedHiZBuffer[tileIdx].mZMin[1], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_set1_epi32(~0))));
  1467. __mw zMin1 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[1], mMaskedHiZBuffer[tileIdx].mZMin[0], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_setzero_epi32())));
  1468. __mw zBuf = _mmw_min_ps(zMin0, zMin1);
  1469. #endif
  1470. // Perform conservative greater than test against hierarchical Z buffer (zMax >= zBuf means the subtile is visible)
  1471. __mwi zPass = simd_cast<__mwi>(_mmw_cmpge_ps(zMax, zBuf)); //zPass = zMax >= zBuf ? ~0 : 0
  1472. // Mask out lanes corresponding to subtiles outside the bounding box
  1473. __mwi bboxTestMin = _mmw_and_epi32(_mmw_cmpgt_epi32(pixelX, stxmin), _mmw_cmpgt_epi32(pixelY, stymin));
  1474. __mwi bboxTestMax = _mmw_and_epi32(_mmw_cmpgt_epi32(stxmax, pixelX), _mmw_cmpgt_epi32(stymax, pixelY));
  1475. __mwi boxMask = _mmw_and_epi32(bboxTestMin, bboxTestMax);
  1476. zPass = _mmw_and_epi32(zPass, boxMask);
  1477. // If not all tiles failed the conservative z test we can immediately terminate the test
  1478. if (!_mmw_testz_epi32(zPass, zPass))
  1479. {
  1480. #if MOC_RECORDER_ENABLE
  1481. {
  1482. std::lock_guard<std::mutex> lock( mRecorderMutex );
  1483. if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::VISIBLE, xmin, ymin, xmax, ymax, wmin );
  1484. }
  1485. #endif
  1486. return CullingResult::VISIBLE;
  1487. }
  1488. if (++tx >= txMax)
  1489. break;
  1490. pixelX = _mmw_add_epi32(pixelX, _mmw_set1_epi32(TILE_WIDTH));
  1491. }
  1492. tileRowIdx += mTilesWidth;
  1493. if (tileRowIdx >= tileRowIdxEnd)
  1494. break;
  1495. pixelY = _mmw_add_epi32(pixelY, _mmw_set1_epi32(TILE_HEIGHT));
  1496. }
  1497. #if MOC_RECORDER_ENABLE
  1498. {
  1499. std::lock_guard<std::mutex> lock( mRecorderMutex );
  1500. if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::OCCLUDED, xmin, ymin, xmax, ymax, wmin );
  1501. }
  1502. #endif
  1503. return CullingResult::OCCLUDED;
  1504. }
  1505. template<bool FAST_GATHER>
  1506. FORCE_INLINE void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout)
  1507. {
  1508. assert(mMaskedHiZBuffer != nullptr);
  1509. #if PRECISE_COVERAGE != 0
  1510. int originalRoundingMode = _MM_GET_ROUNDING_MODE();
  1511. _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
  1512. #endif
  1513. STATS_ADD(mStats.mOccluders.mNumProcessedTriangles, nTris);
  1514. int clipHead = 0;
  1515. int clipTail = 0;
  1516. __m128 clipTriBuffer[MAX_CLIPPED * 3];
  1517. const unsigned int *inTrisPtr = inTris;
  1518. int numLanes = SIMD_LANES;
  1519. int triIndex = 0;
  1520. while (triIndex < nTris || clipHead != clipTail)
  1521. {
  1522. unsigned int triMask = SIMD_ALL_LANES_MASK;
  1523. __mw vtxX[3], vtxY[3], vtxW[3];
  1524. GatherTransformClip<FAST_GATHER>( clipHead, clipTail, numLanes, nTris, triIndex, vtxX, vtxY, vtxW, inVtx, inTrisPtr, vtxLayout, modelToClipMatrix, clipTriBuffer, triMask, clipPlaneMask );
  1525. if (triMask == 0x0)
  1526. continue;
  1527. //////////////////////////////////////////////////////////////////////////////
  1528. // Project, transform to screen space and perform backface culling. Note
  1529. // that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and
  1530. // z = 1 is near. We must also use a greater than depth test, and in effect
  1531. // everything is reversed compared to regular z implementations.
  1532. //////////////////////////////////////////////////////////////////////////////
  1533. __mw pVtxX[3], pVtxY[3], pVtxZ[3];
  1534. #if PRECISE_COVERAGE != 0
  1535. __mwi ipVtxX[3], ipVtxY[3];
  1536. ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
  1537. #else
  1538. ProjectVertices(pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW);
  1539. #endif
  1540. // Perform backface test.
  1541. __mw triArea1 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxY[2], pVtxY[0]));
  1542. __mw triArea2 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[0], pVtxX[2]), _mmw_sub_ps(pVtxY[0], pVtxY[1]));
  1543. __mw triArea = _mmw_sub_ps(triArea1, triArea2);
  1544. __mw ccwMask = _mmw_cmpgt_ps(triArea, _mmw_setzero_ps());
  1545. #if PRECISE_COVERAGE != 0
  1546. triMask &= CullBackfaces(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
  1547. #else
  1548. triMask &= CullBackfaces(pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding);
  1549. #endif
  1550. if (triMask == 0x0)
  1551. continue;
  1552. //////////////////////////////////////////////////////////////////////////////
  1553. // Bin triangles
  1554. //////////////////////////////////////////////////////////////////////////////
  1555. unsigned int binWidth;
  1556. unsigned int binHeight;
  1557. ComputeBinWidthHeight(nBinsW, nBinsH, binWidth, binHeight);
  1558. // Compute pixel bounding box
  1559. __mwi bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY;
  1560. ComputeBoundingBox(bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY, pVtxX, pVtxY, &mFullscreenScissor);
  1561. while (triMask)
  1562. {
  1563. unsigned int triIdx = find_clear_lsb(&triMask);
  1564. // Clamp bounding box to bins
  1565. int startX = min(nBinsW-1, simd_i32(bbPixelMinX)[triIdx] / binWidth);
  1566. int startY = min(nBinsH-1, simd_i32(bbPixelMinY)[triIdx] / binHeight);
  1567. int endX = min(nBinsW, (simd_i32(bbPixelMaxX)[triIdx] + binWidth - 1) / binWidth);
  1568. int endY = min(nBinsH, (simd_i32(bbPixelMaxY)[triIdx] + binHeight - 1) / binHeight);
  1569. for (int y = startY; y < endY; ++y)
  1570. {
  1571. for (int x = startX; x < endX; ++x)
  1572. {
  1573. int binIdx = x + y * nBinsW;
  1574. unsigned int writeTriIdx = triLists[binIdx].mTriIdx;
  1575. for (int i = 0; i < 3; ++i)
  1576. {
  1577. #if PRECISE_COVERAGE != 0
  1578. ((int*)triLists[binIdx].mPtr)[i * 3 + writeTriIdx * 9 + 0] = simd_i32(ipVtxX[i])[triIdx];
  1579. ((int*)triLists[binIdx].mPtr)[i * 3 + writeTriIdx * 9 + 1] = simd_i32(ipVtxY[i])[triIdx];
  1580. #else
  1581. triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 0] = simd_f32(pVtxX[i])[triIdx];
  1582. triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 1] = simd_f32(pVtxY[i])[triIdx];
  1583. #endif
  1584. triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 2] = simd_f32(pVtxZ[i])[triIdx];
  1585. }
  1586. triLists[binIdx].mTriIdx++;
  1587. }
  1588. }
  1589. }
  1590. }
  1591. #if PRECISE_COVERAGE != 0
  1592. _MM_SET_ROUNDING_MODE(originalRoundingMode);
  1593. #endif
  1594. }
  1595. void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override
  1596. {
  1597. if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12)
  1598. BinTriangles<true>(inVtx, inTris, nTris, triLists, nBinsW, nBinsH, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
  1599. else
  1600. BinTriangles<false>(inVtx, inTris, nTris, triLists, nBinsW, nBinsH, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout);
  1601. }
  1602. template<int FAST_GATHER>
  1603. void GatherTransformClip( int & clipHead, int & clipTail, int & numLanes, int nTris, int & triIndex, __mw * vtxX, __mw * vtxY, __mw * vtxW, const float * inVtx, const unsigned int * &inTrisPtr, const VertexLayout & vtxLayout, const float * modelToClipMatrix, __m128 * clipTriBuffer, unsigned int &triMask, ClipPlanes clipPlaneMask )
  1604. {
  1605. //////////////////////////////////////////////////////////////////////////////
  1606. // Assemble triangles from the index list
  1607. //////////////////////////////////////////////////////////////////////////////
  1608. unsigned int triClipMask = SIMD_ALL_LANES_MASK;
  1609. if( clipHead != clipTail )
  1610. {
  1611. int clippedTris = clipHead > clipTail ? clipHead - clipTail : MAX_CLIPPED + clipHead - clipTail;
  1612. clippedTris = min( clippedTris, SIMD_LANES );
  1613. #if CLIPPING_PRESERVES_ORDER != 0
  1614. // if preserving order, don't mix clipped and new triangles, handle the clip buffer fully
  1615. // and then continue gathering; this is not as efficient - ideally we want to gather
  1616. // at the end (if clip buffer has less than SIMD_LANES triangles) but that requires
  1617. // more modifications below - something to do in the future.
  1618. numLanes = 0;
  1619. #else
  1620. // Fill out SIMD registers by fetching more triangles.
  1621. numLanes = max( 0, min( SIMD_LANES - clippedTris, nTris - triIndex ) );
  1622. #endif
  1623. if( numLanes > 0 ) {
  1624. if( FAST_GATHER )
  1625. GatherVerticesFast( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes );
  1626. else
  1627. GatherVertices( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes, vtxLayout );
  1628. TransformVerts( vtxX, vtxY, vtxW, modelToClipMatrix );
  1629. }
  1630. for( int clipTri = numLanes; clipTri < numLanes + clippedTris; clipTri++ )
  1631. {
  1632. int triIdx = clipTail * 3;
  1633. for( int i = 0; i < 3; i++ )
  1634. {
  1635. simd_f32( vtxX[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[0];
  1636. simd_f32( vtxY[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[1];
  1637. simd_f32( vtxW[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[2];
  1638. }
  1639. clipTail = ( clipTail + 1 ) & ( MAX_CLIPPED - 1 );
  1640. }
  1641. triIndex += numLanes;
  1642. inTrisPtr += numLanes * 3;
  1643. triMask = ( 1U << ( clippedTris + numLanes ) ) - 1;
  1644. triClipMask = ( 1U << numLanes ) - 1; // Don't re-clip already clipped triangles
  1645. }
  1646. else
  1647. {
  1648. numLanes = min( SIMD_LANES, nTris - triIndex );
  1649. triMask = ( 1U << numLanes ) - 1;
  1650. triClipMask = triMask;
  1651. if( FAST_GATHER )
  1652. GatherVerticesFast( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes );
  1653. else
  1654. GatherVertices( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes, vtxLayout );
  1655. TransformVerts( vtxX, vtxY, vtxW, modelToClipMatrix );
  1656. triIndex += SIMD_LANES;
  1657. inTrisPtr += SIMD_LANES * 3;
  1658. }
  1659. //////////////////////////////////////////////////////////////////////////////
  1660. // Clip transformed triangles
  1661. //////////////////////////////////////////////////////////////////////////////
  1662. if( clipPlaneMask != ClipPlanes::CLIP_PLANE_NONE )
  1663. ClipTriangleAndAddToBuffer( vtxX, vtxY, vtxW, clipTriBuffer, clipHead, triMask, triClipMask, clipPlaneMask );
  1664. }
  1665. void RenderTrilist(const TriList &triList, const ScissorRect *scissor) override
  1666. {
  1667. assert(mMaskedHiZBuffer != nullptr);
  1668. // Setup fullscreen scissor rect as default
  1669. scissor = scissor == nullptr ? &mFullscreenScissor : scissor;
  1670. for (unsigned int i = 0; i < triList.mTriIdx; i += SIMD_LANES)
  1671. {
  1672. //////////////////////////////////////////////////////////////////////////////
  1673. // Fetch triangle vertices
  1674. //////////////////////////////////////////////////////////////////////////////
  1675. unsigned int numLanes = min((unsigned int)SIMD_LANES, triList.mTriIdx - i);
  1676. unsigned int triMask = (1U << numLanes) - 1;
  1677. __mw pVtxX[3], pVtxY[3], pVtxZ[3];
  1678. #if PRECISE_COVERAGE != 0
  1679. __mwi ipVtxX[3], ipVtxY[3];
  1680. for (unsigned int l = 0; l < numLanes; ++l)
  1681. {
  1682. unsigned int triIdx = i + l;
  1683. for (int v = 0; v < 3; ++v)
  1684. {
  1685. simd_i32(ipVtxX[v])[l] = ((int*)triList.mPtr)[v * 3 + triIdx * 9 + 0];
  1686. simd_i32(ipVtxY[v])[l] = ((int*)triList.mPtr)[v * 3 + triIdx * 9 + 1];
  1687. simd_f32(pVtxZ[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 2];
  1688. }
  1689. }
  1690. for (int v = 0; v < 3; ++v)
  1691. {
  1692. pVtxX[v] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxX[v]), _mmw_set1_ps(FP_INV));
  1693. pVtxY[v] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxY[v]), _mmw_set1_ps(FP_INV));
  1694. }
  1695. //////////////////////////////////////////////////////////////////////////////
  1696. // Setup and rasterize a SIMD batch of triangles
  1697. //////////////////////////////////////////////////////////////////////////////
  1698. RasterizeTriangleBatch<false>(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, scissor);
  1699. #else
  1700. for (unsigned int l = 0; l < numLanes; ++l)
  1701. {
  1702. unsigned int triIdx = i + l;
  1703. for (int v = 0; v < 3; ++v)
  1704. {
  1705. simd_f32(pVtxX[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 0];
  1706. simd_f32(pVtxY[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 1];
  1707. simd_f32(pVtxZ[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 2];
  1708. }
  1709. }
  1710. //////////////////////////////////////////////////////////////////////////////
  1711. // Setup and rasterize a SIMD batch of triangles
  1712. //////////////////////////////////////////////////////////////////////////////
  1713. RasterizeTriangleBatch<false>(pVtxX, pVtxY, pVtxZ, triMask, scissor);
  1714. #endif
  1715. }
  1716. }
  1717. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1718. // Debugging and statistics
  1719. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1720. MaskedOcclusionCulling::Implementation GetImplementation() override
  1721. {
  1722. return gInstructionSet;
  1723. }
  1724. void ComputePixelDepthBuffer(float *depthData, bool flipY) override
  1725. {
  1726. assert(mMaskedHiZBuffer != nullptr);
  1727. for (int y = 0; y < mHeight; y++)
  1728. {
  1729. for (int x = 0; x < mWidth; x++)
  1730. {
  1731. // Compute 32xN tile index (SIMD value offset)
  1732. int tx = x / TILE_WIDTH;
  1733. int ty = y / TILE_HEIGHT;
  1734. int tileIdx = ty * mTilesWidth + tx;
  1735. // Compute 8x4 subtile index (SIMD lane offset)
  1736. int stx = (x % TILE_WIDTH) / SUB_TILE_WIDTH;
  1737. int sty = (y % TILE_HEIGHT) / SUB_TILE_HEIGHT;
  1738. int subTileIdx = sty * 4 + stx;
  1739. // Compute pixel index in subtile (bit index in 32-bit word)
  1740. int px = (x % SUB_TILE_WIDTH);
  1741. int py = (y % SUB_TILE_HEIGHT);
  1742. int bitIdx = py * 8 + px;
  1743. int pixelLayer = (simd_i32(mMaskedHiZBuffer[tileIdx].mMask)[subTileIdx] >> bitIdx) & 1;
  1744. float pixelDepth = simd_f32(mMaskedHiZBuffer[tileIdx].mZMin[pixelLayer])[subTileIdx];
  1745. if( flipY )
  1746. depthData[( mHeight - y - 1 ) * mWidth + x] = pixelDepth;
  1747. else
  1748. depthData[y * mWidth + x] = pixelDepth;
  1749. }
  1750. }
  1751. }
  1752. OcclusionCullingStatistics GetStatistics() override
  1753. {
  1754. return mStats;
  1755. }
  1756. };