CPUCullImpl.h 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715
  1. // Copyright 2022 Dolphin Emulator Project
  2. // SPDX-License-Identifier: GPL-2.0-or-later
  3. #if defined(USE_FMA)
  4. #define VECTOR_NAMESPACE CPUCull_FMA
  5. #elif defined(USE_AVX)
  6. #define VECTOR_NAMESPACE CPUCull_AVX
  7. #elif defined(USE_SSE41)
  8. #define VECTOR_NAMESPACE CPUCull_SSE41
  9. #elif defined(USE_SSE3)
  10. #define VECTOR_NAMESPACE CPUCull_SSE3
  11. #elif defined(USE_SSE)
  12. #define VECTOR_NAMESPACE CPUCull_SSE
  13. #elif defined(USE_NEON)
  14. #define VECTOR_NAMESPACE CPUCull_NEON
  15. #elif defined(NO_SIMD)
  16. #define VECTOR_NAMESPACE CPUCull_Scalar
  17. #else
  18. #error This file is meant to be used by CPUCull.cpp only!
  19. #endif
  20. #if defined(__GNUC__) && defined(USE_FMA) && !(defined(__AVX__) && defined(__FMA__))
  21. #define ATTR_TARGET __attribute__((target("avx,fma")))
  22. #elif defined(__GNUC__) && defined(USE_AVX) && !defined(__AVX__)
  23. #define ATTR_TARGET __attribute__((target("avx")))
  24. #elif defined(__GNUC__) && defined(USE_SSE41) && !defined(__SSE4_1__)
  25. #define ATTR_TARGET __attribute__((target("sse4.1")))
  26. #elif defined(__GNUC__) && defined(USE_SSE3) && !defined(__SSE3__)
  27. #define ATTR_TARGET __attribute__((target("sse3")))
  28. #else
  29. #define ATTR_TARGET
  30. #endif
  31. namespace VECTOR_NAMESPACE
  32. {
  33. #if defined(USE_SSE)
  34. typedef __m128 Vector;
  35. #elif defined(USE_NEON)
  36. typedef float32x4_t Vector;
  37. #else
  38. struct alignas(16) Vector
  39. {
  40. float x, y, z, w;
  41. };
  42. #endif
  43. static_assert(sizeof(Vector) == 16);
  44. #ifdef USE_NEON
  45. ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vsetr_f32(float x, float y, float z, float w)
  46. {
  47. float tmp[4] = {x, y, z, w};
  48. return vld1q_f32(tmp);
  49. }
  50. ATTR_TARGET DOLPHIN_FORCE_INLINE static void vuzp12q_f32(Vector& a, Vector& b)
  51. {
  52. Vector tmp = vuzp2q_f32(a, b);
  53. a = vuzp1q_f32(a, b);
  54. b = tmp;
  55. }
  56. #endif
  57. #ifdef USE_SSE
  58. template <int i>
  59. ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vector_broadcast(Vector v)
  60. {
  61. return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
  62. }
  63. #endif
  64. #ifdef USE_AVX
  65. template <int i>
  66. ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 vector_broadcast(__m256 v)
  67. {
  68. return _mm256_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
  69. }
  70. #endif
  71. #ifdef USE_AVX
  72. ATTR_TARGET DOLPHIN_FORCE_INLINE static void TransposeYMM(__m256& o0, __m256& o1, //
  73. __m256& o2, __m256& o3)
  74. {
  75. __m256d tmp0 = _mm256_castps_pd(_mm256_unpacklo_ps(o0, o1));
  76. __m256d tmp1 = _mm256_castps_pd(_mm256_unpacklo_ps(o2, o3));
  77. __m256d tmp2 = _mm256_castps_pd(_mm256_unpackhi_ps(o0, o1));
  78. __m256d tmp3 = _mm256_castps_pd(_mm256_unpackhi_ps(o2, o3));
  79. o0 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp0, tmp1));
  80. o1 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp0, tmp1));
  81. o2 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp2, tmp3));
  82. o3 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp2, tmp3));
  83. }
  84. ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedYMM(const void* source, __m256& o0,
  85. __m256& o1, __m256& o2, __m256& o3)
  86. {
  87. const Vector* vsource = static_cast<const Vector*>(source);
  88. o0 = _mm256_broadcast_ps(&vsource[0]);
  89. o1 = _mm256_broadcast_ps(&vsource[1]);
  90. o2 = _mm256_broadcast_ps(&vsource[2]);
  91. o3 = _mm256_broadcast_ps(&vsource[3]);
  92. TransposeYMM(o0, o1, o2, o3);
  93. }
  94. ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPosYMM(const void* sourcel, const void* sourceh,
  95. __m256& o0, __m256& o1, __m256& o2)
  96. {
  97. const Vector* vsourcel = static_cast<const Vector*>(sourcel);
  98. const Vector* vsourceh = static_cast<const Vector*>(sourceh);
  99. o0 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[0]), vsourceh[0], 1);
  100. o1 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[1]), vsourceh[1], 1);
  101. o2 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[2]), vsourceh[2], 1);
  102. }
  103. ATTR_TARGET DOLPHIN_FORCE_INLINE static void
  104. LoadTransposedPosYMM(const void* source, __m256& o0, __m256& o1, __m256& o2, __m256& o3)
  105. {
  106. const Vector* vsource = static_cast<const Vector*>(source);
  107. o0 = _mm256_broadcast_ps(&vsource[0]);
  108. o1 = _mm256_broadcast_ps(&vsource[1]);
  109. o2 = _mm256_broadcast_ps(&vsource[2]);
  110. o3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f);
  111. TransposeYMM(o0, o1, o2, o3);
  112. }
  113. ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 ApplyMatrixYMM(__m256 v, __m256 m0, __m256 m1,
  114. __m256 m2, __m256 m3)
  115. {
  116. __m256 output = _mm256_mul_ps(vector_broadcast<0>(v), m0);
  117. #ifdef USE_FMA
  118. output = _mm256_fmadd_ps(vector_broadcast<1>(v), m1, output);
  119. output = _mm256_fmadd_ps(vector_broadcast<2>(v), m2, output);
  120. output = _mm256_fmadd_ps(vector_broadcast<3>(v), m3, output);
  121. #else
  122. output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(v), m1));
  123. output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(v), m2));
  124. output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<3>(v), m3));
  125. #endif
  126. return output;
  127. }
  128. ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
  129. TransformVertexNoTransposeYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, //
  130. __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
  131. {
  132. __m256 mul0 = _mm256_mul_ps(vertex, pos0);
  133. __m256 mul1 = _mm256_mul_ps(vertex, pos1);
  134. __m256 mul2 = _mm256_mul_ps(vertex, pos2);
  135. __m256 mul3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f);
  136. __m256 output = _mm256_hadd_ps(_mm256_hadd_ps(mul0, mul1), _mm256_hadd_ps(mul2, mul3));
  137. return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3);
  138. }
  139. template <bool PositionHas3Elems>
  140. ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
  141. TransformVertexYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3, //
  142. __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
  143. {
  144. __m256 output = pos3; // vertex.w is always 1.0
  145. #ifdef USE_FMA
  146. output = _mm256_fmadd_ps(vector_broadcast<0>(vertex), pos0, output);
  147. output = _mm256_fmadd_ps(vector_broadcast<1>(vertex), pos1, output);
  148. if constexpr (PositionHas3Elems)
  149. output = _mm256_fmadd_ps(vector_broadcast<2>(vertex), pos2, output);
  150. #else
  151. output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<0>(vertex), pos0));
  152. output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(vertex), pos1));
  153. if constexpr (PositionHas3Elems)
  154. output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(vertex), pos2));
  155. #endif
  156. return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3);
  157. }
  158. template <bool PositionHas3Elems, bool PerVertexPosMtx>
  159. ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
  160. LoadTransform2Vertices(const u8* v0data, const u8* v1data, //
  161. __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3, //
  162. __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
  163. {
  164. __m256 v01;
  165. if constexpr (PerVertexPosMtx)
  166. {
  167. // Vertex data layout always starts with posmtx data if available, then position data
  168. // Convenient for us, that means offsets are always fixed
  169. u32 v0idx = v0data[0] & 0x3f;
  170. u32 v1idx = v1data[0] & 0x3f;
  171. v0data += sizeof(u32);
  172. v1data += sizeof(u32);
  173. const float* v0fdata = reinterpret_cast<const float*>(v0data);
  174. const float* v1fdata = reinterpret_cast<const float*>(v1data);
  175. LoadPosYMM(&xfmem.posMatrices[v0idx * 4], &xfmem.posMatrices[v1idx * 4], pos0, pos1, pos2);
  176. if constexpr (PositionHas3Elems)
  177. {
  178. __m256 base = _mm256_set1_ps(1.0f);
  179. v01 = _mm256_blend_ps(_mm256_loadu2_m128(v1fdata, v0fdata), base, 0x88);
  180. }
  181. else
  182. {
  183. __m256 base = _mm256_unpacklo_ps(_mm256_setzero_ps(), _mm256_set1_ps(1.0f));
  184. __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast<const double*>(v1data)));
  185. __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(v0data));
  186. v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30);
  187. v01 = _mm256_blend_ps(v01, base, 0xcc);
  188. }
  189. v01 = TransformVertexNoTransposeYMM(v01, pos0, pos1, pos2, proj0, proj1, proj2, proj3);
  190. }
  191. else
  192. {
  193. const float* v0fdata = reinterpret_cast<const float*>(v0data);
  194. const float* v1fdata = reinterpret_cast<const float*>(v1data);
  195. if constexpr (PositionHas3Elems)
  196. {
  197. v01 = _mm256_loadu2_m128(v1fdata, v0fdata);
  198. }
  199. else
  200. {
  201. __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast<const double*>(v1data)));
  202. __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(v0data));
  203. v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30);
  204. }
  205. #ifdef __clang__
  206. // Clang's optimizer is dumb, yay
  207. // It sees TransformVertexYMM doing broadcasts and is like
  208. // "let's broadcast *before* we combine v0 and v1! Then we can use vbroadcastss!"
  209. // Prevent it from "optimizing" here
  210. asm("" : "+x"(v01)::);
  211. #endif
  212. v01 = TransformVertexYMM<PositionHas3Elems>(v01, pos0, pos1, pos2, pos3, //
  213. proj0, proj1, proj2, proj3);
  214. }
  215. return v01;
  216. }
  217. #endif
  218. #ifndef USE_AVX
  219. // Note: Assumes 16-byte aligned source
  220. ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposed(const void* source, Vector& o0,
  221. Vector& o1, Vector& o2, Vector& o3)
  222. {
  223. #if defined(USE_SSE)
  224. const Vector* vsource = static_cast<const Vector*>(source);
  225. o0 = vsource[0];
  226. o1 = vsource[1];
  227. o2 = vsource[2];
  228. o3 = vsource[3];
  229. _MM_TRANSPOSE4_PS(o0, o1, o2, o3);
  230. #elif defined(USE_NEON)
  231. float32x4x4_t ld = vld4q_f32(static_cast<const float*>(source));
  232. o0 = ld.val[0];
  233. o1 = ld.val[1];
  234. o2 = ld.val[2];
  235. o3 = ld.val[3];
  236. #else
  237. const Vector* vsource = static_cast<const Vector*>(source);
  238. // clang-format off
  239. o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = vsource[3].x;
  240. o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = vsource[3].y;
  241. o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = vsource[3].z;
  242. o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = vsource[3].w;
  243. // clang-format on
  244. #endif
  245. }
  246. ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedPos(const void* source, Vector& o0,
  247. Vector& o1, Vector& o2, Vector& o3)
  248. {
  249. const Vector* vsource = static_cast<const Vector*>(source);
  250. #if defined(USE_SSE)
  251. o0 = vsource[0];
  252. o1 = vsource[1];
  253. o2 = vsource[2];
  254. o3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
  255. _MM_TRANSPOSE4_PS(o0, o1, o2, o3);
  256. #elif defined(USE_NEON)
  257. float32x4x2_t ld01 = vld2q_f32(static_cast<const float*>(source));
  258. o0 = ld01.val[0];
  259. o1 = ld01.val[1];
  260. o2 = vsource[2];
  261. o3 = vsetr_f32(0.0f, 0.0f, 0.0f, 1.0f);
  262. vuzp12q_f32(o2, o3);
  263. vuzp12q_f32(o0, o2);
  264. vuzp12q_f32(o1, o3);
  265. #else
  266. // clang-format off
  267. o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = 0.0f;
  268. o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = 0.0f;
  269. o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = 0.0f;
  270. o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = 1.0f;
  271. // clang-format on
  272. #endif
  273. }
  274. #endif
  275. #ifndef USE_NEON
  276. ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPos(const void* source, //
  277. Vector& o0, Vector& o1, Vector& o2)
  278. {
  279. const Vector* vsource = static_cast<const Vector*>(source);
  280. o0 = vsource[0];
  281. o1 = vsource[1];
  282. o2 = vsource[2];
  283. }
  284. #endif
  285. ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector ApplyMatrix(Vector v, Vector m0, Vector m1,
  286. Vector m2, Vector m3)
  287. {
  288. #if defined(USE_SSE)
  289. Vector output = _mm_mul_ps(vector_broadcast<0>(v), m0);
  290. #ifdef USE_FMA
  291. output = _mm_fmadd_ps(vector_broadcast<1>(v), m1, output);
  292. output = _mm_fmadd_ps(vector_broadcast<2>(v), m2, output);
  293. output = _mm_fmadd_ps(vector_broadcast<3>(v), m3, output);
  294. #else
  295. output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(v), m1));
  296. output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(v), m2));
  297. output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<3>(v), m3));
  298. #endif
  299. return output;
  300. #elif defined(USE_NEON)
  301. Vector output = vmulq_laneq_f32(m0, v, 0);
  302. output = vfmaq_laneq_f32(output, m1, v, 1);
  303. output = vfmaq_laneq_f32(output, m2, v, 2);
  304. output = vfmaq_laneq_f32(output, m3, v, 3);
  305. return output;
  306. #else
  307. Vector output;
  308. output.x = v.x * m0.x + v.y * m1.x + v.z * m2.x + v.w * m3.x;
  309. output.y = v.x * m0.y + v.y * m1.y + v.z * m2.y + v.w * m3.y;
  310. output.z = v.x * m0.z + v.y * m1.z + v.z * m2.z + v.w * m3.z;
  311. output.w = v.x * m0.w + v.y * m1.w + v.z * m2.w + v.w * m3.w;
  312. return output;
  313. #endif
  314. }
  315. #ifndef USE_NEON
  316. ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
  317. TransformVertexNoTranspose(Vector vertex, Vector pos0, Vector pos1, Vector pos2, //
  318. Vector proj0, Vector proj1, Vector proj2, Vector proj3)
  319. {
  320. #ifdef USE_SSE
  321. Vector mul0 = _mm_mul_ps(vertex, pos0);
  322. Vector mul1 = _mm_mul_ps(vertex, pos1);
  323. Vector mul2 = _mm_mul_ps(vertex, pos2);
  324. Vector mul3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
  325. #ifdef USE_SSE3
  326. Vector output = _mm_hadd_ps(_mm_hadd_ps(mul0, mul1), _mm_hadd_ps(mul2, mul3));
  327. #else
  328. Vector t0 = _mm_add_ps(_mm_unpacklo_ps(mul0, mul2), _mm_unpackhi_ps(mul0, mul2));
  329. Vector t1 = _mm_add_ps(_mm_unpacklo_ps(mul1, mul3), _mm_unpackhi_ps(mul1, mul3));
  330. Vector output = _mm_add_ps(_mm_unpacklo_ps(t0, t1), _mm_unpackhi_ps(t0, t1));
  331. #endif
  332. #else
  333. Vector output;
  334. output.x = vertex.x * pos0.x + vertex.y * pos0.y + vertex.z * pos0.z + vertex.w * pos0.w;
  335. output.y = vertex.x * pos1.x + vertex.y * pos1.y + vertex.z * pos1.z + vertex.w * pos1.w;
  336. output.z = vertex.x * pos2.x + vertex.y * pos2.y + vertex.z * pos2.z + vertex.w * pos2.w;
  337. output.w = 1.0f;
  338. #endif
  339. output = ApplyMatrix(output, proj0, proj1, proj2, proj3);
  340. return output;
  341. }
  342. #endif
  343. template <bool PositionHas3Elems>
  344. ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
  345. TransformVertex(Vector vertex, Vector pos0, Vector pos1, Vector pos2, Vector pos3, //
  346. Vector proj0, Vector proj1, Vector proj2, Vector proj3)
  347. {
  348. Vector output = pos3; // vertex.w is always 1.0
  349. #if defined(USE_FMA)
  350. output = _mm_fmadd_ps(vector_broadcast<0>(vertex), pos0, output);
  351. output = _mm_fmadd_ps(vector_broadcast<1>(vertex), pos1, output);
  352. if constexpr (PositionHas3Elems)
  353. output = _mm_fmadd_ps(vector_broadcast<2>(vertex), pos2, output);
  354. #elif defined(USE_SSE)
  355. output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<0>(vertex), pos0));
  356. output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(vertex), pos1));
  357. if constexpr (PositionHas3Elems)
  358. output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(vertex), pos2));
  359. #elif defined(USE_NEON)
  360. output = vfmaq_laneq_f32(output, pos0, vertex, 0);
  361. output = vfmaq_laneq_f32(output, pos1, vertex, 1);
  362. if constexpr (PositionHas3Elems)
  363. output = vfmaq_laneq_f32(output, pos2, vertex, 2);
  364. #else
  365. output.x += vertex.x * pos0.x + vertex.y * pos1.x;
  366. output.y += vertex.x * pos0.y + vertex.y * pos1.y;
  367. output.z += vertex.x * pos0.z + vertex.y * pos1.z;
  368. if constexpr (PositionHas3Elems)
  369. {
  370. output.x += vertex.z * pos2.x;
  371. output.y += vertex.z * pos2.y;
  372. output.z += vertex.z * pos2.z;
  373. }
  374. #endif
  375. output = ApplyMatrix(output, proj0, proj1, proj2, proj3);
  376. return output;
  377. }
  378. template <bool PositionHas3Elems, bool PerVertexPosMtx>
  379. ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
  380. LoadTransformVertex(const u8* data, Vector pos0, Vector pos1, Vector pos2, Vector pos3,
  381. Vector proj0, Vector proj1, Vector proj2, Vector proj3)
  382. {
  383. Vector vertex;
  384. if constexpr (PerVertexPosMtx)
  385. {
  386. // Vertex data layout always starts with posmtx data if available, then position data
  387. // Convenient for us, that means offsets are always fixed
  388. u32 idx = data[0] & 0x3f;
  389. data += sizeof(u32);
  390. const float* fdata = reinterpret_cast<const float*>(data);
  391. #ifdef USE_NEON
  392. LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
  393. if constexpr (PositionHas3Elems)
  394. {
  395. vertex = vld1q_f32(fdata);
  396. }
  397. else
  398. {
  399. vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f));
  400. }
  401. vertex = TransformVertex<PositionHas3Elems>(vertex, pos0, pos1, pos2, pos3, //
  402. proj0, proj1, proj2, proj3);
  403. #else
  404. LoadPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2);
  405. if constexpr (PositionHas3Elems)
  406. {
  407. #if defined(USE_SSE)
  408. #ifdef USE_SSE41
  409. Vector base = _mm_set1_ps(1.0f);
  410. vertex = _mm_blend_ps(_mm_loadu_ps(fdata), base, 8);
  411. #else
  412. Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
  413. Vector mask = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, -1, 0));
  414. vertex = _mm_or_ps(_mm_and_ps(_mm_loadu_ps(fdata), mask), base);
  415. #endif
  416. #else
  417. vertex.x = fdata[0];
  418. vertex.y = fdata[1];
  419. vertex.z = fdata[2];
  420. vertex.w = 1.0f;
  421. #endif
  422. }
  423. else
  424. {
  425. #if defined(USE_SSE)
  426. Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
  427. vertex = _mm_loadl_pi(base, reinterpret_cast<const __m64*>(fdata));
  428. #else
  429. vertex.x = fdata[0];
  430. vertex.y = fdata[1];
  431. vertex.z = 0.0f;
  432. vertex.w = 1.0f;
  433. #endif
  434. }
  435. vertex = TransformVertexNoTranspose(vertex, pos0, pos1, pos2, proj0, proj1, proj2, proj3);
  436. #endif
  437. }
  438. else
  439. {
  440. const float* fdata = reinterpret_cast<const float*>(data);
  441. if constexpr (PositionHas3Elems)
  442. {
  443. #if defined(USE_SSE)
  444. vertex = _mm_loadu_ps(fdata);
  445. #elif defined(USE_NEON)
  446. vertex = vld1q_f32(fdata);
  447. #else
  448. vertex.x = fdata[0];
  449. vertex.y = fdata[1];
  450. vertex.z = fdata[2];
  451. vertex.w = 1.0f;
  452. #endif
  453. }
  454. else
  455. {
  456. #if defined(USE_SSE)
  457. vertex = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(fdata));
  458. #elif defined(USE_NEON)
  459. vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f));
  460. #else
  461. vertex.x = fdata[0];
  462. vertex.y = fdata[1];
  463. vertex.z = 0.0f;
  464. vertex.w = 1.0f;
  465. #endif
  466. }
  467. vertex = TransformVertex<PositionHas3Elems>(vertex, pos0, pos1, pos2, pos3, //
  468. proj0, proj1, proj2, proj3);
  469. }
  470. return vertex;
  471. }
  472. template <bool PositionHas3Elems, bool PerVertexPosMtx>
  473. ATTR_TARGET static void TransformVertices(void* output, const void* vertices, u32 stride, int count)
  474. {
  475. const VertexShaderManager& vsmanager = Core::System::GetInstance().GetVertexShaderManager();
  476. const u8* cvertices = static_cast<const u8*>(vertices);
  477. Vector* voutput = static_cast<Vector*>(output);
  478. u32 idx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 0x3f;
  479. #ifdef USE_AVX
  480. __m256 proj0, proj1, proj2, proj3;
  481. __m256 pos0, pos1, pos2, pos3;
  482. LoadTransposedYMM(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3);
  483. LoadTransposedPosYMM(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
  484. for (int i = 1; i < count; i += 2)
  485. {
  486. const u8* v0data = cvertices;
  487. const u8* v1data = cvertices + stride;
  488. __m256 v01 = LoadTransform2Vertices<PositionHas3Elems, PerVertexPosMtx>(
  489. v0data, v1data, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3);
  490. _mm256_store_ps(reinterpret_cast<float*>(voutput), v01);
  491. cvertices += stride * 2;
  492. voutput += 2;
  493. }
  494. if (count & 1)
  495. {
  496. *voutput = LoadTransformVertex<PositionHas3Elems, PerVertexPosMtx>(
  497. cvertices, //
  498. _mm256_castps256_ps128(pos0), _mm256_castps256_ps128(pos1), //
  499. _mm256_castps256_ps128(pos2), _mm256_castps256_ps128(pos3), //
  500. _mm256_castps256_ps128(proj0), _mm256_castps256_ps128(proj1), //
  501. _mm256_castps256_ps128(proj2), _mm256_castps256_ps128(proj3));
  502. }
  503. #else
  504. Vector proj0, proj1, proj2, proj3;
  505. Vector pos0, pos1, pos2, pos3;
  506. LoadTransposed(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3);
  507. LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
  508. for (int i = 0; i < count; i++)
  509. {
  510. *voutput = LoadTransformVertex<PositionHas3Elems, PerVertexPosMtx>(
  511. cvertices, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3);
  512. cvertices += stride;
  513. voutput += 1;
  514. }
  515. #endif
  516. }
  517. template <CullMode Mode>
  518. ATTR_TARGET DOLPHIN_FORCE_INLINE static bool CullTriangle(const CPUCull::TransformedVertex& a,
  519. const CPUCull::TransformedVertex& b,
  520. const CPUCull::TransformedVertex& c)
  521. {
  522. if (Mode == CullMode::All)
  523. return true;
  524. Vector va = reinterpret_cast<const Vector&>(a);
  525. Vector vb = reinterpret_cast<const Vector&>(b);
  526. Vector vc = reinterpret_cast<const Vector&>(c);
  527. // See videosoftware Clipper.cpp
  528. #if defined(USE_SSE)
  529. Vector wxzya = _mm_shuffle_ps(va, va, _MM_SHUFFLE(1, 2, 0, 3));
  530. Vector wxzyc = _mm_shuffle_ps(vc, vc, _MM_SHUFFLE(1, 2, 0, 3));
  531. Vector ywzxb = _mm_shuffle_ps(vb, vb, _MM_SHUFFLE(0, 2, 3, 1));
  532. Vector part0 = _mm_mul_ps(va, wxzyc);
  533. Vector part1 = _mm_mul_ps(vc, wxzya);
  534. Vector part2 = _mm_mul_ps(_mm_sub_ps(part0, part1), ywzxb);
  535. #ifdef USE_SSE3
  536. Vector part3 = _mm_movehdup_ps(part2);
  537. #else
  538. Vector part3 = vector_broadcast<1>(part2);
  539. #endif
  540. Vector part4 = vector_broadcast<3>(part2);
  541. Vector part5 = _mm_add_ss(_mm_add_ss(part2, part3), part4);
  542. float normal_z_dir;
  543. _mm_store_ss(&normal_z_dir, part5);
  544. #elif defined(USE_NEON)
  545. Vector zero = vdupq_n_f32(0.0f);
  546. Vector wx0ya = vextq_f32(va, vzip1q_f32(va, zero), 3);
  547. Vector wx0yc = vextq_f32(vc, vzip1q_f32(vc, zero), 3);
  548. Vector ywxxb = vuzp2q_f32(vb, vdupq_laneq_f32(vb, 0));
  549. Vector part0 = vmulq_f32(va, wx0yc);
  550. Vector part1 = vmulq_f32(vc, wx0ya);
  551. Vector part2 = vmulq_f32(vsubq_f32(part0, part1), ywxxb);
  552. float normal_z_dir = vaddvq_f32(part2);
  553. #else
  554. float normal_z_dir = (c.w * a.x - a.w * c.x) * b.y + //
  555. (c.x * a.y - a.x * c.y) * b.w + //
  556. (c.y * a.w - a.y * c.w) * b.x;
  557. #endif
  558. bool cull = false;
  559. switch (Mode)
  560. {
  561. case CullMode::None:
  562. cull = normal_z_dir == 0;
  563. break;
  564. case CullMode::Front:
  565. cull = normal_z_dir <= 0;
  566. break;
  567. case CullMode::Back:
  568. cull = normal_z_dir >= 0;
  569. break;
  570. case CullMode::All:
  571. cull = true;
  572. break;
  573. }
  574. if (cull)
  575. return true;
  576. #if defined(USE_SSE)
  577. Vector xyab = _mm_unpacklo_ps(va, vb);
  578. Vector zwab = _mm_unpackhi_ps(va, vb);
  579. Vector allx = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(0, 0, 1, 0));
  580. Vector ally = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(1, 1, 3, 2));
  581. Vector allpw = _mm_shuffle_ps(zwab, vc, _MM_SHUFFLE(3, 3, 3, 2));
  582. Vector allnw = _mm_xor_ps(allpw, _mm_set1_ps(-0.0f));
  583. __m128i x_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, allx));
  584. __m128i y_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, ally));
  585. __m128i x_lt_nw = _mm_castps_si128(_mm_cmplt_ps(allx, allnw));
  586. __m128i y_lt_nw = _mm_castps_si128(_mm_cmplt_ps(ally, allnw));
  587. __m128i any_out_of_bounds = _mm_packs_epi16(_mm_packs_epi32(x_lt_nw, y_lt_nw), //
  588. _mm_packs_epi32(x_gt_pw, y_gt_pw));
  589. cull |= 0 != _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_set1_epi32(~0), any_out_of_bounds));
  590. #elif defined(USE_NEON)
  591. float64x2_t xyab = vreinterpretq_f64_f32(vzip1q_f32(va, vb));
  592. float64x2_t xycc = vreinterpretq_f64_f32(vzip1q_f32(vc, vc));
  593. float32x4_t allx = vreinterpretq_f32_f64(vzip1q_f64(xyab, xycc));
  594. float32x4_t ally = vreinterpretq_f32_f64(vzip2q_f64(xyab, xycc));
  595. float32x4_t allpw = vextq_f32(vzip2q_f32(va, vb), vdupq_laneq_f32(vc, 3), 2);
  596. float32x4_t allnw = vnegq_f32(allpw);
  597. uint16x8_t x_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(allx, allpw));
  598. uint16x8_t y_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(ally, allpw));
  599. uint16x8_t x_lt_nw = vreinterpretq_u16_u32(vcltq_f32(allx, allnw));
  600. uint16x8_t y_lt_nw = vreinterpretq_u16_u32(vcltq_f32(ally, allnw));
  601. uint8x16_t lt_nw = vreinterpretq_u8_u16(vuzp1q_u16(x_lt_nw, y_lt_nw));
  602. uint8x16_t gt_pw = vreinterpretq_u8_u16(vuzp1q_u16(x_gt_pw, y_gt_pw));
  603. uint32x4_t any_out_of_bounds = vreinterpretq_u32_u8(vuzp1q_u8(lt_nw, gt_pw));
  604. cull |= 0xFFFFFFFF == vmaxvq_u32(any_out_of_bounds);
  605. #else
  606. cull |= a.x < -a.w && b.x < -b.w && c.x < -c.w;
  607. cull |= a.y < -a.w && b.y < -b.w && c.y < -c.w;
  608. cull |= a.x > a.w && b.x > b.w && c.x > c.w;
  609. cull |= a.y > a.w && b.y > b.w && c.y > c.w;
  610. #endif
  611. return cull;
  612. }
  613. template <OpcodeDecoder::Primitive Primitive, CullMode Mode>
  614. ATTR_TARGET static bool AreAllVerticesCulled(const CPUCull::TransformedVertex* transformed,
  615. int count)
  616. {
  617. switch (Primitive)
  618. {
  619. case OpcodeDecoder::Primitive::GX_DRAW_QUADS:
  620. case OpcodeDecoder::Primitive::GX_DRAW_QUADS_2:
  621. {
  622. int i = 3;
  623. for (; i < count; i += 4)
  624. {
  625. if (!CullTriangle<Mode>(transformed[i - 3], transformed[i - 2], transformed[i - 1]))
  626. return false;
  627. if (!CullTriangle<Mode>(transformed[i - 3], transformed[i - 1], transformed[i - 0]))
  628. return false;
  629. }
  630. // three vertices remaining, so render a triangle
  631. if (i == count)
  632. {
  633. if (!CullTriangle<Mode>(transformed[i - 3], transformed[i - 2], transformed[i - 1]))
  634. return false;
  635. }
  636. break;
  637. }
  638. case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLES:
  639. for (int i = 2; i < count; i += 3)
  640. {
  641. if (!CullTriangle<Mode>(transformed[i - 2], transformed[i - 1], transformed[i - 0]))
  642. return false;
  643. }
  644. break;
  645. case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_STRIP:
  646. {
  647. bool wind = false;
  648. for (int i = 2; i < count; ++i)
  649. {
  650. if (!CullTriangle<Mode>(transformed[i - 2], transformed[i - !wind], transformed[i - wind]))
  651. return false;
  652. wind = !wind;
  653. }
  654. break;
  655. }
  656. case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN:
  657. for (int i = 2; i < count; ++i)
  658. {
  659. if (!CullTriangle<Mode>(transformed[0], transformed[i - 1], transformed[i]))
  660. return false;
  661. }
  662. break;
  663. }
  664. return true;
  665. }
  666. } // namespace VECTOR_NAMESPACE
  667. #undef ATTR_TARGET
  668. #undef VECTOR_NAMESPACE