VertexLoaderX64.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. // Copyright 2015 Dolphin Emulator Project
  2. // SPDX-License-Identifier: GPL-2.0-or-later
  3. #include "VideoCommon/VertexLoaderX64.h"
  4. #include <array>
  5. #include <cstring>
  6. #include <string>
  7. #include "Common/BitSet.h"
  8. #include "Common/CPUDetect.h"
  9. #include "Common/Common.h"
  10. #include "Common/CommonTypes.h"
  11. #include "Common/Intrinsics.h"
  12. #include "Common/JitRegister.h"
  13. #include "Common/x64ABI.h"
  14. #include "Common/x64Emitter.h"
  15. #include "VideoCommon/CPMemory.h"
  16. #include "VideoCommon/VertexLoaderManager.h"
  17. using namespace Gen;
  18. static const X64Reg src_reg = ABI_PARAM1;
  19. static const X64Reg dst_reg = ABI_PARAM2;
  20. static const X64Reg scratch1 = RAX;
  21. static const X64Reg scratch2 = ABI_PARAM3;
  22. static const X64Reg scratch3 = ABI_PARAM4;
  23. // The remaining number of vertices to be processed. Starts at count - 1, and the final loop has it
  24. // at 0.
  25. static const X64Reg remaining_reg = R10;
  26. static const X64Reg skipped_reg = R11;
  27. static const X64Reg base_reg = RBX;
  28. static const u8* memory_base_ptr = (u8*)&g_main_cp_state.array_strides;
  29. static OpArg MPIC(const void* ptr, X64Reg scale_reg, int scale = SCALE_1)
  30. {
  31. return MComplex(base_reg, scale_reg, scale, PtrOffset(ptr, memory_base_ptr));
  32. }
  33. static OpArg MPIC(const void* ptr)
  34. {
  35. return MDisp(base_reg, PtrOffset(ptr, memory_base_ptr));
  36. }
  37. VertexLoaderX64::VertexLoaderX64(const TVtxDesc& vtx_desc, const VAT& vtx_att)
  38. : VertexLoaderBase(vtx_desc, vtx_att)
  39. {
  40. AllocCodeSpace(4096);
  41. ClearCodeSpace();
  42. GenerateVertexLoader();
  43. WriteProtect(true);
  44. Common::JitRegister::Register(region, GetCodePtr(), "VertexLoaderX64\nVtx desc: \n{}\nVAT:\n{}",
  45. vtx_desc, vtx_att);
  46. }
  47. OpArg VertexLoaderX64::GetVertexAddr(CPArray array, VertexComponentFormat attribute)
  48. {
  49. OpArg data = MDisp(src_reg, m_src_ofs);
  50. if (IsIndexed(attribute))
  51. {
  52. int bits = attribute == VertexComponentFormat::Index8 ? 8 : 16;
  53. LoadAndSwap(bits, scratch1, data);
  54. m_src_ofs += bits / 8;
  55. if (array == CPArray::Position)
  56. {
  57. CMP(bits, R(scratch1), Imm8(-1));
  58. m_skip_vertex = J_CC(CC_E, Jump::Near);
  59. }
  60. IMUL(32, scratch1, MPIC(&g_main_cp_state.array_strides[array]));
  61. MOV(64, R(scratch2), MPIC(&VertexLoaderManager::cached_arraybases[array]));
  62. return MRegSum(scratch1, scratch2);
  63. }
  64. else
  65. {
  66. return data;
  67. }
  68. }
  69. void VertexLoaderX64::ReadVertex(OpArg data, VertexComponentFormat attribute,
  70. ComponentFormat format, int count_in, int count_out,
  71. bool dequantize, u8 scaling_exponent,
  72. AttributeFormat* native_format)
  73. {
  74. using ShuffleRow = std::array<__m128i, 3>;
  75. static const Common::EnumMap<ShuffleRow, ComponentFormat::InvalidFloat7> shuffle_lut = {
  76. ShuffleRow{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF00L), // 1x u8
  77. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L), // 2x u8
  78. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L)}, // 3x u8
  79. ShuffleRow{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00FFFFFFL), // 1x s8
  80. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL), // 2x s8
  81. _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL)}, // 3x s8
  82. ShuffleRow{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0001L), // 1x u16
  83. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L), // 2x u16
  84. _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L)}, // 3x u16
  85. ShuffleRow{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x0001FFFFL), // 1x s16
  86. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL), // 2x s16
  87. _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL)}, // 3x s16
  88. ShuffleRow{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x float
  89. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x float
  90. _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x float
  91. ShuffleRow{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x invalid
  92. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x invalid
  93. _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x invalid
  94. ShuffleRow{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x invalid
  95. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x invalid
  96. _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x invalid
  97. ShuffleRow{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x invalid
  98. _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x invalid
  99. _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x invalid
  100. };
  101. static const __m128 scale_factors[32] = {
  102. _mm_set_ps1(1. / (1u << 0)), _mm_set_ps1(1. / (1u << 1)), _mm_set_ps1(1. / (1u << 2)),
  103. _mm_set_ps1(1. / (1u << 3)), _mm_set_ps1(1. / (1u << 4)), _mm_set_ps1(1. / (1u << 5)),
  104. _mm_set_ps1(1. / (1u << 6)), _mm_set_ps1(1. / (1u << 7)), _mm_set_ps1(1. / (1u << 8)),
  105. _mm_set_ps1(1. / (1u << 9)), _mm_set_ps1(1. / (1u << 10)), _mm_set_ps1(1. / (1u << 11)),
  106. _mm_set_ps1(1. / (1u << 12)), _mm_set_ps1(1. / (1u << 13)), _mm_set_ps1(1. / (1u << 14)),
  107. _mm_set_ps1(1. / (1u << 15)), _mm_set_ps1(1. / (1u << 16)), _mm_set_ps1(1. / (1u << 17)),
  108. _mm_set_ps1(1. / (1u << 18)), _mm_set_ps1(1. / (1u << 19)), _mm_set_ps1(1. / (1u << 20)),
  109. _mm_set_ps1(1. / (1u << 21)), _mm_set_ps1(1. / (1u << 22)), _mm_set_ps1(1. / (1u << 23)),
  110. _mm_set_ps1(1. / (1u << 24)), _mm_set_ps1(1. / (1u << 25)), _mm_set_ps1(1. / (1u << 26)),
  111. _mm_set_ps1(1. / (1u << 27)), _mm_set_ps1(1. / (1u << 28)), _mm_set_ps1(1. / (1u << 29)),
  112. _mm_set_ps1(1. / (1u << 30)), _mm_set_ps1(1. / (1u << 31)),
  113. };
  114. X64Reg coords = XMM0;
  115. const auto write_zfreeze = [&]() { // zfreeze
  116. if (native_format == &m_native_vtx_decl.position)
  117. {
  118. CMP(32, R(remaining_reg), Imm8(3));
  119. FixupBranch dont_store = J_CC(CC_AE);
  120. // The position cache is composed of 3 rows of 4 floats each; since each float is 4 bytes,
  121. // we need to scale by 4 twice to cover the 4 floats.
  122. LEA(32, scratch3, MScaled(remaining_reg, SCALE_4, 0));
  123. MOVUPS(MPIC(VertexLoaderManager::position_cache.data(), scratch3, SCALE_4), coords);
  124. SetJumpTarget(dont_store);
  125. }
  126. else if (native_format == &m_native_vtx_decl.normals[0])
  127. {
  128. TEST(32, R(remaining_reg), R(remaining_reg));
  129. FixupBranch dont_store = J_CC(CC_NZ);
  130. // For similar reasons, the cached normal is 4 floats each
  131. MOVUPS(MPIC(VertexLoaderManager::normal_cache.data()), coords);
  132. SetJumpTarget(dont_store);
  133. }
  134. else if (native_format == &m_native_vtx_decl.normals[1])
  135. {
  136. TEST(32, R(remaining_reg), R(remaining_reg));
  137. FixupBranch dont_store = J_CC(CC_NZ);
  138. // For similar reasons, the cached tangent and binormal are 4 floats each
  139. MOVUPS(MPIC(VertexLoaderManager::tangent_cache.data()), coords);
  140. SetJumpTarget(dont_store);
  141. }
  142. else if (native_format == &m_native_vtx_decl.normals[2])
  143. {
  144. CMP(32, R(remaining_reg), R(remaining_reg));
  145. FixupBranch dont_store = J_CC(CC_NZ);
  146. // For similar reasons, the cached tangent and binormal are 4 floats each
  147. MOVUPS(MPIC(VertexLoaderManager::binormal_cache.data()), coords);
  148. SetJumpTarget(dont_store);
  149. }
  150. };
  151. int elem_size = GetElementSize(format);
  152. int load_bytes = elem_size * count_in;
  153. OpArg dest = MDisp(dst_reg, m_dst_ofs);
  154. native_format->components = count_out;
  155. native_format->enable = true;
  156. native_format->offset = m_dst_ofs;
  157. native_format->type = ComponentFormat::Float;
  158. native_format->integer = false;
  159. m_dst_ofs += sizeof(float) * count_out;
  160. if (attribute == VertexComponentFormat::Direct)
  161. m_src_ofs += load_bytes;
  162. if (cpu_info.bSSSE3)
  163. {
  164. if (load_bytes > 8)
  165. MOVDQU(coords, data);
  166. else if (load_bytes > 4)
  167. MOVQ_xmm(coords, data);
  168. else
  169. MOVD_xmm(coords, data);
  170. PSHUFB(coords, MPIC(&shuffle_lut[format][count_in - 1]));
  171. // Sign-extend.
  172. if (format == ComponentFormat::Byte)
  173. PSRAD(coords, 24);
  174. if (format == ComponentFormat::Short)
  175. PSRAD(coords, 16);
  176. }
  177. else
  178. {
  179. // SSE2
  180. X64Reg temp = XMM1;
  181. switch (format)
  182. {
  183. case ComponentFormat::UByte:
  184. MOVD_xmm(coords, data);
  185. PXOR(temp, R(temp));
  186. PUNPCKLBW(coords, R(temp));
  187. PUNPCKLWD(coords, R(temp));
  188. break;
  189. case ComponentFormat::Byte:
  190. MOVD_xmm(coords, data);
  191. PUNPCKLBW(coords, R(coords));
  192. PUNPCKLWD(coords, R(coords));
  193. PSRAD(coords, 24);
  194. break;
  195. case ComponentFormat::UShort:
  196. case ComponentFormat::Short:
  197. switch (count_in)
  198. {
  199. case 1:
  200. LoadAndSwap(32, scratch3, data);
  201. MOVD_xmm(coords, R(scratch3)); // ......X.
  202. break;
  203. case 2:
  204. LoadAndSwap(32, scratch3, data);
  205. MOVD_xmm(coords, R(scratch3)); // ......XY
  206. PSHUFLW(coords, R(coords), 0x24); // ....Y.X.
  207. break;
  208. case 3:
  209. LoadAndSwap(64, scratch3, data);
  210. MOVQ_xmm(coords, R(scratch3)); // ....XYZ.
  211. PUNPCKLQDQ(coords, R(coords)); // ..Z.XYZ.
  212. PSHUFLW(coords, R(coords), 0xAC); // ..Z.Y.X.
  213. break;
  214. }
  215. if (format == ComponentFormat::Short)
  216. PSRAD(coords, 16);
  217. else
  218. PSRLD(coords, 16);
  219. break;
  220. case ComponentFormat::Float:
  221. case ComponentFormat::InvalidFloat5:
  222. case ComponentFormat::InvalidFloat6:
  223. case ComponentFormat::InvalidFloat7:
  224. // Floats don't need to be scaled or converted,
  225. // so we can just load/swap/store them directly
  226. // and return early.
  227. // (In SSSE3 we still need to store them.)
  228. for (int i = 0; i < count_in; i++)
  229. {
  230. LoadAndSwap(32, scratch3, data);
  231. MOV(32, dest, R(scratch3));
  232. data.AddMemOffset(sizeof(float));
  233. dest.AddMemOffset(sizeof(float));
  234. // zfreeze
  235. if (native_format == &m_native_vtx_decl.position ||
  236. native_format == &m_native_vtx_decl.normals[1] ||
  237. native_format == &m_native_vtx_decl.normals[2])
  238. {
  239. if (cpu_info.bSSE4_1)
  240. {
  241. PINSRD(coords, R(scratch3), i);
  242. }
  243. else
  244. {
  245. PINSRW(coords, R(scratch3), 2 * i + 0);
  246. SHR(32, R(scratch3), Imm8(16));
  247. PINSRW(coords, R(scratch3), 2 * i + 1);
  248. }
  249. }
  250. }
  251. write_zfreeze();
  252. }
  253. }
  254. if (format < ComponentFormat::Float)
  255. {
  256. CVTDQ2PS(coords, R(coords));
  257. if (dequantize && scaling_exponent)
  258. MULPS(coords, MPIC(&scale_factors[scaling_exponent]));
  259. }
  260. switch (count_out)
  261. {
  262. case 1:
  263. MOVSS(dest, coords);
  264. break;
  265. case 2:
  266. MOVLPS(dest, coords);
  267. break;
  268. case 3:
  269. MOVUPS(dest, coords);
  270. break;
  271. }
  272. write_zfreeze();
  273. }
  274. void VertexLoaderX64::ReadColor(OpArg data, VertexComponentFormat attribute, ColorFormat format)
  275. {
  276. int load_bytes = 0;
  277. switch (format)
  278. {
  279. case ColorFormat::RGB888:
  280. case ColorFormat::RGB888x:
  281. case ColorFormat::RGBA8888:
  282. MOV(32, R(scratch1), data);
  283. if (format != ColorFormat::RGBA8888)
  284. OR(32, R(scratch1), Imm32(0xFF000000));
  285. MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1));
  286. load_bytes = format == ColorFormat::RGB888 ? 3 : 4;
  287. break;
  288. case ColorFormat::RGB565:
  289. // RRRRRGGG GGGBBBBB
  290. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  291. LoadAndSwap(16, scratch1, data);
  292. if (cpu_info.bBMI1 && cpu_info.bBMI2FastParallelBitOps)
  293. {
  294. MOV(32, R(scratch2), Imm32(0x07C3F7C0));
  295. PDEP(32, scratch3, scratch1, R(scratch2));
  296. MOV(32, R(scratch2), Imm32(0xF8FCF800));
  297. PDEP(32, scratch1, scratch1, R(scratch2));
  298. ANDN(32, scratch2, scratch2, R(scratch3));
  299. OR(32, R(scratch1), R(scratch2));
  300. }
  301. else
  302. {
  303. SHL(32, R(scratch1), Imm8(11));
  304. LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0));
  305. LEA(32, scratch3, MScaled(scratch2, SCALE_8, 0));
  306. AND(32, R(scratch1), Imm32(0x0000F800));
  307. AND(32, R(scratch2), Imm32(0x00FC0000));
  308. AND(32, R(scratch3), Imm32(0xF8000000));
  309. OR(32, R(scratch1), R(scratch2));
  310. OR(32, R(scratch1), R(scratch3));
  311. MOV(32, R(scratch2), R(scratch1));
  312. SHR(32, R(scratch1), Imm8(5));
  313. AND(32, R(scratch1), Imm32(0x07000700));
  314. OR(32, R(scratch1), R(scratch2));
  315. SHR(32, R(scratch2), Imm8(6));
  316. AND(32, R(scratch2), Imm32(0x00030000));
  317. OR(32, R(scratch1), R(scratch2));
  318. }
  319. OR(32, R(scratch1), Imm32(0x000000FF));
  320. SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1);
  321. load_bytes = 2;
  322. break;
  323. case ColorFormat::RGBA4444:
  324. // RRRRGGGG BBBBAAAA
  325. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  326. LoadAndSwap(16, scratch1, data);
  327. if (cpu_info.bBMI2FastParallelBitOps)
  328. {
  329. MOV(32, R(scratch2), Imm32(0x0F0F0F0F));
  330. PDEP(32, scratch1, scratch1, R(scratch2));
  331. }
  332. else
  333. {
  334. MOV(32, R(scratch2), R(scratch1));
  335. SHL(32, R(scratch1), Imm8(8));
  336. OR(32, R(scratch1), R(scratch2));
  337. AND(32, R(scratch1), Imm32(0x00FF00FF));
  338. MOV(32, R(scratch2), R(scratch1));
  339. SHL(32, R(scratch1), Imm8(4));
  340. OR(32, R(scratch1), R(scratch2));
  341. AND(32, R(scratch1), Imm32(0x0F0F0F0F));
  342. }
  343. MOV(32, R(scratch2), R(scratch1));
  344. SHL(32, R(scratch1), Imm8(4));
  345. OR(32, R(scratch1), R(scratch2));
  346. SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1);
  347. load_bytes = 2;
  348. break;
  349. case ColorFormat::RGBA6666:
  350. // RRRRRRGG GGGGBBBB BBAAAAAA
  351. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  352. data.AddMemOffset(-1); // subtract one from address so we can use a 32bit load and bswap
  353. LoadAndSwap(32, scratch1, data);
  354. if (cpu_info.bBMI2FastParallelBitOps)
  355. {
  356. MOV(32, R(scratch2), Imm32(0xFCFCFCFC));
  357. PDEP(32, scratch1, scratch1, R(scratch2));
  358. MOV(32, R(scratch2), R(scratch1));
  359. }
  360. else
  361. {
  362. LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0)); // ______RR RRRRGGGG GGBBBBBB AAAAAA__
  363. AND(32, R(scratch2), Imm32(0x00003FFC)); // ________ ________ __BBBBBB AAAAAA__
  364. SHL(32, R(scratch1), Imm8(6)); // __RRRRRR GGGGGGBB BBBBAAAA AA______
  365. AND(32, R(scratch1), Imm32(0x3FFC0000)); // __RRRRRR GGGGGG__ ________ ________
  366. OR(32, R(scratch1), R(scratch2)); // __RRRRRR GGGGGG__ __BBBBBB AAAAAA__
  367. LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0)); // RRRRRRGG GGGG____ BBBBBBAA AAAA____
  368. AND(32, R(scratch2), Imm32(0xFC00FC00)); // RRRRRR__ ________ BBBBBB__ ________
  369. AND(32, R(scratch1), Imm32(0x00FC00FC)); // ________ GGGGGG__ ________ AAAAAA__
  370. OR(32, R(scratch1), R(scratch2)); // RRRRRR__ GGGGGG__ BBBBBB__ AAAAAA__
  371. MOV(32, R(scratch2), R(scratch1));
  372. }
  373. SHR(32, R(scratch1), Imm8(6));
  374. AND(32, R(scratch1), Imm32(0x03030303));
  375. OR(32, R(scratch1), R(scratch2));
  376. SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1);
  377. load_bytes = 3;
  378. break;
  379. }
  380. if (attribute == VertexComponentFormat::Direct)
  381. m_src_ofs += load_bytes;
  382. }
  383. void VertexLoaderX64::GenerateVertexLoader()
  384. {
  385. BitSet32 regs = {src_reg, dst_reg, scratch1, scratch2,
  386. scratch3, remaining_reg, skipped_reg, base_reg};
  387. regs &= ABI_ALL_CALLEE_SAVED;
  388. regs[RBP] = true; // Give us a stack frame
  389. ABI_PushRegistersAndAdjustStack(regs, 0);
  390. // Backup count since we're going to count it down.
  391. PUSH(32, R(ABI_PARAM3));
  392. // ABI_PARAM3 is one of the lower registers, so free it for scratch2.
  393. // We also have it end at a value of 0, to simplify indexing for zfreeze;
  394. // this requires subtracting 1 at the start.
  395. LEA(32, remaining_reg, MDisp(ABI_PARAM3, -1));
  396. MOV(64, R(base_reg), R(ABI_PARAM4));
  397. if (IsIndexed(m_VtxDesc.low.Position))
  398. XOR(32, R(skipped_reg), R(skipped_reg));
  399. // TODO: load constants into registers outside the main loop
  400. const u8* loop_start = GetCodePtr();
  401. if (m_VtxDesc.low.PosMatIdx)
  402. {
  403. MOVZX(32, 8, scratch1, MDisp(src_reg, m_src_ofs));
  404. AND(32, R(scratch1), Imm8(0x3F));
  405. MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1));
  406. // zfreeze
  407. CMP(32, R(remaining_reg), Imm8(3));
  408. FixupBranch dont_store = J_CC(CC_AE);
  409. MOV(32, MPIC(VertexLoaderManager::position_matrix_index_cache.data(), remaining_reg, SCALE_4),
  410. R(scratch1));
  411. SetJumpTarget(dont_store);
  412. m_native_vtx_decl.posmtx.components = 4;
  413. m_native_vtx_decl.posmtx.enable = true;
  414. m_native_vtx_decl.posmtx.offset = m_dst_ofs;
  415. m_native_vtx_decl.posmtx.type = ComponentFormat::UByte;
  416. m_native_vtx_decl.posmtx.integer = true;
  417. m_src_ofs += sizeof(u8);
  418. m_dst_ofs += sizeof(u32);
  419. }
  420. std::array<u32, 8> texmatidx_ofs;
  421. for (size_t i = 0; i < m_VtxDesc.low.TexMatIdx.Size(); i++)
  422. {
  423. if (m_VtxDesc.low.TexMatIdx[i])
  424. texmatidx_ofs[i] = m_src_ofs++;
  425. }
  426. OpArg data = GetVertexAddr(CPArray::Position, m_VtxDesc.low.Position);
  427. int pos_elements = m_VtxAttr.g0.PosElements == CoordComponentCount::XY ? 2 : 3;
  428. ReadVertex(data, m_VtxDesc.low.Position, m_VtxAttr.g0.PosFormat, pos_elements, pos_elements,
  429. m_VtxAttr.g0.ByteDequant, m_VtxAttr.g0.PosFrac, &m_native_vtx_decl.position);
  430. if (m_VtxDesc.low.Normal != VertexComponentFormat::NotPresent)
  431. {
  432. static constexpr Common::EnumMap<u8, ComponentFormat::InvalidFloat7> SCALE_MAP = {7, 6, 15, 14,
  433. 0, 0, 0, 0};
  434. const u8 scaling_exponent = SCALE_MAP[m_VtxAttr.g0.NormalFormat];
  435. // Normal
  436. data = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal);
  437. ReadVertex(data, m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, scaling_exponent,
  438. &m_native_vtx_decl.normals[0]);
  439. if (m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB)
  440. {
  441. const bool index3 = IsIndexed(m_VtxDesc.low.Normal) && m_VtxAttr.g0.NormalIndex3;
  442. const int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat);
  443. const int load_bytes = elem_size * 3;
  444. // Tangent
  445. // If in Index3 mode, and indexed components are used, replace the index with a new index.
  446. if (index3)
  447. data = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal);
  448. // The tangent comes after the normal; even in index3 mode, this offset is applied.
  449. // Note that this is different from adding 1 to the index, as the stride for indices may be
  450. // different from the size of the tangent itself.
  451. data.AddMemOffset(load_bytes);
  452. ReadVertex(data, m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true,
  453. scaling_exponent, &m_native_vtx_decl.normals[1]);
  454. // Undo the offset above so that data points to the normal instead of the tangent.
  455. // This way, we can add 2*elem_size below to always point to the binormal, even if we replace
  456. // data with a new index (which would point to the normal).
  457. data.AddMemOffset(-load_bytes);
  458. // Binormal
  459. if (index3)
  460. data = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal);
  461. data.AddMemOffset(load_bytes * 2);
  462. ReadVertex(data, m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true,
  463. scaling_exponent, &m_native_vtx_decl.normals[2]);
  464. }
  465. }
  466. for (u8 i = 0; i < m_VtxDesc.low.Color.Size(); i++)
  467. {
  468. if (m_VtxDesc.low.Color[i] != VertexComponentFormat::NotPresent)
  469. {
  470. data = GetVertexAddr(CPArray::Color0 + i, m_VtxDesc.low.Color[i]);
  471. ReadColor(data, m_VtxDesc.low.Color[i], m_VtxAttr.GetColorFormat(i));
  472. m_native_vtx_decl.colors[i].components = 4;
  473. m_native_vtx_decl.colors[i].enable = true;
  474. m_native_vtx_decl.colors[i].offset = m_dst_ofs;
  475. m_native_vtx_decl.colors[i].type = ComponentFormat::UByte;
  476. m_native_vtx_decl.colors[i].integer = false;
  477. m_dst_ofs += 4;
  478. }
  479. }
  480. for (u8 i = 0; i < m_VtxDesc.high.TexCoord.Size(); i++)
  481. {
  482. int elements = m_VtxAttr.GetTexElements(i) == TexComponentCount::ST ? 2 : 1;
  483. if (m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent)
  484. {
  485. data = GetVertexAddr(CPArray::TexCoord0 + i, m_VtxDesc.high.TexCoord[i]);
  486. u8 scaling_exponent = m_VtxAttr.GetTexFrac(i);
  487. ReadVertex(data, m_VtxDesc.high.TexCoord[i], m_VtxAttr.GetTexFormat(i), elements,
  488. m_VtxDesc.low.TexMatIdx[i] ? 2 : elements, m_VtxAttr.g0.ByteDequant,
  489. scaling_exponent, &m_native_vtx_decl.texcoords[i]);
  490. }
  491. if (m_VtxDesc.low.TexMatIdx[i])
  492. {
  493. m_native_vtx_decl.texcoords[i].components = 3;
  494. m_native_vtx_decl.texcoords[i].enable = true;
  495. m_native_vtx_decl.texcoords[i].type = ComponentFormat::Float;
  496. m_native_vtx_decl.texcoords[i].integer = false;
  497. MOVZX(64, 8, scratch1, MDisp(src_reg, texmatidx_ofs[i]));
  498. if (m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent)
  499. {
  500. CVTSI2SS(XMM0, R(scratch1));
  501. MOVSS(MDisp(dst_reg, m_dst_ofs), XMM0);
  502. m_dst_ofs += sizeof(float);
  503. }
  504. else
  505. {
  506. m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
  507. PXOR(XMM0, R(XMM0));
  508. CVTSI2SS(XMM0, R(scratch1));
  509. SHUFPS(XMM0, R(XMM0), 0x45); // 000X -> 0X00
  510. MOVUPS(MDisp(dst_reg, m_dst_ofs), XMM0);
  511. m_dst_ofs += sizeof(float) * 3;
  512. }
  513. }
  514. }
  515. // Prepare for the next vertex.
  516. ADD(64, R(dst_reg), Imm32(m_dst_ofs));
  517. const u8* cont = GetCodePtr();
  518. ADD(64, R(src_reg), Imm32(m_src_ofs));
  519. SUB(32, R(remaining_reg), Imm8(1));
  520. J_CC(CC_AE, loop_start);
  521. // Get the original count.
  522. POP(32, R(ABI_RETURN));
  523. ABI_PopRegistersAndAdjustStack(regs, 0);
  524. if (IsIndexed(m_VtxDesc.low.Position))
  525. {
  526. SUB(32, R(ABI_RETURN), R(skipped_reg));
  527. RET();
  528. SetJumpTarget(m_skip_vertex);
  529. ADD(32, R(skipped_reg), Imm8(1));
  530. JMP(cont);
  531. }
  532. else
  533. {
  534. RET();
  535. }
  536. ASSERT_MSG(VIDEO, m_vertex_size == m_src_ofs,
  537. "Vertex size from vertex loader ({}) does not match expected vertex size ({})!\nVtx "
  538. "desc: {:08x} {:08x}\nVtx attr: {:08x} {:08x} {:08x}",
  539. m_src_ofs, m_vertex_size, m_VtxDesc.low.Hex, m_VtxDesc.high.Hex, m_VtxAttr.g0.Hex,
  540. m_VtxAttr.g1.Hex, m_VtxAttr.g2.Hex);
  541. m_native_vtx_decl.stride = m_dst_ofs;
  542. }
  543. int VertexLoaderX64::RunVertices(const u8* src, u8* dst, int count)
  544. {
  545. m_numLoadedVertices += count;
  546. return ((int (*)(const u8* src, u8* dst, int count, const void* base))region)(src, dst, count,
  547. memory_base_ptr);
  548. }