VertexLoaderARM64.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. // Copyright 2015 Dolphin Emulator Project
  2. // SPDX-License-Identifier: GPL-2.0-or-later
  3. #include "VideoCommon/VertexLoaderARM64.h"
  4. #include <array>
  5. #include "Common/CommonTypes.h"
  6. #include "VideoCommon/CPMemory.h"
  7. #include "VideoCommon/VertexLoaderManager.h"
  8. using namespace Arm64Gen;
  9. constexpr ARM64Reg src_reg = ARM64Reg::X0;
  10. constexpr ARM64Reg dst_reg = ARM64Reg::X1;
  11. constexpr ARM64Reg remaining_reg = ARM64Reg::W2;
  12. constexpr ARM64Reg skipped_reg = ARM64Reg::W17;
  13. constexpr ARM64Reg scratch1_reg = ARM64Reg::W16;
  14. constexpr ARM64Reg scratch2_reg = ARM64Reg::W15;
  15. constexpr ARM64Reg scratch3_reg = ARM64Reg::W14;
  16. constexpr ARM64Reg saved_count = ARM64Reg::W12;
  17. constexpr ARM64Reg stride_reg = ARM64Reg::X11;
  18. constexpr ARM64Reg arraybase_reg = ARM64Reg::X10;
  19. constexpr ARM64Reg scale_reg = ARM64Reg::X9;
  20. static constexpr int GetLoadSize(int load_bytes)
  21. {
  22. if (load_bytes == 1)
  23. return 1;
  24. else if (load_bytes <= 2)
  25. return 2;
  26. else if (load_bytes <= 4)
  27. return 4;
  28. else if (load_bytes <= 8)
  29. return 8;
  30. else
  31. return 16;
  32. }
  33. alignas(16) static const float scale_factors[] = {
  34. 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3),
  35. 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7),
  36. 1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11),
  37. 1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15),
  38. 1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19),
  39. 1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23),
  40. 1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27),
  41. 1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31),
  42. };
  43. VertexLoaderARM64::VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_att)
  44. : VertexLoaderBase(vtx_desc, vtx_att), m_float_emit(this)
  45. {
  46. AllocCodeSpace(4096);
  47. const Common::ScopedJITPageWriteAndNoExecute enable_jit_page_writes;
  48. ClearCodeSpace();
  49. GenerateVertexLoader();
  50. WriteProtect(true);
  51. }
  52. // Returns the register to use as the base and an offset from that register.
  53. // For indexed attributes, the index is read into scratch1_reg, and then scratch1_reg with no offset
  54. // is returned. For direct attributes, an offset from src_reg is returned.
  55. std::pair<Arm64Gen::ARM64Reg, u32> VertexLoaderARM64::GetVertexAddr(CPArray array,
  56. VertexComponentFormat attribute)
  57. {
  58. if (IsIndexed(attribute))
  59. {
  60. if (attribute == VertexComponentFormat::Index8)
  61. {
  62. LDURB(scratch1_reg, src_reg, m_src_ofs);
  63. m_src_ofs += 1;
  64. }
  65. else // Index16
  66. {
  67. LDURH(scratch1_reg, src_reg, m_src_ofs);
  68. m_src_ofs += 2;
  69. REV16(scratch1_reg, scratch1_reg);
  70. }
  71. if (array == CPArray::Position)
  72. {
  73. EOR(scratch2_reg, scratch1_reg,
  74. attribute == VertexComponentFormat::Index8 ? LogicalImm(0xFF, GPRSize::B32) :
  75. LogicalImm(0xFFFF, GPRSize::B32));
  76. m_skip_vertex = CBZ(scratch2_reg);
  77. }
  78. LDR(IndexType::Unsigned, scratch2_reg, stride_reg, static_cast<u8>(array) * 4);
  79. MUL(scratch1_reg, scratch1_reg, scratch2_reg);
  80. LDR(IndexType::Unsigned, EncodeRegTo64(scratch2_reg), arraybase_reg,
  81. static_cast<u8>(array) * 8);
  82. ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg));
  83. return {EncodeRegTo64(scratch1_reg), 0};
  84. }
  85. else
  86. {
  87. return {src_reg, m_src_ofs};
  88. }
  89. }
  90. void VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentFormat format,
  91. int count_in, int count_out, bool dequantize,
  92. u8 scaling_exponent, AttributeFormat* native_format,
  93. ARM64Reg reg, u32 offset)
  94. {
  95. ARM64Reg coords = count_in == 3 ? ARM64Reg::Q31 : ARM64Reg::D31;
  96. ARM64Reg scale = count_in == 3 ? ARM64Reg::Q30 : ARM64Reg::D30;
  97. int elem_size = GetElementSize(format);
  98. int load_bytes = elem_size * count_in;
  99. int load_size = GetLoadSize(load_bytes);
  100. load_size <<= 3;
  101. m_float_emit.LDUR(load_size, coords, reg, offset);
  102. if (format < ComponentFormat::Float)
  103. {
  104. // Extend and convert to float
  105. switch (format)
  106. {
  107. case ComponentFormat::UByte:
  108. m_float_emit.UXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  109. m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  110. break;
  111. case ComponentFormat::Byte:
  112. m_float_emit.SXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  113. m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  114. break;
  115. case ComponentFormat::UShort:
  116. m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  117. m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  118. break;
  119. case ComponentFormat::Short:
  120. m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  121. m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  122. break;
  123. }
  124. m_float_emit.SCVTF(32, coords, coords);
  125. if (dequantize && scaling_exponent)
  126. {
  127. m_float_emit.LDR(32, IndexType::Unsigned, scale, scale_reg, scaling_exponent * 4);
  128. m_float_emit.FMUL(32, coords, coords, scale, 0);
  129. }
  130. }
  131. else
  132. {
  133. m_float_emit.REV32(8, coords, coords);
  134. }
  135. const u32 write_size = count_out == 3 ? 128 : count_out * 32;
  136. m_float_emit.STUR(write_size, coords, dst_reg, m_dst_ofs);
  137. // Z-Freeze
  138. if (native_format == &m_native_vtx_decl.position)
  139. {
  140. CMP(remaining_reg, 3);
  141. FixupBranch dont_store = B(CC_GE);
  142. MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::position_cache.data());
  143. m_float_emit.STR(128, coords, EncodeRegTo64(scratch2_reg), ArithOption(remaining_reg, true));
  144. SetJumpTarget(dont_store);
  145. }
  146. else if (native_format == &m_native_vtx_decl.normals[0])
  147. {
  148. FixupBranch dont_store = CBNZ(remaining_reg);
  149. MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::normal_cache.data());
  150. m_float_emit.STR(128, IndexType::Unsigned, coords, EncodeRegTo64(scratch2_reg), 0);
  151. SetJumpTarget(dont_store);
  152. }
  153. else if (native_format == &m_native_vtx_decl.normals[1])
  154. {
  155. FixupBranch dont_store = CBNZ(remaining_reg);
  156. MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::tangent_cache.data());
  157. m_float_emit.STR(128, IndexType::Unsigned, coords, EncodeRegTo64(scratch2_reg), 0);
  158. SetJumpTarget(dont_store);
  159. }
  160. else if (native_format == &m_native_vtx_decl.normals[2])
  161. {
  162. FixupBranch dont_store = CBNZ(remaining_reg);
  163. MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::binormal_cache.data());
  164. m_float_emit.STR(128, IndexType::Unsigned, coords, EncodeRegTo64(scratch2_reg), 0);
  165. SetJumpTarget(dont_store);
  166. }
  167. native_format->components = count_out;
  168. native_format->enable = true;
  169. native_format->offset = m_dst_ofs;
  170. native_format->type = ComponentFormat::Float;
  171. native_format->integer = false;
  172. m_dst_ofs += sizeof(float) * count_out;
  173. if (attribute == VertexComponentFormat::Direct)
  174. m_src_ofs += load_bytes;
  175. }
  176. void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat format, ARM64Reg reg,
  177. u32 offset)
  178. {
  179. int load_bytes = 0;
  180. switch (format)
  181. {
  182. case ColorFormat::RGB888:
  183. case ColorFormat::RGB888x:
  184. case ColorFormat::RGBA8888:
  185. LDUR(scratch2_reg, reg, offset);
  186. if (format != ColorFormat::RGBA8888)
  187. ORR(scratch2_reg, scratch2_reg, LogicalImm(0xFF000000, GPRSize::B32));
  188. STR(IndexType::Unsigned, scratch2_reg, dst_reg, m_dst_ofs);
  189. load_bytes = format == ColorFormat::RGB888 ? 3 : 4;
  190. break;
  191. case ColorFormat::RGB565:
  192. // RRRRRGGG GGGBBBBB
  193. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  194. LDURH(scratch3_reg, reg, offset);
  195. REV16(scratch3_reg, scratch3_reg);
  196. // B
  197. AND(scratch2_reg, scratch3_reg, LogicalImm(0x1F, GPRSize::B32));
  198. ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 3));
  199. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 5));
  200. ORR(scratch1_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 16));
  201. // G
  202. UBFM(scratch2_reg, scratch3_reg, 5, 10);
  203. ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
  204. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 6));
  205. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 8));
  206. // R
  207. UBFM(scratch2_reg, scratch3_reg, 11, 15);
  208. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 3));
  209. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 2));
  210. // A
  211. ORR(scratch1_reg, scratch1_reg, LogicalImm(0xFF000000, GPRSize::B32));
  212. STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs);
  213. load_bytes = 2;
  214. break;
  215. case ColorFormat::RGBA4444:
  216. // BBBBAAAA RRRRGGGG
  217. // REV16 - RRRRGGGG BBBBAAAA
  218. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  219. LDURH(scratch3_reg, reg, offset);
  220. // R
  221. UBFM(scratch1_reg, scratch3_reg, 4, 7);
  222. // G
  223. AND(scratch2_reg, scratch3_reg, LogicalImm(0xF, GPRSize::B32));
  224. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 8));
  225. // B
  226. UBFM(scratch2_reg, scratch3_reg, 12, 15);
  227. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 16));
  228. // A
  229. UBFM(scratch2_reg, scratch3_reg, 8, 11);
  230. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 24));
  231. // Final duplication
  232. ORR(scratch1_reg, scratch1_reg, scratch1_reg, ArithOption(scratch1_reg, ShiftType::LSL, 4));
  233. STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs);
  234. load_bytes = 2;
  235. break;
  236. case ColorFormat::RGBA6666:
  237. // RRRRRRGG GGGGBBBB BBAAAAAA
  238. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  239. LDUR(scratch3_reg, reg, offset - 1);
  240. REV32(scratch3_reg, scratch3_reg);
  241. // A
  242. UBFM(scratch2_reg, scratch3_reg, 0, 5);
  243. ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
  244. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 6));
  245. ORR(scratch1_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 24));
  246. // B
  247. UBFM(scratch2_reg, scratch3_reg, 6, 11);
  248. ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
  249. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 6));
  250. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 16));
  251. // G
  252. UBFM(scratch2_reg, scratch3_reg, 12, 17);
  253. ORR(scratch2_reg, ARM64Reg::WSP, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
  254. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 6));
  255. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 8));
  256. // R
  257. UBFM(scratch2_reg, scratch3_reg, 18, 23);
  258. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSL, 2));
  259. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ShiftType::LSR, 4));
  260. STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs);
  261. load_bytes = 3;
  262. break;
  263. }
  264. if (attribute == VertexComponentFormat::Direct)
  265. m_src_ofs += load_bytes;
  266. }
  267. void VertexLoaderARM64::GenerateVertexLoader()
  268. {
  269. // The largest input vertex (with the position matrix index and all texture matrix indices
  270. // enabled, and all components set as direct) is 129 bytes (corresponding to a 156-byte
  271. // output). This is small enough that we can always use the unscaled load/store instructions
  272. // (which allow an offset from -256 to +255).
  273. ASSERT(m_vertex_size <= 255);
  274. // R0 - Source pointer
  275. // R1 - Destination pointer
  276. // R2 - Count
  277. // R30 - LR
  278. //
  279. // R0 return how many
  280. //
  281. // Registers we don't have to worry about saving
  282. // R9-R17 are caller saved temporaries
  283. // R18 is a temporary or platform specific register(iOS)
  284. //
  285. // VFP registers
  286. // We can touch all except v8-v15
  287. // If we need to use those, we need to retain the lower 64bits(!) of the register
  288. bool has_tc = false;
  289. bool has_tc_scale = false;
  290. for (size_t i = 0; i < m_VtxDesc.high.TexCoord.Size(); i++)
  291. {
  292. has_tc |= m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent;
  293. has_tc_scale |= (m_VtxAttr.GetTexFrac(i) != 0);
  294. }
  295. bool need_scale = (m_VtxAttr.g0.ByteDequant && m_VtxAttr.g0.PosFrac) ||
  296. (has_tc && has_tc_scale) ||
  297. (m_VtxDesc.low.Normal != VertexComponentFormat::NotPresent);
  298. AlignCode16();
  299. if (IsIndexed(m_VtxDesc.low.Position))
  300. MOV(skipped_reg, ARM64Reg::WZR);
  301. ADD(saved_count, remaining_reg, 1);
  302. MOVP2R(stride_reg, g_main_cp_state.array_strides.data());
  303. MOVP2R(arraybase_reg, VertexLoaderManager::cached_arraybases.data());
  304. if (need_scale)
  305. MOVP2R(scale_reg, scale_factors);
  306. const u8* loop_start = GetCodePtr();
  307. if (m_VtxDesc.low.PosMatIdx)
  308. {
  309. LDRB(IndexType::Unsigned, scratch1_reg, src_reg, m_src_ofs);
  310. AND(scratch1_reg, scratch1_reg, LogicalImm(0x3F, GPRSize::B32));
  311. STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs);
  312. // Z-Freeze
  313. CMP(remaining_reg, 3);
  314. FixupBranch dont_store = B(CC_GE);
  315. MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::position_matrix_index_cache.data());
  316. STR(scratch1_reg, EncodeRegTo64(scratch2_reg), ArithOption(remaining_reg, true));
  317. SetJumpTarget(dont_store);
  318. m_native_vtx_decl.posmtx.components = 4;
  319. m_native_vtx_decl.posmtx.enable = true;
  320. m_native_vtx_decl.posmtx.offset = m_dst_ofs;
  321. m_native_vtx_decl.posmtx.type = ComponentFormat::UByte;
  322. m_native_vtx_decl.posmtx.integer = true;
  323. m_src_ofs += sizeof(u8);
  324. m_dst_ofs += sizeof(u32);
  325. }
  326. std::array<u32, 8> texmatidx_ofs;
  327. for (size_t i = 0; i < m_VtxDesc.low.TexMatIdx.Size(); i++)
  328. {
  329. if (m_VtxDesc.low.TexMatIdx[i])
  330. texmatidx_ofs[i] = m_src_ofs++;
  331. }
  332. // Position
  333. {
  334. const int pos_elements = m_VtxAttr.g0.PosElements == CoordComponentCount::XY ? 2 : 3;
  335. const auto [reg, offset] = GetVertexAddr(CPArray::Position, m_VtxDesc.low.Position);
  336. ReadVertex(m_VtxDesc.low.Position, m_VtxAttr.g0.PosFormat, pos_elements, pos_elements,
  337. m_VtxAttr.g0.ByteDequant, m_VtxAttr.g0.PosFrac, &m_native_vtx_decl.position, reg,
  338. offset);
  339. }
  340. if (m_VtxDesc.low.Normal != VertexComponentFormat::NotPresent)
  341. {
  342. static constexpr Common::EnumMap<u8, ComponentFormat::InvalidFloat7> SCALE_MAP = {7, 6, 15, 14,
  343. 0, 0, 0, 0};
  344. const u8 scaling_exponent = SCALE_MAP[m_VtxAttr.g0.NormalFormat];
  345. // Normal
  346. auto [reg, offset] = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal);
  347. ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, scaling_exponent,
  348. &m_native_vtx_decl.normals[0], reg, offset);
  349. if (m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB)
  350. {
  351. const bool index3 = IsIndexed(m_VtxDesc.low.Normal) && m_VtxAttr.g0.NormalIndex3;
  352. const int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat);
  353. const int load_bytes = elem_size * 3;
  354. // Tangent
  355. // If in Index3 mode, and indexed components are used, replace the index with a new index.
  356. if (index3)
  357. std::tie(reg, offset) = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal);
  358. // The tangent comes after the normal; even in index3 mode, an extra offset of load_bytes is
  359. // applied. Note that this is different from adding 1 to the index, as the stride for indices
  360. // may be different from the size of the tangent itself.
  361. ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, scaling_exponent,
  362. &m_native_vtx_decl.normals[1], reg, offset + load_bytes);
  363. // Binormal
  364. if (index3)
  365. std::tie(reg, offset) = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal);
  366. ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, scaling_exponent,
  367. &m_native_vtx_decl.normals[2], reg, offset + load_bytes * 2);
  368. }
  369. }
  370. for (u8 i = 0; i < m_VtxDesc.low.Color.Size(); i++)
  371. {
  372. m_native_vtx_decl.colors[i].components = 4;
  373. m_native_vtx_decl.colors[i].type = ComponentFormat::UByte;
  374. m_native_vtx_decl.colors[i].integer = false;
  375. if (m_VtxDesc.low.Color[i] != VertexComponentFormat::NotPresent)
  376. {
  377. const auto [reg, offset] = GetVertexAddr(CPArray::Color0 + i, m_VtxDesc.low.Color[i]);
  378. ReadColor(m_VtxDesc.low.Color[i], m_VtxAttr.GetColorFormat(i), reg, offset);
  379. m_native_vtx_decl.colors[i].components = 4;
  380. m_native_vtx_decl.colors[i].enable = true;
  381. m_native_vtx_decl.colors[i].offset = m_dst_ofs;
  382. m_native_vtx_decl.colors[i].type = ComponentFormat::UByte;
  383. m_native_vtx_decl.colors[i].integer = false;
  384. m_dst_ofs += 4;
  385. }
  386. }
  387. for (u8 i = 0; i < m_VtxDesc.high.TexCoord.Size(); i++)
  388. {
  389. m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
  390. m_native_vtx_decl.texcoords[i].type = ComponentFormat::Float;
  391. m_native_vtx_decl.texcoords[i].integer = false;
  392. const int elements = m_VtxAttr.GetTexElements(i) == TexComponentCount::S ? 1 : 2;
  393. if (m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent)
  394. {
  395. const auto [reg, offset] = GetVertexAddr(CPArray::TexCoord0 + i, m_VtxDesc.high.TexCoord[i]);
  396. u8 scaling_exponent = m_VtxAttr.GetTexFrac(i);
  397. ReadVertex(m_VtxDesc.high.TexCoord[i], m_VtxAttr.GetTexFormat(i), elements,
  398. m_VtxDesc.low.TexMatIdx[i] ? 2 : elements, m_VtxAttr.g0.ByteDequant,
  399. scaling_exponent, &m_native_vtx_decl.texcoords[i], reg, offset);
  400. }
  401. if (m_VtxDesc.low.TexMatIdx[i])
  402. {
  403. m_native_vtx_decl.texcoords[i].components = 3;
  404. m_native_vtx_decl.texcoords[i].enable = true;
  405. m_native_vtx_decl.texcoords[i].type = ComponentFormat::Float;
  406. m_native_vtx_decl.texcoords[i].integer = false;
  407. LDRB(IndexType::Unsigned, scratch2_reg, src_reg, texmatidx_ofs[i]);
  408. m_float_emit.UCVTF(ARM64Reg::S31, scratch2_reg);
  409. if (m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent)
  410. {
  411. m_float_emit.STR(32, IndexType::Unsigned, ARM64Reg::D31, dst_reg, m_dst_ofs);
  412. m_dst_ofs += sizeof(float);
  413. }
  414. else
  415. {
  416. m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
  417. STUR(ARM64Reg::SP, dst_reg, m_dst_ofs);
  418. m_float_emit.STR(32, IndexType::Unsigned, ARM64Reg::D31, dst_reg, m_dst_ofs + 8);
  419. m_dst_ofs += sizeof(float) * 3;
  420. }
  421. }
  422. }
  423. // Prepare for the next vertex.
  424. ADD(dst_reg, dst_reg, m_dst_ofs);
  425. const u8* cont = GetCodePtr();
  426. ADD(src_reg, src_reg, m_src_ofs);
  427. SUBS(remaining_reg, remaining_reg, 1);
  428. B(CCFlags::CC_GE, loop_start);
  429. if (IsIndexed(m_VtxDesc.low.Position))
  430. {
  431. SUB(ARM64Reg::W0, saved_count, skipped_reg);
  432. RET(ARM64Reg::X30);
  433. SetJumpTarget(m_skip_vertex);
  434. ADD(skipped_reg, skipped_reg, 1);
  435. B(cont);
  436. }
  437. else
  438. {
  439. MOV(ARM64Reg::W0, saved_count);
  440. RET(ARM64Reg::X30);
  441. }
  442. FlushIcache();
  443. ASSERT_MSG(VIDEO, m_vertex_size == m_src_ofs,
  444. "Vertex size from vertex loader ({}) does not match expected vertex size ({})!\nVtx "
  445. "desc: {:08x} {:08x}\nVtx attr: {:08x} {:08x} {:08x}",
  446. m_src_ofs, m_vertex_size, m_VtxDesc.low.Hex, m_VtxDesc.high.Hex, m_VtxAttr.g0.Hex,
  447. m_VtxAttr.g1.Hex, m_VtxAttr.g2.Hex);
  448. m_native_vtx_decl.stride = m_dst_ofs;
  449. }
  450. int VertexLoaderARM64::RunVertices(const u8* src, u8* dst, int count)
  451. {
  452. m_numLoadedVertices += count;
  453. return ((int (*)(const u8* src, u8* dst, int count))region)(src, dst, count - 1);
  454. }