astcenc_vecmathlib_neon_4.h 25 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2023 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
  19. *
  20. * This module implements 4-wide 32-bit float, int, and mask vectors for
  21. * Armv8-A NEON.
  22. *
  23. * There is a baseline level of functionality provided by all vector widths and
  24. * implementations. This is implemented using identical function signatures,
  25. * modulo data type, so we can use them as substitutable implementations in VLA
  26. * code.
  27. *
  28. * The 4-wide vectors are also used as a fixed-width type, and significantly
  29. * extend the functionality above that available to VLA code.
  30. */
  31. #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
  32. #define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
  33. #ifndef ASTCENC_SIMD_INLINE
  34. #error "Include astcenc_vecmathlib.h, do not include directly"
  35. #endif
  36. #include <cstdio>
  37. #include <cstring>
  38. // ============================================================================
  39. // vfloat4 data type
  40. // ============================================================================
  41. /**
  42. * @brief Data type for 4-wide floats.
  43. */
  44. struct vfloat4
  45. {
  46. /**
  47. * @brief Construct from zero-initialized value.
  48. */
  49. ASTCENC_SIMD_INLINE vfloat4() = default;
  50. /**
  51. * @brief Construct from 4 values loaded from an unaligned address.
  52. *
  53. * Consider using loada() which is better with vectors if data is aligned
  54. * to vector length.
  55. */
  56. ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
  57. {
  58. m = vld1q_f32(p);
  59. }
  60. /**
  61. * @brief Construct from 1 scalar value replicated across all lanes.
  62. *
  63. * Consider using zero() for constexpr zeros.
  64. */
  65. ASTCENC_SIMD_INLINE explicit vfloat4(float a)
  66. {
  67. m = vdupq_n_f32(a);
  68. }
  69. /**
  70. * @brief Construct from 4 scalar values.
  71. *
  72. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  73. */
  74. ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
  75. {
  76. float v[4] { a, b, c, d };
  77. m = vld1q_f32(v);
  78. }
  79. /**
  80. * @brief Construct from an existing SIMD register.
  81. */
  82. ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
  83. {
  84. m = a;
  85. }
  86. /**
  87. * @brief Get the scalar value of a single lane.
  88. */
  89. template <int l> ASTCENC_SIMD_INLINE float lane() const
  90. {
  91. return vgetq_lane_f32(m, l);
  92. }
  93. /**
  94. * @brief Set the scalar value of a single lane.
  95. */
  96. template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
  97. {
  98. m = vsetq_lane_f32(a, m, l);
  99. }
  100. /**
  101. * @brief Factory that returns a vector of zeros.
  102. */
  103. static ASTCENC_SIMD_INLINE vfloat4 zero()
  104. {
  105. return vfloat4(vdupq_n_f32(0.0f));
  106. }
  107. /**
  108. * @brief Factory that returns a replicated scalar loaded from memory.
  109. */
  110. static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
  111. {
  112. return vfloat4(vld1q_dup_f32(p));
  113. }
  114. /**
  115. * @brief Factory that returns a vector loaded from 16B aligned memory.
  116. */
  117. static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
  118. {
  119. return vfloat4(vld1q_f32(p));
  120. }
  121. /**
  122. * @brief Factory that returns a vector containing the lane IDs.
  123. */
  124. static ASTCENC_SIMD_INLINE vfloat4 lane_id()
  125. {
  126. alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
  127. return vfloat4(vld1q_f32(data));
  128. }
  129. /**
  130. * @brief Return a swizzled float 2.
  131. */
  132. template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
  133. {
  134. return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
  135. }
  136. /**
  137. * @brief Return a swizzled float 3.
  138. */
  139. template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
  140. {
  141. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
  142. }
  143. /**
  144. * @brief Return a swizzled float 4.
  145. */
  146. template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
  147. {
  148. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
  149. }
  150. /**
  151. * @brief The vector ...
  152. */
  153. float32x4_t m;
  154. };
  155. // ============================================================================
  156. // vint4 data type
  157. // ============================================================================
  158. /**
  159. * @brief Data type for 4-wide ints.
  160. */
  161. struct vint4
  162. {
  163. /**
  164. * @brief Construct from zero-initialized value.
  165. */
  166. ASTCENC_SIMD_INLINE vint4() = default;
  167. /**
  168. * @brief Construct from 4 values loaded from an unaligned address.
  169. *
  170. * Consider using loada() which is better with vectors if data is aligned
  171. * to vector length.
  172. */
  173. ASTCENC_SIMD_INLINE explicit vint4(const int *p)
  174. {
  175. m = vld1q_s32(p);
  176. }
  177. /**
  178. * @brief Construct from 4 uint8_t loaded from an unaligned address.
  179. */
  180. ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
  181. {
  182. // Cast is safe - NEON loads are allowed to be unaligned
  183. uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
  184. uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
  185. m = vreinterpretq_s32_u32(vmovl_u16(t16));
  186. }
  187. /**
  188. * @brief Construct from 1 scalar value replicated across all lanes.
  189. *
  190. * Consider using vfloat4::zero() for constexpr zeros.
  191. */
  192. ASTCENC_SIMD_INLINE explicit vint4(int a)
  193. {
  194. m = vdupq_n_s32(a);
  195. }
  196. /**
  197. * @brief Construct from 4 scalar values.
  198. *
  199. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  200. */
  201. ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
  202. {
  203. int v[4] { a, b, c, d };
  204. m = vld1q_s32(v);
  205. }
  206. /**
  207. * @brief Construct from an existing SIMD register.
  208. */
  209. ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
  210. {
  211. m = a;
  212. }
  213. /**
  214. * @brief Get the scalar from a single lane.
  215. */
  216. template <int l> ASTCENC_SIMD_INLINE int lane() const
  217. {
  218. return vgetq_lane_s32(m, l);
  219. }
  220. /**
  221. * @brief Set the scalar value of a single lane.
  222. */
  223. template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
  224. {
  225. m = vsetq_lane_s32(a, m, l);
  226. }
  227. /**
  228. * @brief Factory that returns a vector of zeros.
  229. */
  230. static ASTCENC_SIMD_INLINE vint4 zero()
  231. {
  232. return vint4(0);
  233. }
  234. /**
  235. * @brief Factory that returns a replicated scalar loaded from memory.
  236. */
  237. static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
  238. {
  239. return vint4(*p);
  240. }
  241. /**
  242. * @brief Factory that returns a vector loaded from unaligned memory.
  243. */
  244. static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
  245. {
  246. vint4 data;
  247. std::memcpy(&data.m, p, 4 * sizeof(int));
  248. return data;
  249. }
  250. /**
  251. * @brief Factory that returns a vector loaded from 16B aligned memory.
  252. */
  253. static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
  254. {
  255. return vint4(p);
  256. }
  257. /**
  258. * @brief Factory that returns a vector containing the lane IDs.
  259. */
  260. static ASTCENC_SIMD_INLINE vint4 lane_id()
  261. {
  262. alignas(16) static const int data[4] { 0, 1, 2, 3 };
  263. return vint4(vld1q_s32(data));
  264. }
  265. /**
  266. * @brief The vector ...
  267. */
  268. int32x4_t m;
  269. };
  270. // ============================================================================
  271. // vmask4 data type
  272. // ============================================================================
  273. /**
  274. * @brief Data type for 4-wide control plane masks.
  275. */
  276. struct vmask4
  277. {
  278. /**
  279. * @brief Construct from an existing SIMD register.
  280. */
  281. ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
  282. {
  283. m = a;
  284. }
  285. #if !defined(_MSC_VER)
  286. /**
  287. * @brief Construct from an existing SIMD register.
  288. */
  289. ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
  290. {
  291. m = vreinterpretq_u32_s32(a);
  292. }
  293. #endif
  294. /**
  295. * @brief Construct from 1 scalar value.
  296. */
  297. ASTCENC_SIMD_INLINE explicit vmask4(bool a)
  298. {
  299. m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
  300. }
  301. /**
  302. * @brief Construct from 4 scalar values.
  303. *
  304. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  305. */
  306. ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
  307. {
  308. int v[4] {
  309. a == true ? -1 : 0,
  310. b == true ? -1 : 0,
  311. c == true ? -1 : 0,
  312. d == true ? -1 : 0
  313. };
  314. int32x4_t ms = vld1q_s32(v);
  315. m = vreinterpretq_u32_s32(ms);
  316. }
  317. /**
  318. * @brief Get the scalar from a single lane.
  319. */
  320. template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
  321. {
  322. return vgetq_lane_u32(m, l) != 0;
  323. }
  324. /**
  325. * @brief The vector ...
  326. */
  327. uint32x4_t m;
  328. };
  329. // ============================================================================
  330. // vmask4 operators and functions
  331. // ============================================================================
  332. /**
  333. * @brief Overload: mask union (or).
  334. */
  335. ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
  336. {
  337. return vmask4(vorrq_u32(a.m, b.m));
  338. }
  339. /**
  340. * @brief Overload: mask intersect (and).
  341. */
  342. ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
  343. {
  344. return vmask4(vandq_u32(a.m, b.m));
  345. }
  346. /**
  347. * @brief Overload: mask difference (xor).
  348. */
  349. ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
  350. {
  351. return vmask4(veorq_u32(a.m, b.m));
  352. }
  353. /**
  354. * @brief Overload: mask invert (not).
  355. */
  356. ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
  357. {
  358. return vmask4(vmvnq_u32(a.m));
  359. }
  360. /**
  361. * @brief Return a 4-bit mask code indicating mask status.
  362. *
  363. * bit0 = lane 0
  364. */
  365. ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
  366. {
  367. static const int shifta[4] { 0, 1, 2, 3 };
  368. static const int32x4_t shift = vld1q_s32(shifta);
  369. uint32x4_t tmp = vshrq_n_u32(a.m, 31);
  370. return vaddvq_u32(vshlq_u32(tmp, shift));
  371. }
  372. // ============================================================================
  373. // vint4 operators and functions
  374. // ============================================================================
  375. /**
  376. * @brief Overload: vector by vector addition.
  377. */
  378. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
  379. {
  380. return vint4(vaddq_s32(a.m, b.m));
  381. }
  382. /**
  383. * @brief Overload: vector by vector subtraction.
  384. */
  385. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
  386. {
  387. return vint4(vsubq_s32(a.m, b.m));
  388. }
  389. /**
  390. * @brief Overload: vector by vector multiplication.
  391. */
  392. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
  393. {
  394. return vint4(vmulq_s32(a.m, b.m));
  395. }
  396. /**
  397. * @brief Overload: vector bit invert.
  398. */
  399. ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
  400. {
  401. return vint4(vmvnq_s32(a.m));
  402. }
  403. /**
  404. * @brief Overload: vector by vector bitwise or.
  405. */
  406. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
  407. {
  408. return vint4(vorrq_s32(a.m, b.m));
  409. }
  410. /**
  411. * @brief Overload: vector by vector bitwise and.
  412. */
  413. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
  414. {
  415. return vint4(vandq_s32(a.m, b.m));
  416. }
  417. /**
  418. * @brief Overload: vector by vector bitwise xor.
  419. */
  420. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
  421. {
  422. return vint4(veorq_s32(a.m, b.m));
  423. }
  424. /**
  425. * @brief Overload: vector by vector equality.
  426. */
  427. ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
  428. {
  429. return vmask4(vceqq_s32(a.m, b.m));
  430. }
  431. /**
  432. * @brief Overload: vector by vector inequality.
  433. */
  434. ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
  435. {
  436. return ~vmask4(vceqq_s32(a.m, b.m));
  437. }
  438. /**
  439. * @brief Overload: vector by vector less than.
  440. */
  441. ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
  442. {
  443. return vmask4(vcltq_s32(a.m, b.m));
  444. }
  445. /**
  446. * @brief Overload: vector by vector greater than.
  447. */
  448. ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
  449. {
  450. return vmask4(vcgtq_s32(a.m, b.m));
  451. }
  452. /**
  453. * @brief Logical shift left.
  454. */
  455. template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
  456. {
  457. return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
  458. }
  459. /**
  460. * @brief Logical shift right.
  461. */
  462. template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
  463. {
  464. uint32x4_t ua = vreinterpretq_u32_s32(a.m);
  465. ua = vshlq_u32(ua, vdupq_n_s32(-s));
  466. return vint4(vreinterpretq_s32_u32(ua));
  467. }
  468. /**
  469. * @brief Arithmetic shift right.
  470. */
  471. template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
  472. {
  473. return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
  474. }
  475. /**
  476. * @brief Return the min vector of two vectors.
  477. */
  478. ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
  479. {
  480. return vint4(vminq_s32(a.m, b.m));
  481. }
  482. /**
  483. * @brief Return the max vector of two vectors.
  484. */
  485. ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
  486. {
  487. return vint4(vmaxq_s32(a.m, b.m));
  488. }
  489. /**
  490. * @brief Return the horizontal minimum of a vector.
  491. */
  492. ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
  493. {
  494. return vint4(vminvq_s32(a.m));
  495. }
  496. /**
  497. * @brief Return the horizontal maximum of a vector.
  498. */
  499. ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
  500. {
  501. return vint4(vmaxvq_s32(a.m));
  502. }
  503. /**
  504. * @brief Return the horizontal sum of a vector.
  505. */
  506. ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
  507. {
  508. int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
  509. return vget_lane_s32(vpadd_s32(t, t), 0);
  510. }
  511. /**
  512. * @brief Store a vector to a 16B aligned memory address.
  513. */
  514. ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
  515. {
  516. vst1q_s32(p, a.m);
  517. }
  518. /**
  519. * @brief Store a vector to an unaligned memory address.
  520. */
  521. ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
  522. {
  523. vst1q_s32(p, a.m);
  524. }
  525. /**
  526. * @brief Store a vector to an unaligned memory address.
  527. */
  528. ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
  529. {
  530. std::memcpy(p, &a.m, sizeof(int) * 4);
  531. }
  532. /**
  533. * @brief Store lowest N (vector width) bytes into an unaligned address.
  534. */
  535. ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  536. {
  537. vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
  538. }
  539. /**
  540. * @brief Gather N (vector width) indices from the array.
  541. */
  542. ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
  543. {
  544. alignas(16) int idx[4];
  545. storea(indices, idx);
  546. alignas(16) int vals[4];
  547. vals[0] = base[idx[0]];
  548. vals[1] = base[idx[1]];
  549. vals[2] = base[idx[2]];
  550. vals[3] = base[idx[3]];
  551. return vint4(vals);
  552. }
  553. /**
  554. * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  555. */
  556. ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
  557. {
  558. alignas(16) uint8_t shuf[16] {
  559. 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  560. };
  561. uint8x16_t idx = vld1q_u8(shuf);
  562. int8x16_t av = vreinterpretq_s8_s32(a.m);
  563. return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
  564. }
  565. /**
  566. * @brief Return lanes from @c b if @c cond is set, else @c a.
  567. */
  568. ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
  569. {
  570. return vint4(vbslq_s32(cond.m, b.m, a.m));
  571. }
  572. // ============================================================================
  573. // vfloat4 operators and functions
  574. // ============================================================================
  575. /**
  576. * @brief Overload: vector by vector addition.
  577. */
  578. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
  579. {
  580. return vfloat4(vaddq_f32(a.m, b.m));
  581. }
  582. /**
  583. * @brief Overload: vector by vector subtraction.
  584. */
  585. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
  586. {
  587. return vfloat4(vsubq_f32(a.m, b.m));
  588. }
  589. /**
  590. * @brief Overload: vector by vector multiplication.
  591. */
  592. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
  593. {
  594. return vfloat4(vmulq_f32(a.m, b.m));
  595. }
  596. /**
  597. * @brief Overload: vector by vector division.
  598. */
  599. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
  600. {
  601. return vfloat4(vdivq_f32(a.m, b.m));
  602. }
  603. /**
  604. * @brief Overload: vector by vector equality.
  605. */
  606. ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
  607. {
  608. return vmask4(vceqq_f32(a.m, b.m));
  609. }
  610. /**
  611. * @brief Overload: vector by vector inequality.
  612. */
  613. ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
  614. {
  615. return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
  616. }
  617. /**
  618. * @brief Overload: vector by vector less than.
  619. */
  620. ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
  621. {
  622. return vmask4(vcltq_f32(a.m, b.m));
  623. }
  624. /**
  625. * @brief Overload: vector by vector greater than.
  626. */
  627. ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
  628. {
  629. return vmask4(vcgtq_f32(a.m, b.m));
  630. }
  631. /**
  632. * @brief Overload: vector by vector less than or equal.
  633. */
  634. ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
  635. {
  636. return vmask4(vcleq_f32(a.m, b.m));
  637. }
  638. /**
  639. * @brief Overload: vector by vector greater than or equal.
  640. */
  641. ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
  642. {
  643. return vmask4(vcgeq_f32(a.m, b.m));
  644. }
  645. /**
  646. * @brief Return the min vector of two vectors.
  647. *
  648. * If either lane value is NaN, @c b will be returned for that lane.
  649. */
  650. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
  651. {
  652. // Do not reorder - second operand will return if either is NaN
  653. return vfloat4(vminnmq_f32(a.m, b.m));
  654. }
  655. /**
  656. * @brief Return the max vector of two vectors.
  657. *
  658. * If either lane value is NaN, @c b will be returned for that lane.
  659. */
  660. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
  661. {
  662. // Do not reorder - second operand will return if either is NaN
  663. return vfloat4(vmaxnmq_f32(a.m, b.m));
  664. }
  665. /**
  666. * @brief Return the absolute value of the float vector.
  667. */
  668. ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
  669. {
  670. float32x4_t zero = vdupq_n_f32(0.0f);
  671. float32x4_t inv = vsubq_f32(zero, a.m);
  672. return vfloat4(vmaxq_f32(a.m, inv));
  673. }
  674. /**
  675. * @brief Return a float rounded to the nearest integer value.
  676. */
  677. ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
  678. {
  679. return vfloat4(vrndnq_f32(a.m));
  680. }
  681. /**
  682. * @brief Return the horizontal minimum of a vector.
  683. */
  684. ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
  685. {
  686. return vfloat4(vminvq_f32(a.m));
  687. }
  688. /**
  689. * @brief Return the horizontal maximum of a vector.
  690. */
  691. ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
  692. {
  693. return vfloat4(vmaxvq_f32(a.m));
  694. }
  695. /**
  696. * @brief Return the horizontal sum of a vector.
  697. */
  698. ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
  699. {
  700. // Perform halving add to ensure invariance; we cannot use vaddqv as this
  701. // does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
  702. float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
  703. return vget_lane_f32(vpadd_f32(t, t), 0);
  704. }
  705. /**
  706. * @brief Return the sqrt of the lanes in the vector.
  707. */
  708. ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
  709. {
  710. return vfloat4(vsqrtq_f32(a.m));
  711. }
  712. /**
  713. * @brief Return lanes from @c b if @c cond is set, else @c a.
  714. */
  715. ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
  716. {
  717. return vfloat4(vbslq_f32(cond.m, b.m, a.m));
  718. }
  719. /**
  720. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  721. */
  722. ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
  723. {
  724. static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
  725. uint32x4_t mask = vcgeq_u32(cond.m, msb);
  726. return vfloat4(vbslq_f32(mask, b.m, a.m));
  727. }
  728. /**
  729. * @brief Load a vector of gathered results from an array;
  730. */
  731. ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
  732. {
  733. alignas(16) int idx[4];
  734. storea(indices, idx);
  735. alignas(16) float vals[4];
  736. vals[0] = base[idx[0]];
  737. vals[1] = base[idx[1]];
  738. vals[2] = base[idx[2]];
  739. vals[3] = base[idx[3]];
  740. return vfloat4(vals);
  741. }
  742. /**
  743. * @brief Store a vector to an unaligned memory address.
  744. */
  745. ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
  746. {
  747. vst1q_f32(p, a.m);
  748. }
  749. /**
  750. * @brief Store a vector to a 16B aligned memory address.
  751. */
  752. ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
  753. {
  754. vst1q_f32(p, a.m);
  755. }
  756. /**
  757. * @brief Return a integer value for a float vector, using truncation.
  758. */
  759. ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
  760. {
  761. return vint4(vcvtq_s32_f32(a.m));
  762. }
  763. /**
  764. * @brief Return a integer value for a float vector, using round-to-nearest.
  765. */
  766. ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
  767. {
  768. a = a + vfloat4(0.5f);
  769. return vint4(vcvtq_s32_f32(a.m));
  770. }
  771. /**
  772. * @brief Return a float value for an integer vector.
  773. */
  774. ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
  775. {
  776. return vfloat4(vcvtq_f32_s32(a.m));
  777. }
  778. /**
  779. * @brief Return a float16 value for a float vector, using round-to-nearest.
  780. */
  781. ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
  782. {
  783. // Generate float16 value
  784. float16x4_t f16 = vcvt_f16_f32(a.m);
  785. // Convert each 16-bit float pattern to a 32-bit pattern
  786. uint16x4_t u16 = vreinterpret_u16_f16(f16);
  787. uint32x4_t u32 = vmovl_u16(u16);
  788. return vint4(vreinterpretq_s32_u32(u32));
  789. }
  790. /**
  791. * @brief Return a float16 value for a float scalar, using round-to-nearest.
  792. */
  793. static inline uint16_t float_to_float16(float a)
  794. {
  795. vfloat4 av(a);
  796. return static_cast<uint16_t>(float_to_float16(av).lane<0>());
  797. }
  798. /**
  799. * @brief Return a float value for a float16 vector.
  800. */
  801. ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
  802. {
  803. // Convert each 32-bit float pattern to a 16-bit pattern
  804. uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
  805. uint16x4_t u16 = vmovn_u32(u32);
  806. float16x4_t f16 = vreinterpret_f16_u16(u16);
  807. // Generate float16 value
  808. return vfloat4(vcvt_f32_f16(f16));
  809. }
  810. /**
  811. * @brief Return a float value for a float16 scalar.
  812. */
  813. ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
  814. {
  815. vint4 av(a);
  816. return float16_to_float(av).lane<0>();
  817. }
  818. /**
  819. * @brief Return a float value as an integer bit pattern (i.e. no conversion).
  820. *
  821. * It is a common trick to convert floats into integer bit patterns, perform
  822. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  823. * convert them back again. This is the first half of that flip.
  824. */
  825. ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
  826. {
  827. return vint4(vreinterpretq_s32_f32(a.m));
  828. }
  829. /**
  830. * @brief Return a integer value as a float bit pattern (i.e. no conversion).
  831. *
  832. * It is a common trick to convert floats into integer bit patterns, perform
  833. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  834. * convert them back again. This is the second half of that flip.
  835. */
  836. ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
  837. {
  838. return vfloat4(vreinterpretq_f32_s32(v.m));
  839. }
  840. /**
  841. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  842. */
  843. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
  844. {
  845. t0p = t0;
  846. }
  847. /**
  848. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  849. */
  850. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
  851. {
  852. t0p = t0;
  853. t1p = t1;
  854. }
  855. /**
  856. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  857. */
  858. ASTCENC_SIMD_INLINE void vtable_prepare(
  859. vint4 t0, vint4 t1, vint4 t2, vint4 t3,
  860. vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
  861. {
  862. t0p = t0;
  863. t1p = t1;
  864. t2p = t2;
  865. t3p = t3;
  866. }
  867. /**
  868. * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
  869. */
  870. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
  871. {
  872. int8x16_t table {
  873. vreinterpretq_s8_s32(t0.m)
  874. };
  875. // Set index byte above max index for unused bytes so table lookup returns zero
  876. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  877. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  878. return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
  879. }
  880. /**
  881. * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
  882. */
  883. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
  884. {
  885. int8x16x2_t table {
  886. vreinterpretq_s8_s32(t0.m),
  887. vreinterpretq_s8_s32(t1.m)
  888. };
  889. // Set index byte above max index for unused bytes so table lookup returns zero
  890. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  891. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  892. return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
  893. }
  894. /**
  895. * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
  896. */
  897. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
  898. {
  899. int8x16x4_t table {
  900. vreinterpretq_s8_s32(t0.m),
  901. vreinterpretq_s8_s32(t1.m),
  902. vreinterpretq_s8_s32(t2.m),
  903. vreinterpretq_s8_s32(t3.m)
  904. };
  905. // Set index byte above max index for unused bytes so table lookup returns zero
  906. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  907. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  908. return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
  909. }
  910. /**
  911. * @brief Return a vector of interleaved RGBA data.
  912. *
  913. * Input vectors have the value stored in the bottom 8 bits of each lane,
  914. * with high bits set to zero.
  915. *
  916. * Output vector stores a single RGBA texel packed in each lane.
  917. */
  918. ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
  919. {
  920. return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
  921. }
  922. /**
  923. * @brief Store a single vector lane to an unaligned address.
  924. */
  925. ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
  926. {
  927. std::memcpy(base, &data, sizeof(int));
  928. }
  929. /**
  930. * @brief Store a vector, skipping masked lanes.
  931. *
  932. * All masked lanes must be at the end of vector, after all non-masked lanes.
  933. */
  934. ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
  935. {
  936. if (mask.lane<3>())
  937. {
  938. store(data, base);
  939. }
  940. else if (mask.lane<2>() != 0.0f)
  941. {
  942. store_lane(base + 0, data.lane<0>());
  943. store_lane(base + 4, data.lane<1>());
  944. store_lane(base + 8, data.lane<2>());
  945. }
  946. else if (mask.lane<1>() != 0.0f)
  947. {
  948. store_lane(base + 0, data.lane<0>());
  949. store_lane(base + 4, data.lane<1>());
  950. }
  951. else if (mask.lane<0>() != 0.0f)
  952. {
  953. store_lane(base + 0, data.lane<0>());
  954. }
  955. }
  956. #define ASTCENC_USE_NATIVE_POPCOUNT 1
  957. /**
  958. * @brief Population bit count.
  959. *
  960. * @param v The value to population count.
  961. *
  962. * @return The number of 1 bits.
  963. */
  964. ASTCENC_SIMD_INLINE int popcount(uint64_t v)
  965. {
  966. return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
  967. }
  968. #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED