astcenc_vecmathlib_neon_4.h 24 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2022 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
  19. *
  20. * This module implements 4-wide 32-bit float, int, and mask vectors for
  21. * Armv8-A NEON.
  22. *
  23. * There is a baseline level of functionality provided by all vector widths and
  24. * implementations. This is implemented using identical function signatures,
  25. * modulo data type, so we can use them as substitutable implementations in VLA
  26. * code.
  27. *
  28. * The 4-wide vectors are also used as a fixed-width type, and significantly
  29. * extend the functionality above that available to VLA code.
  30. */
  31. #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
  32. #define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
  33. #ifndef ASTCENC_SIMD_INLINE
  34. #error "Include astcenc_vecmathlib.h, do not include directly"
  35. #endif
  36. #include <cstdio>
  37. // ============================================================================
  38. // vfloat4 data type
  39. // ============================================================================
  40. /**
  41. * @brief Data type for 4-wide floats.
  42. */
  43. struct vfloat4
  44. {
  45. /**
  46. * @brief Construct from zero-initialized value.
  47. */
  48. ASTCENC_SIMD_INLINE vfloat4() = default;
  49. /**
  50. * @brief Construct from 4 values loaded from an unaligned address.
  51. *
  52. * Consider using loada() which is better with vectors if data is aligned
  53. * to vector length.
  54. */
  55. ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
  56. {
  57. m = vld1q_f32(p);
  58. }
  59. /**
  60. * @brief Construct from 1 scalar value replicated across all lanes.
  61. *
  62. * Consider using zero() for constexpr zeros.
  63. */
  64. ASTCENC_SIMD_INLINE explicit vfloat4(float a)
  65. {
  66. m = vdupq_n_f32(a);
  67. }
  68. /**
  69. * @brief Construct from 4 scalar values.
  70. *
  71. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  72. */
  73. ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
  74. {
  75. float v[4] { a, b, c, d };
  76. m = vld1q_f32(v);
  77. }
  78. /**
  79. * @brief Construct from an existing SIMD register.
  80. */
  81. ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
  82. {
  83. m = a;
  84. }
  85. /**
  86. * @brief Get the scalar value of a single lane.
  87. */
  88. template <int l> ASTCENC_SIMD_INLINE float lane() const
  89. {
  90. return vgetq_lane_f32(m, l);
  91. }
  92. /**
  93. * @brief Set the scalar value of a single lane.
  94. */
  95. template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
  96. {
  97. m = vsetq_lane_f32(a, m, l);
  98. }
  99. /**
  100. * @brief Factory that returns a vector of zeros.
  101. */
  102. static ASTCENC_SIMD_INLINE vfloat4 zero()
  103. {
  104. return vfloat4(vdupq_n_f32(0.0f));
  105. }
  106. /**
  107. * @brief Factory that returns a replicated scalar loaded from memory.
  108. */
  109. static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
  110. {
  111. return vfloat4(vld1q_dup_f32(p));
  112. }
  113. /**
  114. * @brief Factory that returns a vector loaded from 16B aligned memory.
  115. */
  116. static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
  117. {
  118. return vfloat4(vld1q_f32(p));
  119. }
  120. /**
  121. * @brief Factory that returns a vector containing the lane IDs.
  122. */
  123. static ASTCENC_SIMD_INLINE vfloat4 lane_id()
  124. {
  125. alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
  126. return vfloat4(vld1q_f32(data));
  127. }
  128. /**
  129. * @brief Return a swizzled float 2.
  130. */
  131. template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
  132. {
  133. return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
  134. }
  135. /**
  136. * @brief Return a swizzled float 3.
  137. */
  138. template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
  139. {
  140. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
  141. }
  142. /**
  143. * @brief Return a swizzled float 4.
  144. */
  145. template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
  146. {
  147. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
  148. }
  149. /**
  150. * @brief The vector ...
  151. */
  152. float32x4_t m;
  153. };
  154. // ============================================================================
  155. // vint4 data type
  156. // ============================================================================
  157. /**
  158. * @brief Data type for 4-wide ints.
  159. */
  160. struct vint4
  161. {
  162. /**
  163. * @brief Construct from zero-initialized value.
  164. */
  165. ASTCENC_SIMD_INLINE vint4() = default;
  166. /**
  167. * @brief Construct from 4 values loaded from an unaligned address.
  168. *
  169. * Consider using loada() which is better with vectors if data is aligned
  170. * to vector length.
  171. */
  172. ASTCENC_SIMD_INLINE explicit vint4(const int *p)
  173. {
  174. m = vld1q_s32(p);
  175. }
  176. /**
  177. * @brief Construct from 4 uint8_t loaded from an unaligned address.
  178. */
  179. ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
  180. {
  181. // Cast is safe - NEON loads are allowed to be unaligned
  182. uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
  183. uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
  184. m = vreinterpretq_s32_u32(vmovl_u16(t16));
  185. }
  186. /**
  187. * @brief Construct from 1 scalar value replicated across all lanes.
  188. *
  189. * Consider using vfloat4::zero() for constexpr zeros.
  190. */
  191. ASTCENC_SIMD_INLINE explicit vint4(int a)
  192. {
  193. m = vdupq_n_s32(a);
  194. }
  195. /**
  196. * @brief Construct from 4 scalar values.
  197. *
  198. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  199. */
  200. ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
  201. {
  202. int v[4] { a, b, c, d };
  203. m = vld1q_s32(v);
  204. }
  205. /**
  206. * @brief Construct from an existing SIMD register.
  207. */
  208. ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
  209. {
  210. m = a;
  211. }
  212. /**
  213. * @brief Get the scalar from a single lane.
  214. */
  215. template <int l> ASTCENC_SIMD_INLINE int lane() const
  216. {
  217. return vgetq_lane_s32(m, l);
  218. }
  219. /**
  220. * @brief Set the scalar value of a single lane.
  221. */
  222. template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
  223. {
  224. m = vsetq_lane_s32(a, m, l);
  225. }
  226. /**
  227. * @brief Factory that returns a vector of zeros.
  228. */
  229. static ASTCENC_SIMD_INLINE vint4 zero()
  230. {
  231. return vint4(0);
  232. }
  233. /**
  234. * @brief Factory that returns a replicated scalar loaded from memory.
  235. */
  236. static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
  237. {
  238. return vint4(*p);
  239. }
  240. /**
  241. * @brief Factory that returns a vector loaded from 16B aligned memory.
  242. */
  243. static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
  244. {
  245. return vint4(p);
  246. }
  247. /**
  248. * @brief Factory that returns a vector containing the lane IDs.
  249. */
  250. static ASTCENC_SIMD_INLINE vint4 lane_id()
  251. {
  252. alignas(16) static const int data[4] { 0, 1, 2, 3 };
  253. return vint4(vld1q_s32(data));
  254. }
  255. /**
  256. * @brief The vector ...
  257. */
  258. int32x4_t m;
  259. };
  260. // ============================================================================
  261. // vmask4 data type
  262. // ============================================================================
  263. /**
  264. * @brief Data type for 4-wide control plane masks.
  265. */
  266. struct vmask4
  267. {
  268. /**
  269. * @brief Construct from an existing SIMD register.
  270. */
  271. ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
  272. {
  273. m = a;
  274. }
  275. #if !defined(_MSC_VER)
  276. /**
  277. * @brief Construct from an existing SIMD register.
  278. */
  279. ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
  280. {
  281. m = vreinterpretq_u32_s32(a);
  282. }
  283. #endif
  284. /**
  285. * @brief Construct from 1 scalar value.
  286. */
  287. ASTCENC_SIMD_INLINE explicit vmask4(bool a)
  288. {
  289. m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
  290. }
  291. /**
  292. * @brief Construct from 4 scalar values.
  293. *
  294. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  295. */
  296. ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
  297. {
  298. int v[4] {
  299. a == true ? -1 : 0,
  300. b == true ? -1 : 0,
  301. c == true ? -1 : 0,
  302. d == true ? -1 : 0
  303. };
  304. int32x4_t ms = vld1q_s32(v);
  305. m = vreinterpretq_u32_s32(ms);
  306. }
  307. /**
  308. * @brief Get the scalar from a single lane.
  309. */
  310. template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
  311. {
  312. return vgetq_lane_u32(m, l);
  313. }
  314. /**
  315. * @brief The vector ...
  316. */
  317. uint32x4_t m;
  318. };
  319. // ============================================================================
  320. // vmask4 operators and functions
  321. // ============================================================================
  322. /**
  323. * @brief Overload: mask union (or).
  324. */
  325. ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
  326. {
  327. return vmask4(vorrq_u32(a.m, b.m));
  328. }
  329. /**
  330. * @brief Overload: mask intersect (and).
  331. */
  332. ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
  333. {
  334. return vmask4(vandq_u32(a.m, b.m));
  335. }
  336. /**
  337. * @brief Overload: mask difference (xor).
  338. */
  339. ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
  340. {
  341. return vmask4(veorq_u32(a.m, b.m));
  342. }
  343. /**
  344. * @brief Overload: mask invert (not).
  345. */
  346. ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
  347. {
  348. return vmask4(vmvnq_u32(a.m));
  349. }
  350. /**
  351. * @brief Return a 4-bit mask code indicating mask status.
  352. *
  353. * bit0 = lane 0
  354. */
  355. ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
  356. {
  357. static const int shifta[4] { 0, 1, 2, 3 };
  358. static const int32x4_t shift = vld1q_s32(shifta);
  359. uint32x4_t tmp = vshrq_n_u32(a.m, 31);
  360. return vaddvq_u32(vshlq_u32(tmp, shift));
  361. }
  362. // ============================================================================
  363. // vint4 operators and functions
  364. // ============================================================================
  365. /**
  366. * @brief Overload: vector by vector addition.
  367. */
  368. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
  369. {
  370. return vint4(vaddq_s32(a.m, b.m));
  371. }
  372. /**
  373. * @brief Overload: vector by vector subtraction.
  374. */
  375. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
  376. {
  377. return vint4(vsubq_s32(a.m, b.m));
  378. }
  379. /**
  380. * @brief Overload: vector by vector multiplication.
  381. */
  382. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
  383. {
  384. return vint4(vmulq_s32(a.m, b.m));
  385. }
  386. /**
  387. * @brief Overload: vector bit invert.
  388. */
  389. ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
  390. {
  391. return vint4(vmvnq_s32(a.m));
  392. }
  393. /**
  394. * @brief Overload: vector by vector bitwise or.
  395. */
  396. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
  397. {
  398. return vint4(vorrq_s32(a.m, b.m));
  399. }
  400. /**
  401. * @brief Overload: vector by vector bitwise and.
  402. */
  403. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
  404. {
  405. return vint4(vandq_s32(a.m, b.m));
  406. }
  407. /**
  408. * @brief Overload: vector by vector bitwise xor.
  409. */
  410. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
  411. {
  412. return vint4(veorq_s32(a.m, b.m));
  413. }
  414. /**
  415. * @brief Overload: vector by vector equality.
  416. */
  417. ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
  418. {
  419. return vmask4(vceqq_s32(a.m, b.m));
  420. }
  421. /**
  422. * @brief Overload: vector by vector inequality.
  423. */
  424. ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
  425. {
  426. return ~vmask4(vceqq_s32(a.m, b.m));
  427. }
  428. /**
  429. * @brief Overload: vector by vector less than.
  430. */
  431. ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
  432. {
  433. return vmask4(vcltq_s32(a.m, b.m));
  434. }
  435. /**
  436. * @brief Overload: vector by vector greater than.
  437. */
  438. ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
  439. {
  440. return vmask4(vcgtq_s32(a.m, b.m));
  441. }
  442. /**
  443. * @brief Logical shift left.
  444. */
  445. template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
  446. {
  447. return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
  448. }
  449. /**
  450. * @brief Logical shift right.
  451. */
  452. template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
  453. {
  454. uint32x4_t ua = vreinterpretq_u32_s32(a.m);
  455. ua = vshlq_u32(ua, vdupq_n_s32(-s));
  456. return vint4(vreinterpretq_s32_u32(ua));
  457. }
  458. /**
  459. * @brief Arithmetic shift right.
  460. */
  461. template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
  462. {
  463. return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
  464. }
  465. /**
  466. * @brief Return the min vector of two vectors.
  467. */
  468. ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
  469. {
  470. return vint4(vminq_s32(a.m, b.m));
  471. }
  472. /**
  473. * @brief Return the max vector of two vectors.
  474. */
  475. ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
  476. {
  477. return vint4(vmaxq_s32(a.m, b.m));
  478. }
  479. /**
  480. * @brief Return the horizontal minimum of a vector.
  481. */
  482. ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
  483. {
  484. return vint4(vminvq_s32(a.m));
  485. }
  486. /**
  487. * @brief Return the horizontal maximum of a vector.
  488. */
  489. ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
  490. {
  491. return vint4(vmaxvq_s32(a.m));
  492. }
  493. /**
  494. * @brief Return the horizontal sum of a vector.
  495. */
  496. ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
  497. {
  498. int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
  499. return vget_lane_s32(vpadd_s32(t, t), 0);
  500. }
  501. /**
  502. * @brief Store a vector to a 16B aligned memory address.
  503. */
  504. ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
  505. {
  506. vst1q_s32(p, a.m);
  507. }
  508. /**
  509. * @brief Store a vector to an unaligned memory address.
  510. */
  511. ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
  512. {
  513. vst1q_s32(p, a.m);
  514. }
  515. /**
  516. * @brief Store lowest N (vector width) bytes into an unaligned address.
  517. */
  518. ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  519. {
  520. vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
  521. }
  522. /**
  523. * @brief Gather N (vector width) indices from the array.
  524. */
  525. ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
  526. {
  527. alignas(16) int idx[4];
  528. storea(indices, idx);
  529. alignas(16) int vals[4];
  530. vals[0] = base[idx[0]];
  531. vals[1] = base[idx[1]];
  532. vals[2] = base[idx[2]];
  533. vals[3] = base[idx[3]];
  534. return vint4(vals);
  535. }
  536. /**
  537. * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  538. */
  539. ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
  540. {
  541. alignas(16) uint8_t shuf[16] {
  542. 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  543. };
  544. uint8x16_t idx = vld1q_u8(shuf);
  545. int8x16_t av = vreinterpretq_s8_s32(a.m);
  546. return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
  547. }
  548. /**
  549. * @brief Return lanes from @c b if @c cond is set, else @c a.
  550. */
  551. ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
  552. {
  553. return vint4(vbslq_s32(cond.m, b.m, a.m));
  554. }
  555. // ============================================================================
  556. // vfloat4 operators and functions
  557. // ============================================================================
  558. /**
  559. * @brief Overload: vector by vector addition.
  560. */
  561. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
  562. {
  563. return vfloat4(vaddq_f32(a.m, b.m));
  564. }
  565. /**
  566. * @brief Overload: vector by vector subtraction.
  567. */
  568. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
  569. {
  570. return vfloat4(vsubq_f32(a.m, b.m));
  571. }
  572. /**
  573. * @brief Overload: vector by vector multiplication.
  574. */
  575. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
  576. {
  577. return vfloat4(vmulq_f32(a.m, b.m));
  578. }
  579. /**
  580. * @brief Overload: vector by vector division.
  581. */
  582. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
  583. {
  584. return vfloat4(vdivq_f32(a.m, b.m));
  585. }
  586. /**
  587. * @brief Overload: vector by vector equality.
  588. */
  589. ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
  590. {
  591. return vmask4(vceqq_f32(a.m, b.m));
  592. }
  593. /**
  594. * @brief Overload: vector by vector inequality.
  595. */
  596. ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
  597. {
  598. return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
  599. }
  600. /**
  601. * @brief Overload: vector by vector less than.
  602. */
  603. ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
  604. {
  605. return vmask4(vcltq_f32(a.m, b.m));
  606. }
  607. /**
  608. * @brief Overload: vector by vector greater than.
  609. */
  610. ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
  611. {
  612. return vmask4(vcgtq_f32(a.m, b.m));
  613. }
  614. /**
  615. * @brief Overload: vector by vector less than or equal.
  616. */
  617. ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
  618. {
  619. return vmask4(vcleq_f32(a.m, b.m));
  620. }
  621. /**
  622. * @brief Overload: vector by vector greater than or equal.
  623. */
  624. ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
  625. {
  626. return vmask4(vcgeq_f32(a.m, b.m));
  627. }
  628. /**
  629. * @brief Return the min vector of two vectors.
  630. *
  631. * If either lane value is NaN, @c b will be returned for that lane.
  632. */
  633. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
  634. {
  635. // Do not reorder - second operand will return if either is NaN
  636. return vfloat4(vminnmq_f32(a.m, b.m));
  637. }
  638. /**
  639. * @brief Return the max vector of two vectors.
  640. *
  641. * If either lane value is NaN, @c b will be returned for that lane.
  642. */
  643. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
  644. {
  645. // Do not reorder - second operand will return if either is NaN
  646. return vfloat4(vmaxnmq_f32(a.m, b.m));
  647. }
  648. /**
  649. * @brief Return the absolute value of the float vector.
  650. */
  651. ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
  652. {
  653. float32x4_t zero = vdupq_n_f32(0.0f);
  654. float32x4_t inv = vsubq_f32(zero, a.m);
  655. return vfloat4(vmaxq_f32(a.m, inv));
  656. }
  657. /**
  658. * @brief Return a float rounded to the nearest integer value.
  659. */
  660. ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
  661. {
  662. return vfloat4(vrndnq_f32(a.m));
  663. }
  664. /**
  665. * @brief Return the horizontal minimum of a vector.
  666. */
  667. ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
  668. {
  669. return vfloat4(vminvq_f32(a.m));
  670. }
  671. /**
  672. * @brief Return the horizontal maximum of a vector.
  673. */
  674. ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
  675. {
  676. return vfloat4(vmaxvq_f32(a.m));
  677. }
  678. /**
  679. * @brief Return the horizontal sum of a vector.
  680. */
  681. ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
  682. {
  683. // Perform halving add to ensure invariance; we cannot use vaddqv as this
  684. // does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
  685. float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
  686. return vget_lane_f32(vpadd_f32(t, t), 0);
  687. }
  688. /**
  689. * @brief Return the sqrt of the lanes in the vector.
  690. */
  691. ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
  692. {
  693. return vfloat4(vsqrtq_f32(a.m));
  694. }
  695. /**
  696. * @brief Return lanes from @c b if @c cond is set, else @c a.
  697. */
  698. ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
  699. {
  700. return vfloat4(vbslq_f32(cond.m, b.m, a.m));
  701. }
  702. /**
  703. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  704. */
  705. ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
  706. {
  707. static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
  708. uint32x4_t mask = vcgeq_u32(cond.m, msb);
  709. return vfloat4(vbslq_f32(mask, b.m, a.m));
  710. }
  711. /**
  712. * @brief Load a vector of gathered results from an array;
  713. */
  714. ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
  715. {
  716. alignas(16) int idx[4];
  717. storea(indices, idx);
  718. alignas(16) float vals[4];
  719. vals[0] = base[idx[0]];
  720. vals[1] = base[idx[1]];
  721. vals[2] = base[idx[2]];
  722. vals[3] = base[idx[3]];
  723. return vfloat4(vals);
  724. }
  725. /**
  726. * @brief Store a vector to an unaligned memory address.
  727. */
  728. ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
  729. {
  730. vst1q_f32(p, a.m);
  731. }
  732. /**
  733. * @brief Store a vector to a 16B aligned memory address.
  734. */
  735. ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
  736. {
  737. vst1q_f32(p, a.m);
  738. }
  739. /**
  740. * @brief Return a integer value for a float vector, using truncation.
  741. */
  742. ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
  743. {
  744. return vint4(vcvtq_s32_f32(a.m));
  745. }
  746. /**
  747. * @brief Return a integer value for a float vector, using round-to-nearest.
  748. */
  749. ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
  750. {
  751. a = round(a);
  752. return vint4(vcvtq_s32_f32(a.m));
  753. }
  754. /**
  755. * @brief Return a float value for an integer vector.
  756. */
  757. ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
  758. {
  759. return vfloat4(vcvtq_f32_s32(a.m));
  760. }
  761. /**
  762. * @brief Return a float16 value for a float vector, using round-to-nearest.
  763. */
  764. ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
  765. {
  766. // Generate float16 value
  767. float16x4_t f16 = vcvt_f16_f32(a.m);
  768. // Convert each 16-bit float pattern to a 32-bit pattern
  769. uint16x4_t u16 = vreinterpret_u16_f16(f16);
  770. uint32x4_t u32 = vmovl_u16(u16);
  771. return vint4(vreinterpretq_s32_u32(u32));
  772. }
  773. /**
  774. * @brief Return a float16 value for a float scalar, using round-to-nearest.
  775. */
  776. static inline uint16_t float_to_float16(float a)
  777. {
  778. vfloat4 av(a);
  779. return static_cast<uint16_t>(float_to_float16(av).lane<0>());
  780. }
  781. /**
  782. * @brief Return a float value for a float16 vector.
  783. */
  784. ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
  785. {
  786. // Convert each 32-bit float pattern to a 16-bit pattern
  787. uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
  788. uint16x4_t u16 = vmovn_u32(u32);
  789. float16x4_t f16 = vreinterpret_f16_u16(u16);
  790. // Generate float16 value
  791. return vfloat4(vcvt_f32_f16(f16));
  792. }
  793. /**
  794. * @brief Return a float value for a float16 scalar.
  795. */
  796. ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
  797. {
  798. vint4 av(a);
  799. return float16_to_float(av).lane<0>();
  800. }
  801. /**
  802. * @brief Return a float value as an integer bit pattern (i.e. no conversion).
  803. *
  804. * It is a common trick to convert floats into integer bit patterns, perform
  805. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  806. * convert them back again. This is the first half of that flip.
  807. */
  808. ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
  809. {
  810. return vint4(vreinterpretq_s32_f32(a.m));
  811. }
  812. /**
  813. * @brief Return a integer value as a float bit pattern (i.e. no conversion).
  814. *
  815. * It is a common trick to convert floats into integer bit patterns, perform
  816. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  817. * convert them back again. This is the second half of that flip.
  818. */
  819. ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
  820. {
  821. return vfloat4(vreinterpretq_f32_s32(v.m));
  822. }
  823. /**
  824. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  825. */
  826. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
  827. {
  828. t0p = t0;
  829. }
  830. /**
  831. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  832. */
  833. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
  834. {
  835. t0p = t0;
  836. t1p = t1;
  837. }
  838. /**
  839. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  840. */
  841. ASTCENC_SIMD_INLINE void vtable_prepare(
  842. vint4 t0, vint4 t1, vint4 t2, vint4 t3,
  843. vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
  844. {
  845. t0p = t0;
  846. t1p = t1;
  847. t2p = t2;
  848. t3p = t3;
  849. }
  850. /**
  851. * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
  852. */
  853. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
  854. {
  855. int8x16_t table {
  856. vreinterpretq_s8_s32(t0.m)
  857. };
  858. // Set index byte above max index for unused bytes so table lookup returns zero
  859. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  860. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  861. return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
  862. }
  863. /**
  864. * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
  865. */
  866. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
  867. {
  868. int8x16x2_t table {
  869. vreinterpretq_s8_s32(t0.m),
  870. vreinterpretq_s8_s32(t1.m)
  871. };
  872. // Set index byte above max index for unused bytes so table lookup returns zero
  873. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  874. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  875. return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
  876. }
  877. /**
  878. * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
  879. */
  880. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
  881. {
  882. int8x16x4_t table {
  883. vreinterpretq_s8_s32(t0.m),
  884. vreinterpretq_s8_s32(t1.m),
  885. vreinterpretq_s8_s32(t2.m),
  886. vreinterpretq_s8_s32(t3.m)
  887. };
  888. // Set index byte above max index for unused bytes so table lookup returns zero
  889. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  890. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  891. return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
  892. }
  893. /**
  894. * @brief Return a vector of interleaved RGBA data.
  895. *
  896. * Input vectors have the value stored in the bottom 8 bits of each lane,
  897. * with high bits set to zero.
  898. *
  899. * Output vector stores a single RGBA texel packed in each lane.
  900. */
  901. ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
  902. {
  903. return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
  904. }
  905. /**
  906. * @brief Store a vector, skipping masked lanes.
  907. *
  908. * All masked lanes must be at the end of vector, after all non-masked lanes.
  909. */
  910. ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
  911. {
  912. if (mask.lane<3>())
  913. {
  914. store(data, base);
  915. }
  916. else if (mask.lane<2>())
  917. {
  918. base[0] = data.lane<0>();
  919. base[1] = data.lane<1>();
  920. base[2] = data.lane<2>();
  921. }
  922. else if (mask.lane<1>())
  923. {
  924. base[0] = data.lane<0>();
  925. base[1] = data.lane<1>();
  926. }
  927. else if (mask.lane<0>())
  928. {
  929. base[0] = data.lane<0>();
  930. }
  931. }
  932. #define ASTCENC_USE_NATIVE_POPCOUNT 1
  933. /**
  934. * @brief Population bit count.
  935. *
  936. * @param v The value to population count.
  937. *
  938. * @return The number of 1 bits.
  939. */
  940. ASTCENC_SIMD_INLINE int popcount(uint64_t v)
  941. {
  942. return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
  943. }
  944. #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED