astcenc_vecmathlib_none_4.h 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2024 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief 4x32-bit vectors, implemented using plain C++.
  19. *
  20. * This module implements 4-wide 32-bit float, int, and mask vectors. This
  21. * module provides a scalar fallback for VLA code, primarily useful for
  22. * debugging VLA algorithms without the complexity of handling SIMD. Only the
  23. * baseline level of functionality needed to support VLA is provided.
  24. *
  25. * Note that the vector conditional operators implemented by this module are
  26. * designed to behave like SIMD conditional operators that generate lane masks.
  27. * Rather than returning 0/1 booleans like normal C++ code they will return
  28. * 0/-1 to give a full lane-width bitmask.
  29. *
  30. * Note that the documentation for this module still talks about "vectors" to
  31. * help developers think about the implied VLA behavior when writing optimized
  32. * paths.
  33. */
  34. #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
  35. #define ASTC_VECMATHLIB_NONE_4_H_INCLUDED
  36. #ifndef ASTCENC_SIMD_INLINE
  37. #error "Include astcenc_vecmathlib.h, do not include directly"
  38. #endif
  39. #include <algorithm>
  40. #include <cstdio>
  41. #include <cstring>
  42. #include <cfenv>
  43. // ============================================================================
  44. // vfloat4 data type
  45. // ============================================================================
  46. /**
  47. * @brief Data type for 4-wide floats.
  48. */
  49. struct vfloat4
  50. {
  51. /**
  52. * @brief Construct from zero-initialized value.
  53. */
  54. ASTCENC_SIMD_INLINE vfloat4() = default;
  55. /**
  56. * @brief Construct from 4 values loaded from an unaligned address.
  57. *
  58. * Consider using loada() which is better with wider VLA vectors if data is
  59. * aligned to vector length.
  60. */
  61. ASTCENC_SIMD_INLINE explicit vfloat4(const float* p)
  62. {
  63. m[0] = p[0];
  64. m[1] = p[1];
  65. m[2] = p[2];
  66. m[3] = p[3];
  67. }
  68. /**
  69. * @brief Construct from 4 scalar values replicated across all lanes.
  70. *
  71. * Consider using zero() for constexpr zeros.
  72. */
  73. ASTCENC_SIMD_INLINE explicit vfloat4(float a)
  74. {
  75. m[0] = a;
  76. m[1] = a;
  77. m[2] = a;
  78. m[3] = a;
  79. }
  80. /**
  81. * @brief Construct from 4 scalar values.
  82. *
  83. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  84. */
  85. ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
  86. {
  87. m[0] = a;
  88. m[1] = b;
  89. m[2] = c;
  90. m[3] = d;
  91. }
  92. /**
  93. * @brief Get the scalar value of a single lane.
  94. */
  95. template <int l> ASTCENC_SIMD_INLINE float lane() const
  96. {
  97. return m[l];
  98. }
  99. /**
  100. * @brief Set the scalar value of a single lane.
  101. */
  102. template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
  103. {
  104. m[l] = a;
  105. }
  106. /**
  107. * @brief Factory that returns a vector of zeros.
  108. */
  109. static ASTCENC_SIMD_INLINE vfloat4 zero()
  110. {
  111. return vfloat4(0.0f);
  112. }
  113. /**
  114. * @brief Factory that returns a replicated scalar loaded from memory.
  115. */
  116. static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
  117. {
  118. return vfloat4(*p);
  119. }
  120. /**
  121. * @brief Factory that returns a vector loaded from aligned memory.
  122. */
  123. static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
  124. {
  125. return vfloat4(p);
  126. }
  127. /**
  128. * @brief Factory that returns a vector containing the lane IDs.
  129. */
  130. static ASTCENC_SIMD_INLINE vfloat4 lane_id()
  131. {
  132. return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
  133. }
  134. /**
  135. * @brief Return a swizzled float 2.
  136. */
  137. template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
  138. {
  139. return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
  140. }
  141. /**
  142. * @brief Return a swizzled float 3.
  143. */
  144. template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
  145. {
  146. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
  147. }
  148. /**
  149. * @brief Return a swizzled float 4.
  150. */
  151. template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
  152. {
  153. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
  154. }
  155. /**
  156. * @brief The vector ...
  157. */
  158. float m[4];
  159. };
  160. // ============================================================================
  161. // vint4 data type
  162. // ============================================================================
  163. /**
  164. * @brief Data type for 4-wide ints.
  165. */
  166. struct vint4
  167. {
  168. /**
  169. * @brief Construct from zero-initialized value.
  170. */
  171. ASTCENC_SIMD_INLINE vint4() = default;
  172. /**
  173. * @brief Construct from 4 values loaded from an unaligned address.
  174. *
  175. * Consider using vint4::loada() which is better with wider VLA vectors
  176. * if data is aligned.
  177. */
  178. ASTCENC_SIMD_INLINE explicit vint4(const int* p)
  179. {
  180. m[0] = p[0];
  181. m[1] = p[1];
  182. m[2] = p[2];
  183. m[3] = p[3];
  184. }
  185. /**
  186. * @brief Construct from 4 uint8_t loaded from an unaligned address.
  187. */
  188. ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
  189. {
  190. m[0] = p[0];
  191. m[1] = p[1];
  192. m[2] = p[2];
  193. m[3] = p[3];
  194. }
  195. /**
  196. * @brief Construct from 4 scalar values.
  197. *
  198. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  199. */
  200. ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
  201. {
  202. m[0] = a;
  203. m[1] = b;
  204. m[2] = c;
  205. m[3] = d;
  206. }
  207. /**
  208. * @brief Construct from 4 scalar values replicated across all lanes.
  209. *
  210. * Consider using vint4::zero() for constexpr zeros.
  211. */
  212. ASTCENC_SIMD_INLINE explicit vint4(int a)
  213. {
  214. m[0] = a;
  215. m[1] = a;
  216. m[2] = a;
  217. m[3] = a;
  218. }
  219. /**
  220. * @brief Get the scalar value of a single lane.
  221. */
  222. template <int l> ASTCENC_SIMD_INLINE int lane() const
  223. {
  224. return m[l];
  225. }
  226. /**
  227. * @brief Set the scalar value of a single lane.
  228. */
  229. template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
  230. {
  231. m[l] = a;
  232. }
  233. /**
  234. * @brief Factory that returns a vector of zeros.
  235. */
  236. static ASTCENC_SIMD_INLINE vint4 zero()
  237. {
  238. return vint4(0);
  239. }
  240. /**
  241. * @brief Factory that returns a replicated scalar loaded from memory.
  242. */
  243. static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
  244. {
  245. return vint4(*p);
  246. }
  247. /**
  248. * @brief Factory that returns a vector loaded from unaligned memory.
  249. */
  250. static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
  251. {
  252. vint4 data;
  253. std::memcpy(&data.m, p, 4 * sizeof(int));
  254. return data;
  255. }
  256. /**
  257. * @brief Factory that returns a vector loaded from 16B aligned memory.
  258. */
  259. static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
  260. {
  261. return vint4(p);
  262. }
  263. /**
  264. * @brief Factory that returns a vector containing the lane IDs.
  265. */
  266. static ASTCENC_SIMD_INLINE vint4 lane_id()
  267. {
  268. return vint4(0, 1, 2, 3);
  269. }
  270. /**
  271. * @brief The vector ...
  272. */
  273. int m[4];
  274. };
  275. // ============================================================================
  276. // vmask4 data type
  277. // ============================================================================
  278. /**
  279. * @brief Data type for 4-wide control plane masks.
  280. */
  281. struct vmask4
  282. {
  283. /**
  284. * @brief Construct from an existing mask value.
  285. */
  286. ASTCENC_SIMD_INLINE explicit vmask4(int* p)
  287. {
  288. m[0] = p[0];
  289. m[1] = p[1];
  290. m[2] = p[2];
  291. m[3] = p[3];
  292. }
  293. /**
  294. * @brief Construct from 1 scalar value.
  295. */
  296. ASTCENC_SIMD_INLINE explicit vmask4(bool a)
  297. {
  298. m[0] = a == false ? 0 : -1;
  299. m[1] = a == false ? 0 : -1;
  300. m[2] = a == false ? 0 : -1;
  301. m[3] = a == false ? 0 : -1;
  302. }
  303. /**
  304. * @brief Construct from 4 scalar values.
  305. *
  306. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  307. */
  308. ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
  309. {
  310. m[0] = a == false ? 0 : -1;
  311. m[1] = b == false ? 0 : -1;
  312. m[2] = c == false ? 0 : -1;
  313. m[3] = d == false ? 0 : -1;
  314. }
  315. /**
  316. * @brief Get the scalar value of a single lane.
  317. */
  318. template <int l> ASTCENC_SIMD_INLINE float lane() const
  319. {
  320. return m[l] != 0;
  321. }
  322. /**
  323. * @brief The vector ...
  324. */
  325. int m[4];
  326. };
  327. // ============================================================================
  328. // vmask4 operators and functions
  329. // ============================================================================
  330. /**
  331. * @brief Overload: mask union (or).
  332. */
  333. ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
  334. {
  335. return vmask4(a.m[0] | b.m[0],
  336. a.m[1] | b.m[1],
  337. a.m[2] | b.m[2],
  338. a.m[3] | b.m[3]);
  339. }
  340. /**
  341. * @brief Overload: mask intersect (and).
  342. */
  343. ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
  344. {
  345. return vmask4(a.m[0] & b.m[0],
  346. a.m[1] & b.m[1],
  347. a.m[2] & b.m[2],
  348. a.m[3] & b.m[3]);
  349. }
  350. /**
  351. * @brief Overload: mask difference (xor).
  352. */
  353. ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
  354. {
  355. return vmask4(a.m[0] ^ b.m[0],
  356. a.m[1] ^ b.m[1],
  357. a.m[2] ^ b.m[2],
  358. a.m[3] ^ b.m[3]);
  359. }
  360. /**
  361. * @brief Overload: mask invert (not).
  362. */
  363. ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
  364. {
  365. return vmask4(~a.m[0],
  366. ~a.m[1],
  367. ~a.m[2],
  368. ~a.m[3]);
  369. }
  370. /**
  371. * @brief Return a 1-bit mask code indicating mask status.
  372. *
  373. * bit0 = lane 0
  374. */
  375. ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
  376. {
  377. return ((a.m[0] >> 31) & 0x1) |
  378. ((a.m[1] >> 30) & 0x2) |
  379. ((a.m[2] >> 29) & 0x4) |
  380. ((a.m[3] >> 28) & 0x8);
  381. }
  382. // ============================================================================
  383. // vint4 operators and functions
  384. // ============================================================================
  385. /**
  386. * @brief Overload: vector by vector addition.
  387. */
  388. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
  389. {
  390. return vint4(a.m[0] + b.m[0],
  391. a.m[1] + b.m[1],
  392. a.m[2] + b.m[2],
  393. a.m[3] + b.m[3]);
  394. }
  395. /**
  396. * @brief Overload: vector by vector subtraction.
  397. */
  398. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
  399. {
  400. return vint4(a.m[0] - b.m[0],
  401. a.m[1] - b.m[1],
  402. a.m[2] - b.m[2],
  403. a.m[3] - b.m[3]);
  404. }
  405. /**
  406. * @brief Overload: vector by vector multiplication.
  407. */
  408. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
  409. {
  410. return vint4(a.m[0] * b.m[0],
  411. a.m[1] * b.m[1],
  412. a.m[2] * b.m[2],
  413. a.m[3] * b.m[3]);
  414. }
  415. /**
  416. * @brief Overload: vector bit invert.
  417. */
  418. ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
  419. {
  420. return vint4(~a.m[0],
  421. ~a.m[1],
  422. ~a.m[2],
  423. ~a.m[3]);
  424. }
  425. /**
  426. * @brief Overload: vector by vector bitwise or.
  427. */
  428. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
  429. {
  430. return vint4(a.m[0] | b.m[0],
  431. a.m[1] | b.m[1],
  432. a.m[2] | b.m[2],
  433. a.m[3] | b.m[3]);
  434. }
  435. /**
  436. * @brief Overload: vector by vector bitwise and.
  437. */
  438. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
  439. {
  440. return vint4(a.m[0] & b.m[0],
  441. a.m[1] & b.m[1],
  442. a.m[2] & b.m[2],
  443. a.m[3] & b.m[3]);
  444. }
  445. /**
  446. * @brief Overload: vector by vector bitwise xor.
  447. */
  448. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
  449. {
  450. return vint4(a.m[0] ^ b.m[0],
  451. a.m[1] ^ b.m[1],
  452. a.m[2] ^ b.m[2],
  453. a.m[3] ^ b.m[3]);
  454. }
  455. /**
  456. * @brief Overload: vector by vector equality.
  457. */
  458. ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
  459. {
  460. return vmask4(a.m[0] == b.m[0],
  461. a.m[1] == b.m[1],
  462. a.m[2] == b.m[2],
  463. a.m[3] == b.m[3]);
  464. }
  465. /**
  466. * @brief Overload: vector by vector inequality.
  467. */
  468. ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
  469. {
  470. return vmask4(a.m[0] != b.m[0],
  471. a.m[1] != b.m[1],
  472. a.m[2] != b.m[2],
  473. a.m[3] != b.m[3]);
  474. }
  475. /**
  476. * @brief Overload: vector by vector less than.
  477. */
  478. ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
  479. {
  480. return vmask4(a.m[0] < b.m[0],
  481. a.m[1] < b.m[1],
  482. a.m[2] < b.m[2],
  483. a.m[3] < b.m[3]);
  484. }
  485. /**
  486. * @brief Overload: vector by vector greater than.
  487. */
  488. ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
  489. {
  490. return vmask4(a.m[0] > b.m[0],
  491. a.m[1] > b.m[1],
  492. a.m[2] > b.m[2],
  493. a.m[3] > b.m[3]);
  494. }
  495. /**
  496. * @brief Logical shift left.
  497. */
  498. template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
  499. {
  500. // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
  501. unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
  502. unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
  503. unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
  504. unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
  505. return vint4(static_cast<int>(as0),
  506. static_cast<int>(as1),
  507. static_cast<int>(as2),
  508. static_cast<int>(as3));
  509. }
  510. /**
  511. * @brief Logical shift right.
  512. */
  513. template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
  514. {
  515. // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
  516. unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
  517. unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
  518. unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
  519. unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
  520. return vint4(static_cast<int>(as0),
  521. static_cast<int>(as1),
  522. static_cast<int>(as2),
  523. static_cast<int>(as3));
  524. }
  525. /**
  526. * @brief Arithmetic shift right.
  527. */
  528. template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
  529. {
  530. return vint4(a.m[0] >> s,
  531. a.m[1] >> s,
  532. a.m[2] >> s,
  533. a.m[3] >> s);
  534. }
  535. /**
  536. * @brief Return the min vector of two vectors.
  537. */
  538. ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
  539. {
  540. return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
  541. a.m[1] < b.m[1] ? a.m[1] : b.m[1],
  542. a.m[2] < b.m[2] ? a.m[2] : b.m[2],
  543. a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
  544. }
  545. /**
  546. * @brief Return the min vector of two vectors.
  547. */
  548. ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
  549. {
  550. return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
  551. a.m[1] > b.m[1] ? a.m[1] : b.m[1],
  552. a.m[2] > b.m[2] ? a.m[2] : b.m[2],
  553. a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
  554. }
  555. /**
  556. * @brief Return the horizontal minimum of a single vector.
  557. */
  558. ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
  559. {
  560. int b = std::min(a.m[0], a.m[1]);
  561. int c = std::min(a.m[2], a.m[3]);
  562. return vint4(std::min(b, c));
  563. }
  564. /**
  565. * @brief Return the horizontal maximum of a single vector.
  566. */
  567. ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
  568. {
  569. int b = std::max(a.m[0], a.m[1]);
  570. int c = std::max(a.m[2], a.m[3]);
  571. return vint4(std::max(b, c));
  572. }
  573. /**
  574. * @brief Return the horizontal sum of vector lanes as a scalar.
  575. */
  576. ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
  577. {
  578. return a.m[0] + a.m[1] + a.m[2] + a.m[3];
  579. }
  580. /**
  581. * @brief Store a vector to an aligned memory address.
  582. */
  583. ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
  584. {
  585. p[0] = a.m[0];
  586. p[1] = a.m[1];
  587. p[2] = a.m[2];
  588. p[3] = a.m[3];
  589. }
  590. /**
  591. * @brief Store a vector to an unaligned memory address.
  592. */
  593. ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
  594. {
  595. p[0] = a.m[0];
  596. p[1] = a.m[1];
  597. p[2] = a.m[2];
  598. p[3] = a.m[3];
  599. }
  600. /**
  601. * @brief Store a vector to an unaligned memory address.
  602. */
  603. ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
  604. {
  605. std::memcpy(p, a.m, sizeof(int) * 4);
  606. }
  607. /**
  608. * @brief Store lowest N (vector width) bytes into an unaligned address.
  609. */
  610. ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  611. {
  612. std::memcpy(p, a.m, sizeof(uint8_t) * 4);
  613. }
  614. /**
  615. * @brief Gather N (vector width) indices from the array.
  616. */
  617. ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
  618. {
  619. return vint4(base[indices.m[0]],
  620. base[indices.m[1]],
  621. base[indices.m[2]],
  622. base[indices.m[3]]);
  623. }
  624. /**
  625. * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  626. */
  627. ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
  628. {
  629. int b0 = a.m[0] & 0xFF;
  630. int b1 = a.m[1] & 0xFF;
  631. int b2 = a.m[2] & 0xFF;
  632. int b3 = a.m[3] & 0xFF;
  633. int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
  634. return vint4(b, 0, 0, 0);
  635. }
  636. /**
  637. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  638. */
  639. ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
  640. {
  641. return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  642. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  643. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  644. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  645. }
  646. // ============================================================================
  647. // vfloat4 operators and functions
  648. // ============================================================================
  649. /**
  650. * @brief Overload: vector by vector addition.
  651. */
  652. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
  653. {
  654. return vfloat4(a.m[0] + b.m[0],
  655. a.m[1] + b.m[1],
  656. a.m[2] + b.m[2],
  657. a.m[3] + b.m[3]);
  658. }
  659. /**
  660. * @brief Overload: vector by vector subtraction.
  661. */
  662. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
  663. {
  664. return vfloat4(a.m[0] - b.m[0],
  665. a.m[1] - b.m[1],
  666. a.m[2] - b.m[2],
  667. a.m[3] - b.m[3]);
  668. }
  669. /**
  670. * @brief Overload: vector by vector multiplication.
  671. */
  672. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
  673. {
  674. return vfloat4(a.m[0] * b.m[0],
  675. a.m[1] * b.m[1],
  676. a.m[2] * b.m[2],
  677. a.m[3] * b.m[3]);
  678. }
  679. /**
  680. * @brief Overload: vector by vector division.
  681. */
  682. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
  683. {
  684. return vfloat4(a.m[0] / b.m[0],
  685. a.m[1] / b.m[1],
  686. a.m[2] / b.m[2],
  687. a.m[3] / b.m[3]);
  688. }
  689. /**
  690. * @brief Overload: vector by vector equality.
  691. */
  692. ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
  693. {
  694. return vmask4(a.m[0] == b.m[0],
  695. a.m[1] == b.m[1],
  696. a.m[2] == b.m[2],
  697. a.m[3] == b.m[3]);
  698. }
  699. /**
  700. * @brief Overload: vector by vector inequality.
  701. */
  702. ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
  703. {
  704. return vmask4(a.m[0] != b.m[0],
  705. a.m[1] != b.m[1],
  706. a.m[2] != b.m[2],
  707. a.m[3] != b.m[3]);
  708. }
  709. /**
  710. * @brief Overload: vector by vector less than.
  711. */
  712. ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
  713. {
  714. return vmask4(a.m[0] < b.m[0],
  715. a.m[1] < b.m[1],
  716. a.m[2] < b.m[2],
  717. a.m[3] < b.m[3]);
  718. }
  719. /**
  720. * @brief Overload: vector by vector greater than.
  721. */
  722. ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
  723. {
  724. return vmask4(a.m[0] > b.m[0],
  725. a.m[1] > b.m[1],
  726. a.m[2] > b.m[2],
  727. a.m[3] > b.m[3]);
  728. }
  729. /**
  730. * @brief Overload: vector by vector less than or equal.
  731. */
  732. ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
  733. {
  734. return vmask4(a.m[0] <= b.m[0],
  735. a.m[1] <= b.m[1],
  736. a.m[2] <= b.m[2],
  737. a.m[3] <= b.m[3]);
  738. }
  739. /**
  740. * @brief Overload: vector by vector greater than or equal.
  741. */
  742. ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
  743. {
  744. return vmask4(a.m[0] >= b.m[0],
  745. a.m[1] >= b.m[1],
  746. a.m[2] >= b.m[2],
  747. a.m[3] >= b.m[3]);
  748. }
  749. /**
  750. * @brief Return the min vector of two vectors.
  751. *
  752. * If either lane value is NaN, @c b will be returned for that lane.
  753. */
  754. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
  755. {
  756. return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
  757. a.m[1] < b.m[1] ? a.m[1] : b.m[1],
  758. a.m[2] < b.m[2] ? a.m[2] : b.m[2],
  759. a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
  760. }
  761. /**
  762. * @brief Return the max vector of two vectors.
  763. *
  764. * If either lane value is NaN, @c b will be returned for that lane.
  765. */
  766. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
  767. {
  768. return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
  769. a.m[1] > b.m[1] ? a.m[1] : b.m[1],
  770. a.m[2] > b.m[2] ? a.m[2] : b.m[2],
  771. a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
  772. }
  773. /**
  774. * @brief Return the absolute value of the float vector.
  775. */
  776. ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
  777. {
  778. return vfloat4(std::abs(a.m[0]),
  779. std::abs(a.m[1]),
  780. std::abs(a.m[2]),
  781. std::abs(a.m[3]));
  782. }
  783. /**
  784. * @brief Return a float rounded to the nearest integer value.
  785. */
  786. ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
  787. {
  788. assert(std::fegetround() == FE_TONEAREST);
  789. return vfloat4(std::nearbyint(a.m[0]),
  790. std::nearbyint(a.m[1]),
  791. std::nearbyint(a.m[2]),
  792. std::nearbyint(a.m[3]));
  793. }
  794. /**
  795. * @brief Return the horizontal minimum of a vector.
  796. */
  797. ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
  798. {
  799. float tmp1 = std::min(a.m[0], a.m[1]);
  800. float tmp2 = std::min(a.m[2], a.m[3]);
  801. return vfloat4(std::min(tmp1, tmp2));
  802. }
  803. /**
  804. * @brief Return the horizontal maximum of a vector.
  805. */
  806. ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
  807. {
  808. float tmp1 = std::max(a.m[0], a.m[1]);
  809. float tmp2 = std::max(a.m[2], a.m[3]);
  810. return vfloat4(std::max(tmp1, tmp2));
  811. }
  812. /**
  813. * @brief Return the horizontal sum of a vector.
  814. */
  815. ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
  816. {
  817. // Use halving add, gives invariance with SIMD versions
  818. return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]);
  819. }
  820. /**
  821. * @brief Return the sqrt of the lanes in the vector.
  822. */
  823. ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
  824. {
  825. return vfloat4(std::sqrt(a.m[0]),
  826. std::sqrt(a.m[1]),
  827. std::sqrt(a.m[2]),
  828. std::sqrt(a.m[3]));
  829. }
  830. /**
  831. * @brief Return lanes from @c b if @c cond is set, else @c a.
  832. */
  833. ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
  834. {
  835. return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  836. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  837. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  838. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  839. }
  840. /**
  841. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  842. */
  843. ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
  844. {
  845. return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  846. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  847. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  848. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  849. }
  850. /**
  851. * @brief Load a vector of gathered results from an array;
  852. */
  853. ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
  854. {
  855. return vfloat4(base[indices.m[0]],
  856. base[indices.m[1]],
  857. base[indices.m[2]],
  858. base[indices.m[3]]);
  859. }
  860. /**
  861. * @brief Store a vector to an unaligned memory address.
  862. */
  863. ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr)
  864. {
  865. ptr[0] = a.m[0];
  866. ptr[1] = a.m[1];
  867. ptr[2] = a.m[2];
  868. ptr[3] = a.m[3];
  869. }
  870. /**
  871. * @brief Store a vector to an aligned memory address.
  872. */
  873. ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)
  874. {
  875. ptr[0] = a.m[0];
  876. ptr[1] = a.m[1];
  877. ptr[2] = a.m[2];
  878. ptr[3] = a.m[3];
  879. }
  880. /**
  881. * @brief Return a integer value for a float vector, using truncation.
  882. */
  883. ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
  884. {
  885. return vint4(static_cast<int>(a.m[0]),
  886. static_cast<int>(a.m[1]),
  887. static_cast<int>(a.m[2]),
  888. static_cast<int>(a.m[3]));
  889. }
  890. /**f
  891. * @brief Return a integer value for a float vector, using round-to-nearest.
  892. */
  893. ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
  894. {
  895. a = a + vfloat4(0.5f);
  896. return vint4(static_cast<int>(a.m[0]),
  897. static_cast<int>(a.m[1]),
  898. static_cast<int>(a.m[2]),
  899. static_cast<int>(a.m[3]));
  900. }
  901. /**
  902. * @brief Return a float value for a integer vector.
  903. */
  904. ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
  905. {
  906. return vfloat4(static_cast<float>(a.m[0]),
  907. static_cast<float>(a.m[1]),
  908. static_cast<float>(a.m[2]),
  909. static_cast<float>(a.m[3]));
  910. }
  911. /**
  912. * @brief Return a float16 value for a float vector, using round-to-nearest.
  913. */
  914. ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
  915. {
  916. return vint4(
  917. float_to_sf16(a.lane<0>()),
  918. float_to_sf16(a.lane<1>()),
  919. float_to_sf16(a.lane<2>()),
  920. float_to_sf16(a.lane<3>()));
  921. }
  922. /**
  923. * @brief Return a float16 value for a float scalar, using round-to-nearest.
  924. */
  925. static inline uint16_t float_to_float16(float a)
  926. {
  927. return float_to_sf16(a);
  928. }
  929. /**
  930. * @brief Return a float value for a float16 vector.
  931. */
  932. ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
  933. {
  934. return vfloat4(
  935. sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
  936. sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
  937. sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
  938. sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
  939. }
  940. /**
  941. * @brief Return a float value for a float16 scalar.
  942. */
  943. ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
  944. {
  945. return sf16_to_float(a);
  946. }
  947. /**
  948. * @brief Return a float value as an integer bit pattern (i.e. no conversion).
  949. *
  950. * It is a common trick to convert floats into integer bit patterns, perform
  951. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  952. * convert them back again. This is the first half of that flip.
  953. */
  954. ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
  955. {
  956. vint4 r;
  957. std::memcpy(r.m, a.m, 4 * 4);
  958. return r;
  959. }
  960. /**
  961. * @brief Return a integer value as a float bit pattern (i.e. no conversion).
  962. *
  963. * It is a common trick to convert floats into integer bit patterns, perform
  964. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  965. * convert them back again. This is the second half of that flip.
  966. */
  967. ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
  968. {
  969. vfloat4 r;
  970. std::memcpy(r.m, a.m, 4 * 4);
  971. return r;
  972. }
  973. /**
  974. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  975. */
  976. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
  977. {
  978. t0p = t0;
  979. }
  980. /**
  981. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  982. */
  983. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
  984. {
  985. t0p = t0;
  986. t1p = t1;
  987. }
  988. /**
  989. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  990. */
  991. ASTCENC_SIMD_INLINE void vtable_prepare(
  992. vint4 t0, vint4 t1, vint4 t2, vint4 t3,
  993. vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
  994. {
  995. t0p = t0;
  996. t1p = t1;
  997. t2p = t2;
  998. t3p = t3;
  999. }
  1000. /**
  1001. * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
  1002. */
  1003. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
  1004. {
  1005. uint8_t table[16];
  1006. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1007. return vint4(table[idx.lane<0>()],
  1008. table[idx.lane<1>()],
  1009. table[idx.lane<2>()],
  1010. table[idx.lane<3>()]);
  1011. }
  1012. /**
  1013. * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
  1014. */
  1015. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
  1016. {
  1017. uint8_t table[32];
  1018. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1019. std::memcpy(table + 16, t1.m, 4 * sizeof(int));
  1020. return vint4(table[idx.lane<0>()],
  1021. table[idx.lane<1>()],
  1022. table[idx.lane<2>()],
  1023. table[idx.lane<3>()]);
  1024. }
  1025. /**
  1026. * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
  1027. */
  1028. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
  1029. {
  1030. uint8_t table[64];
  1031. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1032. std::memcpy(table + 16, t1.m, 4 * sizeof(int));
  1033. std::memcpy(table + 32, t2.m, 4 * sizeof(int));
  1034. std::memcpy(table + 48, t3.m, 4 * sizeof(int));
  1035. return vint4(table[idx.lane<0>()],
  1036. table[idx.lane<1>()],
  1037. table[idx.lane<2>()],
  1038. table[idx.lane<3>()]);
  1039. }
  1040. /**
  1041. * @brief Return a vector of interleaved RGBA data.
  1042. *
  1043. * Input vectors have the value stored in the bottom 8 bits of each lane,
  1044. * with high bits set to zero.
  1045. *
  1046. * Output vector stores a single RGBA texel packed in each lane.
  1047. */
  1048. ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
  1049. {
  1050. return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
  1051. }
  1052. /**
  1053. * @brief Store a single vector lane to an unaligned address.
  1054. */
  1055. ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
  1056. {
  1057. std::memcpy(base, &data, sizeof(int));
  1058. }
  1059. /**
  1060. * @brief Store a vector, skipping masked lanes.
  1061. *
  1062. * All masked lanes must be at the end of vector, after all non-masked lanes.
  1063. * Input is a byte array of at least 4 bytes per unmasked entry.
  1064. */
  1065. ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
  1066. {
  1067. if (mask.m[3])
  1068. {
  1069. store(data, base);
  1070. }
  1071. else if (mask.m[2])
  1072. {
  1073. store_lane(base + 0, data.lane<0>());
  1074. store_lane(base + 4, data.lane<1>());
  1075. store_lane(base + 8, data.lane<2>());
  1076. }
  1077. else if (mask.m[1])
  1078. {
  1079. store_lane(base + 0, data.lane<0>());
  1080. store_lane(base + 4, data.lane<1>());
  1081. }
  1082. else if (mask.m[0])
  1083. {
  1084. store_lane(base + 0, data.lane<0>());
  1085. }
  1086. }
  1087. #endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED