astcenc_vecmathlib_common_4.h 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2020-2021 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief Generic 4x32-bit vector functions.
  19. *
  20. * This module implements generic 4-wide vector functions that are valid for
  21. * all instruction sets, typically implemented using lower level 4-wide
  22. * operations that are ISA-specific.
  23. */
  24. #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
  25. #define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
  26. #ifndef ASTCENC_SIMD_INLINE
  27. #error "Include astcenc_vecmathlib.h, do not include directly"
  28. #endif
  29. #include <cstdio>
  30. // ============================================================================
  31. // vmask4 operators and functions
  32. // ============================================================================
  33. /**
  34. * @brief True if any lanes are enabled, false otherwise.
  35. */
  36. ASTCENC_SIMD_INLINE bool any(vmask4 a)
  37. {
  38. return mask(a) != 0;
  39. }
  40. /**
  41. * @brief True if all lanes are enabled, false otherwise.
  42. */
  43. ASTCENC_SIMD_INLINE bool all(vmask4 a)
  44. {
  45. return mask(a) == 0xF;
  46. }
  47. // ============================================================================
  48. // vint4 operators and functions
  49. // ============================================================================
  50. /**
  51. * @brief Overload: vector by scalar addition.
  52. */
  53. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
  54. {
  55. return a + vint4(b);
  56. }
  57. /**
  58. * @brief Overload: vector by vector incremental addition.
  59. */
  60. ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
  61. {
  62. a = a + b;
  63. return a;
  64. }
  65. /**
  66. * @brief Overload: vector by scalar subtraction.
  67. */
  68. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
  69. {
  70. return a - vint4(b);
  71. }
  72. /**
  73. * @brief Overload: vector by scalar multiplication.
  74. */
  75. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
  76. {
  77. return a * vint4(b);
  78. }
  79. /**
  80. * @brief Overload: vector by scalar bitwise or.
  81. */
  82. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
  83. {
  84. return a | vint4(b);
  85. }
  86. /**
  87. * @brief Overload: vector by scalar bitwise and.
  88. */
  89. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
  90. {
  91. return a & vint4(b);
  92. }
  93. /**
  94. * @brief Overload: vector by scalar bitwise xor.
  95. */
  96. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
  97. {
  98. return a ^ vint4(b);
  99. }
  100. /**
  101. * @brief Return the clamped value between min and max.
  102. */
  103. ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
  104. {
  105. return min(max(a, vint4(minv)), vint4(maxv));
  106. }
  107. /**
  108. * @brief Return the horizontal sum of RGB vector lanes as a scalar.
  109. */
  110. ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
  111. {
  112. return a.lane<0>() + a.lane<1>() + a.lane<2>();
  113. }
  114. // ============================================================================
  115. // vfloat4 operators and functions
  116. // ============================================================================
  117. /**
  118. * @brief Overload: vector by vector incremental addition.
  119. */
  120. ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
  121. {
  122. a = a + b;
  123. return a;
  124. }
  125. /**
  126. * @brief Overload: vector by scalar addition.
  127. */
  128. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
  129. {
  130. return a + vfloat4(b);
  131. }
  132. /**
  133. * @brief Overload: vector by scalar subtraction.
  134. */
  135. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
  136. {
  137. return a - vfloat4(b);
  138. }
  139. /**
  140. * @brief Overload: vector by scalar multiplication.
  141. */
  142. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
  143. {
  144. return a * vfloat4(b);
  145. }
  146. /**
  147. * @brief Overload: scalar by vector multiplication.
  148. */
  149. ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
  150. {
  151. return vfloat4(a) * b;
  152. }
  153. /**
  154. * @brief Overload: vector by scalar division.
  155. */
  156. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
  157. {
  158. return a / vfloat4(b);
  159. }
  160. /**
  161. * @brief Overload: scalar by vector division.
  162. */
  163. ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
  164. {
  165. return vfloat4(a) / b;
  166. }
  167. /**
  168. * @brief Return the min vector of a vector and a scalar.
  169. *
  170. * If either lane value is NaN, @c b will be returned for that lane.
  171. */
  172. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
  173. {
  174. return min(a, vfloat4(b));
  175. }
  176. /**
  177. * @brief Return the max vector of a vector and a scalar.
  178. *
  179. * If either lane value is NaN, @c b will be returned for that lane.
  180. */
  181. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
  182. {
  183. return max(a, vfloat4(b));
  184. }
  185. /**
  186. * @brief Return the clamped value between min and max.
  187. *
  188. * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
  189. * then @c min will be returned for that lane.
  190. */
  191. ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
  192. {
  193. // Do not reorder - second operand will return if either is NaN
  194. return min(max(a, minv), maxv);
  195. }
  196. /**
  197. * @brief Return the clamped value between 0.0f and max.
  198. *
  199. * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
  200. * be returned for that lane.
  201. */
  202. ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
  203. {
  204. // Do not reorder - second operand will return if either is NaN
  205. return min(max(a, vfloat4::zero()), maxv);
  206. }
  207. /**
  208. * @brief Return the clamped value between 0.0f and 1.0f.
  209. *
  210. * If @c a is NaN then zero will be returned for that lane.
  211. */
  212. ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
  213. {
  214. // Do not reorder - second operand will return if either is NaN
  215. return min(max(a, vfloat4::zero()), 1.0f);
  216. }
  217. /**
  218. * @brief Return the horizontal minimum of a vector.
  219. */
  220. ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
  221. {
  222. return hmin(a).lane<0>();
  223. }
  224. /**
  225. * @brief Return the horizontal min of RGB vector lanes as a scalar.
  226. */
  227. ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
  228. {
  229. a.set_lane<3>(a.lane<0>());
  230. return hmin_s(a);
  231. }
  232. /**
  233. * @brief Return the horizontal maximum of a vector.
  234. */
  235. ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
  236. {
  237. return hmax(a).lane<0>();
  238. }
  239. /**
  240. * @brief Accumulate lane-wise sums for a vector.
  241. */
  242. ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
  243. {
  244. accum = accum + a;
  245. }
  246. /**
  247. * @brief Accumulate lane-wise sums for a masked vector.
  248. */
  249. ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
  250. {
  251. a = select(vfloat4::zero(), a, m);
  252. haccumulate(accum, a);
  253. }
  254. /**
  255. * @brief Return the horizontal sum of RGB vector lanes as a scalar.
  256. */
  257. ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
  258. {
  259. return a.lane<0>() + a.lane<1>() + a.lane<2>();
  260. }
  261. #if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
  262. /**
  263. * @brief Return the dot product for the full 4 lanes, returning scalar.
  264. */
  265. ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
  266. {
  267. vfloat4 m = a * b;
  268. return hadd_s(m);
  269. }
  270. /**
  271. * @brief Return the dot product for the full 4 lanes, returning vector.
  272. */
  273. ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
  274. {
  275. vfloat4 m = a * b;
  276. return vfloat4(hadd_s(m));
  277. }
  278. /**
  279. * @brief Return the dot product for the bottom 3 lanes, returning scalar.
  280. */
  281. ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
  282. {
  283. vfloat4 m = a * b;
  284. return hadd_rgb_s(m);
  285. }
  286. /**
  287. * @brief Return the dot product for the bottom 3 lanes, returning vector.
  288. */
  289. ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
  290. {
  291. vfloat4 m = a * b;
  292. float d3 = hadd_rgb_s(m);
  293. return vfloat4(d3, d3, d3, 0.0f);
  294. }
  295. #endif
  296. #if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
  297. /**
  298. * @brief Population bit count.
  299. *
  300. * @param v The value to population count.
  301. *
  302. * @return The number of 1 bits.
  303. */
  304. static inline int popcount(uint64_t v)
  305. {
  306. uint64_t mask1 = 0x5555555555555555ULL;
  307. uint64_t mask2 = 0x3333333333333333ULL;
  308. uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
  309. v -= (v >> 1) & mask1;
  310. v = (v & mask2) + ((v >> 2) & mask2);
  311. v += v >> 4;
  312. v &= mask3;
  313. v *= 0x0101010101010101ULL;
  314. v >>= 56;
  315. return static_cast<int>(v);
  316. }
  317. #endif
  318. /**
  319. * @brief Apply signed bit transfer.
  320. *
  321. * @param input0 The first encoded endpoint.
  322. * @param input1 The second encoded endpoint.
  323. */
  324. static ASTCENC_SIMD_INLINE void bit_transfer_signed(
  325. vint4& input0,
  326. vint4& input1
  327. ) {
  328. input1 = lsr<1>(input1) | (input0 & 0x80);
  329. input0 = lsr<1>(input0) & 0x3F;
  330. vmask4 mask = (input0 & 0x20) != vint4::zero();
  331. input0 = select(input0, input0 - 0x40, mask);
  332. }
  333. /**
  334. * @brief Debug function to print a vector of ints.
  335. */
  336. ASTCENC_SIMD_INLINE void print(vint4 a)
  337. {
  338. alignas(16) int v[4];
  339. storea(a, v);
  340. printf("v4_i32:\n %8d %8d %8d %8d\n",
  341. v[0], v[1], v[2], v[3]);
  342. }
  343. /**
  344. * @brief Debug function to print a vector of ints.
  345. */
  346. ASTCENC_SIMD_INLINE void printx(vint4 a)
  347. {
  348. alignas(16) int v[4];
  349. storea(a, v);
  350. printf("v4_i32:\n %08x %08x %08x %08x\n",
  351. v[0], v[1], v[2], v[3]);
  352. }
  353. /**
  354. * @brief Debug function to print a vector of floats.
  355. */
  356. ASTCENC_SIMD_INLINE void print(vfloat4 a)
  357. {
  358. alignas(16) float v[4];
  359. storea(a, v);
  360. printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
  361. static_cast<double>(v[0]), static_cast<double>(v[1]),
  362. static_cast<double>(v[2]), static_cast<double>(v[3]));
  363. }
  364. /**
  365. * @brief Debug function to print a vector of masks.
  366. */
  367. ASTCENC_SIMD_INLINE void print(vmask4 a)
  368. {
  369. print(select(vint4(0), vint4(1), a));
  370. }
  371. #endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED