transcendental.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. // Transcendental functions from "ispc": https://github.com/ispc/ispc/
  5. // Most of the transcendental implementations in ispc code come from
  6. // Solomon Boulos's "syrah": https://github.com/boulos/syrah/
  7. #include "../simd/simd.h"
  8. namespace embree
  9. {
  10. namespace fastapprox
  11. {
  12. template <typename T>
  13. __forceinline T sin(const T &v)
  14. {
  15. static const float piOverTwoVec = 1.57079637050628662109375;
  16. static const float twoOverPiVec = 0.636619746685028076171875;
  17. auto scaled = v * twoOverPiVec;
  18. auto kReal = floor(scaled);
  19. auto k = toInt(kReal);
  20. // Reduced range version of x
  21. auto x = v - kReal * piOverTwoVec;
  22. auto kMod4 = k & 3;
  23. auto sinUseCos = (kMod4 == 1) | (kMod4 == 3);
  24. auto flipSign = (kMod4 > 1);
  25. // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
  26. // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
  27. static const float sinC2 = -0.16666667163372039794921875;
  28. static const float sinC4 = +8.333347737789154052734375e-3;
  29. static const float sinC6 = -1.9842604524455964565277099609375e-4;
  30. static const float sinC8 = +2.760012648650445044040679931640625e-6;
  31. static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
  32. static const float cosC2 = -0.5;
  33. static const float cosC4 = +4.166664183139801025390625e-2;
  34. static const float cosC6 = -1.388833043165504932403564453125e-3;
  35. static const float cosC8 = +2.47562347794882953166961669921875e-5;
  36. static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
  37. auto outside = select(sinUseCos, 1., x);
  38. auto c2 = select(sinUseCos, T(cosC2), T(sinC2));
  39. auto c4 = select(sinUseCos, T(cosC4), T(sinC4));
  40. auto c6 = select(sinUseCos, T(cosC6), T(sinC6));
  41. auto c8 = select(sinUseCos, T(cosC8), T(sinC8));
  42. auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
  43. auto x2 = x * x;
  44. auto formula = x2 * c10 + c8;
  45. formula = x2 * formula + c6;
  46. formula = x2 * formula + c4;
  47. formula = x2 * formula + c2;
  48. formula = x2 * formula + 1.;
  49. formula *= outside;
  50. formula = select(flipSign, -formula, formula);
  51. return formula;
  52. }
  53. template <typename T>
  54. __forceinline T cos(const T &v)
  55. {
  56. static const float piOverTwoVec = 1.57079637050628662109375;
  57. static const float twoOverPiVec = 0.636619746685028076171875;
  58. auto scaled = v * twoOverPiVec;
  59. auto kReal = floor(scaled);
  60. auto k = toInt(kReal);
  61. // Reduced range version of x
  62. auto x = v - kReal * piOverTwoVec;
  63. auto kMod4 = k & 3;
  64. auto cosUseCos = (kMod4 == 0) | (kMod4 == 2);
  65. auto flipSign = (kMod4 == 1) | (kMod4 == 2);
  66. const float sinC2 = -0.16666667163372039794921875;
  67. const float sinC4 = +8.333347737789154052734375e-3;
  68. const float sinC6 = -1.9842604524455964565277099609375e-4;
  69. const float sinC8 = +2.760012648650445044040679931640625e-6;
  70. const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
  71. const float cosC2 = -0.5;
  72. const float cosC4 = +4.166664183139801025390625e-2;
  73. const float cosC6 = -1.388833043165504932403564453125e-3;
  74. const float cosC8 = +2.47562347794882953166961669921875e-5;
  75. const float cosC10 = -2.59630184018533327616751194000244140625e-7;
  76. auto outside = select(cosUseCos, 1., x);
  77. auto c2 = select(cosUseCos, T(cosC2), T(sinC2));
  78. auto c4 = select(cosUseCos, T(cosC4), T(sinC4));
  79. auto c6 = select(cosUseCos, T(cosC6), T(sinC6));
  80. auto c8 = select(cosUseCos, T(cosC8), T(sinC8));
  81. auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
  82. auto x2 = x * x;
  83. auto formula = x2 * c10 + c8;
  84. formula = x2 * formula + c6;
  85. formula = x2 * formula + c4;
  86. formula = x2 * formula + c2;
  87. formula = x2 * formula + 1.;
  88. formula *= outside;
  89. formula = select(flipSign, -formula, formula);
  90. return formula;
  91. }
  92. template <typename T>
  93. __forceinline void sincos(const T &v, T &sinResult, T &cosResult)
  94. {
  95. const float piOverTwoVec = 1.57079637050628662109375;
  96. const float twoOverPiVec = 0.636619746685028076171875;
  97. auto scaled = v * twoOverPiVec;
  98. auto kReal = floor(scaled);
  99. auto k = toInt(kReal);
  100. // Reduced range version of x
  101. auto x = v - kReal * piOverTwoVec;
  102. auto kMod4 = k & 3;
  103. auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
  104. auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
  105. auto sinFlipSign = (kMod4 > 1);
  106. auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
  107. const float oneVec = +1.;
  108. const float sinC2 = -0.16666667163372039794921875;
  109. const float sinC4 = +8.333347737789154052734375e-3;
  110. const float sinC6 = -1.9842604524455964565277099609375e-4;
  111. const float sinC8 = +2.760012648650445044040679931640625e-6;
  112. const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
  113. const float cosC2 = -0.5;
  114. const float cosC4 = +4.166664183139801025390625e-2;
  115. const float cosC6 = -1.388833043165504932403564453125e-3;
  116. const float cosC8 = +2.47562347794882953166961669921875e-5;
  117. const float cosC10 = -2.59630184018533327616751194000244140625e-7;
  118. auto x2 = x * x;
  119. auto sinFormula = x2 * sinC10 + sinC8;
  120. auto cosFormula = x2 * cosC10 + cosC8;
  121. sinFormula = x2 * sinFormula + sinC6;
  122. cosFormula = x2 * cosFormula + cosC6;
  123. sinFormula = x2 * sinFormula + sinC4;
  124. cosFormula = x2 * cosFormula + cosC4;
  125. sinFormula = x2 * sinFormula + sinC2;
  126. cosFormula = x2 * cosFormula + cosC2;
  127. sinFormula = x2 * sinFormula + oneVec;
  128. cosFormula = x2 * cosFormula + oneVec;
  129. sinFormula *= x;
  130. sinResult = select(sinUseCos, cosFormula, sinFormula);
  131. cosResult = select(cosUseCos, cosFormula, sinFormula);
  132. sinResult = select(sinFlipSign, -sinResult, sinResult);
  133. cosResult = select(cosFlipSign, -cosResult, cosResult);
  134. }
  135. template <typename T>
  136. __forceinline T tan(const T &v)
  137. {
  138. const float piOverFourVec = 0.785398185253143310546875;
  139. const float fourOverPiVec = 1.27323949337005615234375;
  140. auto xLt0 = v < 0.;
  141. auto y = select(xLt0, -v, v);
  142. auto scaled = y * fourOverPiVec;
  143. auto kReal = floor(scaled);
  144. auto k = toInt(kReal);
  145. auto x = y - kReal * piOverFourVec;
  146. // If k & 1, x -= Pi/4
  147. auto needOffset = (k & 1) != 0;
  148. x = select(needOffset, x - piOverFourVec, x);
  149. // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
  150. auto kMod4 = k & 3;
  151. auto useCotan = (kMod4 == 1) | (kMod4 == 2);
  152. const float oneVec = 1.0;
  153. const float tanC2 = +0.33333075046539306640625;
  154. const float tanC4 = +0.13339905440807342529296875;
  155. const float tanC6 = +5.3348250687122344970703125e-2;
  156. const float tanC8 = +2.46033705770969390869140625e-2;
  157. const float tanC10 = +2.892402000725269317626953125e-3;
  158. const float tanC12 = +9.500005282461643218994140625e-3;
  159. const float cotC2 = -0.3333333432674407958984375;
  160. const float cotC4 = -2.222204394638538360595703125e-2;
  161. const float cotC6 = -2.11752182804048061370849609375e-3;
  162. const float cotC8 = -2.0846328698098659515380859375e-4;
  163. const float cotC10 = -2.548247357481159269809722900390625e-5;
  164. const float cotC12 = -3.5257363606433500535786151885986328125e-7;
  165. auto x2 = x * x;
  166. T z;
  167. if (any(useCotan))
  168. {
  169. auto cotVal = x2 * cotC12 + cotC10;
  170. cotVal = x2 * cotVal + cotC8;
  171. cotVal = x2 * cotVal + cotC6;
  172. cotVal = x2 * cotVal + cotC4;
  173. cotVal = x2 * cotVal + cotC2;
  174. cotVal = x2 * cotVal + oneVec;
  175. // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
  176. cotVal /= -x;
  177. z = cotVal;
  178. }
  179. auto useTan = !useCotan;
  180. if (any(useTan))
  181. {
  182. auto tanVal = x2 * tanC12 + tanC10;
  183. tanVal = x2 * tanVal + tanC8;
  184. tanVal = x2 * tanVal + tanC6;
  185. tanVal = x2 * tanVal + tanC4;
  186. tanVal = x2 * tanVal + tanC2;
  187. tanVal = x2 * tanVal + oneVec;
  188. // Equation was for tan(x)/x
  189. tanVal *= x;
  190. z = select(useTan, tanVal, z);
  191. }
  192. return select(xLt0, -z, z);
  193. }
  194. template <typename T>
  195. __forceinline T asin(const T &x0)
  196. {
  197. auto isneg = (x0 < 0.f);
  198. auto x = abs(x0);
  199. auto isnan = (x > 1.f);
  200. // sollya
  201. // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
  202. // [1e-20;.9999999999999999]);
  203. // avg error: 1.1105439e-06, max error 1.3187528e-06
  204. auto v = 1.57079517841339111328125f +
  205. x * (-0.21450997889041900634765625f +
  206. x * (8.78556668758392333984375e-2f +
  207. x * (-4.489909112453460693359375e-2f +
  208. x * (1.928029954433441162109375e-2f +
  209. x * (-4.3095736764371395111083984375e-3f)))));
  210. v *= -sqrt(1.f - x);
  211. v = v + 1.57079637050628662109375f;
  212. v = select(v < 0.f, T(0.f), v);
  213. v = select(isneg, -v, v);
  214. v = select(isnan, T(cast_i2f(0x7fc00000)), v);
  215. return v;
  216. }
  217. template <typename T>
  218. __forceinline T acos(const T &v)
  219. {
  220. return 1.57079637050628662109375f - asin(v);
  221. }
  222. template <typename T>
  223. __forceinline T atan(const T &v)
  224. {
  225. const float piOverTwoVec = 1.57079637050628662109375;
  226. // atan(-x) = -atan(x) (so flip from negative to positive first)
  227. // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
  228. auto xNeg = v < 0.f;
  229. auto xFlipped = select(xNeg, -v, v);
  230. auto xGt1 = xFlipped > 1.;
  231. auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
  232. // These coefficients approximate atan(x)/x
  233. const float atanC0 = +0.99999988079071044921875;
  234. const float atanC2 = -0.3333191573619842529296875;
  235. const float atanC4 = +0.199689209461212158203125;
  236. const float atanC6 = -0.14015688002109527587890625;
  237. const float atanC8 = +9.905083477497100830078125e-2;
  238. const float atanC10 = -5.93664981424808502197265625e-2;
  239. const float atanC12 = +2.417283318936824798583984375e-2;
  240. const float atanC14 = -4.6721356920897960662841796875e-3;
  241. auto x2 = x * x;
  242. auto result = x2 * atanC14 + atanC12;
  243. result = x2 * result + atanC10;
  244. result = x2 * result + atanC8;
  245. result = x2 * result + atanC6;
  246. result = x2 * result + atanC4;
  247. result = x2 * result + atanC2;
  248. result = x2 * result + atanC0;
  249. result *= x;
  250. result = select(xGt1, piOverTwoVec - result, result);
  251. result = select(xNeg, -result, result);
  252. return result;
  253. }
  254. template <typename T>
  255. __forceinline T atan2(const T &y, const T &x)
  256. {
  257. const float piVec = 3.1415926536;
  258. // atan2(y, x) =
  259. //
  260. // atan2(y > 0, x = +-0) -> Pi/2
  261. // atan2(y < 0, x = +-0) -> -Pi/2
  262. // atan2(y = +-0, x < +0) -> +-Pi
  263. // atan2(y = +-0, x >= +0) -> +-0
  264. //
  265. // atan2(y >= 0, x < 0) -> Pi + atan(y/x)
  266. // atan2(y < 0, x < 0) -> -Pi + atan(y/x)
  267. // atan2(y, x > 0) -> atan(y/x)
  268. //
  269. // and then a bunch of code for dealing with infinities.
  270. auto yOverX = y * rcpSafe(x);
  271. auto atanArg = atan(yOverX);
  272. auto xLt0 = x < 0.f;
  273. auto yLt0 = y < 0.f;
  274. auto offset = select(xLt0,
  275. select(yLt0, T(-piVec), T(piVec)), 0.f);
  276. return offset + atanArg;
  277. }
  278. template <typename T>
  279. __forceinline T exp(const T &v)
  280. {
  281. const float ln2Part1 = 0.6931457519;
  282. const float ln2Part2 = 1.4286067653e-6;
  283. const float oneOverLn2 = 1.44269502162933349609375;
  284. auto scaled = v * oneOverLn2;
  285. auto kReal = floor(scaled);
  286. auto k = toInt(kReal);
  287. // Reduced range version of x
  288. auto x = v - kReal * ln2Part1;
  289. x -= kReal * ln2Part2;
  290. // These coefficients are for e^x in [0, ln(2)]
  291. const float one = 1.;
  292. const float c2 = 0.4999999105930328369140625;
  293. const float c3 = 0.166668415069580078125;
  294. const float c4 = 4.16539050638675689697265625e-2;
  295. const float c5 = 8.378830738365650177001953125e-3;
  296. const float c6 = 1.304379315115511417388916015625e-3;
  297. const float c7 = 2.7555381529964506626129150390625e-4;
  298. auto result = x * c7 + c6;
  299. result = x * result + c5;
  300. result = x * result + c4;
  301. result = x * result + c3;
  302. result = x * result + c2;
  303. result = x * result + one;
  304. result = x * result + one;
  305. // Compute 2^k (should differ for float and double, but I'll avoid
  306. // it for now and just do floats)
  307. const int fpbias = 127;
  308. auto biasedN = k + fpbias;
  309. auto overflow = kReal > fpbias;
  310. // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
  311. // we've got underflow. -127 * ln(2) -> -88.02. So the most
  312. // negative float input that doesn't result in zero is like -88.
  313. auto underflow = kReal <= -fpbias;
  314. const int infBits = 0x7f800000;
  315. biasedN <<= 23;
  316. // Reinterpret this thing as float
  317. auto twoToTheN = asFloat(biasedN);
  318. // Handle both doubles and floats (hopefully eliding the copy for float)
  319. auto elemtype2n = twoToTheN;
  320. result *= elemtype2n;
  321. result = select(overflow, cast_i2f(infBits), result);
  322. result = select(underflow, 0., result);
  323. return result;
  324. }
  325. // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
  326. // * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
  327. template <typename T, typename R>
  328. __forceinline void __rangeReduceLog(const T &input,
  329. T &reduced,
  330. R &exponent)
  331. {
  332. auto intVersion = asInt(input);
  333. // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
  334. // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
  335. // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
  336. // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
  337. // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
  338. //const int exponentMask(0x7F800000)
  339. static const int nonexponentMask = 0x807FFFFF;
  340. // We want the reduced version to have an exponent of -1 which is
  341. // -1 + 127 after biasing or 126
  342. static const int exponentNeg1 = (126l << 23);
  343. // NOTE(boulos): We don't need to mask anything out since we know
  344. // the sign bit has to be 0. If it's 1, we need to return infinity/nan
  345. // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
  346. auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
  347. auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
  348. exponent = offsetExponent - 127; // get the real value
  349. // Blend the offset_exponent with the original input (do this in
  350. // int for now, until I decide if float can have & and &not)
  351. auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
  352. reduced = asFloat(blended);
  353. }
  354. template <typename T> struct ExponentType { };
  355. template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; };
  356. template <> struct ExponentType<float> { typedef int Ty; };
  357. template <typename T>
  358. __forceinline T log(const T &v)
  359. {
  360. T reduced;
  361. typename ExponentType<T>::Ty exponent;
  362. const int nanBits = 0x7fc00000;
  363. const int negInfBits = 0xFF800000;
  364. const float nan = cast_i2f(nanBits);
  365. const float negInf = cast_i2f(negInfBits);
  366. auto useNan = v < 0.;
  367. auto useInf = v == 0.;
  368. auto exceptional = useNan | useInf;
  369. const float one = 1.0;
  370. auto patched = select(exceptional, one, v);
  371. __rangeReduceLog(patched, reduced, exponent);
  372. const float ln2 = 0.693147182464599609375;
  373. auto x1 = one - reduced;
  374. const float c1 = +0.50000095367431640625;
  375. const float c2 = +0.33326041698455810546875;
  376. const float c3 = +0.2519190013408660888671875;
  377. const float c4 = +0.17541764676570892333984375;
  378. const float c5 = +0.3424419462680816650390625;
  379. const float c6 = -0.599632322788238525390625;
  380. const float c7 = +1.98442304134368896484375;
  381. const float c8 = -2.4899270534515380859375;
  382. const float c9 = +1.7491014003753662109375;
  383. auto result = x1 * c9 + c8;
  384. result = x1 * result + c7;
  385. result = x1 * result + c6;
  386. result = x1 * result + c5;
  387. result = x1 * result + c4;
  388. result = x1 * result + c3;
  389. result = x1 * result + c2;
  390. result = x1 * result + c1;
  391. result = x1 * result + one;
  392. // Equation was for -(ln(red)/(1-red))
  393. result *= -x1;
  394. result += toFloat(exponent) * ln2;
  395. return select(exceptional,
  396. select(useNan, T(nan), T(negInf)),
  397. result);
  398. }
  399. template <typename T>
  400. __forceinline T pow(const T &x, const T &y)
  401. {
  402. auto x1 = abs(x);
  403. auto z = exp(y * log(x1));
  404. // Handle special cases
  405. const float twoOver23 = 8388608.0f;
  406. auto yInt = y == round(y);
  407. auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
  408. // x == 0
  409. z = select(x == 0.0f,
  410. select(y < 0.0f, T(inf) | signmsk(x),
  411. select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
  412. // x < 0
  413. auto xNegative = x < 0.0f;
  414. if (any(xNegative))
  415. {
  416. auto z1 = z | asFloat(yOddInt);
  417. z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
  418. z = select(xNegative, z1, z);
  419. }
  420. auto xFinite = isfinite(x);
  421. auto yFinite = isfinite(y);
  422. if (all(xFinite & yFinite))
  423. return z;
  424. // x finite and y infinite
  425. z = select(andn(xFinite, yFinite),
  426. select(x1 == 1.0f, 1.0f,
  427. select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
  428. // x infinite
  429. z = select(xFinite, z,
  430. select(y == 0.0f, 1.0f,
  431. select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
  432. return z;
  433. }
  434. template <typename T>
  435. __forceinline T pow(const T &x, float y)
  436. {
  437. return pow(x, T(y));
  438. }
  439. } // namespace fastapprox
  440. } // namespace embree