quantization.cpp 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. union FloatBits
  5. {
  6. float f;
  7. unsigned int ui;
  8. };
  9. unsigned short meshopt_quantizeHalf(float v)
  10. {
  11. FloatBits u = {v};
  12. unsigned int ui = u.ui;
  13. int s = (ui >> 16) & 0x8000;
  14. int em = ui & 0x7fffffff;
  15. // bias exponent and round to nearest; 112 is relative exponent bias (127-15)
  16. int h = (em - (112 << 23) + (1 << 12)) >> 13;
  17. // underflow: flush to zero; 113 encodes exponent -14
  18. h = (em < (113 << 23)) ? 0 : h;
  19. // overflow: infinity; 143 encodes exponent 16
  20. h = (em >= (143 << 23)) ? 0x7c00 : h;
  21. // NaN; note that we convert all types of NaN to qNaN
  22. h = (em > (255 << 23)) ? 0x7e00 : h;
  23. return (unsigned short)(s | h);
  24. }
  25. float meshopt_quantizeFloat(float v, int N)
  26. {
  27. assert(N >= 0 && N <= 23);
  28. FloatBits u = {v};
  29. unsigned int ui = u.ui;
  30. const int mask = (1 << (23 - N)) - 1;
  31. const int round = (1 << (23 - N)) >> 1;
  32. int e = ui & 0x7f800000;
  33. unsigned int rui = (ui + round) & ~mask;
  34. // round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0
  35. ui = e == 0x7f800000 ? ui : rui;
  36. // flush denormals to zero
  37. ui = e == 0 ? 0 : ui;
  38. u.ui = ui;
  39. return u.f;
  40. }
  41. float meshopt_dequantizeHalf(unsigned short h)
  42. {
  43. unsigned int s = unsigned(h & 0x8000) << 16;
  44. int em = h & 0x7fff;
  45. // bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
  46. int r = (em + (112 << 10)) << 13;
  47. // denormal: flush to zero
  48. r = (em < (1 << 10)) ? 0 : r;
  49. // infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
  50. // 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
  51. r += (em >= (31 << 10)) ? (112 << 23) : 0;
  52. FloatBits u;
  53. u.ui = s | r;
  54. return u.f;
  55. }