Dither.cpp 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #include <algorithm>
  2. #include <string.h>
  3. #include "Dither.hpp"
  4. #include "Math.hpp"
  5. #ifdef __SSE4_1__
  6. # ifdef _MSC_VER
  7. # include <intrin.h>
  8. # include <Windows.h>
  9. # else
  10. # include <x86intrin.h>
  11. # endif
  12. #endif
  13. #ifdef __AVX2__
  14. void DitherAvx2( uint8_t* data, __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
  15. {
  16. static constexpr uint8_t a31[] = { 0, 0, 0, 1, 2, 0, 4, 0, 0, 2, 0, 0, 4, 0, 3, 0 };
  17. static constexpr uint8_t a63[] = { 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0 };
  18. static constexpr uint8_t s31[] = { 5, 0, 4, 0, 0, 2, 0, 1, 3, 0, 4, 0, 0, 0, 0, 2 };
  19. static constexpr uint8_t s63[] = { 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1 };
  20. const __m256i BayerAdd0 = _mm256_setr_epi8(
  21. a31[0], a63[0], a31[0], 0, a31[1], a63[1], a31[1], 0, a31[2], a63[2], a31[2], 0, a31[3], a63[3], a31[3], 0,
  22. a31[4], a63[4], a31[4], 0, a31[5], a63[5], a31[5], 0, a31[6], a63[6], a31[6], 0, a31[7], a63[7], a31[7], 0
  23. );
  24. const __m256i BayerAdd1 = _mm256_setr_epi8(
  25. a31[8], a63[8], a31[8], 0, a31[9], a63[9], a31[9], 0, a31[10], a63[10], a31[10], 0, a31[11], a63[11], a31[11], 0,
  26. a31[12], a63[12], a31[12], 0, a31[13], a63[13], a31[13], 0, a31[14], a63[14], a31[14], 0, a31[15], a63[15], a31[15], 0
  27. );
  28. const __m256i BayerSub0 = _mm256_setr_epi8(
  29. s31[0], s63[0], s31[0], 0, s31[1], s63[1], s31[1], 0, s31[2], s63[2], s31[2], 0, s31[3], s63[3], s31[3], 0,
  30. s31[4], s63[4], s31[4], 0, s31[5], s63[5], s31[5], 0, s31[6], s63[6], s31[6], 0, s31[7], s63[7], s31[7], 0
  31. );
  32. const __m256i BayerSub1 = _mm256_setr_epi8(
  33. s31[8], s63[8], s31[8], 0, s31[9], s63[9], s31[9], 0, s31[10], s63[10], s31[10], 0, s31[11], s63[11], s31[11], 0,
  34. s31[12], s63[12], s31[12], 0, s31[13], s63[13], s31[13], 0, s31[14], s63[14], s31[14], 0, s31[15], s63[15], s31[15], 0
  35. );
  36. __m256i l0 = _mm256_inserti128_si256( _mm256_castsi128_si256( px0 ), px1, 1 );
  37. __m256i l1 = _mm256_inserti128_si256( _mm256_castsi128_si256( px2 ), px3, 1 );
  38. __m256i a0 = _mm256_adds_epu8( l0, BayerAdd0 );
  39. __m256i a1 = _mm256_adds_epu8( l1, BayerAdd1 );
  40. __m256i s0 = _mm256_subs_epu8( a0, BayerSub0 );
  41. __m256i s1 = _mm256_subs_epu8( a1, BayerSub1 );
  42. _mm256_storeu_si256( (__m256i*)(data ), s0 );
  43. _mm256_storeu_si256( (__m256i*)(data+32), s1 );
  44. }
  45. #endif
  46. void Dither( uint8_t* data )
  47. {
  48. #ifdef __AVX2__
  49. static constexpr uint8_t a31[] = { 0, 0, 0, 1, 2, 0, 4, 0, 0, 2, 0, 0, 4, 0, 3, 0 };
  50. static constexpr uint8_t a63[] = { 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0 };
  51. static constexpr uint8_t s31[] = { 5, 0, 4, 0, 0, 2, 0, 1, 3, 0, 4, 0, 0, 0, 0, 2 };
  52. static constexpr uint8_t s63[] = { 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1 };
  53. const __m256i BayerAdd0 = _mm256_setr_epi8(
  54. a31[0], a63[0], a31[0], 0, a31[1], a63[1], a31[1], 0, a31[2], a63[2], a31[2], 0, a31[3], a63[3], a31[3], 0,
  55. a31[4], a63[4], a31[4], 0, a31[5], a63[5], a31[5], 0, a31[6], a63[6], a31[6], 0, a31[7], a63[7], a31[7], 0
  56. );
  57. const __m256i BayerAdd1 = _mm256_setr_epi8(
  58. a31[8], a63[8], a31[8], 0, a31[9], a63[9], a31[9], 0, a31[10], a63[10], a31[10], 0, a31[11], a63[11], a31[11], 0,
  59. a31[12], a63[12], a31[12], 0, a31[13], a63[13], a31[13], 0, a31[14], a63[14], a31[14], 0, a31[15], a63[15], a31[15], 0
  60. );
  61. const __m256i BayerSub0 = _mm256_setr_epi8(
  62. s31[0], s63[0], s31[0], 0, s31[1], s63[1], s31[1], 0, s31[2], s63[2], s31[2], 0, s31[3], s63[3], s31[3], 0,
  63. s31[4], s63[4], s31[4], 0, s31[5], s63[5], s31[5], 0, s31[6], s63[6], s31[6], 0, s31[7], s63[7], s31[7], 0
  64. );
  65. const __m256i BayerSub1 = _mm256_setr_epi8(
  66. s31[8], s63[8], s31[8], 0, s31[9], s63[9], s31[9], 0, s31[10], s63[10], s31[10], 0, s31[11], s63[11], s31[11], 0,
  67. s31[12], s63[12], s31[12], 0, s31[13], s63[13], s31[13], 0, s31[14], s63[14], s31[14], 0, s31[15], s63[15], s31[15], 0
  68. );
  69. __m256i px0 = _mm256_loadu_si256( (__m256i*)(data ) );
  70. __m256i px1 = _mm256_loadu_si256( (__m256i*)(data+32) );
  71. __m256i a0 = _mm256_adds_epu8( px0, BayerAdd0 );
  72. __m256i a1 = _mm256_adds_epu8( px1, BayerAdd1 );
  73. __m256i s0 = _mm256_subs_epu8( a0, BayerSub0 );
  74. __m256i s1 = _mm256_subs_epu8( a1, BayerSub1 );
  75. _mm256_storeu_si256( (__m256i*)(data ), s0 );
  76. _mm256_storeu_si256( (__m256i*)(data+32), s1 );
  77. #else
  78. static constexpr int8_t Bayer31[16] = {
  79. ( 0-8)*2/3, ( 8-8)*2/3, ( 2-8)*2/3, (10-8)*2/3,
  80. (12-8)*2/3, ( 4-8)*2/3, (14-8)*2/3, ( 6-8)*2/3,
  81. ( 3-8)*2/3, (11-8)*2/3, ( 1-8)*2/3, ( 9-8)*2/3,
  82. (15-8)*2/3, ( 7-8)*2/3, (13-8)*2/3, ( 5-8)*2/3
  83. };
  84. static constexpr int8_t Bayer63[16] = {
  85. ( 0-8)*2/6, ( 8-8)*2/6, ( 2-8)*2/6, (10-8)*2/6,
  86. (12-8)*2/6, ( 4-8)*2/6, (14-8)*2/6, ( 6-8)*2/6,
  87. ( 3-8)*2/6, (11-8)*2/6, ( 1-8)*2/6, ( 9-8)*2/6,
  88. (15-8)*2/6, ( 7-8)*2/6, (13-8)*2/6, ( 5-8)*2/6
  89. };
  90. for( int i=0; i<16; i++ )
  91. {
  92. uint32_t col;
  93. memcpy( &col, data, 4 );
  94. uint8_t r = col & 0xFF;
  95. uint8_t g = ( col >> 8 ) & 0xFF;
  96. uint8_t b = ( col >> 16 ) & 0xFF;
  97. r = clampu8( r + Bayer31[i] );
  98. g = clampu8( g + Bayer63[i] );
  99. b = clampu8( b + Bayer31[i] );
  100. col = r | ( g << 8 ) | ( b << 16 );
  101. memcpy( data, &col, 4 );
  102. data += 4;
  103. }
  104. #endif
  105. }