dec_loop.c 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. static inline int
  2. dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
  3. {
  4. const __m256i lut_lo = _mm256_setr_epi8(
  5. 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
  6. 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
  7. 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
  8. 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
  9. const __m256i lut_hi = _mm256_setr_epi8(
  10. 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
  11. 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  12. 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
  13. 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
  14. const __m256i lut_roll = _mm256_setr_epi8(
  15. 0, 16, 19, 4, -65, -65, -71, -71,
  16. 0, 0, 0, 0, 0, 0, 0, 0,
  17. 0, 16, 19, 4, -65, -65, -71, -71,
  18. 0, 0, 0, 0, 0, 0, 0, 0);
  19. const __m256i mask_2F = _mm256_set1_epi8(0x2F);
  20. // Load input:
  21. __m256i str = _mm256_loadu_si256((__m256i *) *s);
  22. // See the SSSE3 decoder for an explanation of the algorithm.
  23. const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
  24. const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
  25. const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
  26. const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
  27. if (!_mm256_testz_si256(lo, hi)) {
  28. return 0;
  29. }
  30. const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
  31. const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
  32. // Now simply add the delta values to the input:
  33. str = _mm256_add_epi8(str, roll);
  34. // Reshuffle the input to packed 12-byte output format:
  35. str = dec_reshuffle(str);
  36. // Store the output:
  37. _mm256_storeu_si256((__m256i *) *o, str);
  38. *s += 32;
  39. *o += 24;
  40. *rounds -= 1;
  41. return 1;
  42. }
  43. static inline void
  44. dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
  45. {
  46. if (*slen < 45) {
  47. return;
  48. }
  49. // Process blocks of 32 bytes per round. Because 8 extra zero bytes are
  50. // written after the output, ensure that there will be at least 13
  51. // bytes of input data left to cover the gap. (11 data bytes and up to
  52. // two end-of-string markers.)
  53. size_t rounds = (*slen - 13) / 32;
  54. *slen -= rounds * 32; // 32 bytes consumed per round
  55. *olen += rounds * 24; // 24 bytes produced per round
  56. do {
  57. if (rounds >= 8) {
  58. if (dec_loop_avx2_inner(s, o, &rounds) &&
  59. dec_loop_avx2_inner(s, o, &rounds) &&
  60. dec_loop_avx2_inner(s, o, &rounds) &&
  61. dec_loop_avx2_inner(s, o, &rounds) &&
  62. dec_loop_avx2_inner(s, o, &rounds) &&
  63. dec_loop_avx2_inner(s, o, &rounds) &&
  64. dec_loop_avx2_inner(s, o, &rounds) &&
  65. dec_loop_avx2_inner(s, o, &rounds)) {
  66. continue;
  67. }
  68. break;
  69. }
  70. if (rounds >= 4) {
  71. if (dec_loop_avx2_inner(s, o, &rounds) &&
  72. dec_loop_avx2_inner(s, o, &rounds) &&
  73. dec_loop_avx2_inner(s, o, &rounds) &&
  74. dec_loop_avx2_inner(s, o, &rounds)) {
  75. continue;
  76. }
  77. break;
  78. }
  79. if (rounds >= 2) {
  80. if (dec_loop_avx2_inner(s, o, &rounds) &&
  81. dec_loop_avx2_inner(s, o, &rounds)) {
  82. continue;
  83. }
  84. break;
  85. }
  86. dec_loop_avx2_inner(s, o, &rounds);
  87. break;
  88. } while (rounds > 0);
  89. // Adjust for any rounds that were skipped:
  90. *slen += rounds * 32;
  91. *olen -= rounds * 24;
  92. }