enc_loop.c 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. static inline void
  2. enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
  3. {
  4. // First load is done at s - 0 to not get a segfault:
  5. __m256i src = _mm256_loadu_si256((__m256i *) *s);
  6. // Shift by 4 bytes, as required by enc_reshuffle:
  7. src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
  8. // Reshuffle, translate, store:
  9. src = enc_reshuffle(src);
  10. src = enc_translate(src);
  11. _mm256_storeu_si256((__m256i *) *o, src);
  12. // Subsequent loads will be done at s - 4, set pointer for next round:
  13. *s += 20;
  14. *o += 32;
  15. }
  16. static inline void
  17. enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
  18. {
  19. // Load input:
  20. __m256i src = _mm256_loadu_si256((__m256i *) *s);
  21. // Reshuffle, translate, store:
  22. src = enc_reshuffle(src);
  23. src = enc_translate(src);
  24. _mm256_storeu_si256((__m256i *) *o, src);
  25. *s += 24;
  26. *o += 32;
  27. }
  28. static inline void
  29. enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
  30. {
  31. if (*slen < 32) {
  32. return;
  33. }
  34. // Process blocks of 24 bytes at a time. Because blocks are loaded 32
  35. // bytes at a time an offset of -4, ensure that there will be at least
  36. // 4 remaining bytes after the last round, so that the final read will
  37. // not pass beyond the bounds of the input buffer:
  38. size_t rounds = (*slen - 4) / 24;
  39. *slen -= rounds * 24; // 24 bytes consumed per round
  40. *olen += rounds * 32; // 32 bytes produced per round
  41. // The first loop iteration requires special handling to ensure that
  42. // the read, which is done at an offset, does not underflow the buffer:
  43. enc_loop_avx2_inner_first(s, o);
  44. rounds--;
  45. while (rounds > 0) {
  46. if (rounds >= 8) {
  47. enc_loop_avx2_inner(s, o);
  48. enc_loop_avx2_inner(s, o);
  49. enc_loop_avx2_inner(s, o);
  50. enc_loop_avx2_inner(s, o);
  51. enc_loop_avx2_inner(s, o);
  52. enc_loop_avx2_inner(s, o);
  53. enc_loop_avx2_inner(s, o);
  54. enc_loop_avx2_inner(s, o);
  55. rounds -= 8;
  56. continue;
  57. }
  58. if (rounds >= 4) {
  59. enc_loop_avx2_inner(s, o);
  60. enc_loop_avx2_inner(s, o);
  61. enc_loop_avx2_inner(s, o);
  62. enc_loop_avx2_inner(s, o);
  63. rounds -= 4;
  64. continue;
  65. }
  66. if (rounds >= 2) {
  67. enc_loop_avx2_inner(s, o);
  68. enc_loop_avx2_inner(s, o);
  69. rounds -= 2;
  70. continue;
  71. }
  72. enc_loop_avx2_inner(s, o);
  73. break;
  74. }
  75. // Add the offset back:
  76. *s += 4;
  77. }