enc_loop.c 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. static inline void
  2. enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
  3. {
  4. // Load input.
  5. __m512i src = _mm512_loadu_si512((__m512i *) *s);
  6. // Reshuffle, translate, store.
  7. src = enc_reshuffle_translate(src);
  8. _mm512_storeu_si512((__m512i *) *o, src);
  9. *s += 48;
  10. *o += 64;
  11. }
  12. static inline void
  13. enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
  14. {
  15. if (*slen < 64) {
  16. return;
  17. }
  18. // Process blocks of 48 bytes at a time. Because blocks are loaded 64
  19. // bytes at a time, ensure that there will be at least 24 remaining
  20. // bytes after the last round, so that the final read will not pass
  21. // beyond the bounds of the input buffer.
  22. size_t rounds = (*slen - 24) / 48;
  23. *slen -= rounds * 48; // 48 bytes consumed per round
  24. *olen += rounds * 64; // 64 bytes produced per round
  25. while (rounds > 0) {
  26. if (rounds >= 8) {
  27. enc_loop_avx512_inner(s, o);
  28. enc_loop_avx512_inner(s, o);
  29. enc_loop_avx512_inner(s, o);
  30. enc_loop_avx512_inner(s, o);
  31. enc_loop_avx512_inner(s, o);
  32. enc_loop_avx512_inner(s, o);
  33. enc_loop_avx512_inner(s, o);
  34. enc_loop_avx512_inner(s, o);
  35. rounds -= 8;
  36. continue;
  37. }
  38. if (rounds >= 4) {
  39. enc_loop_avx512_inner(s, o);
  40. enc_loop_avx512_inner(s, o);
  41. enc_loop_avx512_inner(s, o);
  42. enc_loop_avx512_inner(s, o);
  43. rounds -= 4;
  44. continue;
  45. }
  46. if (rounds >= 2) {
  47. enc_loop_avx512_inner(s, o);
  48. enc_loop_avx512_inner(s, o);
  49. rounds -= 2;
  50. continue;
  51. }
  52. enc_loop_avx512_inner(s, o);
  53. break;
  54. }
  55. }