enc_reshuffle_translate.c 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. // AVX512 algorithm is based on permutevar and multishift. The code is based on
  2. // https://github.com/WojciechMula/base64simd which is under BSD-2 license.
  3. static inline __m512i
  4. enc_reshuffle_translate (const __m512i input)
  5. {
  6. // 32-bit input
  7. // [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
  8. // b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
  9. // output order [1, 2, 0, 1]
  10. // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
  11. // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
  12. const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
  13. 0x04050304,
  14. 0x07080607,
  15. 0x0a0b090a,
  16. 0x0d0e0c0d,
  17. 0x10110f10,
  18. 0x13141213,
  19. 0x16171516,
  20. 0x191a1819,
  21. 0x1c1d1b1c,
  22. 0x1f201e1f,
  23. 0x22232122,
  24. 0x25262425,
  25. 0x28292728,
  26. 0x2b2c2a2b,
  27. 0x2e2f2d2e);
  28. // Reorder bytes
  29. // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
  30. // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
  31. const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
  32. // After multishift a single 32-bit lane has following layout
  33. // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
  34. // a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
  35. // (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
  36. // 48, 54, 36, 42, 16, 22, 4, 10
  37. const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
  38. __m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
  39. // Translate immediatedly after reshuffled.
  40. const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
  41. // Translation 6-bit values to ASCII.
  42. return _mm512_permutexvar_epi8(shuffled_in, lookup);
  43. }