codec.c 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. #include <stdint.h>
  2. #include <stddef.h>
  3. #include <string.h>
  4. #include "../../../include/libbase64.h"
  5. #include "../../tables/tables.h"
  6. #include "../../codecs.h"
  7. #include "config.h"
  8. #include "../../env.h"
  9. #ifdef __aarch64__
  10. # if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64
  11. # define BASE64_USE_NEON64
  12. # endif
  13. #endif
  14. #ifdef BASE64_USE_NEON64
  15. #include <arm_neon.h>
  16. // Only enable inline assembly on supported compilers.
  17. #if defined(__GNUC__) || defined(__clang__)
  18. #define BASE64_NEON64_USE_ASM
  19. #endif
  20. static inline uint8x16x4_t
  21. load_64byte_table (const uint8_t *p)
  22. {
  23. #ifdef BASE64_NEON64_USE_ASM
  24. // Force the table to be loaded into contiguous registers. GCC will not
  25. // normally allocate contiguous registers for a `uint8x16x4_t'. These
  26. // registers are chosen to not conflict with the ones in the enc loop.
  27. register uint8x16_t t0 __asm__ ("v8");
  28. register uint8x16_t t1 __asm__ ("v9");
  29. register uint8x16_t t2 __asm__ ("v10");
  30. register uint8x16_t t3 __asm__ ("v11");
  31. __asm__ (
  32. "ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t"
  33. : [src] "+r" (p),
  34. [t0] "=w" (t0),
  35. [t1] "=w" (t1),
  36. [t2] "=w" (t2),
  37. [t3] "=w" (t3)
  38. );
  39. return (uint8x16x4_t) {
  40. .val[0] = t0,
  41. .val[1] = t1,
  42. .val[2] = t2,
  43. .val[3] = t3,
  44. };
  45. #else
  46. return vld1q_u8_x4(p);
  47. #endif
  48. }
  49. #include "../generic/32/dec_loop.c"
  50. #include "../generic/64/enc_loop.c"
  51. #include "dec_loop.c"
  52. #ifdef BASE64_NEON64_USE_ASM
  53. # include "enc_loop_asm.c"
  54. #else
  55. # include "enc_reshuffle.c"
  56. # include "enc_loop.c"
  57. #endif
  58. #endif // BASE64_USE_NEON64
  59. // Stride size is so large on these NEON 64-bit functions
  60. // (48 bytes encode, 64 bytes decode) that we inline the
  61. // uint64 codec to stay performant on smaller inputs.
  62. BASE64_ENC_FUNCTION(neon64)
  63. {
  64. #ifdef BASE64_USE_NEON64
  65. #include "../generic/enc_head.c"
  66. enc_loop_neon64(&s, &slen, &o, &olen);
  67. enc_loop_generic_64(&s, &slen, &o, &olen);
  68. #include "../generic/enc_tail.c"
  69. #else
  70. BASE64_ENC_STUB
  71. #endif
  72. }
  73. BASE64_DEC_FUNCTION(neon64)
  74. {
  75. #ifdef BASE64_USE_NEON64
  76. #include "../generic/dec_head.c"
  77. dec_loop_neon64(&s, &slen, &o, &olen);
  78. dec_loop_generic_32(&s, &slen, &o, &olen);
  79. #include "../generic/dec_tail.c"
  80. #else
  81. BASE64_DEC_STUB
  82. #endif
  83. }