12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- static inline void
- enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
- {
- // Load 48 bytes and deinterleave:
- uint8x16x3_t src = vld3q_u8(*s);
- // Divide bits of three input bytes over four output bytes:
- uint8x16x4_t out = enc_reshuffle(src);
- // The bits have now been shifted to the right locations;
- // translate their values 0..63 to the Base64 alphabet.
- // Use a 64-byte table lookup:
- out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
- out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
- out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
- out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);
- // Interleave and store output:
- vst4q_u8(*o, out);
- *s += 48;
- *o += 64;
- }
- static inline void
- enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
- {
- size_t rounds = *slen / 48;
- *slen -= rounds * 48; // 48 bytes consumed per round
- *olen += rounds * 64; // 64 bytes produced per round
- // Load the encoding table:
- const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
- while (rounds > 0) {
- if (rounds >= 8) {
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- rounds -= 8;
- continue;
- }
- if (rounds >= 4) {
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- rounds -= 4;
- continue;
- }
- if (rounds >= 2) {
- enc_loop_neon64_inner(s, o, tbl_enc);
- enc_loop_neon64_inner(s, o, tbl_enc);
- rounds -= 2;
- continue;
- }
- enc_loop_neon64_inner(s, o, tbl_enc);
- break;
- }
- }
|