123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- // Apologies in advance for combining the preprocessor with inline assembly,
- // two notoriously gnarly parts of C, but it was necessary to avoid a lot of
- // code repetition. The preprocessor is used to template large sections of
- // inline assembly that differ only in the registers used. If the code was
- // written out by hand, it would become very large and hard to audit.
- // Generate a block of inline assembly that loads three user-defined registers
- // A, B, C from memory and deinterleaves them, post-incrementing the src
- // pointer. The register set should be sequential.
- #define LOAD(A, B, C) \
- "ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t"
- // Generate a block of inline assembly that takes three deinterleaved registers
- // and shuffles the bytes. The output is in temporary registers t0..t3.
- #define SHUF(A, B, C) \
- "ushr %[t0].16b, "A".16b, #2 \n\t" \
- "ushr %[t1].16b, "B".16b, #4 \n\t" \
- "ushr %[t2].16b, "C".16b, #6 \n\t" \
- "sli %[t1].16b, "A".16b, #4 \n\t" \
- "sli %[t2].16b, "B".16b, #2 \n\t" \
- "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
- "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
- "and %[t3].16b, "C".16b, %[n63].16b \n\t"
- // Generate a block of inline assembly that takes temporary registers t0..t3
- // and translates them to the base64 alphabet, using a table loaded into
- // v8..v11. The output is in user-defined registers A..D.
- #define TRAN(A, B, C, D) \
- "tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
- "tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
- "tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
- "tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"
- // Generate a block of inline assembly that interleaves four registers and
- // stores them, post-incrementing the destination pointer.
- #define STOR(A, B, C, D) \
- "st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t"
- // Generate a block of inline assembly that generates a single self-contained
- // encoder round: fetch the data, process it, and store the result.
- #define ROUND() \
- LOAD("v12", "v13", "v14") \
- SHUF("v12", "v13", "v14") \
- TRAN("v12", "v13", "v14", "v15") \
- STOR("v12", "v13", "v14", "v15")
- // Generate a block of assembly that generates a type A interleaved encoder
- // round. It uses registers that were loaded by the previous type B round, and
- // in turn loads registers for the next type B round.
- #define ROUND_A() \
- SHUF("v2", "v3", "v4") \
- LOAD("v12", "v13", "v14") \
- TRAN("v2", "v3", "v4", "v5") \
- STOR("v2", "v3", "v4", "v5")
- // Type B interleaved encoder round. Same as type A, but register sets swapped.
- #define ROUND_B() \
- SHUF("v12", "v13", "v14") \
- LOAD("v2", "v3", "v4") \
- TRAN("v12", "v13", "v14", "v15") \
- STOR("v12", "v13", "v14", "v15")
- // The first type A round needs to load its own registers.
- #define ROUND_A_FIRST() \
- LOAD("v2", "v3", "v4") \
- ROUND_A()
- // The last type B round omits the load for the next step.
- #define ROUND_B_LAST() \
- SHUF("v12", "v13", "v14") \
- TRAN("v12", "v13", "v14", "v15") \
- STOR("v12", "v13", "v14", "v15")
- // Suppress clang's warning that the literal string in the asm statement is
- // overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
- // compilers). It may be true, but the goal here is not C99 portability.
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Woverlength-strings"
- static inline void
- enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
- {
- size_t rounds = *slen / 48;
- if (rounds == 0) {
- return;
- }
- *slen -= rounds * 48; // 48 bytes consumed per round.
- *olen += rounds * 64; // 64 bytes produced per round.
- // Number of times to go through the 8x loop.
- size_t loops = rounds / 8;
- // Number of rounds remaining after the 8x loop.
- rounds %= 8;
- // Temporary registers, used as scratch space.
- uint8x16_t tmp0, tmp1, tmp2, tmp3;
- __asm__ volatile (
- // Load the encoding table into v8..v11.
- " ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"
- // If there are eight rounds or more, enter an 8x unrolled loop
- // of interleaved encoding rounds. The rounds interleave memory
- // operations (load/store) with data operations to maximize
- // pipeline throughput.
- " cbz %[loops], 4f \n\t"
- // The SIMD instructions do not touch the flags.
- "88: subs %[loops], %[loops], #1 \n\t"
- " " ROUND_A_FIRST()
- " " ROUND_B()
- " " ROUND_A()
- " " ROUND_B()
- " " ROUND_A()
- " " ROUND_B()
- " " ROUND_A()
- " " ROUND_B_LAST()
- " b.ne 88b \n\t"
- // Enter a 4x unrolled loop for rounds of 4 or more.
- "4: cmp %[rounds], #4 \n\t"
- " b.lt 30f \n\t"
- " " ROUND_A_FIRST()
- " " ROUND_B()
- " " ROUND_A()
- " " ROUND_B_LAST()
- " sub %[rounds], %[rounds], #4 \n\t"
- // Dispatch the remaining rounds 0..3.
- "30: cbz %[rounds], 0f \n\t"
- " cmp %[rounds], #2 \n\t"
- " b.eq 2f \n\t"
- " b.lt 1f \n\t"
- // Block of non-interlaced encoding rounds, which can each
- // individually be jumped to. Rounds fall through to the next.
- "3: " ROUND()
- "2: " ROUND()
- "1: " ROUND()
- "0: \n\t"
- // Outputs (modified).
- : [loops] "+r" (loops),
- [src] "+r" (*s),
- [dst] "+r" (*o),
- [t0] "=&w" (tmp0),
- [t1] "=&w" (tmp1),
- [t2] "=&w" (tmp2),
- [t3] "=&w" (tmp3)
- // Inputs (not modified).
- : [rounds] "r" (rounds),
- [tbl] "r" (base64_table_enc_6bit),
- [n63] "w" (vdupq_n_u8(63))
- // Clobbers.
- : "v2", "v3", "v4", "v5",
- "v8", "v9", "v10", "v11",
- "v12", "v13", "v14", "v15",
- "cc", "memory"
- );
- }
- #pragma GCC diagnostic pop
|