enc_loop_asm.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. // Apologies in advance for combining the preprocessor with inline assembly,
  2. // two notoriously gnarly parts of C, but it was necessary to avoid a lot of
  3. // code repetition. The preprocessor is used to template large sections of
  4. // inline assembly that differ only in the registers used. If the code was
  5. // written out by hand, it would become very large and hard to audit.
  6. // Generate a block of inline assembly that loads three user-defined registers
  7. // A, B, C from memory and deinterleaves them, post-incrementing the src
  8. // pointer. The register set should be sequential.
  9. #define LOAD(A, B, C) \
  10. "ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t"
  11. // Generate a block of inline assembly that takes three deinterleaved registers
  12. // and shuffles the bytes. The output is in temporary registers t0..t3.
  13. #define SHUF(A, B, C) \
  14. "ushr %[t0].16b, "A".16b, #2 \n\t" \
  15. "ushr %[t1].16b, "B".16b, #4 \n\t" \
  16. "ushr %[t2].16b, "C".16b, #6 \n\t" \
  17. "sli %[t1].16b, "A".16b, #4 \n\t" \
  18. "sli %[t2].16b, "B".16b, #2 \n\t" \
  19. "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
  20. "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
  21. "and %[t3].16b, "C".16b, %[n63].16b \n\t"
  22. // Generate a block of inline assembly that takes temporary registers t0..t3
  23. // and translates them to the base64 alphabet, using a table loaded into
  24. // v8..v11. The output is in user-defined registers A..D.
  25. #define TRAN(A, B, C, D) \
  26. "tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
  27. "tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
  28. "tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
  29. "tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"
  30. // Generate a block of inline assembly that interleaves four registers and
  31. // stores them, post-incrementing the destination pointer.
  32. #define STOR(A, B, C, D) \
  33. "st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t"
  34. // Generate a block of inline assembly that generates a single self-contained
  35. // encoder round: fetch the data, process it, and store the result.
  36. #define ROUND() \
  37. LOAD("v12", "v13", "v14") \
  38. SHUF("v12", "v13", "v14") \
  39. TRAN("v12", "v13", "v14", "v15") \
  40. STOR("v12", "v13", "v14", "v15")
  41. // Generate a block of assembly that generates a type A interleaved encoder
  42. // round. It uses registers that were loaded by the previous type B round, and
  43. // in turn loads registers for the next type B round.
  44. #define ROUND_A() \
  45. SHUF("v2", "v3", "v4") \
  46. LOAD("v12", "v13", "v14") \
  47. TRAN("v2", "v3", "v4", "v5") \
  48. STOR("v2", "v3", "v4", "v5")
  49. // Type B interleaved encoder round. Same as type A, but register sets swapped.
  50. #define ROUND_B() \
  51. SHUF("v12", "v13", "v14") \
  52. LOAD("v2", "v3", "v4") \
  53. TRAN("v12", "v13", "v14", "v15") \
  54. STOR("v12", "v13", "v14", "v15")
  55. // The first type A round needs to load its own registers.
  56. #define ROUND_A_FIRST() \
  57. LOAD("v2", "v3", "v4") \
  58. ROUND_A()
  59. // The last type B round omits the load for the next step.
  60. #define ROUND_B_LAST() \
  61. SHUF("v12", "v13", "v14") \
  62. TRAN("v12", "v13", "v14", "v15") \
  63. STOR("v12", "v13", "v14", "v15")
  64. // Suppress clang's warning that the literal string in the asm statement is
  65. // overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
  66. // compilers). It may be true, but the goal here is not C99 portability.
  67. #pragma GCC diagnostic push
  68. #pragma GCC diagnostic ignored "-Woverlength-strings"
  69. static inline void
  70. enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
  71. {
  72. size_t rounds = *slen / 48;
  73. if (rounds == 0) {
  74. return;
  75. }
  76. *slen -= rounds * 48; // 48 bytes consumed per round.
  77. *olen += rounds * 64; // 64 bytes produced per round.
  78. // Number of times to go through the 8x loop.
  79. size_t loops = rounds / 8;
  80. // Number of rounds remaining after the 8x loop.
  81. rounds %= 8;
  82. // Temporary registers, used as scratch space.
  83. uint8x16_t tmp0, tmp1, tmp2, tmp3;
  84. __asm__ volatile (
  85. // Load the encoding table into v8..v11.
  86. " ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"
  87. // If there are eight rounds or more, enter an 8x unrolled loop
  88. // of interleaved encoding rounds. The rounds interleave memory
  89. // operations (load/store) with data operations to maximize
  90. // pipeline throughput.
  91. " cbz %[loops], 4f \n\t"
  92. // The SIMD instructions do not touch the flags.
  93. "88: subs %[loops], %[loops], #1 \n\t"
  94. " " ROUND_A_FIRST()
  95. " " ROUND_B()
  96. " " ROUND_A()
  97. " " ROUND_B()
  98. " " ROUND_A()
  99. " " ROUND_B()
  100. " " ROUND_A()
  101. " " ROUND_B_LAST()
  102. " b.ne 88b \n\t"
  103. // Enter a 4x unrolled loop for rounds of 4 or more.
  104. "4: cmp %[rounds], #4 \n\t"
  105. " b.lt 30f \n\t"
  106. " " ROUND_A_FIRST()
  107. " " ROUND_B()
  108. " " ROUND_A()
  109. " " ROUND_B_LAST()
  110. " sub %[rounds], %[rounds], #4 \n\t"
  111. // Dispatch the remaining rounds 0..3.
  112. "30: cbz %[rounds], 0f \n\t"
  113. " cmp %[rounds], #2 \n\t"
  114. " b.eq 2f \n\t"
  115. " b.lt 1f \n\t"
  116. // Block of non-interlaced encoding rounds, which can each
  117. // individually be jumped to. Rounds fall through to the next.
  118. "3: " ROUND()
  119. "2: " ROUND()
  120. "1: " ROUND()
  121. "0: \n\t"
  122. // Outputs (modified).
  123. : [loops] "+r" (loops),
  124. [src] "+r" (*s),
  125. [dst] "+r" (*o),
  126. [t0] "=&w" (tmp0),
  127. [t1] "=&w" (tmp1),
  128. [t2] "=&w" (tmp2),
  129. [t3] "=&w" (tmp3)
  130. // Inputs (not modified).
  131. : [rounds] "r" (rounds),
  132. [tbl] "r" (base64_table_enc_6bit),
  133. [n63] "w" (vdupq_n_u8(63))
  134. // Clobbers.
  135. : "v2", "v3", "v4", "v5",
  136. "v8", "v9", "v10", "v11",
  137. "v12", "v13", "v14", "v15",
  138. "cc", "memory"
  139. );
  140. }
  141. #pragma GCC diagnostic pop