aesgcm-neon.c 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. /*
  2. * Implementation of the GCM polynomial hash using Arm NEON vector
  3. * intrinsics, in particular the multiplication operation for
  4. * polynomials over GF(2).
  5. *
  6. * Follows the reference implementation in aesgcm-ref-poly.c; see
  7. * there for comments on the underlying technique. Here the comments
  8. * just discuss the NEON-specific details.
  9. */
  10. #include "ssh.h"
  11. #include "aesgcm.h"
  12. #if USE_ARM64_NEON_H
  13. #include <arm64_neon.h>
  14. #else
  15. #include <arm_neon.h>
  16. #endif
  17. typedef struct aesgcm_neon {
  18. AESGCM_COMMON_FIELDS;
  19. poly128_t var, acc, mask;
  20. } aesgcm_neon;
  21. static bool aesgcm_neon_available(void)
  22. {
  23. return platform_pmull_neon_available();
  24. }
  25. /*
  26. * The NEON types involved are:
  27. *
  28. * 'poly128_t' is a type that lives in a 128-bit vector register and
  29. * represents a 128-bit polynomial over GF(2)
  30. *
  31. * 'poly64x2_t' is a type that lives in a 128-bit vector register and
  32. * represents a vector of two 64-bit polynomials. These appear as
  33. * intermediate results in some of the helper functions below, but we
  34. * never need to actually have a variable of that type.
  35. *
  36. * 'poly64x1_t' is a type that lives in a 128-bit vector register and
  37. * represents a vector of one 64-bit polynomial.
  38. *
  39. * That is distinct from 'poly64_t', which is a type that lives in
  40. * ordinary scalar registers and is a typedef for an integer type.
  41. *
  42. * Generally here we try to work in terms of poly128_t and 64-bit
  43. * integer types, and let everything else be handled as internal
  44. * details of these helper functions.
  45. */
  46. /* Make a poly128_t from two halves */
  47. static inline poly128_t create_p128(poly64_t hi, poly64_t lo)
  48. {
  49. return vreinterpretq_p128_p64(
  50. vcombine_p64(vcreate_p64(lo), vcreate_p64(hi)));
  51. }
  52. /* Retrieve the high and low halves of a poly128_t */
  53. static inline poly64_t hi_half(poly128_t v)
  54. {
  55. return vgetq_lane_p64(vreinterpretq_p64_p128(v), 1);
  56. }
  57. static inline poly64_t lo_half(poly128_t v)
  58. {
  59. return vgetq_lane_p64(vreinterpretq_p64_p128(v), 0);
  60. }
  61. /* 64x64 -> 128 bit polynomial multiplication, the largest we can do
  62. * in one CPU operation */
  63. static inline poly128_t pmul(poly64_t v, poly64_t w)
  64. {
  65. return vmull_p64(v, w);
  66. }
  67. /* Load and store a poly128_t in the form of big-endian bytes. This
  68. * involves separately swapping the halves of the register and
  69. * reversing the bytes within each half. */
  70. static inline poly128_t load_p128_be(const void *p)
  71. {
  72. poly128_t swapped = vreinterpretq_p128_u8(vrev64q_u8(vld1q_u8(p)));
  73. return create_p128(lo_half(swapped), hi_half(swapped));
  74. }
  75. static inline void store_p128_be(void *p, poly128_t v)
  76. {
  77. poly128_t swapped = create_p128(lo_half(v), hi_half(v));
  78. vst1q_u8(p, vrev64q_u8(vreinterpretq_u8_p128(swapped)));
  79. }
  80. #if !HAVE_NEON_VADDQ_P128
  81. static inline poly128_t vaddq_p128(poly128_t a, poly128_t b)
  82. {
  83. return vreinterpretq_p128_u32(veorq_u32(
  84. vreinterpretq_u32_p128(a), vreinterpretq_u32_p128(b)));
  85. }
  86. #endif
  87. /*
  88. * Key setup is just like in aesgcm-ref-poly.c. There's no point using
  89. * vector registers to accelerate this, because it happens rarely.
  90. */
  91. static void aesgcm_neon_setkey_impl(aesgcm_neon *ctx, const unsigned char *var)
  92. {
  93. uint64_t hi = GET_64BIT_MSB_FIRST(var);
  94. uint64_t lo = GET_64BIT_MSB_FIRST(var + 8);
  95. uint64_t bit = 1 & (hi >> 63);
  96. hi = (hi << 1) ^ (lo >> 63);
  97. lo = (lo << 1) ^ bit;
  98. hi ^= 0xC200000000000000 & -bit;
  99. ctx->var = create_p128(hi, lo);
  100. }
  101. static inline void aesgcm_neon_setup(aesgcm_neon *ctx,
  102. const unsigned char *mask)
  103. {
  104. ctx->mask = load_p128_be(mask);
  105. ctx->acc = create_p128(0, 0);
  106. }
  107. /*
  108. * Folding a coefficient into the accumulator is done by exactly the
  109. * algorithm in aesgcm-ref-poly.c, translated line by line.
  110. *
  111. * It's possible that this could be improved by some clever manoeuvres
  112. * that avoid having to break vectors in half and put them together
  113. * again. Patches welcome if anyone has better ideas.
  114. */
  115. static inline void aesgcm_neon_coeff(aesgcm_neon *ctx,
  116. const unsigned char *coeff)
  117. {
  118. ctx->acc = vaddq_p128(ctx->acc, load_p128_be(coeff));
  119. poly64_t ah = hi_half(ctx->acc), al = lo_half(ctx->acc);
  120. poly64_t bh = hi_half(ctx->var), bl = lo_half(ctx->var);
  121. poly128_t md = pmul(ah ^ al, bh ^ bl);
  122. poly128_t lo = pmul(al, bl);
  123. poly128_t hi = pmul(ah, bh);
  124. md = vaddq_p128(md, vaddq_p128(hi, lo));
  125. hi = create_p128(hi_half(hi), lo_half(hi) ^ hi_half(md));
  126. lo = create_p128(hi_half(lo) ^ lo_half(md), lo_half(lo));
  127. poly128_t r1 = pmul((poly64_t)0xC200000000000000, lo_half(lo));
  128. hi = create_p128(hi_half(hi), lo_half(hi) ^ lo_half(lo) ^ hi_half(r1));
  129. lo = create_p128(hi_half(lo) ^ lo_half(r1), lo_half(lo));
  130. poly128_t r2 = pmul((poly64_t)0xC200000000000000, hi_half(lo));
  131. hi = vaddq_p128(hi, r2);
  132. hi = create_p128(hi_half(hi) ^ hi_half(lo), lo_half(hi));
  133. ctx->acc = hi;
  134. }
  135. static inline void aesgcm_neon_output(aesgcm_neon *ctx, unsigned char *output)
  136. {
  137. store_p128_be(output, vaddq_p128(ctx->acc, ctx->mask));
  138. ctx->acc = create_p128(0, 0);
  139. ctx->mask = create_p128(0, 0);
  140. }
  141. #define AESGCM_FLAVOUR neon
  142. #define AESGCM_NAME "NEON accelerated"
  143. #include "aesgcm-footer.h"