aesgcm-clmul.c 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. /*
  2. * Implementation of the GCM polynomial hash using the x86 CLMUL
  3. * extension, which provides 64x64->128 polynomial multiplication (or
  4. * 'carry-less', which is what the CL stands for).
  5. *
  6. * Follows the reference implementation in aesgcm-ref-poly.c; see
  7. * there for comments on the underlying technique. Here the comments
  8. * just discuss the x86-specific details.
  9. */
  10. #include <wmmintrin.h>
  11. #include <tmmintrin.h>
  12. #if defined(__clang__) || defined(__GNUC__)
  13. #include <cpuid.h>
  14. #define GET_CPU_ID(out) __cpuid(1, (out)[0], (out)[1], (out)[2], (out)[3])
  15. #else
  16. #define GET_CPU_ID(out) __cpuid(out, 1)
  17. #endif
  18. #include "ssh.h"
  19. #include "aesgcm.h"
  20. typedef struct aesgcm_clmul {
  21. AESGCM_COMMON_FIELDS;
  22. __m128i var, acc, mask;
  23. void *ptr_to_free;
  24. } aesgcm_clmul;
  25. static bool aesgcm_clmul_available(void)
  26. {
  27. /*
  28. * Determine if CLMUL is available on this CPU.
  29. */
  30. unsigned int CPUInfo[4];
  31. GET_CPU_ID(CPUInfo);
  32. return (CPUInfo[2] & (1 << 1));
  33. }
  34. /*
  35. * __m128i has to be aligned to 16 bytes, and x86 mallocs may not
  36. * guarantee that, so we must over-allocate to make sure a large
  37. * enough 16-byte region can be found, and ensure the aesgcm_clmul
  38. * struct pointer is at least that well aligned.
  39. */
  40. #define SPECIAL_ALLOC
  41. static aesgcm_clmul *aesgcm_clmul_alloc(void)
  42. {
  43. char *p = smalloc(sizeof(aesgcm_clmul) + 15);
  44. uintptr_t ip = (uintptr_t)p;
  45. ip = (ip + 15) & ~15;
  46. aesgcm_clmul *ctx = (aesgcm_clmul *)ip;
  47. memset(ctx, 0, sizeof(aesgcm_clmul));
  48. ctx->ptr_to_free = p;
  49. return ctx;
  50. }
  51. #define SPECIAL_FREE
  52. static void aesgcm_clmul_free(aesgcm_clmul *ctx)
  53. {
  54. void *ptf = ctx->ptr_to_free;
  55. smemclr(ctx, sizeof(*ctx));
  56. sfree(ptf);
  57. }
  58. /* Helper function to reverse the 16 bytes in a 128-bit vector */
  59. static inline __m128i mm_byteswap(__m128i vec)
  60. {
  61. const __m128i reverse = _mm_set_epi64x(
  62. 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
  63. return _mm_shuffle_epi8(vec, reverse);
  64. }
  65. /* Helper function to swap the two 64-bit words in a 128-bit vector */
  66. static inline __m128i mm_wordswap(__m128i vec)
  67. {
  68. return _mm_shuffle_epi32(vec, 0x4E);
  69. }
  70. /* Load and store a 128-bit vector in big-endian fashion */
  71. static inline __m128i mm_load_be(const void *p)
  72. {
  73. return mm_byteswap(_mm_loadu_si128(p));
  74. }
  75. static inline void mm_store_be(void *p, __m128i vec)
  76. {
  77. _mm_storeu_si128(p, mm_byteswap(vec));
  78. }
  79. /*
  80. * Key setup is just like in aesgcm-ref-poly.c. There's no point using
  81. * vector registers to accelerate this, because it happens rarely.
  82. */
  83. static void aesgcm_clmul_setkey_impl(aesgcm_clmul *ctx,
  84. const unsigned char *var)
  85. {
  86. uint64_t hi = GET_64BIT_MSB_FIRST(var);
  87. uint64_t lo = GET_64BIT_MSB_FIRST(var + 8);
  88. uint64_t bit = 1 & (hi >> 63);
  89. hi = (hi << 1) ^ (lo >> 63);
  90. lo = (lo << 1) ^ bit;
  91. hi ^= 0xC200000000000000 & -bit;
  92. ctx->var = _mm_set_epi64x(hi, lo);
  93. }
  94. static inline void aesgcm_clmul_setup(aesgcm_clmul *ctx,
  95. const unsigned char *mask)
  96. {
  97. ctx->mask = mm_load_be(mask);
  98. ctx->acc = _mm_set_epi64x(0, 0);
  99. }
  100. /*
  101. * Folding a coefficient into the accumulator is done by essentially
  102. * the algorithm in aesgcm-ref-poly.c. I don't speak these intrinsics
  103. * all that well, so in the parts where I needed to XOR half of one
  104. * vector into half of another, I did a lot of faffing about with
  105. * masks like 0xFFFFFFFFFFFFFFFF0000000000000000. Very likely this can
  106. * be streamlined by a better x86-speaker than me. Patches welcome.
  107. */
  108. static inline void aesgcm_clmul_coeff(aesgcm_clmul *ctx,
  109. const unsigned char *coeff)
  110. {
  111. ctx->acc = _mm_xor_si128(ctx->acc, mm_load_be(coeff));
  112. /* Compute ah^al and bh^bl by word-swapping each of a and b and
  113. * XORing with the original. That does more work than necessary -
  114. * you end up with each of the desired values repeated twice -
  115. * but I don't know of a neater way. */
  116. __m128i aswap = mm_wordswap(ctx->acc);
  117. __m128i vswap = mm_wordswap(ctx->var);
  118. aswap = _mm_xor_si128(ctx->acc, aswap);
  119. vswap = _mm_xor_si128(ctx->var, vswap);
  120. /* Do the three multiplications required by Karatsuba */
  121. __m128i md = _mm_clmulepi64_si128(aswap, vswap, 0x00);
  122. __m128i lo = _mm_clmulepi64_si128(ctx->acc, ctx->var, 0x00);
  123. __m128i hi = _mm_clmulepi64_si128(ctx->acc, ctx->var, 0x11);
  124. /* Combine lo and hi into md */
  125. md = _mm_xor_si128(md, lo);
  126. md = _mm_xor_si128(md, hi);
  127. /* Now we must XOR the high half of md into the low half of hi,
  128. * and the low half of md into the high half of hi. Simplest thing
  129. * is to swap the words of md (so that each one lines up with the
  130. * register it's going to end up in), and then mask one off in
  131. * each case. */
  132. md = mm_wordswap(md);
  133. lo = _mm_xor_si128(lo, _mm_and_si128(md, _mm_set_epi64x(~0ULL, 0ULL)));
  134. hi = _mm_xor_si128(hi, _mm_and_si128(md, _mm_set_epi64x(0ULL, ~0ULL)));
  135. /* The reduction stage is transformed similarly from the version
  136. * in aesgcm-ref-poly.c. */
  137. __m128i r1 = _mm_clmulepi64_si128(_mm_set_epi64x(0, 0xC200000000000000),
  138. lo, 0x00);
  139. r1 = mm_wordswap(r1);
  140. r1 = _mm_xor_si128(r1, lo);
  141. hi = _mm_xor_si128(hi, _mm_and_si128(r1, _mm_set_epi64x(~0ULL, 0ULL)));
  142. __m128i r2 = _mm_clmulepi64_si128(_mm_set_epi64x(0, 0xC200000000000000),
  143. r1, 0x10);
  144. hi = _mm_xor_si128(hi, r2);
  145. hi = _mm_xor_si128(hi, _mm_and_si128(r1, _mm_set_epi64x(0ULL, ~0ULL)));
  146. ctx->acc = hi;
  147. }
  148. static inline void aesgcm_clmul_output(aesgcm_clmul *ctx,
  149. unsigned char *output)
  150. {
  151. mm_store_be(output, _mm_xor_si128(ctx->acc, ctx->mask));
  152. smemclr(&ctx->acc, 16);
  153. smemclr(&ctx->mask, 16);
  154. }
  155. #define AESGCM_FLAVOUR clmul
  156. #define AESGCM_NAME "CLMUL accelerated"
  157. #include "aesgcm-footer.h"