aesgcm-ref-poly.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. /*
  2. * Implementation of the GCM polynomial hash in pure software, but
  3. * based on a primitive that performs 64x64->128 bit polynomial
  4. * multiplication over GF(2).
  5. *
  6. * This implementation is technically correct (should even be
  7. * side-channel safe as far as I can see), but it's hopelessly slow,
  8. * so no live SSH connection should ever use it. Therefore, it's
  9. * deliberately not included in the lists in aesgcm-select.c. For pure
  10. * software GCM in live use, you want aesgcm-sw.c, and that's what the
  11. * selection system will choose.
  12. *
  13. * However, this implementation _is_ made available to testcrypt, so
  14. * all the GCM tests in cryptsuite.py are run over this as well as the
  15. * other implementations.
  16. *
  17. * The reason why this code exists at all is to act as a reference for
  18. * GCM implementations that use a CPU-specific polynomial multiply
  19. * intrinsic or asm statement. This version will run on whatever
  20. * platform you're trying to port to, and will generate all the same
  21. * intermediate results you expect the CPU-specific one to go through.
  22. * So you can insert parallel diagnostics in this version and in your
  23. * new version, to see where the two diverge.
  24. *
  25. * Also, this version is a good place to put long comments explaining
  26. * the entire strategy of this implementation and its rationale. That
  27. * avoids those comments having to be duplicated in the multiple
  28. * platform-specific implementations, which can focus on commenting
  29. * the way the local platform's idioms match up to this version, and
  30. * refer to this file for the explanation of the underlying technique.
  31. */
  32. #include "ssh.h"
  33. #include "aesgcm.h"
  34. /*
  35. * Store a 128-bit value in the most convenient form standard C will
  36. * let us, namely two uint64_t giving its most and least significant
  37. * halves.
  38. */
  39. typedef struct {
  40. uint64_t hi, lo;
  41. } value128_t;
  42. typedef struct aesgcm_ref_poly {
  43. AESGCM_COMMON_FIELDS;
  44. /*
  45. * The state of our GCM implementation is represented entirely by
  46. * three 128-bit values:
  47. */
  48. /*
  49. * The value at which we're evaluating the polynomial. The GCM
  50. * spec calls this 'H'. It's defined once at GCM key setup time,
  51. * by encrypting the all-zeroes value with our block cipher.
  52. */
  53. value128_t var;
  54. /*
  55. * Accumulator containing the result of evaluating the polynomial
  56. * so far.
  57. */
  58. value128_t acc;
  59. /*
  60. * The mask value that is XORed into the final value of 'acc' to
  61. * produce the output MAC. This is different for every MAC
  62. * generated, because its purpose is to ensure that no information
  63. * gathered from a legal MAC can be used to help the forgery of
  64. * another one, and that comparing two legal MACs gives you no
  65. * useful information about the text they cover, because in each
  66. * case, the masks are different and pseudorandom.
  67. */
  68. value128_t mask;
  69. } aesgcm_ref_poly;
  70. static bool aesgcm_ref_poly_available(void)
  71. {
  72. return true; /* pure software implementation, always available */
  73. }
  74. /*
  75. * Primitive function that takes two uint64_t values representing
  76. * polynomials, multiplies them, and returns a value128_t struct
  77. * containing the full product.
  78. *
  79. * Because the input polynomials have maximum degree 63, the output
  80. * has max degree 63+63 = 127, not 128. As a result, the topmost bit
  81. * of the output is always zero.
  82. *
  83. * The inside of this function is implemented in the simplest way,
  84. * with no attention paid to performance. The important feature of
  85. * this implementation is not what's _inside_ this function, but
  86. * what's _outside_ it: aesgcm_ref_poly_coeff() tries to minimise the
  87. * number of these operations.
  88. */
  89. static value128_t pmul(uint64_t x, uint64_t y)
  90. {
  91. value128_t r;
  92. r.hi = r.lo = 0;
  93. uint64_t bit = 1 & y;
  94. r.lo ^= x & -bit;
  95. for (unsigned i = 1; i < 64; i++) {
  96. bit = 1 & (y >> i);
  97. uint64_t z = x & -bit;
  98. r.lo ^= z << i;
  99. r.hi ^= z >> (64-i);
  100. }
  101. return r;
  102. }
  103. /*
  104. * OK, I promised a long comment explaining what's going on in this
  105. * implementation, and now it's time.
  106. *
  107. * The way AES-GCM _itself_ is defined by its own spec, its finite
  108. * field consists of polynomials over GF(2), constrained to be 128
  109. * bits long by reducing them modulo P = x^128 + x^7 + x^2 + x + 1.
  110. * Using the usual binary representation in which bit i is the
  111. * coefficient of x^i, that's 0x100000000000000000000000000000087.
  112. *
  113. * That is, whenever you multiply two polynomials and find a term
  114. * x^128, you can replace it with x^7+x^2+x+1. More generally,
  115. * x^(128+n) can be replaced with x^(7+n)+x^(2+n)+x^(1+n)+x^n. In
  116. * binary terms, a 1 bit at the 128th position or above is replaced by
  117. * 0x87 exactly 128 bits further down.
  118. *
  119. * So you'd think that multiplying two 128-bit polynomials mod P would
  120. * be a matter of generating their full 256-bit product in the form of
  121. * four words HI:HU:LU:LO, and then reducing it mod P by a two-stage
  122. * process of computing HI * 0x87 and XORing it into HU:LU, then
  123. * computing HU * 0x87 and XORing it into LU:LO.
  124. *
  125. * But it's not!
  126. *
  127. * The reason why not is because when AES-GCM is applied to SSH,
  128. * somehow the _bit_ order got reversed. A 16-byte block of data in
  129. * memory is converted into a polynomial by regarding bit 7 of the
  130. * first byte as the constant term, bit 0 of the first byte as the x^7
  131. * coefficient, ..., bit 0 of the last byte as the x^127 coefficient.
  132. * So if we load that 16-byte block as a big-endian 128-bit integer,
  133. * we end up with it representing a polynomial back to front, with the
  134. * constant term at the top and the x^127 bit at the bottom.
  135. *
  136. * Well, that _shouldn't_ be a problem, right? The nice thing about
  137. * polynomial multiplication is that it's essentially reversible. If
  138. * you reverse the order of the coefficients of two polynomials, then
  139. * the product of the reversed polys is exactly the reversal of the
  140. * product of the original ones. So we bit-reverse our modulo
  141. * polynomial to get 0x1c2000000000000000000000000000001, and we just
  142. * pretend we're working mod that instead.
  143. *
  144. * And that is basically what we're about to do. But there's one
  145. * complication, that arises from the semantics of the polynomial
  146. * multiplication function we're using as our primitive operation.
  147. *
  148. * That function multiplies two polynomials of degree at most 63, to
  149. * give one with degree at most 127. So it returns a 128-bit integer
  150. * whose low bit is the constant term, and its very highest bit is 0,
  151. * and its _next_ highest bit is the product of the high bits of the
  152. * two inputs.
  153. *
  154. * That operation is _not_ symmetric in bit-reversal. If you give it
  155. * the 64-bit-wise reversals of two polynomials P,Q, then its output
  156. * is not the 128-bit-wise reversal of their product PQ, because that
  157. * would require the constant term of PQ to appear in bit 127 of the
  158. * output, and in fact it appears in bit 126. So in fact, what we get
  159. * is offset by one bit from where we'd like it: it's the bit-reversal
  160. * of PQx, not of PQ.
  161. *
  162. * There's more than one way we could fix this. One approach would be
  163. * to work around this off-by-one error by taking the 128-bit output
  164. * of pmul() and shifting it left by a bit. Then it _is_ the bitwise
  165. * reversal of the 128-bit value we'd have wanted to get, and we could
  166. * implement the exact algorithm described above, in fully
  167. * bit-reversed form.
  168. *
  169. * But a 128-bit left shift is not a trivial operation in the vector
  170. * architectures that this code is acting as a reference for. So we'd
  171. * prefer to find a fix that doesn't need compensation during the
  172. * actual per-block multiplication step.
  173. *
  174. * If we did the obvious thing anyway - compute the unshifted 128-bit
  175. * product representing the bit-reversal of PQx, and reduce it mod
  176. * 0x1c2000000000000000000000000000001 - then we'd get a result which
  177. * is exactly what we want, except that it's got a factor of x in it
  178. * that we need to get rid of. The obvious answer is to divide by x
  179. * (which is legal and safe, since mod P, x is invertible).
  180. *
  181. * Dividing a 128-bit polynomial by x is easy in principle. Shift left
  182. * (because we're still bit-reversed), and if that shifted a 1 bit off
  183. * the top, XOR 0xc2000000000000000000000000000001 into the remaining
  184. * 128 bits.
  185. *
  186. * But we're back to having that expensive left shift. What can we do
  187. * about that?
  188. *
  189. * Happily, one of the two input values to our per-block multiply
  190. * operation is fixed! It's given to us at key setup stage, and never
  191. * changed until the next rekey. So if at key setup time we do this
  192. * left shift business _once_, replacing the logical value Q with Q/x,
  193. * then that exactly cancels out the unwanted factor of x that shows
  194. * up in our multiply operation. And now it doesn't matter that it's
  195. * expensive (in the sense of 'a few more instructions than you'd
  196. * like'), because it only happens once per SSH key exchange, not once
  197. * per 16 bytes of data transferred.
  198. */
  199. static void aesgcm_ref_poly_setkey_impl(aesgcm_ref_poly *ctx,
  200. const unsigned char *var)
  201. {
  202. /*
  203. * Key setup function. We copy the provided 16-byte 'var'
  204. * value into our polynomial. But, as discussed above, we also
  205. * need to divide it by x.
  206. */
  207. ctx->var.hi = GET_64BIT_MSB_FIRST(var);
  208. ctx->var.lo = GET_64BIT_MSB_FIRST(var + 8);
  209. uint64_t bit = 1 & (ctx->var.hi >> 63);
  210. ctx->var.hi = (ctx->var.hi << 1) ^ (ctx->var.lo >> 63);
  211. ctx->var.lo = (ctx->var.lo << 1) ^ bit;
  212. ctx->var.hi ^= 0xC200000000000000 & -bit;
  213. }
  214. static inline void aesgcm_ref_poly_setup(aesgcm_ref_poly *ctx,
  215. const unsigned char *mask)
  216. {
  217. /*
  218. * Set up to start evaluating a particular MAC. Copy in the mask
  219. * value for this packet, and initialise acc to zero.
  220. */
  221. ctx->mask.hi = GET_64BIT_MSB_FIRST(mask);
  222. ctx->mask.lo = GET_64BIT_MSB_FIRST(mask + 8);
  223. ctx->acc.hi = ctx->acc.lo = 0;
  224. }
  225. static inline void aesgcm_ref_poly_coeff(aesgcm_ref_poly *ctx,
  226. const unsigned char *coeff)
  227. {
  228. /*
  229. * One step of Horner's-rule polynomial evaluation (with each
  230. * coefficient of the polynomial being an element of GF(2^128),
  231. * itself composed of polynomials over GF(2) mod P).
  232. *
  233. * We take our accumulator value, add the incoming coefficient
  234. * (which means XOR, by GF(2) rules), and multiply by x (that is,
  235. * 'var').
  236. */
  237. /*
  238. * The addition first, which is easy.
  239. */
  240. ctx->acc.hi ^= GET_64BIT_MSB_FIRST(coeff);
  241. ctx->acc.lo ^= GET_64BIT_MSB_FIRST(coeff + 8);
  242. /*
  243. * First, create the 256-bit product of the two 128-bit
  244. * polynomials over GF(2) stored in ctx->acc and ctx->var.
  245. *
  246. * The obvious way to do this is by four smaller multiplications
  247. * of 64x64 -> 128 bits. But we can do better using a single
  248. * iteration of the Karatsuba technique, which is actually more
  249. * convenient in polynomials over GF(2) than it is in integers,
  250. * because there aren't all those awkward carries complicating
  251. * things.
  252. *
  253. * Letting B denote x^64, and imagining our two inputs are split
  254. * up into 64-bit chunks ah,al,bh,bl, the product we want is
  255. *
  256. * (ah B + al) (bh B + bl)
  257. * = (ah bh) B^2 + (al bh + ah bl) B + (al bl)
  258. *
  259. * which looks like four smaller multiplications of each of ah,al
  260. * with each of bh,bl. But Karatsuba's trick is to first compute
  261. *
  262. * (ah + al) (bh + bl)
  263. * = ah bh + al bh + ah bl + al bl
  264. *
  265. * and then subtract the terms (ah bh) and (al bl), which we had
  266. * to compute anyway, to get the middle two terms (al bh + ah bl)
  267. * which are our coefficient of B.
  268. *
  269. * This involves more bookkeeping instructions like XORs, but with
  270. * any luck those are faster than the main multiplication.
  271. */
  272. uint64_t ah = ctx->acc.hi, al = ctx->acc.lo;
  273. uint64_t bh = ctx->var.hi, bl = ctx->var.lo;
  274. /* Compute the outer two terms */
  275. value128_t lo = pmul(al, bl);
  276. value128_t hi = pmul(ah, bh);
  277. /* Compute the trick product (ah+al)(bh+bl) */
  278. value128_t md = pmul(ah ^ al, bh ^ bl);
  279. /* Subtract off the outer two terms to get md = al bh + ah bl */
  280. md.hi ^= lo.hi ^ hi.hi;
  281. md.lo ^= lo.lo ^ hi.lo;
  282. /* And add that into the 256-bit value given by hi * x^128 + lo */
  283. lo.hi ^= md.lo;
  284. hi.lo ^= md.hi;
  285. /*
  286. * OK. Now hi and lo together make up the 256-bit full product.
  287. * Now reduce it mod the reversal of the GCM modulus polynomial.
  288. * As discussed above, that's 0x1c2000000000000000000000000000001.
  289. *
  290. * We want the _topmost_ 128 bits of this, because we're working
  291. * in a bit-reversed world. So what we fundamentally want to do is
  292. * to take our 256-bit product, and add to it the product of its
  293. * low 128 bits with 0x1c2000000000000000000000000000001. Then the
  294. * top 128 bits will be the output we want.
  295. *
  296. * Since there's no carrying in this arithmetic, it's enough to
  297. * discard the 1 bit at the bottom of that, because it won't
  298. * affect anything in the half we're keeping. So it's enough to
  299. * add 0x1c2000000000000000000000000000000 * lo to (hi:lo).
  300. *
  301. * We can only work with 64 bits at a time, so the first thing we
  302. * do is to break that up:
  303. *
  304. * - add 0x1c200000000000000 * lo.lo to (hi.lo : lo.hi)
  305. * - add 0x1c200000000000000 * lo.hi to (hi.hi : hi.lo)
  306. *
  307. * But there's still a problem: 0x1c200000000000000 is just too
  308. * _big_ to fit in 64 bits. So we have to break it up into the low
  309. * 64 bits 0xc200000000000000, and its leading 1. So each of those
  310. * steps of the form 'add 0x1c200000000000000 * x to y:z' becomes
  311. * 'add 0xc200000000000000 * x to y:z' followed by 'add x to y',
  312. * the latter step dealing with the leading 1.
  313. */
  314. /* First step, adding to the middle two words of our number. After
  315. * this the lowest word (in lo.lo) is ignored. */
  316. value128_t r1 = pmul(0xC200000000000000, lo.lo);
  317. hi.lo ^= r1.hi ^ lo.lo;
  318. lo.hi ^= r1.lo;
  319. /* Second of those steps, adding to the top two words, and
  320. * discarding lo.hi. */
  321. value128_t r2 = pmul(0xC200000000000000, lo.hi);
  322. hi.hi ^= r2.hi ^ lo.hi;
  323. hi.lo ^= r2.lo;
  324. /* Now 'hi' is precisely what we have left. */
  325. ctx->acc = hi;
  326. }
  327. static inline void aesgcm_ref_poly_output(aesgcm_ref_poly *ctx,
  328. unsigned char *output)
  329. {
  330. PUT_64BIT_MSB_FIRST(output, ctx->acc.hi ^ ctx->mask.hi);
  331. PUT_64BIT_MSB_FIRST(output + 8, ctx->acc.lo ^ ctx->mask.lo);
  332. smemclr(&ctx->acc, 16);
  333. smemclr(&ctx->mask, 16);
  334. }
  335. #define AESGCM_FLAVOUR ref_poly
  336. #define AESGCM_NAME "reference polynomial-based implementation"
  337. #include "aesgcm-footer.h"