crypto_aes_arm.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. #include "cpusupport.h"
  2. #ifdef CPUSUPPORT_ARM_AES
  3. /**
  4. * CPUSUPPORT CFLAGS: ARM_AES
  5. */
  6. #include <stdint.h>
  7. #include <stdlib.h>
  8. #ifdef __ARM_NEON
  9. #include <arm_neon.h>
  10. #endif
  11. #include "align_ptr.h"
  12. #include "insecure_memzero.h"
  13. #include "warnp.h"
  14. #include "crypto_aes_arm.h"
  15. #include "crypto_aes_arm_u8.h"
  16. /* Expanded-key structure. */
  17. struct crypto_aes_key_arm {
  18. ALIGN_PTR_DECL(uint8x16_t, rkeys, 15, sizeof(uint8x16_t));
  19. size_t nr;
  20. };
  21. /**
  22. * vdupq_laneq_u32_u8(a, lane):
  23. * Set all 32-bit vector lanes to the same value. Exactly the same as
  24. * vdupq_laneq_u32(), except that accepts (and returns) uint8x16_t.
  25. */
  26. #define vdupq_laneq_u32_u8(a, lane) \
  27. vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(a), lane))
  28. /**
  29. * vshlq_n_u128(a, n):
  30. * Shift left (immediate), applied to the whole vector at once.
  31. *
  32. * Implementation note: this concatenates ${a} with a vector containing zeros,
  33. * then extracts a new vector from the pair (similar to a sliding window).
  34. * For example, vshlq_n_u128(a, 3) would do:
  35. * 0xaaaaaaaaaaaaaaaa0000000000000000
  36. * return: ~~~~~~~~~~~~~~~~
  37. * This is the recommended method of shifting an entire vector with Neon
  38. * intrinsics; all of the built-in shift instructions operate on multiple
  39. * values (such as a pair of 64-bit values).
  40. */
  41. #define vshlq_n_u128(a, n) vextq_u8(vdupq_n_u8(0), a, 16 - n)
  42. /**
  43. * SubWord_duplicate(a):
  44. * Perform the AES SubWord operation on the final 32-bit word (bits 96..127)
  45. * of ${a}, and return a vector consisting of that value copied to all lanes.
  46. */
  47. static inline uint8x16_t
  48. SubWord_duplicate(uint8x16_t a)
  49. {
  50. /*
  51. * Duplicate the final 32-bit word in all other lanes. By having four
  52. * copies of the same uint32_t, we cause the ShiftRows in the upcoming
  53. * AESE to have no effect.
  54. */
  55. a = vdupq_laneq_u32_u8(a, 3);
  56. /* AESE does AddRoundKey (nop), ShiftRows (nop), and SubBytes. */
  57. a = vaeseq_u8(a, vdupq_n_u8(0));
  58. return (a);
  59. }
  60. /**
  61. * SubWord_RotWord_XOR_duplicate(a, rcon):
  62. * Perform the AES key schedule operations of SubWord, RotWord, and XOR with
  63. * ${rcon}, acting on the final 32-bit word (bits 96..127) of ${a}, and return
  64. * a vector consisting of that value copied to all lanes.
  65. */
  66. static inline uint8x16_t
  67. SubWord_RotWord_XOR_duplicate(uint8x16_t a, const uint32_t rcon)
  68. {
  69. uint32_t x3;
  70. /* Perform SubWord on the final 32-bit word and copy it to all lanes. */
  71. a = SubWord_duplicate(a);
  72. /* We'll use non-neon for the rest. */
  73. x3 = vgetq_lane_u32(vreinterpretq_u32_u8(a), 0);
  74. /*-
  75. * x3 gets RotWord. Note that
  76. * RotWord(SubWord(a)) == SubWord(RotWord(a))
  77. */
  78. x3 = (x3 >> 8) | (x3 << (32 - 8));
  79. /* x3 gets XOR'd with rcon. */
  80. x3 = x3 ^ rcon;
  81. /* Copy x3 to all 128 bits, and convert it to a uint8x16_t. */
  82. return (vreinterpretq_u8_u32(vdupq_n_u32(x3)));
  83. }
  84. /* Compute an AES-128 round key. */
  85. #define MKRKEY128(rkeys, i, rcon) do { \
  86. uint8x16_t _s = rkeys[i - 1]; \
  87. uint8x16_t _t = rkeys[i - 1]; \
  88. _s = veorq_u8(_s, vshlq_n_u128(_s, 4)); \
  89. _s = veorq_u8(_s, vshlq_n_u128(_s, 8)); \
  90. _t = SubWord_RotWord_XOR_duplicate(_t, rcon); \
  91. rkeys[i] = veorq_u8(_s, _t); \
  92. } while (0)
  93. /**
  94. * crypto_aes_key_expand_128_arm(key_unexpanded, rkeys):
  95. * Expand the 128-bit unexpanded AES key ${key_unexpanded} into the 11 round
  96. * keys ${rkeys}. This implementation uses ARM AES instructions, and should
  97. * only be used if CPUSUPPORT_ARM_AES is defined and cpusupport_arm_aes()
  98. * returns nonzero.
  99. */
  100. static void
  101. crypto_aes_key_expand_128_arm(const uint8_t key_unexpanded[16],
  102. uint8x16_t rkeys[11])
  103. {
  104. /* The first round key is just the key. */
  105. rkeys[0] = vld1q_u8(&key_unexpanded[0]);
  106. /*
  107. * Each of the remaining round keys are computed from the preceding
  108. * round key: rotword+subword+rcon (provided as aeskeygenassist) to
  109. * compute the 'temp' value, then xor with 1, 2, 3, or all 4 of the
  110. * 32-bit words from the preceding round key.
  111. */
  112. MKRKEY128(rkeys, 1, 0x01);
  113. MKRKEY128(rkeys, 2, 0x02);
  114. MKRKEY128(rkeys, 3, 0x04);
  115. MKRKEY128(rkeys, 4, 0x08);
  116. MKRKEY128(rkeys, 5, 0x10);
  117. MKRKEY128(rkeys, 6, 0x20);
  118. MKRKEY128(rkeys, 7, 0x40);
  119. MKRKEY128(rkeys, 8, 0x80);
  120. MKRKEY128(rkeys, 9, 0x1b);
  121. MKRKEY128(rkeys, 10, 0x36);
  122. }
  123. /* Compute an AES-256 round key. */
  124. #define MKRKEY256(rkeys, i, rcon) do { \
  125. uint8x16_t _s = rkeys[i - 2]; \
  126. uint8x16_t _t = rkeys[i - 1]; \
  127. _s = veorq_u8(_s, vshlq_n_u128(_s, 4)); \
  128. _s = veorq_u8(_s, vshlq_n_u128(_s, 8)); \
  129. _t = (i % 2 == 1) ? \
  130. SubWord_duplicate(_t) : \
  131. SubWord_RotWord_XOR_duplicate(_t, rcon); \
  132. rkeys[i] = veorq_u8(_s, _t); \
  133. } while (0)
  134. /**
  135. * crypto_aes_key_expand_256_arm(key_unexpanded, rkeys):
  136. * Expand the 256-bit unexpanded AES key ${key_unexpanded} into the 15 round
  137. * keys ${rkeys}. This implementation uses ARM AES instructions, and should
  138. * only be used if CPUSUPPORT_ARM_AES is defined and cpusupport_arm_aes()
  139. * returns nonzero.
  140. */
  141. static void
  142. crypto_aes_key_expand_256_arm(const uint8_t key_unexpanded[32],
  143. uint8x16_t rkeys[15])
  144. {
  145. /* The first two round keys are just the key. */
  146. rkeys[0] = vld1q_u8(&key_unexpanded[0]);
  147. rkeys[1] = vld1q_u8(&key_unexpanded[16]);
  148. /*
  149. * Each of the remaining round keys are computed from the preceding
  150. * pair of keys. Even rounds use rotword+subword+rcon, while odd
  151. * rounds just use subword. The rcon value used is irrelevant for odd
  152. * rounds since we ignore the value which it feeds into.
  153. */
  154. MKRKEY256(rkeys, 2, 0x01);
  155. MKRKEY256(rkeys, 3, 0x00);
  156. MKRKEY256(rkeys, 4, 0x02);
  157. MKRKEY256(rkeys, 5, 0x00);
  158. MKRKEY256(rkeys, 6, 0x04);
  159. MKRKEY256(rkeys, 7, 0x00);
  160. MKRKEY256(rkeys, 8, 0x08);
  161. MKRKEY256(rkeys, 9, 0x00);
  162. MKRKEY256(rkeys, 10, 0x10);
  163. MKRKEY256(rkeys, 11, 0x00);
  164. MKRKEY256(rkeys, 12, 0x20);
  165. MKRKEY256(rkeys, 13, 0x00);
  166. MKRKEY256(rkeys, 14, 0x40);
  167. }
  168. /**
  169. * crypto_aes_key_expand_arm(key_unexpanded, len):
  170. * Expand the ${len}-byte unexpanded AES key ${key_unexpanded} into a
  171. * structure which can be passed to crypto_aes_encrypt_block_arm(). The
  172. * length must be 16 or 32. This implementation uses ARM AES instructions,
  173. * and should only be used if CPUSUPPORT_ARM_AES is defined and
  174. * cpusupport_arm_aes() returns nonzero.
  175. */
  176. void *
  177. crypto_aes_key_expand_arm(const uint8_t * key_unexpanded, size_t len)
  178. {
  179. struct crypto_aes_key_arm * kexp;
  180. /* Allocate structure. */
  181. if ((kexp = malloc(sizeof(struct crypto_aes_key_arm))) == NULL)
  182. goto err0;
  183. /* Figure out where to put the round keys. */
  184. ALIGN_PTR_INIT(kexp->rkeys, sizeof(uint8x16_t));
  185. /* Compute round keys. */
  186. if (len == 16) {
  187. kexp->nr = 10;
  188. crypto_aes_key_expand_128_arm(key_unexpanded, kexp->rkeys);
  189. } else if (len == 32) {
  190. kexp->nr = 14;
  191. crypto_aes_key_expand_256_arm(key_unexpanded, kexp->rkeys);
  192. } else {
  193. warn0("Unsupported AES key length: %zu bytes", len);
  194. goto err1;
  195. }
  196. /* Success! */
  197. return (kexp);
  198. err1:
  199. free(kexp);
  200. err0:
  201. /* Failure! */
  202. return (NULL);
  203. }
  204. /**
  205. * crypto_aes_encrypt_block_arm_u8(in, key):
  206. * Using the expanded AES key ${key}, encrypt the block ${in} and return the
  207. * resulting ciphertext. This implementation uses ARM AES instructions,
  208. * and should only be used if CPUSUPPORT_ARM_AES is defined and
  209. * cpusupport_arm_aes() returns nonzero.
  210. */
  211. uint8x16_t
  212. crypto_aes_encrypt_block_arm_u8(uint8x16_t in, const void * key)
  213. {
  214. const struct crypto_aes_key_arm * _key = key;
  215. const uint8x16_t * aes_key = _key->rkeys;
  216. uint8x16_t aes_state = in;
  217. size_t nr = _key->nr;
  218. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[0]));
  219. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[1]));
  220. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[2]));
  221. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[3]));
  222. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[4]));
  223. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[5]));
  224. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[6]));
  225. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[7]));
  226. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[8]));
  227. if (nr > 10) {
  228. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[9]));
  229. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[10]));
  230. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[11]));
  231. aes_state = vaesmcq_u8(vaeseq_u8(aes_state, aes_key[12]));
  232. }
  233. /* Last round. */
  234. aes_state = vaeseq_u8(aes_state, aes_key[nr - 1]);
  235. aes_state = veorq_u8(aes_state, aes_key[nr]);
  236. return (aes_state);
  237. }
  238. /**
  239. * crypto_aes_encrypt_block_arm(in, out, key):
  240. * Using the expanded AES key ${key}, encrypt the block ${in} and write the
  241. * resulting ciphertext to ${out}. ${in} and ${out} can overlap. This
  242. * implementation uses ARM AES instructions, and should only be used if
  243. * CPUSUPPORT_ARM_AES is defined and cpusupport_arm_aes() returns nonzero.
  244. */
  245. void
  246. crypto_aes_encrypt_block_arm(const uint8_t in[16], uint8_t out[16],
  247. const void * key)
  248. {
  249. uint8x16_t aes_state;
  250. aes_state = vld1q_u8(in);
  251. aes_state = crypto_aes_encrypt_block_arm_u8(aes_state, key);
  252. vst1q_u8(out, aes_state);
  253. }
  254. /**
  255. * crypto_aes_key_free_arm(key):
  256. * Free the expanded AES key ${key}.
  257. */
  258. void
  259. crypto_aes_key_free_arm(void * key)
  260. {
  261. /* Behave consistently with free(NULL). */
  262. if (key == NULL)
  263. return;
  264. /* Attempt to zero the expanded key. */
  265. insecure_memzero(key, sizeof(struct crypto_aes_key_arm));
  266. /* Free the key. */
  267. free(key);
  268. }
  269. #endif /* CPUSUPPORT_ARM_AES */