sha512-neon.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. /*
  2. * Hardware-accelerated implementation of SHA-512 using Arm NEON.
  3. */
  4. #include "ssh.h"
  5. #include "sha512.h"
  6. #if USE_ARM64_NEON_H
  7. #include <arm64_neon.h>
  8. #else
  9. #include <arm_neon.h>
  10. #endif
  11. static bool sha512_neon_available(void)
  12. {
  13. /*
  14. * For Arm, we delegate to a per-platform detection function (see
  15. * explanation in aes-neon.c).
  16. */
  17. return platform_sha512_neon_available();
  18. }
  19. #if !HAVE_NEON_SHA512_INTRINSICS
  20. /*
  21. * clang 12 and before do not provide the SHA-512 NEON intrinsics, but
  22. * do provide assembler support for the underlying instructions. So I
  23. * define the intrinsic functions myself, using inline assembler.
  24. */
  25. static inline uint64x2_t vsha512su0q_u64(uint64x2_t x, uint64x2_t y)
  26. {
  27. __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
  28. return x;
  29. }
  30. static inline uint64x2_t vsha512su1q_u64(uint64x2_t x, uint64x2_t y,
  31. uint64x2_t z)
  32. {
  33. __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
  34. return x;
  35. }
  36. static inline uint64x2_t vsha512hq_u64(uint64x2_t x, uint64x2_t y,
  37. uint64x2_t z)
  38. {
  39. __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
  40. return x;
  41. }
  42. static inline uint64x2_t vsha512h2q_u64(uint64x2_t x, uint64x2_t y,
  43. uint64x2_t z)
  44. {
  45. __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
  46. return x;
  47. }
  48. #endif /* HAVE_NEON_SHA512_INTRINSICS */
  49. typedef struct sha512_neon_core sha512_neon_core;
  50. struct sha512_neon_core {
  51. uint64x2_t ab, cd, ef, gh;
  52. };
  53. static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
  54. {
  55. return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
  56. }
  57. static inline uint64x2_t sha512_neon_schedule_update(
  58. uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
  59. {
  60. /*
  61. * vsha512su0q_u64() takes words from a long way back in the
  62. * schedule and performs the sigma_0 half of the computation of
  63. * the next two 64-bit message-schedule words.
  64. *
  65. * vsha512su1q_u64() combines the result of that with the sigma_1
  66. * steps, to output the finished version of those two words. The
  67. * total amount of input data it requires fits nicely into three
  68. * 128-bit vector registers, but one of those registers is
  69. * misaligned compared to the 128-bit chunks that the message
  70. * schedule is stored in. So we use vextq_u64 to make one of its
  71. * input words out of the second half of m4 and the first half of
  72. * m3.
  73. */
  74. return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
  75. }
  76. static inline void sha512_neon_round2(
  77. unsigned round_index, uint64x2_t schedule_words,
  78. uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
  79. {
  80. /*
  81. * vsha512hq_u64 performs the Sigma_1 and Ch half of the
  82. * computation of two rounds of SHA-512 (including feeding back
  83. * one of the outputs from the first of those half-rounds into the
  84. * second one).
  85. *
  86. * vsha512h2q_u64 combines the result of that with the Sigma_0 and
  87. * Maj steps, and outputs one 128-bit vector that replaces the gh
  88. * piece of the input hash state, and a second that updates cd by
  89. * addition.
  90. *
  91. * Similarly to vsha512su1q_u64 above, some of the input registers
  92. * expected by these instructions are misaligned by 64 bits
  93. * relative to the chunks we've divided the hash state into, so we
  94. * have to start by making 'de' and 'fg' words out of our input
  95. * cd,ef,gh, using vextq_u64.
  96. *
  97. * Also, one of the inputs to vsha512hq_u64 is expected to contain
  98. * the results of summing gh + two round constants + two words of
  99. * message schedule, but the two words of the message schedule
  100. * have to be the opposite way round in the vector register from
  101. * the way that vsha512su1q_u64 output them. Hence, there's
  102. * another vextq_u64 in here that swaps the two halves of the
  103. * initial_sum vector register.
  104. *
  105. * (This also means that I don't have to prepare a specially
  106. * reordered version of the sha512_round_constants[] array: as
  107. * long as I'm unavoidably doing a swap at run time _anyway_, I
  108. * can load from the normally ordered version of that array, and
  109. * just take care to fold in that data _before_ the swap rather
  110. * than after.)
  111. */
  112. /* Load two round constants, with the first one in the low half */
  113. uint64x2_t round_constants = vld1q_u64(
  114. sha512_round_constants + round_index);
  115. /* Add schedule words to round constants */
  116. uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
  117. /* Swap that sum around so the word used in the first of the two
  118. * rounds is in the _high_ half of the vector, matching where h
  119. * lives in the gh vector */
  120. uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
  121. /* Add gh to that, now that they're matching ways round */
  122. uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
  123. /* Make the misaligned de and fg words */
  124. uint64x2_t de = vextq_u64(*cd, *ef, 1);
  125. uint64x2_t fg = vextq_u64(*ef, *gh, 1);
  126. /* Now we're ready to put all the pieces together. The output from
  127. * vsha512h2q_u64 can be used directly as the new gh, and the
  128. * output from vsha512hq_u64 is simultaneously the intermediate
  129. * value passed to h2 and the thing you have to add on to cd. */
  130. uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
  131. *gh = vsha512h2q_u64(intermed, *cd, *ab);
  132. *cd = vaddq_u64(*cd, intermed);
  133. }
  134. static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
  135. {
  136. uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
  137. uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
  138. s0 = sha512_neon_load_input(p + 16*0);
  139. sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
  140. s1 = sha512_neon_load_input(p + 16*1);
  141. sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
  142. s2 = sha512_neon_load_input(p + 16*2);
  143. sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
  144. s3 = sha512_neon_load_input(p + 16*3);
  145. sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
  146. s4 = sha512_neon_load_input(p + 16*4);
  147. sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
  148. s5 = sha512_neon_load_input(p + 16*5);
  149. sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
  150. s6 = sha512_neon_load_input(p + 16*6);
  151. sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
  152. s7 = sha512_neon_load_input(p + 16*7);
  153. sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
  154. s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
  155. sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
  156. s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
  157. sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
  158. s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
  159. sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
  160. s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
  161. sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
  162. s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
  163. sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
  164. s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
  165. sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
  166. s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
  167. sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
  168. s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
  169. sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
  170. s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
  171. sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
  172. s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
  173. sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
  174. s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
  175. sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
  176. s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
  177. sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
  178. s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
  179. sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
  180. s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
  181. sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
  182. s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
  183. sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
  184. s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
  185. sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
  186. s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
  187. sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
  188. s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
  189. sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
  190. s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
  191. sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
  192. s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
  193. sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
  194. s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
  195. sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
  196. s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
  197. sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
  198. s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
  199. sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
  200. s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
  201. sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
  202. s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
  203. sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
  204. s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
  205. sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
  206. s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
  207. sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
  208. s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
  209. sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
  210. s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
  211. sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
  212. s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
  213. sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
  214. s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
  215. sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
  216. s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
  217. sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
  218. core->ab = vaddq_u64(core->ab, ab);
  219. core->cd = vaddq_u64(core->cd, cd);
  220. core->ef = vaddq_u64(core->ef, ef);
  221. core->gh = vaddq_u64(core->gh, gh);
  222. }
  223. typedef struct sha512_neon {
  224. sha512_neon_core core;
  225. sha512_block blk;
  226. BinarySink_IMPLEMENTATION;
  227. ssh_hash hash;
  228. } sha512_neon;
  229. static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
  230. static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
  231. {
  232. const struct sha512_extra *extra = (const struct sha512_extra *)alg->extra;
  233. if (!check_availability(extra))
  234. return NULL;
  235. sha512_neon *s = snew(sha512_neon);
  236. s->hash.vt = alg;
  237. BinarySink_INIT(s, sha512_neon_write);
  238. BinarySink_DELEGATE_INIT(&s->hash, s);
  239. return &s->hash;
  240. }
  241. static void sha512_neon_reset(ssh_hash *hash)
  242. {
  243. sha512_neon *s = container_of(hash, sha512_neon, hash);
  244. const struct sha512_extra *extra =
  245. (const struct sha512_extra *)hash->vt->extra;
  246. s->core.ab = vld1q_u64(extra->initial_state);
  247. s->core.cd = vld1q_u64(extra->initial_state+2);
  248. s->core.ef = vld1q_u64(extra->initial_state+4);
  249. s->core.gh = vld1q_u64(extra->initial_state+6);
  250. sha512_block_setup(&s->blk);
  251. }
  252. static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
  253. {
  254. sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
  255. sha512_neon *orig = container_of(horig, sha512_neon, hash);
  256. *copy = *orig; /* structure copy */
  257. BinarySink_COPIED(copy);
  258. BinarySink_DELEGATE_INIT(&copy->hash, copy);
  259. }
  260. static void sha512_neon_free(ssh_hash *hash)
  261. {
  262. sha512_neon *s = container_of(hash, sha512_neon, hash);
  263. smemclr(s, sizeof(*s));
  264. sfree(s);
  265. }
  266. static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
  267. {
  268. sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
  269. while (len > 0)
  270. if (sha512_block_write(&s->blk, &vp, &len))
  271. sha512_neon_block(&s->core, s->blk.block);
  272. }
  273. static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
  274. {
  275. sha512_neon *s = container_of(hash, sha512_neon, hash);
  276. sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
  277. vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
  278. vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
  279. vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
  280. vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
  281. }
  282. static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
  283. {
  284. sha512_neon *s = container_of(hash, sha512_neon, hash);
  285. sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
  286. vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
  287. vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
  288. vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
  289. }
  290. SHA512_VTABLES(neon, "NEON accelerated");