sha512-armv7-neon.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. /* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform
  2. *
  3. * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms of the GNU General Public License as published by the Free
  7. * Software Foundation; either version 2 of the License, or (at your option)
  8. * any later version.
  9. */
  10. #include <linux/linkage.h>
  11. .syntax unified
  12. .code 32
  13. .fpu neon
  14. .text
  15. /* structure of SHA512_CONTEXT */
  16. #define hd_a 0
  17. #define hd_b ((hd_a) + 8)
  18. #define hd_c ((hd_b) + 8)
  19. #define hd_d ((hd_c) + 8)
  20. #define hd_e ((hd_d) + 8)
  21. #define hd_f ((hd_e) + 8)
  22. #define hd_g ((hd_f) + 8)
  23. /* register macros */
  24. #define RK %r2
  25. #define RA d0
  26. #define RB d1
  27. #define RC d2
  28. #define RD d3
  29. #define RE d4
  30. #define RF d5
  31. #define RG d6
  32. #define RH d7
  33. #define RT0 d8
  34. #define RT1 d9
  35. #define RT2 d10
  36. #define RT3 d11
  37. #define RT4 d12
  38. #define RT5 d13
  39. #define RT6 d14
  40. #define RT7 d15
  41. #define RT01q q4
  42. #define RT23q q5
  43. #define RT45q q6
  44. #define RT67q q7
  45. #define RW0 d16
  46. #define RW1 d17
  47. #define RW2 d18
  48. #define RW3 d19
  49. #define RW4 d20
  50. #define RW5 d21
  51. #define RW6 d22
  52. #define RW7 d23
  53. #define RW8 d24
  54. #define RW9 d25
  55. #define RW10 d26
  56. #define RW11 d27
  57. #define RW12 d28
  58. #define RW13 d29
  59. #define RW14 d30
  60. #define RW15 d31
  61. #define RW01q q8
  62. #define RW23q q9
  63. #define RW45q q10
  64. #define RW67q q11
  65. #define RW89q q12
  66. #define RW1011q q13
  67. #define RW1213q q14
  68. #define RW1415q q15
  69. /***********************************************************************
  70. * ARM assembly implementation of sha512 transform
  71. ***********************************************************************/
  72. #define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \
  73. rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
  74. /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
  75. vshr.u64 RT2, re, #14; \
  76. vshl.u64 RT3, re, #64 - 14; \
  77. interleave_op(arg1); \
  78. vshr.u64 RT4, re, #18; \
  79. vshl.u64 RT5, re, #64 - 18; \
  80. vld1.64 {RT0}, [RK]!; \
  81. veor.64 RT23q, RT23q, RT45q; \
  82. vshr.u64 RT4, re, #41; \
  83. vshl.u64 RT5, re, #64 - 41; \
  84. vadd.u64 RT0, RT0, rw0; \
  85. veor.64 RT23q, RT23q, RT45q; \
  86. vmov.64 RT7, re; \
  87. veor.64 RT1, RT2, RT3; \
  88. vbsl.64 RT7, rf, rg; \
  89. \
  90. vadd.u64 RT1, RT1, rh; \
  91. vshr.u64 RT2, ra, #28; \
  92. vshl.u64 RT3, ra, #64 - 28; \
  93. vadd.u64 RT1, RT1, RT0; \
  94. vshr.u64 RT4, ra, #34; \
  95. vshl.u64 RT5, ra, #64 - 34; \
  96. vadd.u64 RT1, RT1, RT7; \
  97. \
  98. /* h = Sum0 (a) + Maj (a, b, c); */ \
  99. veor.64 RT23q, RT23q, RT45q; \
  100. vshr.u64 RT4, ra, #39; \
  101. vshl.u64 RT5, ra, #64 - 39; \
  102. veor.64 RT0, ra, rb; \
  103. veor.64 RT23q, RT23q, RT45q; \
  104. vbsl.64 RT0, rc, rb; \
  105. vadd.u64 rd, rd, RT1; /* d+=t1; */ \
  106. veor.64 rh, RT2, RT3; \
  107. \
  108. /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
  109. vshr.u64 RT2, rd, #14; \
  110. vshl.u64 RT3, rd, #64 - 14; \
  111. vadd.u64 rh, rh, RT0; \
  112. vshr.u64 RT4, rd, #18; \
  113. vshl.u64 RT5, rd, #64 - 18; \
  114. vadd.u64 rh, rh, RT1; /* h+=t1; */ \
  115. vld1.64 {RT0}, [RK]!; \
  116. veor.64 RT23q, RT23q, RT45q; \
  117. vshr.u64 RT4, rd, #41; \
  118. vshl.u64 RT5, rd, #64 - 41; \
  119. vadd.u64 RT0, RT0, rw1; \
  120. veor.64 RT23q, RT23q, RT45q; \
  121. vmov.64 RT7, rd; \
  122. veor.64 RT1, RT2, RT3; \
  123. vbsl.64 RT7, re, rf; \
  124. \
  125. vadd.u64 RT1, RT1, rg; \
  126. vshr.u64 RT2, rh, #28; \
  127. vshl.u64 RT3, rh, #64 - 28; \
  128. vadd.u64 RT1, RT1, RT0; \
  129. vshr.u64 RT4, rh, #34; \
  130. vshl.u64 RT5, rh, #64 - 34; \
  131. vadd.u64 RT1, RT1, RT7; \
  132. \
  133. /* g = Sum0 (h) + Maj (h, a, b); */ \
  134. veor.64 RT23q, RT23q, RT45q; \
  135. vshr.u64 RT4, rh, #39; \
  136. vshl.u64 RT5, rh, #64 - 39; \
  137. veor.64 RT0, rh, ra; \
  138. veor.64 RT23q, RT23q, RT45q; \
  139. vbsl.64 RT0, rb, ra; \
  140. vadd.u64 rc, rc, RT1; /* c+=t1; */ \
  141. veor.64 rg, RT2, RT3; \
  142. \
  143. /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
  144. /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
  145. \
  146. /**** S0(w[1:2]) */ \
  147. \
  148. /* w[0:1] += w[9:10] */ \
  149. /* RT23q = rw1:rw2 */ \
  150. vext.u64 RT23q, rw01q, rw23q, #1; \
  151. vadd.u64 rw0, rw9; \
  152. vadd.u64 rg, rg, RT0; \
  153. vadd.u64 rw1, rw10;\
  154. vadd.u64 rg, rg, RT1; /* g+=t1; */ \
  155. \
  156. vshr.u64 RT45q, RT23q, #1; \
  157. vshl.u64 RT67q, RT23q, #64 - 1; \
  158. vshr.u64 RT01q, RT23q, #8; \
  159. veor.u64 RT45q, RT45q, RT67q; \
  160. vshl.u64 RT67q, RT23q, #64 - 8; \
  161. veor.u64 RT45q, RT45q, RT01q; \
  162. vshr.u64 RT01q, RT23q, #7; \
  163. veor.u64 RT45q, RT45q, RT67q; \
  164. \
  165. /**** S1(w[14:15]) */ \
  166. vshr.u64 RT23q, rw1415q, #6; \
  167. veor.u64 RT01q, RT01q, RT45q; \
  168. vshr.u64 RT45q, rw1415q, #19; \
  169. vshl.u64 RT67q, rw1415q, #64 - 19; \
  170. veor.u64 RT23q, RT23q, RT45q; \
  171. vshr.u64 RT45q, rw1415q, #61; \
  172. veor.u64 RT23q, RT23q, RT67q; \
  173. vshl.u64 RT67q, rw1415q, #64 - 61; \
  174. veor.u64 RT23q, RT23q, RT45q; \
  175. vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
  176. veor.u64 RT01q, RT23q, RT67q;
  177. #define vadd_RT01q(rw01q) \
  178. /* w[0:1] += S(w[14:15]) */ \
  179. vadd.u64 rw01q, RT01q;
  180. #define dummy(_) /*_*/
  181. #define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \
  182. interleave_op1, arg1, interleave_op2, arg2) \
  183. /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
  184. vshr.u64 RT2, re, #14; \
  185. vshl.u64 RT3, re, #64 - 14; \
  186. interleave_op1(arg1); \
  187. vshr.u64 RT4, re, #18; \
  188. vshl.u64 RT5, re, #64 - 18; \
  189. interleave_op2(arg2); \
  190. vld1.64 {RT0}, [RK]!; \
  191. veor.64 RT23q, RT23q, RT45q; \
  192. vshr.u64 RT4, re, #41; \
  193. vshl.u64 RT5, re, #64 - 41; \
  194. vadd.u64 RT0, RT0, rw0; \
  195. veor.64 RT23q, RT23q, RT45q; \
  196. vmov.64 RT7, re; \
  197. veor.64 RT1, RT2, RT3; \
  198. vbsl.64 RT7, rf, rg; \
  199. \
  200. vadd.u64 RT1, RT1, rh; \
  201. vshr.u64 RT2, ra, #28; \
  202. vshl.u64 RT3, ra, #64 - 28; \
  203. vadd.u64 RT1, RT1, RT0; \
  204. vshr.u64 RT4, ra, #34; \
  205. vshl.u64 RT5, ra, #64 - 34; \
  206. vadd.u64 RT1, RT1, RT7; \
  207. \
  208. /* h = Sum0 (a) + Maj (a, b, c); */ \
  209. veor.64 RT23q, RT23q, RT45q; \
  210. vshr.u64 RT4, ra, #39; \
  211. vshl.u64 RT5, ra, #64 - 39; \
  212. veor.64 RT0, ra, rb; \
  213. veor.64 RT23q, RT23q, RT45q; \
  214. vbsl.64 RT0, rc, rb; \
  215. vadd.u64 rd, rd, RT1; /* d+=t1; */ \
  216. veor.64 rh, RT2, RT3; \
  217. \
  218. /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
  219. vshr.u64 RT2, rd, #14; \
  220. vshl.u64 RT3, rd, #64 - 14; \
  221. vadd.u64 rh, rh, RT0; \
  222. vshr.u64 RT4, rd, #18; \
  223. vshl.u64 RT5, rd, #64 - 18; \
  224. vadd.u64 rh, rh, RT1; /* h+=t1; */ \
  225. vld1.64 {RT0}, [RK]!; \
  226. veor.64 RT23q, RT23q, RT45q; \
  227. vshr.u64 RT4, rd, #41; \
  228. vshl.u64 RT5, rd, #64 - 41; \
  229. vadd.u64 RT0, RT0, rw1; \
  230. veor.64 RT23q, RT23q, RT45q; \
  231. vmov.64 RT7, rd; \
  232. veor.64 RT1, RT2, RT3; \
  233. vbsl.64 RT7, re, rf; \
  234. \
  235. vadd.u64 RT1, RT1, rg; \
  236. vshr.u64 RT2, rh, #28; \
  237. vshl.u64 RT3, rh, #64 - 28; \
  238. vadd.u64 RT1, RT1, RT0; \
  239. vshr.u64 RT4, rh, #34; \
  240. vshl.u64 RT5, rh, #64 - 34; \
  241. vadd.u64 RT1, RT1, RT7; \
  242. \
  243. /* g = Sum0 (h) + Maj (h, a, b); */ \
  244. veor.64 RT23q, RT23q, RT45q; \
  245. vshr.u64 RT4, rh, #39; \
  246. vshl.u64 RT5, rh, #64 - 39; \
  247. veor.64 RT0, rh, ra; \
  248. veor.64 RT23q, RT23q, RT45q; \
  249. vbsl.64 RT0, rb, ra; \
  250. vadd.u64 rc, rc, RT1; /* c+=t1; */ \
  251. veor.64 rg, RT2, RT3;
  252. #define vadd_rg_RT0(rg) \
  253. vadd.u64 rg, rg, RT0;
  254. #define vadd_rg_RT1(rg) \
  255. vadd.u64 rg, rg, RT1; /* g+=t1; */
  256. .align 3
  257. ENTRY(sha512_transform_neon)
  258. /* Input:
  259. * %r0: SHA512_CONTEXT
  260. * %r1: data
  261. * %r2: u64 k[] constants
  262. * %r3: nblks
  263. */
  264. push {%lr};
  265. mov %lr, #0;
  266. /* Load context to d0-d7 */
  267. vld1.64 {RA-RD}, [%r0]!;
  268. vld1.64 {RE-RH}, [%r0];
  269. sub %r0, #(4*8);
  270. /* Load input to w[16], d16-d31 */
  271. /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
  272. vld1.64 {RW0-RW3}, [%r1]!;
  273. vld1.64 {RW4-RW7}, [%r1]!;
  274. vld1.64 {RW8-RW11}, [%r1]!;
  275. vld1.64 {RW12-RW15}, [%r1]!;
  276. #ifdef __ARMEL__
  277. /* byteswap */
  278. vrev64.8 RW01q, RW01q;
  279. vrev64.8 RW23q, RW23q;
  280. vrev64.8 RW45q, RW45q;
  281. vrev64.8 RW67q, RW67q;
  282. vrev64.8 RW89q, RW89q;
  283. vrev64.8 RW1011q, RW1011q;
  284. vrev64.8 RW1213q, RW1213q;
  285. vrev64.8 RW1415q, RW1415q;
  286. #endif
  287. /* EABI says that d8-d15 must be preserved by callee. */
  288. /*vpush {RT0-RT7};*/
  289. .Loop:
  290. rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
  291. RW23q, RW1415q, RW9, RW10, dummy, _);
  292. b .Lenter_rounds;
  293. .Loop_rounds:
  294. rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
  295. RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
  296. .Lenter_rounds:
  297. rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4,
  298. RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
  299. rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6,
  300. RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
  301. rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8,
  302. RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
  303. rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10,
  304. RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
  305. rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12,
  306. RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
  307. add %lr, #16;
  308. rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14,
  309. RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
  310. cmp %lr, #64;
  311. rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0,
  312. RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
  313. bne .Loop_rounds;
  314. subs %r3, #1;
  315. rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1,
  316. vadd_RT01q, RW1415q, dummy, _);
  317. rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3,
  318. vadd_rg_RT0, RG, vadd_rg_RT1, RG);
  319. beq .Lhandle_tail;
  320. vld1.64 {RW0-RW3}, [%r1]!;
  321. rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
  322. vadd_rg_RT0, RE, vadd_rg_RT1, RE);
  323. rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
  324. vadd_rg_RT0, RC, vadd_rg_RT1, RC);
  325. #ifdef __ARMEL__
  326. vrev64.8 RW01q, RW01q;
  327. vrev64.8 RW23q, RW23q;
  328. #endif
  329. vld1.64 {RW4-RW7}, [%r1]!;
  330. rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
  331. vadd_rg_RT0, RA, vadd_rg_RT1, RA);
  332. rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
  333. vadd_rg_RT0, RG, vadd_rg_RT1, RG);
  334. #ifdef __ARMEL__
  335. vrev64.8 RW45q, RW45q;
  336. vrev64.8 RW67q, RW67q;
  337. #endif
  338. vld1.64 {RW8-RW11}, [%r1]!;
  339. rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
  340. vadd_rg_RT0, RE, vadd_rg_RT1, RE);
  341. rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
  342. vadd_rg_RT0, RC, vadd_rg_RT1, RC);
  343. #ifdef __ARMEL__
  344. vrev64.8 RW89q, RW89q;
  345. vrev64.8 RW1011q, RW1011q;
  346. #endif
  347. vld1.64 {RW12-RW15}, [%r1]!;
  348. vadd_rg_RT0(RA);
  349. vadd_rg_RT1(RA);
  350. /* Load context */
  351. vld1.64 {RT0-RT3}, [%r0]!;
  352. vld1.64 {RT4-RT7}, [%r0];
  353. sub %r0, #(4*8);
  354. #ifdef __ARMEL__
  355. vrev64.8 RW1213q, RW1213q;
  356. vrev64.8 RW1415q, RW1415q;
  357. #endif
  358. vadd.u64 RA, RT0;
  359. vadd.u64 RB, RT1;
  360. vadd.u64 RC, RT2;
  361. vadd.u64 RD, RT3;
  362. vadd.u64 RE, RT4;
  363. vadd.u64 RF, RT5;
  364. vadd.u64 RG, RT6;
  365. vadd.u64 RH, RT7;
  366. /* Store the first half of context */
  367. vst1.64 {RA-RD}, [%r0]!;
  368. sub RK, $(8*80);
  369. vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
  370. mov %lr, #0;
  371. sub %r0, #(4*8);
  372. b .Loop;
  373. .Lhandle_tail:
  374. rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
  375. vadd_rg_RT0, RE, vadd_rg_RT1, RE);
  376. rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
  377. vadd_rg_RT0, RC, vadd_rg_RT1, RC);
  378. rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
  379. vadd_rg_RT0, RA, vadd_rg_RT1, RA);
  380. rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
  381. vadd_rg_RT0, RG, vadd_rg_RT1, RG);
  382. rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
  383. vadd_rg_RT0, RE, vadd_rg_RT1, RE);
  384. rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
  385. vadd_rg_RT0, RC, vadd_rg_RT1, RC);
  386. /* Load context to d16-d23 */
  387. vld1.64 {RW0-RW3}, [%r0]!;
  388. vadd_rg_RT0(RA);
  389. vld1.64 {RW4-RW7}, [%r0];
  390. vadd_rg_RT1(RA);
  391. sub %r0, #(4*8);
  392. vadd.u64 RA, RW0;
  393. vadd.u64 RB, RW1;
  394. vadd.u64 RC, RW2;
  395. vadd.u64 RD, RW3;
  396. vadd.u64 RE, RW4;
  397. vadd.u64 RF, RW5;
  398. vadd.u64 RG, RW6;
  399. vadd.u64 RH, RW7;
  400. /* Store the first half of context */
  401. vst1.64 {RA-RD}, [%r0]!;
  402. /* Clear used registers */
  403. /* d16-d31 */
  404. veor.u64 RW01q, RW01q;
  405. veor.u64 RW23q, RW23q;
  406. veor.u64 RW45q, RW45q;
  407. veor.u64 RW67q, RW67q;
  408. vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
  409. veor.u64 RW89q, RW89q;
  410. veor.u64 RW1011q, RW1011q;
  411. veor.u64 RW1213q, RW1213q;
  412. veor.u64 RW1415q, RW1415q;
  413. /* d8-d15 */
  414. /*vpop {RT0-RT7};*/
  415. /* d0-d7 (q0-q3) */
  416. veor.u64 %q0, %q0;
  417. veor.u64 %q1, %q1;
  418. veor.u64 %q2, %q2;
  419. veor.u64 %q3, %q3;
  420. pop {%pc};
  421. ENDPROC(sha512_transform_neon)