sha1-armv7-neon.S 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635
  1. /* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
  2. *
  3. * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms of the GNU General Public License as published by the Free
  7. * Software Foundation; either version 2 of the License, or (at your option)
  8. * any later version.
  9. */
  10. #include <linux/linkage.h>
  11. .syntax unified
  12. .code 32
  13. .fpu neon
  14. .text
  15. /* Context structure */
  16. #define state_h0 0
  17. #define state_h1 4
  18. #define state_h2 8
  19. #define state_h3 12
  20. #define state_h4 16
  21. /* Constants */
  22. #define K1 0x5A827999
  23. #define K2 0x6ED9EBA1
  24. #define K3 0x8F1BBCDC
  25. #define K4 0xCA62C1D6
  26. .align 4
  27. .LK_VEC:
  28. .LK1: .long K1, K1, K1, K1
  29. .LK2: .long K2, K2, K2, K2
  30. .LK3: .long K3, K3, K3, K3
  31. .LK4: .long K4, K4, K4, K4
  32. /* Register macros */
  33. #define RSTATE r0
  34. #define RDATA r1
  35. #define RNBLKS r2
  36. #define ROLDSTACK r3
  37. #define RWK lr
  38. #define _a r4
  39. #define _b r5
  40. #define _c r6
  41. #define _d r7
  42. #define _e r8
  43. #define RT0 r9
  44. #define RT1 r10
  45. #define RT2 r11
  46. #define RT3 r12
  47. #define W0 q0
  48. #define W1 q1
  49. #define W2 q2
  50. #define W3 q3
  51. #define W4 q4
  52. #define W5 q5
  53. #define W6 q6
  54. #define W7 q7
  55. #define tmp0 q8
  56. #define tmp1 q9
  57. #define tmp2 q10
  58. #define tmp3 q11
  59. #define qK1 q12
  60. #define qK2 q13
  61. #define qK3 q14
  62. #define qK4 q15
  63. /* Round function macros. */
  64. #define WK_offs(i) (((i) & 15) * 4)
  65. #define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  66. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  67. ldr RT3, [sp, WK_offs(i)]; \
  68. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  69. bic RT0, d, b; \
  70. add e, e, a, ror #(32 - 5); \
  71. and RT1, c, b; \
  72. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  73. add RT0, RT0, RT3; \
  74. add e, e, RT1; \
  75. ror b, #(32 - 30); \
  76. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  77. add e, e, RT0;
  78. #define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  79. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  80. ldr RT3, [sp, WK_offs(i)]; \
  81. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  82. eor RT0, d, b; \
  83. add e, e, a, ror #(32 - 5); \
  84. eor RT0, RT0, c; \
  85. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  86. add e, e, RT3; \
  87. ror b, #(32 - 30); \
  88. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  89. add e, e, RT0; \
  90. #define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  91. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  92. ldr RT3, [sp, WK_offs(i)]; \
  93. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  94. eor RT0, b, c; \
  95. and RT1, b, c; \
  96. add e, e, a, ror #(32 - 5); \
  97. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  98. and RT0, RT0, d; \
  99. add RT1, RT1, RT3; \
  100. add e, e, RT0; \
  101. ror b, #(32 - 30); \
  102. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  103. add e, e, RT1;
  104. #define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  105. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  106. _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  107. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  108. #define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
  109. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  110. _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  111. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  112. #define R(a,b,c,d,e,f,i) \
  113. _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
  114. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  115. #define dummy(...)
  116. /* Input expansion macros. */
  117. /********* Precalc macros for rounds 0-15 *************************************/
  118. #define W_PRECALC_00_15() \
  119. add RWK, sp, #(WK_offs(0)); \
  120. \
  121. vld1.32 {tmp0, tmp1}, [RDATA]!; \
  122. vrev32.8 W0, tmp0; /* big => little */ \
  123. vld1.32 {tmp2, tmp3}, [RDATA]!; \
  124. vadd.u32 tmp0, W0, curK; \
  125. vrev32.8 W7, tmp1; /* big => little */ \
  126. vrev32.8 W6, tmp2; /* big => little */ \
  127. vadd.u32 tmp1, W7, curK; \
  128. vrev32.8 W5, tmp3; /* big => little */ \
  129. vadd.u32 tmp2, W6, curK; \
  130. vst1.32 {tmp0, tmp1}, [RWK]!; \
  131. vadd.u32 tmp3, W5, curK; \
  132. vst1.32 {tmp2, tmp3}, [RWK]; \
  133. #define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  134. vld1.32 {tmp0, tmp1}, [RDATA]!; \
  135. #define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  136. add RWK, sp, #(WK_offs(0)); \
  137. #define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  138. vrev32.8 W0, tmp0; /* big => little */ \
  139. #define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  140. vld1.32 {tmp2, tmp3}, [RDATA]!; \
  141. #define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  142. vadd.u32 tmp0, W0, curK; \
  143. #define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  144. vrev32.8 W7, tmp1; /* big => little */ \
  145. #define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  146. vrev32.8 W6, tmp2; /* big => little */ \
  147. #define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  148. vadd.u32 tmp1, W7, curK; \
  149. #define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  150. vrev32.8 W5, tmp3; /* big => little */ \
  151. #define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  152. vadd.u32 tmp2, W6, curK; \
  153. #define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  154. vst1.32 {tmp0, tmp1}, [RWK]!; \
  155. #define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  156. vadd.u32 tmp3, W5, curK; \
  157. #define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  158. vst1.32 {tmp2, tmp3}, [RWK]; \
  159. /********* Precalc macros for rounds 16-31 ************************************/
  160. #define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  161. veor tmp0, tmp0; \
  162. vext.8 W, W_m16, W_m12, #8; \
  163. #define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  164. add RWK, sp, #(WK_offs(i)); \
  165. vext.8 tmp0, W_m04, tmp0, #4; \
  166. #define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  167. veor tmp0, tmp0, W_m16; \
  168. veor.32 W, W, W_m08; \
  169. #define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  170. veor tmp1, tmp1; \
  171. veor W, W, tmp0; \
  172. #define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  173. vshl.u32 tmp0, W, #1; \
  174. #define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  175. vext.8 tmp1, tmp1, W, #(16-12); \
  176. vshr.u32 W, W, #31; \
  177. #define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  178. vorr tmp0, tmp0, W; \
  179. vshr.u32 W, tmp1, #30; \
  180. #define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  181. vshl.u32 tmp1, tmp1, #2; \
  182. #define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  183. veor tmp0, tmp0, W; \
  184. #define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  185. veor W, tmp0, tmp1; \
  186. #define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  187. vadd.u32 tmp0, W, curK; \
  188. #define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  189. vst1.32 {tmp0}, [RWK];
  190. /********* Precalc macros for rounds 32-79 ************************************/
  191. #define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  192. veor W, W_m28; \
  193. #define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  194. vext.8 tmp0, W_m08, W_m04, #8; \
  195. #define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  196. veor W, W_m16; \
  197. #define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  198. veor W, tmp0; \
  199. #define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  200. add RWK, sp, #(WK_offs(i&~3)); \
  201. #define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  202. vshl.u32 tmp1, W, #2; \
  203. #define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  204. vshr.u32 tmp0, W, #30; \
  205. #define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  206. vorr W, tmp0, tmp1; \
  207. #define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  208. vadd.u32 tmp0, W, curK; \
  209. #define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  210. vst1.32 {tmp0}, [RWK];
  211. /*
  212. * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  213. *
  214. * unsigned int
  215. * sha1_transform_neon (void *ctx, const unsigned char *data,
  216. * unsigned int nblks)
  217. */
  218. .align 3
  219. ENTRY(sha1_transform_neon)
  220. /* input:
  221. * r0: ctx, CTX
  222. * r1: data (64*nblks bytes)
  223. * r2: nblks
  224. */
  225. cmp RNBLKS, #0;
  226. beq .Ldo_nothing;
  227. push {r4-r12, lr};
  228. /*vpush {q4-q7};*/
  229. adr RT3, .LK_VEC;
  230. mov ROLDSTACK, sp;
  231. /* Align stack. */
  232. sub RT0, sp, #(16*4);
  233. and RT0, #(~(16-1));
  234. mov sp, RT0;
  235. vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
  236. /* Get the values of the chaining variables. */
  237. ldm RSTATE, {_a-_e};
  238. vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
  239. #undef curK
  240. #define curK qK1
  241. /* Precalc 0-15. */
  242. W_PRECALC_00_15();
  243. .Loop:
  244. /* Transform 0-15 + Precalc 16-31. */
  245. _R( _a, _b, _c, _d, _e, F1, 0,
  246. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
  247. W4, W5, W6, W7, W0, _, _, _ );
  248. _R( _e, _a, _b, _c, _d, F1, 1,
  249. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
  250. W4, W5, W6, W7, W0, _, _, _ );
  251. _R( _d, _e, _a, _b, _c, F1, 2,
  252. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
  253. W4, W5, W6, W7, W0, _, _, _ );
  254. _R( _c, _d, _e, _a, _b, F1, 3,
  255. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
  256. W4, W5, W6, W7, W0, _, _, _ );
  257. #undef curK
  258. #define curK qK2
  259. _R( _b, _c, _d, _e, _a, F1, 4,
  260. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
  261. W3, W4, W5, W6, W7, _, _, _ );
  262. _R( _a, _b, _c, _d, _e, F1, 5,
  263. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
  264. W3, W4, W5, W6, W7, _, _, _ );
  265. _R( _e, _a, _b, _c, _d, F1, 6,
  266. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
  267. W3, W4, W5, W6, W7, _, _, _ );
  268. _R( _d, _e, _a, _b, _c, F1, 7,
  269. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
  270. W3, W4, W5, W6, W7, _, _, _ );
  271. _R( _c, _d, _e, _a, _b, F1, 8,
  272. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
  273. W2, W3, W4, W5, W6, _, _, _ );
  274. _R( _b, _c, _d, _e, _a, F1, 9,
  275. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
  276. W2, W3, W4, W5, W6, _, _, _ );
  277. _R( _a, _b, _c, _d, _e, F1, 10,
  278. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
  279. W2, W3, W4, W5, W6, _, _, _ );
  280. _R( _e, _a, _b, _c, _d, F1, 11,
  281. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
  282. W2, W3, W4, W5, W6, _, _, _ );
  283. _R( _d, _e, _a, _b, _c, F1, 12,
  284. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
  285. W1, W2, W3, W4, W5, _, _, _ );
  286. _R( _c, _d, _e, _a, _b, F1, 13,
  287. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
  288. W1, W2, W3, W4, W5, _, _, _ );
  289. _R( _b, _c, _d, _e, _a, F1, 14,
  290. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
  291. W1, W2, W3, W4, W5, _, _, _ );
  292. _R( _a, _b, _c, _d, _e, F1, 15,
  293. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
  294. W1, W2, W3, W4, W5, _, _, _ );
  295. /* Transform 16-63 + Precalc 32-79. */
  296. _R( _e, _a, _b, _c, _d, F1, 16,
  297. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
  298. W0, W1, W2, W3, W4, W5, W6, W7);
  299. _R( _d, _e, _a, _b, _c, F1, 17,
  300. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
  301. W0, W1, W2, W3, W4, W5, W6, W7);
  302. _R( _c, _d, _e, _a, _b, F1, 18,
  303. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
  304. W0, W1, W2, W3, W4, W5, W6, W7);
  305. _R( _b, _c, _d, _e, _a, F1, 19,
  306. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
  307. W0, W1, W2, W3, W4, W5, W6, W7);
  308. _R( _a, _b, _c, _d, _e, F2, 20,
  309. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
  310. W7, W0, W1, W2, W3, W4, W5, W6);
  311. _R( _e, _a, _b, _c, _d, F2, 21,
  312. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
  313. W7, W0, W1, W2, W3, W4, W5, W6);
  314. _R( _d, _e, _a, _b, _c, F2, 22,
  315. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
  316. W7, W0, W1, W2, W3, W4, W5, W6);
  317. _R( _c, _d, _e, _a, _b, F2, 23,
  318. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
  319. W7, W0, W1, W2, W3, W4, W5, W6);
  320. #undef curK
  321. #define curK qK3
  322. _R( _b, _c, _d, _e, _a, F2, 24,
  323. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
  324. W6, W7, W0, W1, W2, W3, W4, W5);
  325. _R( _a, _b, _c, _d, _e, F2, 25,
  326. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
  327. W6, W7, W0, W1, W2, W3, W4, W5);
  328. _R( _e, _a, _b, _c, _d, F2, 26,
  329. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
  330. W6, W7, W0, W1, W2, W3, W4, W5);
  331. _R( _d, _e, _a, _b, _c, F2, 27,
  332. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
  333. W6, W7, W0, W1, W2, W3, W4, W5);
  334. _R( _c, _d, _e, _a, _b, F2, 28,
  335. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
  336. W5, W6, W7, W0, W1, W2, W3, W4);
  337. _R( _b, _c, _d, _e, _a, F2, 29,
  338. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
  339. W5, W6, W7, W0, W1, W2, W3, W4);
  340. _R( _a, _b, _c, _d, _e, F2, 30,
  341. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
  342. W5, W6, W7, W0, W1, W2, W3, W4);
  343. _R( _e, _a, _b, _c, _d, F2, 31,
  344. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
  345. W5, W6, W7, W0, W1, W2, W3, W4);
  346. _R( _d, _e, _a, _b, _c, F2, 32,
  347. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
  348. W4, W5, W6, W7, W0, W1, W2, W3);
  349. _R( _c, _d, _e, _a, _b, F2, 33,
  350. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
  351. W4, W5, W6, W7, W0, W1, W2, W3);
  352. _R( _b, _c, _d, _e, _a, F2, 34,
  353. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
  354. W4, W5, W6, W7, W0, W1, W2, W3);
  355. _R( _a, _b, _c, _d, _e, F2, 35,
  356. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
  357. W4, W5, W6, W7, W0, W1, W2, W3);
  358. _R( _e, _a, _b, _c, _d, F2, 36,
  359. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
  360. W3, W4, W5, W6, W7, W0, W1, W2);
  361. _R( _d, _e, _a, _b, _c, F2, 37,
  362. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
  363. W3, W4, W5, W6, W7, W0, W1, W2);
  364. _R( _c, _d, _e, _a, _b, F2, 38,
  365. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
  366. W3, W4, W5, W6, W7, W0, W1, W2);
  367. _R( _b, _c, _d, _e, _a, F2, 39,
  368. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
  369. W3, W4, W5, W6, W7, W0, W1, W2);
  370. _R( _a, _b, _c, _d, _e, F3, 40,
  371. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
  372. W2, W3, W4, W5, W6, W7, W0, W1);
  373. _R( _e, _a, _b, _c, _d, F3, 41,
  374. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
  375. W2, W3, W4, W5, W6, W7, W0, W1);
  376. _R( _d, _e, _a, _b, _c, F3, 42,
  377. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
  378. W2, W3, W4, W5, W6, W7, W0, W1);
  379. _R( _c, _d, _e, _a, _b, F3, 43,
  380. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
  381. W2, W3, W4, W5, W6, W7, W0, W1);
  382. #undef curK
  383. #define curK qK4
  384. _R( _b, _c, _d, _e, _a, F3, 44,
  385. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
  386. W1, W2, W3, W4, W5, W6, W7, W0);
  387. _R( _a, _b, _c, _d, _e, F3, 45,
  388. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
  389. W1, W2, W3, W4, W5, W6, W7, W0);
  390. _R( _e, _a, _b, _c, _d, F3, 46,
  391. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
  392. W1, W2, W3, W4, W5, W6, W7, W0);
  393. _R( _d, _e, _a, _b, _c, F3, 47,
  394. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
  395. W1, W2, W3, W4, W5, W6, W7, W0);
  396. _R( _c, _d, _e, _a, _b, F3, 48,
  397. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
  398. W0, W1, W2, W3, W4, W5, W6, W7);
  399. _R( _b, _c, _d, _e, _a, F3, 49,
  400. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
  401. W0, W1, W2, W3, W4, W5, W6, W7);
  402. _R( _a, _b, _c, _d, _e, F3, 50,
  403. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
  404. W0, W1, W2, W3, W4, W5, W6, W7);
  405. _R( _e, _a, _b, _c, _d, F3, 51,
  406. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
  407. W0, W1, W2, W3, W4, W5, W6, W7);
  408. _R( _d, _e, _a, _b, _c, F3, 52,
  409. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
  410. W7, W0, W1, W2, W3, W4, W5, W6);
  411. _R( _c, _d, _e, _a, _b, F3, 53,
  412. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
  413. W7, W0, W1, W2, W3, W4, W5, W6);
  414. _R( _b, _c, _d, _e, _a, F3, 54,
  415. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
  416. W7, W0, W1, W2, W3, W4, W5, W6);
  417. _R( _a, _b, _c, _d, _e, F3, 55,
  418. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
  419. W7, W0, W1, W2, W3, W4, W5, W6);
  420. _R( _e, _a, _b, _c, _d, F3, 56,
  421. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
  422. W6, W7, W0, W1, W2, W3, W4, W5);
  423. _R( _d, _e, _a, _b, _c, F3, 57,
  424. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
  425. W6, W7, W0, W1, W2, W3, W4, W5);
  426. _R( _c, _d, _e, _a, _b, F3, 58,
  427. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
  428. W6, W7, W0, W1, W2, W3, W4, W5);
  429. _R( _b, _c, _d, _e, _a, F3, 59,
  430. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
  431. W6, W7, W0, W1, W2, W3, W4, W5);
  432. subs RNBLKS, #1;
  433. _R( _a, _b, _c, _d, _e, F4, 60,
  434. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
  435. W5, W6, W7, W0, W1, W2, W3, W4);
  436. _R( _e, _a, _b, _c, _d, F4, 61,
  437. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
  438. W5, W6, W7, W0, W1, W2, W3, W4);
  439. _R( _d, _e, _a, _b, _c, F4, 62,
  440. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
  441. W5, W6, W7, W0, W1, W2, W3, W4);
  442. _R( _c, _d, _e, _a, _b, F4, 63,
  443. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
  444. W5, W6, W7, W0, W1, W2, W3, W4);
  445. beq .Lend;
  446. /* Transform 64-79 + Precalc 0-15 of next block. */
  447. #undef curK
  448. #define curK qK1
  449. _R( _b, _c, _d, _e, _a, F4, 64,
  450. WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  451. _R( _a, _b, _c, _d, _e, F4, 65,
  452. WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  453. _R( _e, _a, _b, _c, _d, F4, 66,
  454. WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  455. _R( _d, _e, _a, _b, _c, F4, 67,
  456. WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  457. _R( _c, _d, _e, _a, _b, F4, 68,
  458. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  459. _R( _b, _c, _d, _e, _a, F4, 69,
  460. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  461. _R( _a, _b, _c, _d, _e, F4, 70,
  462. WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  463. _R( _e, _a, _b, _c, _d, F4, 71,
  464. WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  465. _R( _d, _e, _a, _b, _c, F4, 72,
  466. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  467. _R( _c, _d, _e, _a, _b, F4, 73,
  468. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  469. _R( _b, _c, _d, _e, _a, F4, 74,
  470. WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  471. _R( _a, _b, _c, _d, _e, F4, 75,
  472. WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  473. _R( _e, _a, _b, _c, _d, F4, 76,
  474. WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  475. _R( _d, _e, _a, _b, _c, F4, 77,
  476. WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  477. _R( _c, _d, _e, _a, _b, F4, 78,
  478. WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  479. _R( _b, _c, _d, _e, _a, F4, 79,
  480. WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
  481. /* Update the chaining variables. */
  482. ldm RSTATE, {RT0-RT3};
  483. add _a, RT0;
  484. ldr RT0, [RSTATE, #state_h4];
  485. add _b, RT1;
  486. add _c, RT2;
  487. add _d, RT3;
  488. add _e, RT0;
  489. stm RSTATE, {_a-_e};
  490. b .Loop;
  491. .Lend:
  492. /* Transform 64-79 */
  493. R( _b, _c, _d, _e, _a, F4, 64 );
  494. R( _a, _b, _c, _d, _e, F4, 65 );
  495. R( _e, _a, _b, _c, _d, F4, 66 );
  496. R( _d, _e, _a, _b, _c, F4, 67 );
  497. R( _c, _d, _e, _a, _b, F4, 68 );
  498. R( _b, _c, _d, _e, _a, F4, 69 );
  499. R( _a, _b, _c, _d, _e, F4, 70 );
  500. R( _e, _a, _b, _c, _d, F4, 71 );
  501. R( _d, _e, _a, _b, _c, F4, 72 );
  502. R( _c, _d, _e, _a, _b, F4, 73 );
  503. R( _b, _c, _d, _e, _a, F4, 74 );
  504. R( _a, _b, _c, _d, _e, F4, 75 );
  505. R( _e, _a, _b, _c, _d, F4, 76 );
  506. R( _d, _e, _a, _b, _c, F4, 77 );
  507. R( _c, _d, _e, _a, _b, F4, 78 );
  508. R( _b, _c, _d, _e, _a, F4, 79 );
  509. mov sp, ROLDSTACK;
  510. /* Update the chaining variables. */
  511. ldm RSTATE, {RT0-RT3};
  512. add _a, RT0;
  513. ldr RT0, [RSTATE, #state_h4];
  514. add _b, RT1;
  515. add _c, RT2;
  516. add _d, RT3;
  517. /*vpop {q4-q7};*/
  518. add _e, RT0;
  519. stm RSTATE, {_a-_e};
  520. pop {r4-r12, pc};
  521. .Ldo_nothing:
  522. bx lr
  523. ENDPROC(sha1_transform_neon)