sha1-armv7-neon.S 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. /* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
  2. *
  3. * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms of the GNU General Public License as published by the Free
  7. * Software Foundation; either version 2 of the License, or (at your option)
  8. * any later version.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. .syntax unified
  13. .fpu neon
  14. .text
  15. /* Context structure */
  16. #define state_h0 0
  17. #define state_h1 4
  18. #define state_h2 8
  19. #define state_h3 12
  20. #define state_h4 16
  21. /* Constants */
  22. #define K1 0x5A827999
  23. #define K2 0x6ED9EBA1
  24. #define K3 0x8F1BBCDC
  25. #define K4 0xCA62C1D6
  26. .align 4
  27. .LK_VEC:
  28. .LK1: .long K1, K1, K1, K1
  29. .LK2: .long K2, K2, K2, K2
  30. .LK3: .long K3, K3, K3, K3
  31. .LK4: .long K4, K4, K4, K4
  32. /* Register macros */
  33. #define RSTATE r0
  34. #define RDATA r1
  35. #define RNBLKS r2
  36. #define ROLDSTACK r3
  37. #define RWK lr
  38. #define _a r4
  39. #define _b r5
  40. #define _c r6
  41. #define _d r7
  42. #define _e r8
  43. #define RT0 r9
  44. #define RT1 r10
  45. #define RT2 r11
  46. #define RT3 r12
  47. #define W0 q0
  48. #define W1 q7
  49. #define W2 q2
  50. #define W3 q3
  51. #define W4 q4
  52. #define W5 q6
  53. #define W6 q5
  54. #define W7 q1
  55. #define tmp0 q8
  56. #define tmp1 q9
  57. #define tmp2 q10
  58. #define tmp3 q11
  59. #define qK1 q12
  60. #define qK2 q13
  61. #define qK3 q14
  62. #define qK4 q15
  63. #ifdef CONFIG_CPU_BIG_ENDIAN
  64. #define ARM_LE(code...)
  65. #else
  66. #define ARM_LE(code...) code
  67. #endif
  68. /* Round function macros. */
  69. #define WK_offs(i) (((i) & 15) * 4)
  70. #define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  71. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  72. ldr RT3, [sp, WK_offs(i)]; \
  73. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  74. bic RT0, d, b; \
  75. add e, e, a, ror #(32 - 5); \
  76. and RT1, c, b; \
  77. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  78. add RT0, RT0, RT3; \
  79. add e, e, RT1; \
  80. ror b, #(32 - 30); \
  81. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  82. add e, e, RT0;
  83. #define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  84. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  85. ldr RT3, [sp, WK_offs(i)]; \
  86. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  87. eor RT0, d, b; \
  88. add e, e, a, ror #(32 - 5); \
  89. eor RT0, RT0, c; \
  90. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  91. add e, e, RT3; \
  92. ror b, #(32 - 30); \
  93. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  94. add e, e, RT0; \
  95. #define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  96. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  97. ldr RT3, [sp, WK_offs(i)]; \
  98. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  99. eor RT0, b, c; \
  100. and RT1, b, c; \
  101. add e, e, a, ror #(32 - 5); \
  102. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  103. and RT0, RT0, d; \
  104. add RT1, RT1, RT3; \
  105. add e, e, RT0; \
  106. ror b, #(32 - 30); \
  107. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  108. add e, e, RT1;
  109. #define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  110. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  111. _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  112. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  113. #define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
  114. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  115. _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  116. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  117. #define R(a,b,c,d,e,f,i) \
  118. _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
  119. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  120. #define dummy(...)
  121. /* Input expansion macros. */
  122. /********* Precalc macros for rounds 0-15 *************************************/
  123. #define W_PRECALC_00_15() \
  124. add RWK, sp, #(WK_offs(0)); \
  125. \
  126. vld1.32 {W0, W7}, [RDATA]!; \
  127. ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
  128. vld1.32 {W6, W5}, [RDATA]!; \
  129. vadd.u32 tmp0, W0, curK; \
  130. ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
  131. ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
  132. vadd.u32 tmp1, W7, curK; \
  133. ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
  134. vadd.u32 tmp2, W6, curK; \
  135. vst1.32 {tmp0, tmp1}, [RWK]!; \
  136. vadd.u32 tmp3, W5, curK; \
  137. vst1.32 {tmp2, tmp3}, [RWK]; \
  138. #define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  139. vld1.32 {W0, W7}, [RDATA]!; \
  140. #define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  141. add RWK, sp, #(WK_offs(0)); \
  142. #define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  143. ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
  144. #define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  145. vld1.32 {W6, W5}, [RDATA]!; \
  146. #define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  147. vadd.u32 tmp0, W0, curK; \
  148. #define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  149. ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
  150. #define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  151. ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
  152. #define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  153. vadd.u32 tmp1, W7, curK; \
  154. #define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  155. ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
  156. #define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  157. vadd.u32 tmp2, W6, curK; \
  158. #define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  159. vst1.32 {tmp0, tmp1}, [RWK]!; \
  160. #define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  161. vadd.u32 tmp3, W5, curK; \
  162. #define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  163. vst1.32 {tmp2, tmp3}, [RWK]; \
  164. /********* Precalc macros for rounds 16-31 ************************************/
  165. #define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  166. veor tmp0, tmp0; \
  167. vext.8 W, W_m16, W_m12, #8; \
  168. #define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  169. add RWK, sp, #(WK_offs(i)); \
  170. vext.8 tmp0, W_m04, tmp0, #4; \
  171. #define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  172. veor tmp0, tmp0, W_m16; \
  173. veor.32 W, W, W_m08; \
  174. #define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  175. veor tmp1, tmp1; \
  176. veor W, W, tmp0; \
  177. #define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  178. vshl.u32 tmp0, W, #1; \
  179. #define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  180. vext.8 tmp1, tmp1, W, #(16-12); \
  181. vshr.u32 W, W, #31; \
  182. #define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  183. vorr tmp0, tmp0, W; \
  184. vshr.u32 W, tmp1, #30; \
  185. #define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  186. vshl.u32 tmp1, tmp1, #2; \
  187. #define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  188. veor tmp0, tmp0, W; \
  189. #define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  190. veor W, tmp0, tmp1; \
  191. #define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  192. vadd.u32 tmp0, W, curK; \
  193. #define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  194. vst1.32 {tmp0}, [RWK];
  195. /********* Precalc macros for rounds 32-79 ************************************/
  196. #define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  197. veor W, W_m28; \
  198. #define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  199. vext.8 tmp0, W_m08, W_m04, #8; \
  200. #define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  201. veor W, W_m16; \
  202. #define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  203. veor W, tmp0; \
  204. #define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  205. add RWK, sp, #(WK_offs(i&~3)); \
  206. #define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  207. vshl.u32 tmp1, W, #2; \
  208. #define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  209. vshr.u32 tmp0, W, #30; \
  210. #define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  211. vorr W, tmp0, tmp1; \
  212. #define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  213. vadd.u32 tmp0, W, curK; \
  214. #define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  215. vst1.32 {tmp0}, [RWK];
  216. /*
  217. * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  218. *
  219. * unsigned int
  220. * sha1_transform_neon (void *ctx, const unsigned char *data,
  221. * unsigned int nblks)
  222. */
  223. .align 3
  224. ENTRY(sha1_transform_neon)
  225. /* input:
  226. * r0: ctx, CTX
  227. * r1: data (64*nblks bytes)
  228. * r2: nblks
  229. */
  230. cmp RNBLKS, #0;
  231. beq .Ldo_nothing;
  232. push {r4-r12, lr};
  233. /*vpush {q4-q7};*/
  234. adr RT3, .LK_VEC;
  235. mov ROLDSTACK, sp;
  236. /* Align stack. */
  237. sub RT0, sp, #(16*4);
  238. and RT0, #(~(16-1));
  239. mov sp, RT0;
  240. vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
  241. /* Get the values of the chaining variables. */
  242. ldm RSTATE, {_a-_e};
  243. vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
  244. #undef curK
  245. #define curK qK1
  246. /* Precalc 0-15. */
  247. W_PRECALC_00_15();
  248. .Loop:
  249. /* Transform 0-15 + Precalc 16-31. */
  250. _R( _a, _b, _c, _d, _e, F1, 0,
  251. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
  252. W4, W5, W6, W7, W0, _, _, _ );
  253. _R( _e, _a, _b, _c, _d, F1, 1,
  254. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
  255. W4, W5, W6, W7, W0, _, _, _ );
  256. _R( _d, _e, _a, _b, _c, F1, 2,
  257. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
  258. W4, W5, W6, W7, W0, _, _, _ );
  259. _R( _c, _d, _e, _a, _b, F1, 3,
  260. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
  261. W4, W5, W6, W7, W0, _, _, _ );
  262. #undef curK
  263. #define curK qK2
  264. _R( _b, _c, _d, _e, _a, F1, 4,
  265. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
  266. W3, W4, W5, W6, W7, _, _, _ );
  267. _R( _a, _b, _c, _d, _e, F1, 5,
  268. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
  269. W3, W4, W5, W6, W7, _, _, _ );
  270. _R( _e, _a, _b, _c, _d, F1, 6,
  271. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
  272. W3, W4, W5, W6, W7, _, _, _ );
  273. _R( _d, _e, _a, _b, _c, F1, 7,
  274. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
  275. W3, W4, W5, W6, W7, _, _, _ );
  276. _R( _c, _d, _e, _a, _b, F1, 8,
  277. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
  278. W2, W3, W4, W5, W6, _, _, _ );
  279. _R( _b, _c, _d, _e, _a, F1, 9,
  280. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
  281. W2, W3, W4, W5, W6, _, _, _ );
  282. _R( _a, _b, _c, _d, _e, F1, 10,
  283. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
  284. W2, W3, W4, W5, W6, _, _, _ );
  285. _R( _e, _a, _b, _c, _d, F1, 11,
  286. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
  287. W2, W3, W4, W5, W6, _, _, _ );
  288. _R( _d, _e, _a, _b, _c, F1, 12,
  289. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
  290. W1, W2, W3, W4, W5, _, _, _ );
  291. _R( _c, _d, _e, _a, _b, F1, 13,
  292. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
  293. W1, W2, W3, W4, W5, _, _, _ );
  294. _R( _b, _c, _d, _e, _a, F1, 14,
  295. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
  296. W1, W2, W3, W4, W5, _, _, _ );
  297. _R( _a, _b, _c, _d, _e, F1, 15,
  298. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
  299. W1, W2, W3, W4, W5, _, _, _ );
  300. /* Transform 16-63 + Precalc 32-79. */
  301. _R( _e, _a, _b, _c, _d, F1, 16,
  302. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
  303. W0, W1, W2, W3, W4, W5, W6, W7);
  304. _R( _d, _e, _a, _b, _c, F1, 17,
  305. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
  306. W0, W1, W2, W3, W4, W5, W6, W7);
  307. _R( _c, _d, _e, _a, _b, F1, 18,
  308. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
  309. W0, W1, W2, W3, W4, W5, W6, W7);
  310. _R( _b, _c, _d, _e, _a, F1, 19,
  311. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
  312. W0, W1, W2, W3, W4, W5, W6, W7);
  313. _R( _a, _b, _c, _d, _e, F2, 20,
  314. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
  315. W7, W0, W1, W2, W3, W4, W5, W6);
  316. _R( _e, _a, _b, _c, _d, F2, 21,
  317. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
  318. W7, W0, W1, W2, W3, W4, W5, W6);
  319. _R( _d, _e, _a, _b, _c, F2, 22,
  320. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
  321. W7, W0, W1, W2, W3, W4, W5, W6);
  322. _R( _c, _d, _e, _a, _b, F2, 23,
  323. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
  324. W7, W0, W1, W2, W3, W4, W5, W6);
  325. #undef curK
  326. #define curK qK3
  327. _R( _b, _c, _d, _e, _a, F2, 24,
  328. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
  329. W6, W7, W0, W1, W2, W3, W4, W5);
  330. _R( _a, _b, _c, _d, _e, F2, 25,
  331. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
  332. W6, W7, W0, W1, W2, W3, W4, W5);
  333. _R( _e, _a, _b, _c, _d, F2, 26,
  334. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
  335. W6, W7, W0, W1, W2, W3, W4, W5);
  336. _R( _d, _e, _a, _b, _c, F2, 27,
  337. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
  338. W6, W7, W0, W1, W2, W3, W4, W5);
  339. _R( _c, _d, _e, _a, _b, F2, 28,
  340. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
  341. W5, W6, W7, W0, W1, W2, W3, W4);
  342. _R( _b, _c, _d, _e, _a, F2, 29,
  343. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
  344. W5, W6, W7, W0, W1, W2, W3, W4);
  345. _R( _a, _b, _c, _d, _e, F2, 30,
  346. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
  347. W5, W6, W7, W0, W1, W2, W3, W4);
  348. _R( _e, _a, _b, _c, _d, F2, 31,
  349. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
  350. W5, W6, W7, W0, W1, W2, W3, W4);
  351. _R( _d, _e, _a, _b, _c, F2, 32,
  352. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
  353. W4, W5, W6, W7, W0, W1, W2, W3);
  354. _R( _c, _d, _e, _a, _b, F2, 33,
  355. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
  356. W4, W5, W6, W7, W0, W1, W2, W3);
  357. _R( _b, _c, _d, _e, _a, F2, 34,
  358. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
  359. W4, W5, W6, W7, W0, W1, W2, W3);
  360. _R( _a, _b, _c, _d, _e, F2, 35,
  361. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
  362. W4, W5, W6, W7, W0, W1, W2, W3);
  363. _R( _e, _a, _b, _c, _d, F2, 36,
  364. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
  365. W3, W4, W5, W6, W7, W0, W1, W2);
  366. _R( _d, _e, _a, _b, _c, F2, 37,
  367. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
  368. W3, W4, W5, W6, W7, W0, W1, W2);
  369. _R( _c, _d, _e, _a, _b, F2, 38,
  370. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
  371. W3, W4, W5, W6, W7, W0, W1, W2);
  372. _R( _b, _c, _d, _e, _a, F2, 39,
  373. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
  374. W3, W4, W5, W6, W7, W0, W1, W2);
  375. _R( _a, _b, _c, _d, _e, F3, 40,
  376. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
  377. W2, W3, W4, W5, W6, W7, W0, W1);
  378. _R( _e, _a, _b, _c, _d, F3, 41,
  379. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
  380. W2, W3, W4, W5, W6, W7, W0, W1);
  381. _R( _d, _e, _a, _b, _c, F3, 42,
  382. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
  383. W2, W3, W4, W5, W6, W7, W0, W1);
  384. _R( _c, _d, _e, _a, _b, F3, 43,
  385. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
  386. W2, W3, W4, W5, W6, W7, W0, W1);
  387. #undef curK
  388. #define curK qK4
  389. _R( _b, _c, _d, _e, _a, F3, 44,
  390. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
  391. W1, W2, W3, W4, W5, W6, W7, W0);
  392. _R( _a, _b, _c, _d, _e, F3, 45,
  393. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
  394. W1, W2, W3, W4, W5, W6, W7, W0);
  395. _R( _e, _a, _b, _c, _d, F3, 46,
  396. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
  397. W1, W2, W3, W4, W5, W6, W7, W0);
  398. _R( _d, _e, _a, _b, _c, F3, 47,
  399. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
  400. W1, W2, W3, W4, W5, W6, W7, W0);
  401. _R( _c, _d, _e, _a, _b, F3, 48,
  402. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
  403. W0, W1, W2, W3, W4, W5, W6, W7);
  404. _R( _b, _c, _d, _e, _a, F3, 49,
  405. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
  406. W0, W1, W2, W3, W4, W5, W6, W7);
  407. _R( _a, _b, _c, _d, _e, F3, 50,
  408. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
  409. W0, W1, W2, W3, W4, W5, W6, W7);
  410. _R( _e, _a, _b, _c, _d, F3, 51,
  411. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
  412. W0, W1, W2, W3, W4, W5, W6, W7);
  413. _R( _d, _e, _a, _b, _c, F3, 52,
  414. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
  415. W7, W0, W1, W2, W3, W4, W5, W6);
  416. _R( _c, _d, _e, _a, _b, F3, 53,
  417. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
  418. W7, W0, W1, W2, W3, W4, W5, W6);
  419. _R( _b, _c, _d, _e, _a, F3, 54,
  420. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
  421. W7, W0, W1, W2, W3, W4, W5, W6);
  422. _R( _a, _b, _c, _d, _e, F3, 55,
  423. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
  424. W7, W0, W1, W2, W3, W4, W5, W6);
  425. _R( _e, _a, _b, _c, _d, F3, 56,
  426. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
  427. W6, W7, W0, W1, W2, W3, W4, W5);
  428. _R( _d, _e, _a, _b, _c, F3, 57,
  429. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
  430. W6, W7, W0, W1, W2, W3, W4, W5);
  431. _R( _c, _d, _e, _a, _b, F3, 58,
  432. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
  433. W6, W7, W0, W1, W2, W3, W4, W5);
  434. _R( _b, _c, _d, _e, _a, F3, 59,
  435. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
  436. W6, W7, W0, W1, W2, W3, W4, W5);
  437. subs RNBLKS, #1;
  438. _R( _a, _b, _c, _d, _e, F4, 60,
  439. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
  440. W5, W6, W7, W0, W1, W2, W3, W4);
  441. _R( _e, _a, _b, _c, _d, F4, 61,
  442. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
  443. W5, W6, W7, W0, W1, W2, W3, W4);
  444. _R( _d, _e, _a, _b, _c, F4, 62,
  445. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
  446. W5, W6, W7, W0, W1, W2, W3, W4);
  447. _R( _c, _d, _e, _a, _b, F4, 63,
  448. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
  449. W5, W6, W7, W0, W1, W2, W3, W4);
  450. beq .Lend;
  451. /* Transform 64-79 + Precalc 0-15 of next block. */
  452. #undef curK
  453. #define curK qK1
  454. _R( _b, _c, _d, _e, _a, F4, 64,
  455. WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  456. _R( _a, _b, _c, _d, _e, F4, 65,
  457. WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  458. _R( _e, _a, _b, _c, _d, F4, 66,
  459. WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  460. _R( _d, _e, _a, _b, _c, F4, 67,
  461. WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  462. _R( _c, _d, _e, _a, _b, F4, 68,
  463. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  464. _R( _b, _c, _d, _e, _a, F4, 69,
  465. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  466. _R( _a, _b, _c, _d, _e, F4, 70,
  467. WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  468. _R( _e, _a, _b, _c, _d, F4, 71,
  469. WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  470. _R( _d, _e, _a, _b, _c, F4, 72,
  471. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  472. _R( _c, _d, _e, _a, _b, F4, 73,
  473. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  474. _R( _b, _c, _d, _e, _a, F4, 74,
  475. WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  476. _R( _a, _b, _c, _d, _e, F4, 75,
  477. WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  478. _R( _e, _a, _b, _c, _d, F4, 76,
  479. WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  480. _R( _d, _e, _a, _b, _c, F4, 77,
  481. WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  482. _R( _c, _d, _e, _a, _b, F4, 78,
  483. WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  484. _R( _b, _c, _d, _e, _a, F4, 79,
  485. WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
  486. /* Update the chaining variables. */
  487. ldm RSTATE, {RT0-RT3};
  488. add _a, RT0;
  489. ldr RT0, [RSTATE, #state_h4];
  490. add _b, RT1;
  491. add _c, RT2;
  492. add _d, RT3;
  493. add _e, RT0;
  494. stm RSTATE, {_a-_e};
  495. b .Loop;
  496. .Lend:
  497. /* Transform 64-79 */
  498. R( _b, _c, _d, _e, _a, F4, 64 );
  499. R( _a, _b, _c, _d, _e, F4, 65 );
  500. R( _e, _a, _b, _c, _d, F4, 66 );
  501. R( _d, _e, _a, _b, _c, F4, 67 );
  502. R( _c, _d, _e, _a, _b, F4, 68 );
  503. R( _b, _c, _d, _e, _a, F4, 69 );
  504. R( _a, _b, _c, _d, _e, F4, 70 );
  505. R( _e, _a, _b, _c, _d, F4, 71 );
  506. R( _d, _e, _a, _b, _c, F4, 72 );
  507. R( _c, _d, _e, _a, _b, F4, 73 );
  508. R( _b, _c, _d, _e, _a, F4, 74 );
  509. R( _a, _b, _c, _d, _e, F4, 75 );
  510. R( _e, _a, _b, _c, _d, F4, 76 );
  511. R( _d, _e, _a, _b, _c, F4, 77 );
  512. R( _c, _d, _e, _a, _b, F4, 78 );
  513. R( _b, _c, _d, _e, _a, F4, 79 );
  514. mov sp, ROLDSTACK;
  515. /* Update the chaining variables. */
  516. ldm RSTATE, {RT0-RT3};
  517. add _a, RT0;
  518. ldr RT0, [RSTATE, #state_h4];
  519. add _b, RT1;
  520. add _c, RT2;
  521. add _d, RT3;
  522. /*vpop {q4-q7};*/
  523. add _e, RT0;
  524. stm RSTATE, {_a-_e};
  525. pop {r4-r12, pc};
  526. .Ldo_nothing:
  527. bx lr
  528. ENDPROC(sha1_transform_neon)