sha1_ssse3_asm.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. /*
  2. * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
  3. * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
  4. * processors. CPUs supporting Intel(R) AVX extensions will get an additional
  5. * boost.
  6. *
  7. * This work was inspired by the vectorized implementation of Dean Gaudet.
  8. * Additional information on it can be found at:
  9. * http://www.arctic.org/~dean/crypto/sha1.html
  10. *
  11. * It was improved upon with more efficient vectorization of the message
  12. * scheduling. This implementation has also been optimized for all current and
  13. * several future generations of Intel CPUs.
  14. *
  15. * See this article for more information about the implementation details:
  16. * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
  17. *
  18. * Copyright (C) 2010, Intel Corp.
  19. * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  20. * Ronen Zohar <ronen.zohar@intel.com>
  21. *
  22. * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
  23. * Author: Mathias Krause <minipli@googlemail.com>
  24. *
  25. * This program is free software; you can redistribute it and/or modify
  26. * it under the terms of the GNU General Public License as published by
  27. * the Free Software Foundation; either version 2 of the License, or
  28. * (at your option) any later version.
  29. */
  30. #define CTX %rdi // arg1
  31. #define BUF %rsi // arg2
  32. #define CNT %rdx // arg3
  33. #define REG_A %ecx
  34. #define REG_B %esi
  35. #define REG_C %edi
  36. #define REG_D %ebp
  37. #define REG_E %edx
  38. #define REG_T1 %eax
  39. #define REG_T2 %ebx
  40. #define K_BASE %r8
  41. #define HASH_PTR %r9
  42. #define BUFFER_PTR %r10
  43. #define BUFFER_END %r11
  44. #define W_TMP1 %xmm0
  45. #define W_TMP2 %xmm9
  46. #define W0 %xmm1
  47. #define W4 %xmm2
  48. #define W8 %xmm3
  49. #define W12 %xmm4
  50. #define W16 %xmm5
  51. #define W20 %xmm6
  52. #define W24 %xmm7
  53. #define W28 %xmm8
  54. #define XMM_SHUFB_BSWAP %xmm10
  55. /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
  56. #define WK(t) (((t) & 15) * 4)(%rsp)
  57. #define W_PRECALC_AHEAD 16
  58. /*
  59. * This macro implements the SHA-1 function's body for single 64-byte block
  60. * param: function's name
  61. */
  62. .macro SHA1_VECTOR_ASM name
  63. .global \name
  64. .type \name, @function
  65. .align 32
  66. \name:
  67. push %rbx
  68. push %rbp
  69. push %r12
  70. mov %rsp, %r12
  71. sub $64, %rsp # allocate workspace
  72. and $~15, %rsp # align stack
  73. mov CTX, HASH_PTR
  74. mov BUF, BUFFER_PTR
  75. shl $6, CNT # multiply by 64
  76. add BUF, CNT
  77. mov CNT, BUFFER_END
  78. lea K_XMM_AR(%rip), K_BASE
  79. xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
  80. SHA1_PIPELINED_MAIN_BODY
  81. # cleanup workspace
  82. mov $8, %ecx
  83. mov %rsp, %rdi
  84. xor %rax, %rax
  85. rep stosq
  86. mov %r12, %rsp # deallocate workspace
  87. pop %r12
  88. pop %rbp
  89. pop %rbx
  90. ret
  91. .size \name, .-\name
  92. .endm
  93. /*
  94. * This macro implements 80 rounds of SHA-1 for one 64-byte block
  95. */
  96. .macro SHA1_PIPELINED_MAIN_BODY
  97. INIT_REGALLOC
  98. mov (HASH_PTR), A
  99. mov 4(HASH_PTR), B
  100. mov 8(HASH_PTR), C
  101. mov 12(HASH_PTR), D
  102. mov 16(HASH_PTR), E
  103. .set i, 0
  104. .rept W_PRECALC_AHEAD
  105. W_PRECALC i
  106. .set i, (i+1)
  107. .endr
  108. .align 4
  109. 1:
  110. RR F1,A,B,C,D,E,0
  111. RR F1,D,E,A,B,C,2
  112. RR F1,B,C,D,E,A,4
  113. RR F1,E,A,B,C,D,6
  114. RR F1,C,D,E,A,B,8
  115. RR F1,A,B,C,D,E,10
  116. RR F1,D,E,A,B,C,12
  117. RR F1,B,C,D,E,A,14
  118. RR F1,E,A,B,C,D,16
  119. RR F1,C,D,E,A,B,18
  120. RR F2,A,B,C,D,E,20
  121. RR F2,D,E,A,B,C,22
  122. RR F2,B,C,D,E,A,24
  123. RR F2,E,A,B,C,D,26
  124. RR F2,C,D,E,A,B,28
  125. RR F2,A,B,C,D,E,30
  126. RR F2,D,E,A,B,C,32
  127. RR F2,B,C,D,E,A,34
  128. RR F2,E,A,B,C,D,36
  129. RR F2,C,D,E,A,B,38
  130. RR F3,A,B,C,D,E,40
  131. RR F3,D,E,A,B,C,42
  132. RR F3,B,C,D,E,A,44
  133. RR F3,E,A,B,C,D,46
  134. RR F3,C,D,E,A,B,48
  135. RR F3,A,B,C,D,E,50
  136. RR F3,D,E,A,B,C,52
  137. RR F3,B,C,D,E,A,54
  138. RR F3,E,A,B,C,D,56
  139. RR F3,C,D,E,A,B,58
  140. add $64, BUFFER_PTR # move to the next 64-byte block
  141. cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
  142. cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
  143. RR F4,A,B,C,D,E,60
  144. RR F4,D,E,A,B,C,62
  145. RR F4,B,C,D,E,A,64
  146. RR F4,E,A,B,C,D,66
  147. RR F4,C,D,E,A,B,68
  148. RR F4,A,B,C,D,E,70
  149. RR F4,D,E,A,B,C,72
  150. RR F4,B,C,D,E,A,74
  151. RR F4,E,A,B,C,D,76
  152. RR F4,C,D,E,A,B,78
  153. UPDATE_HASH (HASH_PTR), A
  154. UPDATE_HASH 4(HASH_PTR), B
  155. UPDATE_HASH 8(HASH_PTR), C
  156. UPDATE_HASH 12(HASH_PTR), D
  157. UPDATE_HASH 16(HASH_PTR), E
  158. RESTORE_RENAMED_REGS
  159. cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
  160. jne 1b
  161. .endm
  162. .macro INIT_REGALLOC
  163. .set A, REG_A
  164. .set B, REG_B
  165. .set C, REG_C
  166. .set D, REG_D
  167. .set E, REG_E
  168. .set T1, REG_T1
  169. .set T2, REG_T2
  170. .endm
  171. .macro RESTORE_RENAMED_REGS
  172. # order is important (REG_C is where it should be)
  173. mov B, REG_B
  174. mov D, REG_D
  175. mov A, REG_A
  176. mov E, REG_E
  177. .endm
  178. .macro SWAP_REG_NAMES a, b
  179. .set _T, \a
  180. .set \a, \b
  181. .set \b, _T
  182. .endm
  183. .macro F1 b, c, d
  184. mov \c, T1
  185. SWAP_REG_NAMES \c, T1
  186. xor \d, T1
  187. and \b, T1
  188. xor \d, T1
  189. .endm
  190. .macro F2 b, c, d
  191. mov \d, T1
  192. SWAP_REG_NAMES \d, T1
  193. xor \c, T1
  194. xor \b, T1
  195. .endm
  196. .macro F3 b, c ,d
  197. mov \c, T1
  198. SWAP_REG_NAMES \c, T1
  199. mov \b, T2
  200. or \b, T1
  201. and \c, T2
  202. and \d, T1
  203. or T2, T1
  204. .endm
  205. .macro F4 b, c, d
  206. F2 \b, \c, \d
  207. .endm
  208. .macro UPDATE_HASH hash, val
  209. add \hash, \val
  210. mov \val, \hash
  211. .endm
  212. /*
  213. * RR does two rounds of SHA-1 back to back with W[] pre-calc
  214. * t1 = F(b, c, d); e += w(i)
  215. * e += t1; b <<= 30; d += w(i+1);
  216. * t1 = F(a, b, c);
  217. * d += t1; a <<= 5;
  218. * e += a;
  219. * t1 = e; a >>= 7;
  220. * t1 <<= 5;
  221. * d += t1;
  222. */
  223. .macro RR F, a, b, c, d, e, round
  224. add WK(\round), \e
  225. \F \b, \c, \d # t1 = F(b, c, d);
  226. W_PRECALC (\round + W_PRECALC_AHEAD)
  227. rol $30, \b
  228. add T1, \e
  229. add WK(\round + 1), \d
  230. \F \a, \b, \c
  231. W_PRECALC (\round + W_PRECALC_AHEAD + 1)
  232. rol $5, \a
  233. add \a, \e
  234. add T1, \d
  235. ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
  236. mov \e, T1
  237. SWAP_REG_NAMES \e, T1
  238. rol $5, T1
  239. add T1, \d
  240. # write: \a, \b
  241. # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
  242. .endm
  243. .macro W_PRECALC r
  244. .set i, \r
  245. .if (i < 20)
  246. .set K_XMM, 0
  247. .elseif (i < 40)
  248. .set K_XMM, 16
  249. .elseif (i < 60)
  250. .set K_XMM, 32
  251. .elseif (i < 80)
  252. .set K_XMM, 48
  253. .endif
  254. .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
  255. .set i, ((\r) % 80) # pre-compute for the next iteration
  256. .if (i == 0)
  257. W_PRECALC_RESET
  258. .endif
  259. W_PRECALC_00_15
  260. .elseif (i<32)
  261. W_PRECALC_16_31
  262. .elseif (i < 80) // rounds 32-79
  263. W_PRECALC_32_79
  264. .endif
  265. .endm
  266. .macro W_PRECALC_RESET
  267. .set W, W0
  268. .set W_minus_04, W4
  269. .set W_minus_08, W8
  270. .set W_minus_12, W12
  271. .set W_minus_16, W16
  272. .set W_minus_20, W20
  273. .set W_minus_24, W24
  274. .set W_minus_28, W28
  275. .set W_minus_32, W
  276. .endm
  277. .macro W_PRECALC_ROTATE
  278. .set W_minus_32, W_minus_28
  279. .set W_minus_28, W_minus_24
  280. .set W_minus_24, W_minus_20
  281. .set W_minus_20, W_minus_16
  282. .set W_minus_16, W_minus_12
  283. .set W_minus_12, W_minus_08
  284. .set W_minus_08, W_minus_04
  285. .set W_minus_04, W
  286. .set W, W_minus_32
  287. .endm
  288. .macro W_PRECALC_SSSE3
  289. .macro W_PRECALC_00_15
  290. W_PRECALC_00_15_SSSE3
  291. .endm
  292. .macro W_PRECALC_16_31
  293. W_PRECALC_16_31_SSSE3
  294. .endm
  295. .macro W_PRECALC_32_79
  296. W_PRECALC_32_79_SSSE3
  297. .endm
  298. /* message scheduling pre-compute for rounds 0-15 */
  299. .macro W_PRECALC_00_15_SSSE3
  300. .if ((i & 3) == 0)
  301. movdqu (i*4)(BUFFER_PTR), W_TMP1
  302. .elseif ((i & 3) == 1)
  303. pshufb XMM_SHUFB_BSWAP, W_TMP1
  304. movdqa W_TMP1, W
  305. .elseif ((i & 3) == 2)
  306. paddd (K_BASE), W_TMP1
  307. .elseif ((i & 3) == 3)
  308. movdqa W_TMP1, WK(i&~3)
  309. W_PRECALC_ROTATE
  310. .endif
  311. .endm
  312. /* message scheduling pre-compute for rounds 16-31
  313. *
  314. * - calculating last 32 w[i] values in 8 XMM registers
  315. * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
  316. * instruction
  317. *
  318. * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
  319. * dependency, but improves for 32-79
  320. */
  321. .macro W_PRECALC_16_31_SSSE3
  322. # blended scheduling of vector and scalar instruction streams, one 4-wide
  323. # vector iteration / 4 scalar rounds
  324. .if ((i & 3) == 0)
  325. movdqa W_minus_12, W
  326. palignr $8, W_minus_16, W # w[i-14]
  327. movdqa W_minus_04, W_TMP1
  328. psrldq $4, W_TMP1 # w[i-3]
  329. pxor W_minus_08, W
  330. .elseif ((i & 3) == 1)
  331. pxor W_minus_16, W_TMP1
  332. pxor W_TMP1, W
  333. movdqa W, W_TMP2
  334. movdqa W, W_TMP1
  335. pslldq $12, W_TMP2
  336. .elseif ((i & 3) == 2)
  337. psrld $31, W
  338. pslld $1, W_TMP1
  339. por W, W_TMP1
  340. movdqa W_TMP2, W
  341. psrld $30, W_TMP2
  342. pslld $2, W
  343. .elseif ((i & 3) == 3)
  344. pxor W, W_TMP1
  345. pxor W_TMP2, W_TMP1
  346. movdqa W_TMP1, W
  347. paddd K_XMM(K_BASE), W_TMP1
  348. movdqa W_TMP1, WK(i&~3)
  349. W_PRECALC_ROTATE
  350. .endif
  351. .endm
  352. /* message scheduling pre-compute for rounds 32-79
  353. *
  354. * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
  355. * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
  356. * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
  357. */
  358. .macro W_PRECALC_32_79_SSSE3
  359. .if ((i & 3) == 0)
  360. movdqa W_minus_04, W_TMP1
  361. pxor W_minus_28, W # W is W_minus_32 before xor
  362. palignr $8, W_minus_08, W_TMP1
  363. .elseif ((i & 3) == 1)
  364. pxor W_minus_16, W
  365. pxor W_TMP1, W
  366. movdqa W, W_TMP1
  367. .elseif ((i & 3) == 2)
  368. psrld $30, W
  369. pslld $2, W_TMP1
  370. por W, W_TMP1
  371. .elseif ((i & 3) == 3)
  372. movdqa W_TMP1, W
  373. paddd K_XMM(K_BASE), W_TMP1
  374. movdqa W_TMP1, WK(i&~3)
  375. W_PRECALC_ROTATE
  376. .endif
  377. .endm
  378. .endm // W_PRECALC_SSSE3
  379. #define K1 0x5a827999
  380. #define K2 0x6ed9eba1
  381. #define K3 0x8f1bbcdc
  382. #define K4 0xca62c1d6
  383. .section .rodata
  384. .align 16
  385. K_XMM_AR:
  386. .long K1, K1, K1, K1
  387. .long K2, K2, K2, K2
  388. .long K3, K3, K3, K3
  389. .long K4, K4, K4, K4
  390. BSWAP_SHUFB_CTL:
  391. .long 0x00010203
  392. .long 0x04050607
  393. .long 0x08090a0b
  394. .long 0x0c0d0e0f
  395. .section .text
  396. W_PRECALC_SSSE3
  397. .macro xmm_mov a, b
  398. movdqu \a,\b
  399. .endm
  400. /* SSSE3 optimized implementation:
  401. * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
  402. * unsigned int rounds);
  403. */
  404. SHA1_VECTOR_ASM sha1_transform_ssse3
  405. #ifdef SHA1_ENABLE_AVX_SUPPORT
  406. .macro W_PRECALC_AVX
  407. .purgem W_PRECALC_00_15
  408. .macro W_PRECALC_00_15
  409. W_PRECALC_00_15_AVX
  410. .endm
  411. .purgem W_PRECALC_16_31
  412. .macro W_PRECALC_16_31
  413. W_PRECALC_16_31_AVX
  414. .endm
  415. .purgem W_PRECALC_32_79
  416. .macro W_PRECALC_32_79
  417. W_PRECALC_32_79_AVX
  418. .endm
  419. .macro W_PRECALC_00_15_AVX
  420. .if ((i & 3) == 0)
  421. vmovdqu (i*4)(BUFFER_PTR), W_TMP1
  422. .elseif ((i & 3) == 1)
  423. vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
  424. .elseif ((i & 3) == 2)
  425. vpaddd (K_BASE), W, W_TMP1
  426. .elseif ((i & 3) == 3)
  427. vmovdqa W_TMP1, WK(i&~3)
  428. W_PRECALC_ROTATE
  429. .endif
  430. .endm
  431. .macro W_PRECALC_16_31_AVX
  432. .if ((i & 3) == 0)
  433. vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
  434. vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
  435. vpxor W_minus_08, W, W
  436. vpxor W_minus_16, W_TMP1, W_TMP1
  437. .elseif ((i & 3) == 1)
  438. vpxor W_TMP1, W, W
  439. vpslldq $12, W, W_TMP2
  440. vpslld $1, W, W_TMP1
  441. .elseif ((i & 3) == 2)
  442. vpsrld $31, W, W
  443. vpor W, W_TMP1, W_TMP1
  444. vpslld $2, W_TMP2, W
  445. vpsrld $30, W_TMP2, W_TMP2
  446. .elseif ((i & 3) == 3)
  447. vpxor W, W_TMP1, W_TMP1
  448. vpxor W_TMP2, W_TMP1, W
  449. vpaddd K_XMM(K_BASE), W, W_TMP1
  450. vmovdqu W_TMP1, WK(i&~3)
  451. W_PRECALC_ROTATE
  452. .endif
  453. .endm
  454. .macro W_PRECALC_32_79_AVX
  455. .if ((i & 3) == 0)
  456. vpalignr $8, W_minus_08, W_minus_04, W_TMP1
  457. vpxor W_minus_28, W, W # W is W_minus_32 before xor
  458. .elseif ((i & 3) == 1)
  459. vpxor W_minus_16, W_TMP1, W_TMP1
  460. vpxor W_TMP1, W, W
  461. .elseif ((i & 3) == 2)
  462. vpslld $2, W, W_TMP1
  463. vpsrld $30, W, W
  464. vpor W, W_TMP1, W
  465. .elseif ((i & 3) == 3)
  466. vpaddd K_XMM(K_BASE), W, W_TMP1
  467. vmovdqu W_TMP1, WK(i&~3)
  468. W_PRECALC_ROTATE
  469. .endif
  470. .endm
  471. .endm // W_PRECALC_AVX
  472. W_PRECALC_AVX
  473. .purgem xmm_mov
  474. .macro xmm_mov a, b
  475. vmovdqu \a,\b
  476. .endm
  477. /* AVX optimized implementation:
  478. * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
  479. * unsigned int rounds);
  480. */
  481. SHA1_VECTOR_ASM sha1_transform_avx
  482. #endif