aes-ce-core.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. /*
  2. * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
  3. *
  4. * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. .text
  13. .fpu crypto-neon-fp-armv8
  14. .align 3
  15. .macro enc_round, state, key
  16. aese.8 \state, \key
  17. aesmc.8 \state, \state
  18. .endm
  19. .macro dec_round, state, key
  20. aesd.8 \state, \key
  21. aesimc.8 \state, \state
  22. .endm
  23. .macro enc_dround, key1, key2
  24. enc_round q0, \key1
  25. enc_round q0, \key2
  26. .endm
  27. .macro dec_dround, key1, key2
  28. dec_round q0, \key1
  29. dec_round q0, \key2
  30. .endm
  31. .macro enc_fround, key1, key2, key3
  32. enc_round q0, \key1
  33. aese.8 q0, \key2
  34. veor q0, q0, \key3
  35. .endm
  36. .macro dec_fround, key1, key2, key3
  37. dec_round q0, \key1
  38. aesd.8 q0, \key2
  39. veor q0, q0, \key3
  40. .endm
  41. .macro enc_dround_3x, key1, key2
  42. enc_round q0, \key1
  43. enc_round q1, \key1
  44. enc_round q2, \key1
  45. enc_round q0, \key2
  46. enc_round q1, \key2
  47. enc_round q2, \key2
  48. .endm
  49. .macro dec_dround_3x, key1, key2
  50. dec_round q0, \key1
  51. dec_round q1, \key1
  52. dec_round q2, \key1
  53. dec_round q0, \key2
  54. dec_round q1, \key2
  55. dec_round q2, \key2
  56. .endm
  57. .macro enc_fround_3x, key1, key2, key3
  58. enc_round q0, \key1
  59. enc_round q1, \key1
  60. enc_round q2, \key1
  61. aese.8 q0, \key2
  62. aese.8 q1, \key2
  63. aese.8 q2, \key2
  64. veor q0, q0, \key3
  65. veor q1, q1, \key3
  66. veor q2, q2, \key3
  67. .endm
  68. .macro dec_fround_3x, key1, key2, key3
  69. dec_round q0, \key1
  70. dec_round q1, \key1
  71. dec_round q2, \key1
  72. aesd.8 q0, \key2
  73. aesd.8 q1, \key2
  74. aesd.8 q2, \key2
  75. veor q0, q0, \key3
  76. veor q1, q1, \key3
  77. veor q2, q2, \key3
  78. .endm
  79. .macro do_block, dround, fround
  80. cmp r3, #12 @ which key size?
  81. vld1.8 {q10-q11}, [ip]!
  82. \dround q8, q9
  83. vld1.8 {q12-q13}, [ip]!
  84. \dround q10, q11
  85. vld1.8 {q10-q11}, [ip]!
  86. \dround q12, q13
  87. vld1.8 {q12-q13}, [ip]!
  88. \dround q10, q11
  89. blo 0f @ AES-128: 10 rounds
  90. vld1.8 {q10-q11}, [ip]!
  91. \dround q12, q13
  92. beq 1f @ AES-192: 12 rounds
  93. vld1.8 {q12-q13}, [ip]
  94. \dround q10, q11
  95. 0: \fround q12, q13, q14
  96. bx lr
  97. 1: \fround q10, q11, q14
  98. bx lr
  99. .endm
  100. /*
  101. * Internal, non-AAPCS compliant functions that implement the core AES
  102. * transforms. These should preserve all registers except q0 - q2 and ip
  103. * Arguments:
  104. * q0 : first in/output block
  105. * q1 : second in/output block (_3x version only)
  106. * q2 : third in/output block (_3x version only)
  107. * q8 : first round key
  108. * q9 : secound round key
  109. * q14 : final round key
  110. * r2 : address of round key array
  111. * r3 : number of rounds
  112. */
  113. .align 6
  114. aes_encrypt:
  115. add ip, r2, #32 @ 3rd round key
  116. .Laes_encrypt_tweak:
  117. do_block enc_dround, enc_fround
  118. ENDPROC(aes_encrypt)
  119. .align 6
  120. aes_decrypt:
  121. add ip, r2, #32 @ 3rd round key
  122. do_block dec_dround, dec_fround
  123. ENDPROC(aes_decrypt)
  124. .align 6
  125. aes_encrypt_3x:
  126. add ip, r2, #32 @ 3rd round key
  127. do_block enc_dround_3x, enc_fround_3x
  128. ENDPROC(aes_encrypt_3x)
  129. .align 6
  130. aes_decrypt_3x:
  131. add ip, r2, #32 @ 3rd round key
  132. do_block dec_dround_3x, dec_fround_3x
  133. ENDPROC(aes_decrypt_3x)
  134. .macro prepare_key, rk, rounds
  135. add ip, \rk, \rounds, lsl #4
  136. vld1.8 {q8-q9}, [\rk] @ load first 2 round keys
  137. vld1.8 {q14}, [ip] @ load last round key
  138. .endm
  139. /*
  140. * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  141. * int blocks)
  142. * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  143. * int blocks)
  144. */
  145. ENTRY(ce_aes_ecb_encrypt)
  146. push {r4, lr}
  147. ldr r4, [sp, #8]
  148. prepare_key r2, r3
  149. .Lecbencloop3x:
  150. subs r4, r4, #3
  151. bmi .Lecbenc1x
  152. vld1.8 {q0-q1}, [r1, :64]!
  153. vld1.8 {q2}, [r1, :64]!
  154. bl aes_encrypt_3x
  155. vst1.8 {q0-q1}, [r0, :64]!
  156. vst1.8 {q2}, [r0, :64]!
  157. b .Lecbencloop3x
  158. .Lecbenc1x:
  159. adds r4, r4, #3
  160. beq .Lecbencout
  161. .Lecbencloop:
  162. vld1.8 {q0}, [r1, :64]!
  163. bl aes_encrypt
  164. vst1.8 {q0}, [r0, :64]!
  165. subs r4, r4, #1
  166. bne .Lecbencloop
  167. .Lecbencout:
  168. pop {r4, pc}
  169. ENDPROC(ce_aes_ecb_encrypt)
  170. ENTRY(ce_aes_ecb_decrypt)
  171. push {r4, lr}
  172. ldr r4, [sp, #8]
  173. prepare_key r2, r3
  174. .Lecbdecloop3x:
  175. subs r4, r4, #3
  176. bmi .Lecbdec1x
  177. vld1.8 {q0-q1}, [r1, :64]!
  178. vld1.8 {q2}, [r1, :64]!
  179. bl aes_decrypt_3x
  180. vst1.8 {q0-q1}, [r0, :64]!
  181. vst1.8 {q2}, [r0, :64]!
  182. b .Lecbdecloop3x
  183. .Lecbdec1x:
  184. adds r4, r4, #3
  185. beq .Lecbdecout
  186. .Lecbdecloop:
  187. vld1.8 {q0}, [r1, :64]!
  188. bl aes_decrypt
  189. vst1.8 {q0}, [r0, :64]!
  190. subs r4, r4, #1
  191. bne .Lecbdecloop
  192. .Lecbdecout:
  193. pop {r4, pc}
  194. ENDPROC(ce_aes_ecb_decrypt)
  195. /*
  196. * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  197. * int blocks, u8 iv[])
  198. * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  199. * int blocks, u8 iv[])
  200. */
  201. ENTRY(ce_aes_cbc_encrypt)
  202. push {r4-r6, lr}
  203. ldrd r4, r5, [sp, #16]
  204. vld1.8 {q0}, [r5]
  205. prepare_key r2, r3
  206. .Lcbcencloop:
  207. vld1.8 {q1}, [r1, :64]! @ get next pt block
  208. veor q0, q0, q1 @ ..and xor with iv
  209. bl aes_encrypt
  210. vst1.8 {q0}, [r0, :64]!
  211. subs r4, r4, #1
  212. bne .Lcbcencloop
  213. vst1.8 {q0}, [r5]
  214. pop {r4-r6, pc}
  215. ENDPROC(ce_aes_cbc_encrypt)
  216. ENTRY(ce_aes_cbc_decrypt)
  217. push {r4-r6, lr}
  218. ldrd r4, r5, [sp, #16]
  219. vld1.8 {q6}, [r5] @ keep iv in q6
  220. prepare_key r2, r3
  221. .Lcbcdecloop3x:
  222. subs r4, r4, #3
  223. bmi .Lcbcdec1x
  224. vld1.8 {q0-q1}, [r1, :64]!
  225. vld1.8 {q2}, [r1, :64]!
  226. vmov q3, q0
  227. vmov q4, q1
  228. vmov q5, q2
  229. bl aes_decrypt_3x
  230. veor q0, q0, q6
  231. veor q1, q1, q3
  232. veor q2, q2, q4
  233. vmov q6, q5
  234. vst1.8 {q0-q1}, [r0, :64]!
  235. vst1.8 {q2}, [r0, :64]!
  236. b .Lcbcdecloop3x
  237. .Lcbcdec1x:
  238. adds r4, r4, #3
  239. beq .Lcbcdecout
  240. vmov q15, q14 @ preserve last round key
  241. .Lcbcdecloop:
  242. vld1.8 {q0}, [r1, :64]! @ get next ct block
  243. veor q14, q15, q6 @ combine prev ct with last key
  244. vmov q6, q0
  245. bl aes_decrypt
  246. vst1.8 {q0}, [r0, :64]!
  247. subs r4, r4, #1
  248. bne .Lcbcdecloop
  249. .Lcbcdecout:
  250. vst1.8 {q6}, [r5] @ keep iv in q6
  251. pop {r4-r6, pc}
  252. ENDPROC(ce_aes_cbc_decrypt)
  253. /*
  254. * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  255. * int blocks, u8 ctr[])
  256. */
  257. ENTRY(ce_aes_ctr_encrypt)
  258. push {r4-r6, lr}
  259. ldrd r4, r5, [sp, #16]
  260. vld1.8 {q6}, [r5] @ load ctr
  261. prepare_key r2, r3
  262. vmov r6, s27 @ keep swabbed ctr in r6
  263. rev r6, r6
  264. cmn r6, r4 @ 32 bit overflow?
  265. bcs .Lctrloop
  266. .Lctrloop3x:
  267. subs r4, r4, #3
  268. bmi .Lctr1x
  269. add r6, r6, #1
  270. vmov q0, q6
  271. vmov q1, q6
  272. rev ip, r6
  273. add r6, r6, #1
  274. vmov q2, q6
  275. vmov s7, ip
  276. rev ip, r6
  277. add r6, r6, #1
  278. vmov s11, ip
  279. vld1.8 {q3-q4}, [r1, :64]!
  280. vld1.8 {q5}, [r1, :64]!
  281. bl aes_encrypt_3x
  282. veor q0, q0, q3
  283. veor q1, q1, q4
  284. veor q2, q2, q5
  285. rev ip, r6
  286. vst1.8 {q0-q1}, [r0, :64]!
  287. vst1.8 {q2}, [r0, :64]!
  288. vmov s27, ip
  289. b .Lctrloop3x
  290. .Lctr1x:
  291. adds r4, r4, #3
  292. beq .Lctrout
  293. .Lctrloop:
  294. vmov q0, q6
  295. bl aes_encrypt
  296. subs r4, r4, #1
  297. bmi .Lctrhalfblock @ blocks < 0 means 1/2 block
  298. vld1.8 {q3}, [r1, :64]!
  299. veor q3, q0, q3
  300. vst1.8 {q3}, [r0, :64]!
  301. adds r6, r6, #1 @ increment BE ctr
  302. rev ip, r6
  303. vmov s27, ip
  304. bcs .Lctrcarry
  305. teq r4, #0
  306. bne .Lctrloop
  307. .Lctrout:
  308. vst1.8 {q6}, [r5]
  309. pop {r4-r6, pc}
  310. .Lctrhalfblock:
  311. vld1.8 {d1}, [r1, :64]
  312. veor d0, d0, d1
  313. vst1.8 {d0}, [r0, :64]
  314. pop {r4-r6, pc}
  315. .Lctrcarry:
  316. .irp sreg, s26, s25, s24
  317. vmov ip, \sreg @ load next word of ctr
  318. rev ip, ip @ ... to handle the carry
  319. adds ip, ip, #1
  320. rev ip, ip
  321. vmov \sreg, ip
  322. bcc 0f
  323. .endr
  324. 0: teq r4, #0
  325. beq .Lctrout
  326. b .Lctrloop
  327. ENDPROC(ce_aes_ctr_encrypt)
  328. /*
  329. * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  330. * int blocks, u8 iv[], u8 const rk2[], int first)
  331. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  332. * int blocks, u8 iv[], u8 const rk2[], int first)
  333. */
  334. .macro next_tweak, out, in, const, tmp
  335. vshr.s64 \tmp, \in, #63
  336. vand \tmp, \tmp, \const
  337. vadd.u64 \out, \in, \in
  338. vext.8 \tmp, \tmp, \tmp, #8
  339. veor \out, \out, \tmp
  340. .endm
  341. .align 3
  342. .Lxts_mul_x:
  343. .quad 1, 0x87
  344. ce_aes_xts_init:
  345. vldr d14, .Lxts_mul_x
  346. vldr d15, .Lxts_mul_x + 8
  347. ldrd r4, r5, [sp, #16] @ load args
  348. ldr r6, [sp, #28]
  349. vld1.8 {q0}, [r5] @ load iv
  350. teq r6, #1 @ start of a block?
  351. bxne lr
  352. @ Encrypt the IV in q0 with the second AES key. This should only
  353. @ be done at the start of a block.
  354. ldr r6, [sp, #24] @ load AES key 2
  355. prepare_key r6, r3
  356. add ip, r6, #32 @ 3rd round key of key 2
  357. b .Laes_encrypt_tweak @ tail call
  358. ENDPROC(ce_aes_xts_init)
  359. ENTRY(ce_aes_xts_encrypt)
  360. push {r4-r6, lr}
  361. bl ce_aes_xts_init @ run shared prologue
  362. prepare_key r2, r3
  363. vmov q3, q0
  364. teq r6, #0 @ start of a block?
  365. bne .Lxtsenc3x
  366. .Lxtsencloop3x:
  367. next_tweak q3, q3, q7, q6
  368. .Lxtsenc3x:
  369. subs r4, r4, #3
  370. bmi .Lxtsenc1x
  371. vld1.8 {q0-q1}, [r1, :64]! @ get 3 pt blocks
  372. vld1.8 {q2}, [r1, :64]!
  373. next_tweak q4, q3, q7, q6
  374. veor q0, q0, q3
  375. next_tweak q5, q4, q7, q6
  376. veor q1, q1, q4
  377. veor q2, q2, q5
  378. bl aes_encrypt_3x
  379. veor q0, q0, q3
  380. veor q1, q1, q4
  381. veor q2, q2, q5
  382. vst1.8 {q0-q1}, [r0, :64]! @ write 3 ct blocks
  383. vst1.8 {q2}, [r0, :64]!
  384. vmov q3, q5
  385. teq r4, #0
  386. beq .Lxtsencout
  387. b .Lxtsencloop3x
  388. .Lxtsenc1x:
  389. adds r4, r4, #3
  390. beq .Lxtsencout
  391. .Lxtsencloop:
  392. vld1.8 {q0}, [r1, :64]!
  393. veor q0, q0, q3
  394. bl aes_encrypt
  395. veor q0, q0, q3
  396. vst1.8 {q0}, [r0, :64]!
  397. subs r4, r4, #1
  398. beq .Lxtsencout
  399. next_tweak q3, q3, q7, q6
  400. b .Lxtsencloop
  401. .Lxtsencout:
  402. vst1.8 {q3}, [r5]
  403. pop {r4-r6, pc}
  404. ENDPROC(ce_aes_xts_encrypt)
  405. ENTRY(ce_aes_xts_decrypt)
  406. push {r4-r6, lr}
  407. bl ce_aes_xts_init @ run shared prologue
  408. prepare_key r2, r3
  409. vmov q3, q0
  410. teq r6, #0 @ start of a block?
  411. bne .Lxtsdec3x
  412. .Lxtsdecloop3x:
  413. next_tweak q3, q3, q7, q6
  414. .Lxtsdec3x:
  415. subs r4, r4, #3
  416. bmi .Lxtsdec1x
  417. vld1.8 {q0-q1}, [r1, :64]! @ get 3 ct blocks
  418. vld1.8 {q2}, [r1, :64]!
  419. next_tweak q4, q3, q7, q6
  420. veor q0, q0, q3
  421. next_tweak q5, q4, q7, q6
  422. veor q1, q1, q4
  423. veor q2, q2, q5
  424. bl aes_decrypt_3x
  425. veor q0, q0, q3
  426. veor q1, q1, q4
  427. veor q2, q2, q5
  428. vst1.8 {q0-q1}, [r0, :64]! @ write 3 pt blocks
  429. vst1.8 {q2}, [r0, :64]!
  430. vmov q3, q5
  431. teq r4, #0
  432. beq .Lxtsdecout
  433. b .Lxtsdecloop3x
  434. .Lxtsdec1x:
  435. adds r4, r4, #3
  436. beq .Lxtsdecout
  437. .Lxtsdecloop:
  438. vld1.8 {q0}, [r1, :64]!
  439. veor q0, q0, q3
  440. add ip, r2, #32 @ 3rd round key
  441. bl aes_decrypt
  442. veor q0, q0, q3
  443. vst1.8 {q0}, [r0, :64]!
  444. subs r4, r4, #1
  445. beq .Lxtsdecout
  446. next_tweak q3, q3, q7, q6
  447. b .Lxtsdecloop
  448. .Lxtsdecout:
  449. vst1.8 {q3}, [r5]
  450. pop {r4-r6, pc}
  451. ENDPROC(ce_aes_xts_decrypt)
  452. /*
  453. * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
  454. * AES sbox substitution on each byte in
  455. * 'input'
  456. */
  457. ENTRY(ce_aes_sub)
  458. vdup.32 q1, r0
  459. veor q0, q0, q0
  460. aese.8 q0, q1
  461. vmov r0, s0
  462. bx lr
  463. ENDPROC(ce_aes_sub)
  464. /*
  465. * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
  466. * operation on round key *src
  467. */
  468. ENTRY(ce_aes_invert)
  469. vld1.8 {q0}, [r1]
  470. aesimc.8 q0, q0
  471. vst1.8 {q0}, [r0]
  472. bx lr
  473. ENDPROC(ce_aes_invert)