aes_ctrby8_avx-x86_64.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. /*
  2. * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
  3. *
  4. * This is AES128/192/256 CTR mode optimization implementation. It requires
  5. * the support of Intel(R) AESNI and AVX instructions.
  6. *
  7. * This work was inspired by the AES CTR mode optimization published
  8. * in Intel Optimized IPSEC Cryptograhpic library.
  9. * Additional information on it can be found at:
  10. * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
  11. *
  12. * This file is provided under a dual BSD/GPLv2 license. When using or
  13. * redistributing this file, you may do so under either license.
  14. *
  15. * GPL LICENSE SUMMARY
  16. *
  17. * Copyright(c) 2014 Intel Corporation.
  18. *
  19. * This program is free software; you can redistribute it and/or modify
  20. * it under the terms of version 2 of the GNU General Public License as
  21. * published by the Free Software Foundation.
  22. *
  23. * This program is distributed in the hope that it will be useful, but
  24. * WITHOUT ANY WARRANTY; without even the implied warranty of
  25. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  26. * General Public License for more details.
  27. *
  28. * Contact Information:
  29. * James Guilford <james.guilford@intel.com>
  30. * Sean Gulley <sean.m.gulley@intel.com>
  31. * Chandramouli Narayanan <mouli@linux.intel.com>
  32. *
  33. * BSD LICENSE
  34. *
  35. * Copyright(c) 2014 Intel Corporation.
  36. *
  37. * Redistribution and use in source and binary forms, with or without
  38. * modification, are permitted provided that the following conditions
  39. * are met:
  40. *
  41. * Redistributions of source code must retain the above copyright
  42. * notice, this list of conditions and the following disclaimer.
  43. * Redistributions in binary form must reproduce the above copyright
  44. * notice, this list of conditions and the following disclaimer in
  45. * the documentation and/or other materials provided with the
  46. * distribution.
  47. * Neither the name of Intel Corporation nor the names of its
  48. * contributors may be used to endorse or promote products derived
  49. * from this software without specific prior written permission.
  50. *
  51. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  52. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  53. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  54. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  55. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  56. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  57. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  58. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  59. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  60. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  61. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  62. *
  63. */
  64. #include <linux/linkage.h>
  65. #include <asm/inst.h>
  66. #define CONCAT(a,b) a##b
  67. #define VMOVDQ vmovdqu
  68. #define xdata0 %xmm0
  69. #define xdata1 %xmm1
  70. #define xdata2 %xmm2
  71. #define xdata3 %xmm3
  72. #define xdata4 %xmm4
  73. #define xdata5 %xmm5
  74. #define xdata6 %xmm6
  75. #define xdata7 %xmm7
  76. #define xcounter %xmm8
  77. #define xbyteswap %xmm9
  78. #define xkey0 %xmm10
  79. #define xkey4 %xmm11
  80. #define xkey8 %xmm12
  81. #define xkey12 %xmm13
  82. #define xkeyA %xmm14
  83. #define xkeyB %xmm15
  84. #define p_in %rdi
  85. #define p_iv %rsi
  86. #define p_keys %rdx
  87. #define p_out %rcx
  88. #define num_bytes %r8
  89. #define tmp %r10
  90. #define DDQ(i) CONCAT(ddq_add_,i)
  91. #define XMM(i) CONCAT(%xmm, i)
  92. #define DDQ_DATA 0
  93. #define XDATA 1
  94. #define KEY_128 1
  95. #define KEY_192 2
  96. #define KEY_256 3
  97. .section .rodata
  98. .align 16
  99. byteswap_const:
  100. .octa 0x000102030405060708090A0B0C0D0E0F
  101. ddq_low_msk:
  102. .octa 0x0000000000000000FFFFFFFFFFFFFFFF
  103. ddq_high_add_1:
  104. .octa 0x00000000000000010000000000000000
  105. ddq_add_1:
  106. .octa 0x00000000000000000000000000000001
  107. ddq_add_2:
  108. .octa 0x00000000000000000000000000000002
  109. ddq_add_3:
  110. .octa 0x00000000000000000000000000000003
  111. ddq_add_4:
  112. .octa 0x00000000000000000000000000000004
  113. ddq_add_5:
  114. .octa 0x00000000000000000000000000000005
  115. ddq_add_6:
  116. .octa 0x00000000000000000000000000000006
  117. ddq_add_7:
  118. .octa 0x00000000000000000000000000000007
  119. ddq_add_8:
  120. .octa 0x00000000000000000000000000000008
  121. .text
  122. /* generate a unique variable for ddq_add_x */
  123. .macro setddq n
  124. var_ddq_add = DDQ(\n)
  125. .endm
  126. /* generate a unique variable for xmm register */
  127. .macro setxdata n
  128. var_xdata = XMM(\n)
  129. .endm
  130. /* club the numeric 'id' to the symbol 'name' */
  131. .macro club name, id
  132. .altmacro
  133. .if \name == DDQ_DATA
  134. setddq %\id
  135. .elseif \name == XDATA
  136. setxdata %\id
  137. .endif
  138. .noaltmacro
  139. .endm
  140. /*
  141. * do_aes num_in_par load_keys key_len
  142. * This increments p_in, but not p_out
  143. */
  144. .macro do_aes b, k, key_len
  145. .set by, \b
  146. .set load_keys, \k
  147. .set klen, \key_len
  148. .if (load_keys)
  149. vmovdqa 0*16(p_keys), xkey0
  150. .endif
  151. vpshufb xbyteswap, xcounter, xdata0
  152. .set i, 1
  153. .rept (by - 1)
  154. club DDQ_DATA, i
  155. club XDATA, i
  156. vpaddq var_ddq_add(%rip), xcounter, var_xdata
  157. vptest ddq_low_msk(%rip), var_xdata
  158. jnz 1f
  159. vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
  160. vpaddq ddq_high_add_1(%rip), xcounter, xcounter
  161. 1:
  162. vpshufb xbyteswap, var_xdata, var_xdata
  163. .set i, (i +1)
  164. .endr
  165. vmovdqa 1*16(p_keys), xkeyA
  166. vpxor xkey0, xdata0, xdata0
  167. club DDQ_DATA, by
  168. vpaddq var_ddq_add(%rip), xcounter, xcounter
  169. vptest ddq_low_msk(%rip), xcounter
  170. jnz 1f
  171. vpaddq ddq_high_add_1(%rip), xcounter, xcounter
  172. 1:
  173. .set i, 1
  174. .rept (by - 1)
  175. club XDATA, i
  176. vpxor xkey0, var_xdata, var_xdata
  177. .set i, (i +1)
  178. .endr
  179. vmovdqa 2*16(p_keys), xkeyB
  180. .set i, 0
  181. .rept by
  182. club XDATA, i
  183. vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
  184. .set i, (i +1)
  185. .endr
  186. .if (klen == KEY_128)
  187. .if (load_keys)
  188. vmovdqa 3*16(p_keys), xkey4
  189. .endif
  190. .else
  191. vmovdqa 3*16(p_keys), xkeyA
  192. .endif
  193. .set i, 0
  194. .rept by
  195. club XDATA, i
  196. vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
  197. .set i, (i +1)
  198. .endr
  199. add $(16*by), p_in
  200. .if (klen == KEY_128)
  201. vmovdqa 4*16(p_keys), xkeyB
  202. .else
  203. .if (load_keys)
  204. vmovdqa 4*16(p_keys), xkey4
  205. .endif
  206. .endif
  207. .set i, 0
  208. .rept by
  209. club XDATA, i
  210. /* key 3 */
  211. .if (klen == KEY_128)
  212. vaesenc xkey4, var_xdata, var_xdata
  213. .else
  214. vaesenc xkeyA, var_xdata, var_xdata
  215. .endif
  216. .set i, (i +1)
  217. .endr
  218. vmovdqa 5*16(p_keys), xkeyA
  219. .set i, 0
  220. .rept by
  221. club XDATA, i
  222. /* key 4 */
  223. .if (klen == KEY_128)
  224. vaesenc xkeyB, var_xdata, var_xdata
  225. .else
  226. vaesenc xkey4, var_xdata, var_xdata
  227. .endif
  228. .set i, (i +1)
  229. .endr
  230. .if (klen == KEY_128)
  231. .if (load_keys)
  232. vmovdqa 6*16(p_keys), xkey8
  233. .endif
  234. .else
  235. vmovdqa 6*16(p_keys), xkeyB
  236. .endif
  237. .set i, 0
  238. .rept by
  239. club XDATA, i
  240. vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
  241. .set i, (i +1)
  242. .endr
  243. vmovdqa 7*16(p_keys), xkeyA
  244. .set i, 0
  245. .rept by
  246. club XDATA, i
  247. /* key 6 */
  248. .if (klen == KEY_128)
  249. vaesenc xkey8, var_xdata, var_xdata
  250. .else
  251. vaesenc xkeyB, var_xdata, var_xdata
  252. .endif
  253. .set i, (i +1)
  254. .endr
  255. .if (klen == KEY_128)
  256. vmovdqa 8*16(p_keys), xkeyB
  257. .else
  258. .if (load_keys)
  259. vmovdqa 8*16(p_keys), xkey8
  260. .endif
  261. .endif
  262. .set i, 0
  263. .rept by
  264. club XDATA, i
  265. vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
  266. .set i, (i +1)
  267. .endr
  268. .if (klen == KEY_128)
  269. .if (load_keys)
  270. vmovdqa 9*16(p_keys), xkey12
  271. .endif
  272. .else
  273. vmovdqa 9*16(p_keys), xkeyA
  274. .endif
  275. .set i, 0
  276. .rept by
  277. club XDATA, i
  278. /* key 8 */
  279. .if (klen == KEY_128)
  280. vaesenc xkeyB, var_xdata, var_xdata
  281. .else
  282. vaesenc xkey8, var_xdata, var_xdata
  283. .endif
  284. .set i, (i +1)
  285. .endr
  286. vmovdqa 10*16(p_keys), xkeyB
  287. .set i, 0
  288. .rept by
  289. club XDATA, i
  290. /* key 9 */
  291. .if (klen == KEY_128)
  292. vaesenc xkey12, var_xdata, var_xdata
  293. .else
  294. vaesenc xkeyA, var_xdata, var_xdata
  295. .endif
  296. .set i, (i +1)
  297. .endr
  298. .if (klen != KEY_128)
  299. vmovdqa 11*16(p_keys), xkeyA
  300. .endif
  301. .set i, 0
  302. .rept by
  303. club XDATA, i
  304. /* key 10 */
  305. .if (klen == KEY_128)
  306. vaesenclast xkeyB, var_xdata, var_xdata
  307. .else
  308. vaesenc xkeyB, var_xdata, var_xdata
  309. .endif
  310. .set i, (i +1)
  311. .endr
  312. .if (klen != KEY_128)
  313. .if (load_keys)
  314. vmovdqa 12*16(p_keys), xkey12
  315. .endif
  316. .set i, 0
  317. .rept by
  318. club XDATA, i
  319. vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
  320. .set i, (i +1)
  321. .endr
  322. .if (klen == KEY_256)
  323. vmovdqa 13*16(p_keys), xkeyA
  324. .endif
  325. .set i, 0
  326. .rept by
  327. club XDATA, i
  328. .if (klen == KEY_256)
  329. /* key 12 */
  330. vaesenc xkey12, var_xdata, var_xdata
  331. .else
  332. vaesenclast xkey12, var_xdata, var_xdata
  333. .endif
  334. .set i, (i +1)
  335. .endr
  336. .if (klen == KEY_256)
  337. vmovdqa 14*16(p_keys), xkeyB
  338. .set i, 0
  339. .rept by
  340. club XDATA, i
  341. /* key 13 */
  342. vaesenc xkeyA, var_xdata, var_xdata
  343. .set i, (i +1)
  344. .endr
  345. .set i, 0
  346. .rept by
  347. club XDATA, i
  348. /* key 14 */
  349. vaesenclast xkeyB, var_xdata, var_xdata
  350. .set i, (i +1)
  351. .endr
  352. .endif
  353. .endif
  354. .set i, 0
  355. .rept (by / 2)
  356. .set j, (i+1)
  357. VMOVDQ (i*16 - 16*by)(p_in), xkeyA
  358. VMOVDQ (j*16 - 16*by)(p_in), xkeyB
  359. club XDATA, i
  360. vpxor xkeyA, var_xdata, var_xdata
  361. club XDATA, j
  362. vpxor xkeyB, var_xdata, var_xdata
  363. .set i, (i+2)
  364. .endr
  365. .if (i < by)
  366. VMOVDQ (i*16 - 16*by)(p_in), xkeyA
  367. club XDATA, i
  368. vpxor xkeyA, var_xdata, var_xdata
  369. .endif
  370. .set i, 0
  371. .rept by
  372. club XDATA, i
  373. VMOVDQ var_xdata, i*16(p_out)
  374. .set i, (i+1)
  375. .endr
  376. .endm
  377. .macro do_aes_load val, key_len
  378. do_aes \val, 1, \key_len
  379. .endm
  380. .macro do_aes_noload val, key_len
  381. do_aes \val, 0, \key_len
  382. .endm
  383. /* main body of aes ctr load */
  384. .macro do_aes_ctrmain key_len
  385. cmp $16, num_bytes
  386. jb .Ldo_return2\key_len
  387. vmovdqa byteswap_const(%rip), xbyteswap
  388. vmovdqu (p_iv), xcounter
  389. vpshufb xbyteswap, xcounter, xcounter
  390. mov num_bytes, tmp
  391. and $(7*16), tmp
  392. jz .Lmult_of_8_blks\key_len
  393. /* 1 <= tmp <= 7 */
  394. cmp $(4*16), tmp
  395. jg .Lgt4\key_len
  396. je .Leq4\key_len
  397. .Llt4\key_len:
  398. cmp $(2*16), tmp
  399. jg .Leq3\key_len
  400. je .Leq2\key_len
  401. .Leq1\key_len:
  402. do_aes_load 1, \key_len
  403. add $(1*16), p_out
  404. and $(~7*16), num_bytes
  405. jz .Ldo_return2\key_len
  406. jmp .Lmain_loop2\key_len
  407. .Leq2\key_len:
  408. do_aes_load 2, \key_len
  409. add $(2*16), p_out
  410. and $(~7*16), num_bytes
  411. jz .Ldo_return2\key_len
  412. jmp .Lmain_loop2\key_len
  413. .Leq3\key_len:
  414. do_aes_load 3, \key_len
  415. add $(3*16), p_out
  416. and $(~7*16), num_bytes
  417. jz .Ldo_return2\key_len
  418. jmp .Lmain_loop2\key_len
  419. .Leq4\key_len:
  420. do_aes_load 4, \key_len
  421. add $(4*16), p_out
  422. and $(~7*16), num_bytes
  423. jz .Ldo_return2\key_len
  424. jmp .Lmain_loop2\key_len
  425. .Lgt4\key_len:
  426. cmp $(6*16), tmp
  427. jg .Leq7\key_len
  428. je .Leq6\key_len
  429. .Leq5\key_len:
  430. do_aes_load 5, \key_len
  431. add $(5*16), p_out
  432. and $(~7*16), num_bytes
  433. jz .Ldo_return2\key_len
  434. jmp .Lmain_loop2\key_len
  435. .Leq6\key_len:
  436. do_aes_load 6, \key_len
  437. add $(6*16), p_out
  438. and $(~7*16), num_bytes
  439. jz .Ldo_return2\key_len
  440. jmp .Lmain_loop2\key_len
  441. .Leq7\key_len:
  442. do_aes_load 7, \key_len
  443. add $(7*16), p_out
  444. and $(~7*16), num_bytes
  445. jz .Ldo_return2\key_len
  446. jmp .Lmain_loop2\key_len
  447. .Lmult_of_8_blks\key_len:
  448. .if (\key_len != KEY_128)
  449. vmovdqa 0*16(p_keys), xkey0
  450. vmovdqa 4*16(p_keys), xkey4
  451. vmovdqa 8*16(p_keys), xkey8
  452. vmovdqa 12*16(p_keys), xkey12
  453. .else
  454. vmovdqa 0*16(p_keys), xkey0
  455. vmovdqa 3*16(p_keys), xkey4
  456. vmovdqa 6*16(p_keys), xkey8
  457. vmovdqa 9*16(p_keys), xkey12
  458. .endif
  459. .align 16
  460. .Lmain_loop2\key_len:
  461. /* num_bytes is a multiple of 8 and >0 */
  462. do_aes_noload 8, \key_len
  463. add $(8*16), p_out
  464. sub $(8*16), num_bytes
  465. jne .Lmain_loop2\key_len
  466. .Ldo_return2\key_len:
  467. /* return updated IV */
  468. vpshufb xbyteswap, xcounter, xcounter
  469. vmovdqu xcounter, (p_iv)
  470. ret
  471. .endm
  472. /*
  473. * routine to do AES128 CTR enc/decrypt "by8"
  474. * XMM registers are clobbered.
  475. * Saving/restoring must be done at a higher level
  476. * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
  477. * unsigned int num_bytes)
  478. */
  479. ENTRY(aes_ctr_enc_128_avx_by8)
  480. /* call the aes main loop */
  481. do_aes_ctrmain KEY_128
  482. ENDPROC(aes_ctr_enc_128_avx_by8)
  483. /*
  484. * routine to do AES192 CTR enc/decrypt "by8"
  485. * XMM registers are clobbered.
  486. * Saving/restoring must be done at a higher level
  487. * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
  488. * unsigned int num_bytes)
  489. */
  490. ENTRY(aes_ctr_enc_192_avx_by8)
  491. /* call the aes main loop */
  492. do_aes_ctrmain KEY_192
  493. ENDPROC(aes_ctr_enc_192_avx_by8)
  494. /*
  495. * routine to do AES256 CTR enc/decrypt "by8"
  496. * XMM registers are clobbered.
  497. * Saving/restoring must be done at a higher level
  498. * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
  499. * unsigned int num_bytes)
  500. */
  501. ENTRY(aes_ctr_enc_256_avx_by8)
  502. /* call the aes main loop */
  503. do_aes_ctrmain KEY_256
  504. ENDPROC(aes_ctr_enc_256_avx_by8)