chacha20-avx2-x86_64.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. /*
  2. * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
  3. *
  4. * Copyright (C) 2015 Martin Willi
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. */
  11. #include <linux/linkage.h>
  12. .data
  13. .align 32
  14. ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
  15. .octa 0x0e0d0c0f0a09080b0605040702010003
  16. ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
  17. .octa 0x0d0c0f0e09080b0a0504070601000302
  18. CTRINC: .octa 0x00000003000000020000000100000000
  19. .octa 0x00000007000000060000000500000004
  20. .text
  21. ENTRY(chacha20_8block_xor_avx2)
  22. # %rdi: Input state matrix, s
  23. # %rsi: 8 data blocks output, o
  24. # %rdx: 8 data blocks input, i
  25. # This function encrypts eight consecutive ChaCha20 blocks by loading
  26. # the state matrix in AVX registers eight times. As we need some
  27. # scratch registers, we save the first four registers on the stack. The
  28. # algorithm performs each operation on the corresponding word of each
  29. # state matrix, hence requires no word shuffling. For final XORing step
  30. # we transpose the matrix by interleaving 32-, 64- and then 128-bit
  31. # words, which allows us to do XOR in AVX registers. 8/16-bit word
  32. # rotation is done with the slightly better performing byte shuffling,
  33. # 7/12-bit word rotation uses traditional shift+OR.
  34. vzeroupper
  35. # 4 * 32 byte stack, 32-byte aligned
  36. mov %rsp, %r8
  37. and $~31, %rsp
  38. sub $0x80, %rsp
  39. # x0..15[0-7] = s[0..15]
  40. vpbroadcastd 0x00(%rdi),%ymm0
  41. vpbroadcastd 0x04(%rdi),%ymm1
  42. vpbroadcastd 0x08(%rdi),%ymm2
  43. vpbroadcastd 0x0c(%rdi),%ymm3
  44. vpbroadcastd 0x10(%rdi),%ymm4
  45. vpbroadcastd 0x14(%rdi),%ymm5
  46. vpbroadcastd 0x18(%rdi),%ymm6
  47. vpbroadcastd 0x1c(%rdi),%ymm7
  48. vpbroadcastd 0x20(%rdi),%ymm8
  49. vpbroadcastd 0x24(%rdi),%ymm9
  50. vpbroadcastd 0x28(%rdi),%ymm10
  51. vpbroadcastd 0x2c(%rdi),%ymm11
  52. vpbroadcastd 0x30(%rdi),%ymm12
  53. vpbroadcastd 0x34(%rdi),%ymm13
  54. vpbroadcastd 0x38(%rdi),%ymm14
  55. vpbroadcastd 0x3c(%rdi),%ymm15
  56. # x0..3 on stack
  57. vmovdqa %ymm0,0x00(%rsp)
  58. vmovdqa %ymm1,0x20(%rsp)
  59. vmovdqa %ymm2,0x40(%rsp)
  60. vmovdqa %ymm3,0x60(%rsp)
  61. vmovdqa CTRINC(%rip),%ymm1
  62. vmovdqa ROT8(%rip),%ymm2
  63. vmovdqa ROT16(%rip),%ymm3
  64. # x12 += counter values 0-3
  65. vpaddd %ymm1,%ymm12,%ymm12
  66. mov $10,%ecx
  67. .Ldoubleround8:
  68. # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  69. vpaddd 0x00(%rsp),%ymm4,%ymm0
  70. vmovdqa %ymm0,0x00(%rsp)
  71. vpxor %ymm0,%ymm12,%ymm12
  72. vpshufb %ymm3,%ymm12,%ymm12
  73. # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  74. vpaddd 0x20(%rsp),%ymm5,%ymm0
  75. vmovdqa %ymm0,0x20(%rsp)
  76. vpxor %ymm0,%ymm13,%ymm13
  77. vpshufb %ymm3,%ymm13,%ymm13
  78. # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  79. vpaddd 0x40(%rsp),%ymm6,%ymm0
  80. vmovdqa %ymm0,0x40(%rsp)
  81. vpxor %ymm0,%ymm14,%ymm14
  82. vpshufb %ymm3,%ymm14,%ymm14
  83. # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  84. vpaddd 0x60(%rsp),%ymm7,%ymm0
  85. vmovdqa %ymm0,0x60(%rsp)
  86. vpxor %ymm0,%ymm15,%ymm15
  87. vpshufb %ymm3,%ymm15,%ymm15
  88. # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  89. vpaddd %ymm12,%ymm8,%ymm8
  90. vpxor %ymm8,%ymm4,%ymm4
  91. vpslld $12,%ymm4,%ymm0
  92. vpsrld $20,%ymm4,%ymm4
  93. vpor %ymm0,%ymm4,%ymm4
  94. # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  95. vpaddd %ymm13,%ymm9,%ymm9
  96. vpxor %ymm9,%ymm5,%ymm5
  97. vpslld $12,%ymm5,%ymm0
  98. vpsrld $20,%ymm5,%ymm5
  99. vpor %ymm0,%ymm5,%ymm5
  100. # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  101. vpaddd %ymm14,%ymm10,%ymm10
  102. vpxor %ymm10,%ymm6,%ymm6
  103. vpslld $12,%ymm6,%ymm0
  104. vpsrld $20,%ymm6,%ymm6
  105. vpor %ymm0,%ymm6,%ymm6
  106. # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  107. vpaddd %ymm15,%ymm11,%ymm11
  108. vpxor %ymm11,%ymm7,%ymm7
  109. vpslld $12,%ymm7,%ymm0
  110. vpsrld $20,%ymm7,%ymm7
  111. vpor %ymm0,%ymm7,%ymm7
  112. # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  113. vpaddd 0x00(%rsp),%ymm4,%ymm0
  114. vmovdqa %ymm0,0x00(%rsp)
  115. vpxor %ymm0,%ymm12,%ymm12
  116. vpshufb %ymm2,%ymm12,%ymm12
  117. # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  118. vpaddd 0x20(%rsp),%ymm5,%ymm0
  119. vmovdqa %ymm0,0x20(%rsp)
  120. vpxor %ymm0,%ymm13,%ymm13
  121. vpshufb %ymm2,%ymm13,%ymm13
  122. # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  123. vpaddd 0x40(%rsp),%ymm6,%ymm0
  124. vmovdqa %ymm0,0x40(%rsp)
  125. vpxor %ymm0,%ymm14,%ymm14
  126. vpshufb %ymm2,%ymm14,%ymm14
  127. # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  128. vpaddd 0x60(%rsp),%ymm7,%ymm0
  129. vmovdqa %ymm0,0x60(%rsp)
  130. vpxor %ymm0,%ymm15,%ymm15
  131. vpshufb %ymm2,%ymm15,%ymm15
  132. # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  133. vpaddd %ymm12,%ymm8,%ymm8
  134. vpxor %ymm8,%ymm4,%ymm4
  135. vpslld $7,%ymm4,%ymm0
  136. vpsrld $25,%ymm4,%ymm4
  137. vpor %ymm0,%ymm4,%ymm4
  138. # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  139. vpaddd %ymm13,%ymm9,%ymm9
  140. vpxor %ymm9,%ymm5,%ymm5
  141. vpslld $7,%ymm5,%ymm0
  142. vpsrld $25,%ymm5,%ymm5
  143. vpor %ymm0,%ymm5,%ymm5
  144. # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  145. vpaddd %ymm14,%ymm10,%ymm10
  146. vpxor %ymm10,%ymm6,%ymm6
  147. vpslld $7,%ymm6,%ymm0
  148. vpsrld $25,%ymm6,%ymm6
  149. vpor %ymm0,%ymm6,%ymm6
  150. # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  151. vpaddd %ymm15,%ymm11,%ymm11
  152. vpxor %ymm11,%ymm7,%ymm7
  153. vpslld $7,%ymm7,%ymm0
  154. vpsrld $25,%ymm7,%ymm7
  155. vpor %ymm0,%ymm7,%ymm7
  156. # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  157. vpaddd 0x00(%rsp),%ymm5,%ymm0
  158. vmovdqa %ymm0,0x00(%rsp)
  159. vpxor %ymm0,%ymm15,%ymm15
  160. vpshufb %ymm3,%ymm15,%ymm15
  161. # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
  162. vpaddd 0x20(%rsp),%ymm6,%ymm0
  163. vmovdqa %ymm0,0x20(%rsp)
  164. vpxor %ymm0,%ymm12,%ymm12
  165. vpshufb %ymm3,%ymm12,%ymm12
  166. # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  167. vpaddd 0x40(%rsp),%ymm7,%ymm0
  168. vmovdqa %ymm0,0x40(%rsp)
  169. vpxor %ymm0,%ymm13,%ymm13
  170. vpshufb %ymm3,%ymm13,%ymm13
  171. # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  172. vpaddd 0x60(%rsp),%ymm4,%ymm0
  173. vmovdqa %ymm0,0x60(%rsp)
  174. vpxor %ymm0,%ymm14,%ymm14
  175. vpshufb %ymm3,%ymm14,%ymm14
  176. # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  177. vpaddd %ymm15,%ymm10,%ymm10
  178. vpxor %ymm10,%ymm5,%ymm5
  179. vpslld $12,%ymm5,%ymm0
  180. vpsrld $20,%ymm5,%ymm5
  181. vpor %ymm0,%ymm5,%ymm5
  182. # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  183. vpaddd %ymm12,%ymm11,%ymm11
  184. vpxor %ymm11,%ymm6,%ymm6
  185. vpslld $12,%ymm6,%ymm0
  186. vpsrld $20,%ymm6,%ymm6
  187. vpor %ymm0,%ymm6,%ymm6
  188. # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  189. vpaddd %ymm13,%ymm8,%ymm8
  190. vpxor %ymm8,%ymm7,%ymm7
  191. vpslld $12,%ymm7,%ymm0
  192. vpsrld $20,%ymm7,%ymm7
  193. vpor %ymm0,%ymm7,%ymm7
  194. # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  195. vpaddd %ymm14,%ymm9,%ymm9
  196. vpxor %ymm9,%ymm4,%ymm4
  197. vpslld $12,%ymm4,%ymm0
  198. vpsrld $20,%ymm4,%ymm4
  199. vpor %ymm0,%ymm4,%ymm4
  200. # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  201. vpaddd 0x00(%rsp),%ymm5,%ymm0
  202. vmovdqa %ymm0,0x00(%rsp)
  203. vpxor %ymm0,%ymm15,%ymm15
  204. vpshufb %ymm2,%ymm15,%ymm15
  205. # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  206. vpaddd 0x20(%rsp),%ymm6,%ymm0
  207. vmovdqa %ymm0,0x20(%rsp)
  208. vpxor %ymm0,%ymm12,%ymm12
  209. vpshufb %ymm2,%ymm12,%ymm12
  210. # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  211. vpaddd 0x40(%rsp),%ymm7,%ymm0
  212. vmovdqa %ymm0,0x40(%rsp)
  213. vpxor %ymm0,%ymm13,%ymm13
  214. vpshufb %ymm2,%ymm13,%ymm13
  215. # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  216. vpaddd 0x60(%rsp),%ymm4,%ymm0
  217. vmovdqa %ymm0,0x60(%rsp)
  218. vpxor %ymm0,%ymm14,%ymm14
  219. vpshufb %ymm2,%ymm14,%ymm14
  220. # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  221. vpaddd %ymm15,%ymm10,%ymm10
  222. vpxor %ymm10,%ymm5,%ymm5
  223. vpslld $7,%ymm5,%ymm0
  224. vpsrld $25,%ymm5,%ymm5
  225. vpor %ymm0,%ymm5,%ymm5
  226. # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  227. vpaddd %ymm12,%ymm11,%ymm11
  228. vpxor %ymm11,%ymm6,%ymm6
  229. vpslld $7,%ymm6,%ymm0
  230. vpsrld $25,%ymm6,%ymm6
  231. vpor %ymm0,%ymm6,%ymm6
  232. # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  233. vpaddd %ymm13,%ymm8,%ymm8
  234. vpxor %ymm8,%ymm7,%ymm7
  235. vpslld $7,%ymm7,%ymm0
  236. vpsrld $25,%ymm7,%ymm7
  237. vpor %ymm0,%ymm7,%ymm7
  238. # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  239. vpaddd %ymm14,%ymm9,%ymm9
  240. vpxor %ymm9,%ymm4,%ymm4
  241. vpslld $7,%ymm4,%ymm0
  242. vpsrld $25,%ymm4,%ymm4
  243. vpor %ymm0,%ymm4,%ymm4
  244. dec %ecx
  245. jnz .Ldoubleround8
  246. # x0..15[0-3] += s[0..15]
  247. vpbroadcastd 0x00(%rdi),%ymm0
  248. vpaddd 0x00(%rsp),%ymm0,%ymm0
  249. vmovdqa %ymm0,0x00(%rsp)
  250. vpbroadcastd 0x04(%rdi),%ymm0
  251. vpaddd 0x20(%rsp),%ymm0,%ymm0
  252. vmovdqa %ymm0,0x20(%rsp)
  253. vpbroadcastd 0x08(%rdi),%ymm0
  254. vpaddd 0x40(%rsp),%ymm0,%ymm0
  255. vmovdqa %ymm0,0x40(%rsp)
  256. vpbroadcastd 0x0c(%rdi),%ymm0
  257. vpaddd 0x60(%rsp),%ymm0,%ymm0
  258. vmovdqa %ymm0,0x60(%rsp)
  259. vpbroadcastd 0x10(%rdi),%ymm0
  260. vpaddd %ymm0,%ymm4,%ymm4
  261. vpbroadcastd 0x14(%rdi),%ymm0
  262. vpaddd %ymm0,%ymm5,%ymm5
  263. vpbroadcastd 0x18(%rdi),%ymm0
  264. vpaddd %ymm0,%ymm6,%ymm6
  265. vpbroadcastd 0x1c(%rdi),%ymm0
  266. vpaddd %ymm0,%ymm7,%ymm7
  267. vpbroadcastd 0x20(%rdi),%ymm0
  268. vpaddd %ymm0,%ymm8,%ymm8
  269. vpbroadcastd 0x24(%rdi),%ymm0
  270. vpaddd %ymm0,%ymm9,%ymm9
  271. vpbroadcastd 0x28(%rdi),%ymm0
  272. vpaddd %ymm0,%ymm10,%ymm10
  273. vpbroadcastd 0x2c(%rdi),%ymm0
  274. vpaddd %ymm0,%ymm11,%ymm11
  275. vpbroadcastd 0x30(%rdi),%ymm0
  276. vpaddd %ymm0,%ymm12,%ymm12
  277. vpbroadcastd 0x34(%rdi),%ymm0
  278. vpaddd %ymm0,%ymm13,%ymm13
  279. vpbroadcastd 0x38(%rdi),%ymm0
  280. vpaddd %ymm0,%ymm14,%ymm14
  281. vpbroadcastd 0x3c(%rdi),%ymm0
  282. vpaddd %ymm0,%ymm15,%ymm15
  283. # x12 += counter values 0-3
  284. vpaddd %ymm1,%ymm12,%ymm12
  285. # interleave 32-bit words in state n, n+1
  286. vmovdqa 0x00(%rsp),%ymm0
  287. vmovdqa 0x20(%rsp),%ymm1
  288. vpunpckldq %ymm1,%ymm0,%ymm2
  289. vpunpckhdq %ymm1,%ymm0,%ymm1
  290. vmovdqa %ymm2,0x00(%rsp)
  291. vmovdqa %ymm1,0x20(%rsp)
  292. vmovdqa 0x40(%rsp),%ymm0
  293. vmovdqa 0x60(%rsp),%ymm1
  294. vpunpckldq %ymm1,%ymm0,%ymm2
  295. vpunpckhdq %ymm1,%ymm0,%ymm1
  296. vmovdqa %ymm2,0x40(%rsp)
  297. vmovdqa %ymm1,0x60(%rsp)
  298. vmovdqa %ymm4,%ymm0
  299. vpunpckldq %ymm5,%ymm0,%ymm4
  300. vpunpckhdq %ymm5,%ymm0,%ymm5
  301. vmovdqa %ymm6,%ymm0
  302. vpunpckldq %ymm7,%ymm0,%ymm6
  303. vpunpckhdq %ymm7,%ymm0,%ymm7
  304. vmovdqa %ymm8,%ymm0
  305. vpunpckldq %ymm9,%ymm0,%ymm8
  306. vpunpckhdq %ymm9,%ymm0,%ymm9
  307. vmovdqa %ymm10,%ymm0
  308. vpunpckldq %ymm11,%ymm0,%ymm10
  309. vpunpckhdq %ymm11,%ymm0,%ymm11
  310. vmovdqa %ymm12,%ymm0
  311. vpunpckldq %ymm13,%ymm0,%ymm12
  312. vpunpckhdq %ymm13,%ymm0,%ymm13
  313. vmovdqa %ymm14,%ymm0
  314. vpunpckldq %ymm15,%ymm0,%ymm14
  315. vpunpckhdq %ymm15,%ymm0,%ymm15
  316. # interleave 64-bit words in state n, n+2
  317. vmovdqa 0x00(%rsp),%ymm0
  318. vmovdqa 0x40(%rsp),%ymm2
  319. vpunpcklqdq %ymm2,%ymm0,%ymm1
  320. vpunpckhqdq %ymm2,%ymm0,%ymm2
  321. vmovdqa %ymm1,0x00(%rsp)
  322. vmovdqa %ymm2,0x40(%rsp)
  323. vmovdqa 0x20(%rsp),%ymm0
  324. vmovdqa 0x60(%rsp),%ymm2
  325. vpunpcklqdq %ymm2,%ymm0,%ymm1
  326. vpunpckhqdq %ymm2,%ymm0,%ymm2
  327. vmovdqa %ymm1,0x20(%rsp)
  328. vmovdqa %ymm2,0x60(%rsp)
  329. vmovdqa %ymm4,%ymm0
  330. vpunpcklqdq %ymm6,%ymm0,%ymm4
  331. vpunpckhqdq %ymm6,%ymm0,%ymm6
  332. vmovdqa %ymm5,%ymm0
  333. vpunpcklqdq %ymm7,%ymm0,%ymm5
  334. vpunpckhqdq %ymm7,%ymm0,%ymm7
  335. vmovdqa %ymm8,%ymm0
  336. vpunpcklqdq %ymm10,%ymm0,%ymm8
  337. vpunpckhqdq %ymm10,%ymm0,%ymm10
  338. vmovdqa %ymm9,%ymm0
  339. vpunpcklqdq %ymm11,%ymm0,%ymm9
  340. vpunpckhqdq %ymm11,%ymm0,%ymm11
  341. vmovdqa %ymm12,%ymm0
  342. vpunpcklqdq %ymm14,%ymm0,%ymm12
  343. vpunpckhqdq %ymm14,%ymm0,%ymm14
  344. vmovdqa %ymm13,%ymm0
  345. vpunpcklqdq %ymm15,%ymm0,%ymm13
  346. vpunpckhqdq %ymm15,%ymm0,%ymm15
  347. # interleave 128-bit words in state n, n+4
  348. vmovdqa 0x00(%rsp),%ymm0
  349. vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
  350. vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
  351. vmovdqa %ymm1,0x00(%rsp)
  352. vmovdqa 0x20(%rsp),%ymm0
  353. vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
  354. vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
  355. vmovdqa %ymm1,0x20(%rsp)
  356. vmovdqa 0x40(%rsp),%ymm0
  357. vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
  358. vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
  359. vmovdqa %ymm1,0x40(%rsp)
  360. vmovdqa 0x60(%rsp),%ymm0
  361. vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
  362. vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
  363. vmovdqa %ymm1,0x60(%rsp)
  364. vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
  365. vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
  366. vmovdqa %ymm0,%ymm8
  367. vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
  368. vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
  369. vmovdqa %ymm0,%ymm9
  370. vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
  371. vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
  372. vmovdqa %ymm0,%ymm10
  373. vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
  374. vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
  375. vmovdqa %ymm0,%ymm11
  376. # xor with corresponding input, write to output
  377. vmovdqa 0x00(%rsp),%ymm0
  378. vpxor 0x0000(%rdx),%ymm0,%ymm0
  379. vmovdqu %ymm0,0x0000(%rsi)
  380. vmovdqa 0x20(%rsp),%ymm0
  381. vpxor 0x0080(%rdx),%ymm0,%ymm0
  382. vmovdqu %ymm0,0x0080(%rsi)
  383. vmovdqa 0x40(%rsp),%ymm0
  384. vpxor 0x0040(%rdx),%ymm0,%ymm0
  385. vmovdqu %ymm0,0x0040(%rsi)
  386. vmovdqa 0x60(%rsp),%ymm0
  387. vpxor 0x00c0(%rdx),%ymm0,%ymm0
  388. vmovdqu %ymm0,0x00c0(%rsi)
  389. vpxor 0x0100(%rdx),%ymm4,%ymm4
  390. vmovdqu %ymm4,0x0100(%rsi)
  391. vpxor 0x0180(%rdx),%ymm5,%ymm5
  392. vmovdqu %ymm5,0x00180(%rsi)
  393. vpxor 0x0140(%rdx),%ymm6,%ymm6
  394. vmovdqu %ymm6,0x0140(%rsi)
  395. vpxor 0x01c0(%rdx),%ymm7,%ymm7
  396. vmovdqu %ymm7,0x01c0(%rsi)
  397. vpxor 0x0020(%rdx),%ymm8,%ymm8
  398. vmovdqu %ymm8,0x0020(%rsi)
  399. vpxor 0x00a0(%rdx),%ymm9,%ymm9
  400. vmovdqu %ymm9,0x00a0(%rsi)
  401. vpxor 0x0060(%rdx),%ymm10,%ymm10
  402. vmovdqu %ymm10,0x0060(%rsi)
  403. vpxor 0x00e0(%rdx),%ymm11,%ymm11
  404. vmovdqu %ymm11,0x00e0(%rsi)
  405. vpxor 0x0120(%rdx),%ymm12,%ymm12
  406. vmovdqu %ymm12,0x0120(%rsi)
  407. vpxor 0x01a0(%rdx),%ymm13,%ymm13
  408. vmovdqu %ymm13,0x01a0(%rsi)
  409. vpxor 0x0160(%rdx),%ymm14,%ymm14
  410. vmovdqu %ymm14,0x0160(%rsi)
  411. vpxor 0x01e0(%rdx),%ymm15,%ymm15
  412. vmovdqu %ymm15,0x01e0(%rsi)
  413. vzeroupper
  414. mov %r8,%rsp
  415. ret
  416. ENDPROC(chacha20_8block_xor_avx2)