camellia-x86_64-asm_64.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. /*
  2. * Camellia Cipher Algorithm (x86_64)
  3. *
  4. * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  19. * USA
  20. *
  21. */
  22. .file "camellia-x86_64-asm_64.S"
  23. .text
  24. .extern camellia_sp10011110;
  25. .extern camellia_sp22000222;
  26. .extern camellia_sp03303033;
  27. .extern camellia_sp00444404;
  28. .extern camellia_sp02220222;
  29. .extern camellia_sp30333033;
  30. .extern camellia_sp44044404;
  31. .extern camellia_sp11101110;
  32. #define sp10011110 camellia_sp10011110
  33. #define sp22000222 camellia_sp22000222
  34. #define sp03303033 camellia_sp03303033
  35. #define sp00444404 camellia_sp00444404
  36. #define sp02220222 camellia_sp02220222
  37. #define sp30333033 camellia_sp30333033
  38. #define sp44044404 camellia_sp44044404
  39. #define sp11101110 camellia_sp11101110
  40. #define CAMELLIA_TABLE_BYTE_LEN 272
  41. /* struct camellia_ctx: */
  42. #define key_table 0
  43. #define key_length CAMELLIA_TABLE_BYTE_LEN
  44. /* register macros */
  45. #define CTX %rdi
  46. #define RIO %rsi
  47. #define RIOd %esi
  48. #define RAB0 %rax
  49. #define RCD0 %rcx
  50. #define RAB1 %rbx
  51. #define RCD1 %rdx
  52. #define RAB0d %eax
  53. #define RCD0d %ecx
  54. #define RAB1d %ebx
  55. #define RCD1d %edx
  56. #define RAB0bl %al
  57. #define RCD0bl %cl
  58. #define RAB1bl %bl
  59. #define RCD1bl %dl
  60. #define RAB0bh %ah
  61. #define RCD0bh %ch
  62. #define RAB1bh %bh
  63. #define RCD1bh %dh
  64. #define RT0 %rsi
  65. #define RT1 %rbp
  66. #define RT2 %r8
  67. #define RT0d %esi
  68. #define RT1d %ebp
  69. #define RT2d %r8d
  70. #define RT2bl %r8b
  71. #define RXOR %r9
  72. #define RRBP %r10
  73. #define RDST %r11
  74. #define RXORd %r9d
  75. #define RXORbl %r9b
  76. #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
  77. movzbl ab ## bl, tmp2 ## d; \
  78. movzbl ab ## bh, tmp1 ## d; \
  79. rorq $16, ab; \
  80. xorq T0(, tmp2, 8), dst; \
  81. xorq T1(, tmp1, 8), dst;
  82. /**********************************************************************
  83. 1-way camellia
  84. **********************************************************************/
  85. #define roundsm(ab, subkey, cd) \
  86. movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
  87. \
  88. xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
  89. xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
  90. xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
  91. xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
  92. \
  93. xorq RT2, cd ## 0;
  94. #define fls(l, r, kl, kr) \
  95. movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
  96. andl l ## 0d, RT0d; \
  97. roll $1, RT0d; \
  98. shlq $32, RT0; \
  99. xorq RT0, l ## 0; \
  100. movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
  101. orq r ## 0, RT1; \
  102. shrq $32, RT1; \
  103. xorq RT1, r ## 0; \
  104. \
  105. movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
  106. orq l ## 0, RT2; \
  107. shrq $32, RT2; \
  108. xorq RT2, l ## 0; \
  109. movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
  110. andl r ## 0d, RT0d; \
  111. roll $1, RT0d; \
  112. shlq $32, RT0; \
  113. xorq RT0, r ## 0;
  114. #define enc_rounds(i) \
  115. roundsm(RAB, i + 2, RCD); \
  116. roundsm(RCD, i + 3, RAB); \
  117. roundsm(RAB, i + 4, RCD); \
  118. roundsm(RCD, i + 5, RAB); \
  119. roundsm(RAB, i + 6, RCD); \
  120. roundsm(RCD, i + 7, RAB);
  121. #define enc_fls(i) \
  122. fls(RAB, RCD, i + 0, i + 1);
  123. #define enc_inpack() \
  124. movq (RIO), RAB0; \
  125. bswapq RAB0; \
  126. rolq $32, RAB0; \
  127. movq 4*2(RIO), RCD0; \
  128. bswapq RCD0; \
  129. rorq $32, RCD0; \
  130. xorq key_table(CTX), RAB0;
  131. #define enc_outunpack(op, max) \
  132. xorq key_table(CTX, max, 8), RCD0; \
  133. rorq $32, RCD0; \
  134. bswapq RCD0; \
  135. op ## q RCD0, (RIO); \
  136. rolq $32, RAB0; \
  137. bswapq RAB0; \
  138. op ## q RAB0, 4*2(RIO);
  139. #define dec_rounds(i) \
  140. roundsm(RAB, i + 7, RCD); \
  141. roundsm(RCD, i + 6, RAB); \
  142. roundsm(RAB, i + 5, RCD); \
  143. roundsm(RCD, i + 4, RAB); \
  144. roundsm(RAB, i + 3, RCD); \
  145. roundsm(RCD, i + 2, RAB);
  146. #define dec_fls(i) \
  147. fls(RAB, RCD, i + 1, i + 0);
  148. #define dec_inpack(max) \
  149. movq (RIO), RAB0; \
  150. bswapq RAB0; \
  151. rolq $32, RAB0; \
  152. movq 4*2(RIO), RCD0; \
  153. bswapq RCD0; \
  154. rorq $32, RCD0; \
  155. xorq key_table(CTX, max, 8), RAB0;
  156. #define dec_outunpack() \
  157. xorq key_table(CTX), RCD0; \
  158. rorq $32, RCD0; \
  159. bswapq RCD0; \
  160. movq RCD0, (RIO); \
  161. rolq $32, RAB0; \
  162. bswapq RAB0; \
  163. movq RAB0, 4*2(RIO);
  164. .global __camellia_enc_blk;
  165. .type __camellia_enc_blk,@function;
  166. __camellia_enc_blk:
  167. /* input:
  168. * %rdi: ctx, CTX
  169. * %rsi: dst
  170. * %rdx: src
  171. * %rcx: bool xor
  172. */
  173. movq %rbp, RRBP;
  174. movq %rcx, RXOR;
  175. movq %rsi, RDST;
  176. movq %rdx, RIO;
  177. enc_inpack();
  178. enc_rounds(0);
  179. enc_fls(8);
  180. enc_rounds(8);
  181. enc_fls(16);
  182. enc_rounds(16);
  183. movl $24, RT1d; /* max */
  184. cmpb $16, key_length(CTX);
  185. je __enc_done;
  186. enc_fls(24);
  187. enc_rounds(24);
  188. movl $32, RT1d; /* max */
  189. __enc_done:
  190. testb RXORbl, RXORbl;
  191. movq RDST, RIO;
  192. jnz __enc_xor;
  193. enc_outunpack(mov, RT1);
  194. movq RRBP, %rbp;
  195. ret;
  196. __enc_xor:
  197. enc_outunpack(xor, RT1);
  198. movq RRBP, %rbp;
  199. ret;
  200. .global camellia_dec_blk;
  201. .type camellia_dec_blk,@function;
  202. camellia_dec_blk:
  203. /* input:
  204. * %rdi: ctx, CTX
  205. * %rsi: dst
  206. * %rdx: src
  207. */
  208. cmpl $16, key_length(CTX);
  209. movl $32, RT2d;
  210. movl $24, RXORd;
  211. cmovel RXORd, RT2d; /* max */
  212. movq %rbp, RRBP;
  213. movq %rsi, RDST;
  214. movq %rdx, RIO;
  215. dec_inpack(RT2);
  216. cmpb $24, RT2bl;
  217. je __dec_rounds16;
  218. dec_rounds(24);
  219. dec_fls(24);
  220. __dec_rounds16:
  221. dec_rounds(16);
  222. dec_fls(16);
  223. dec_rounds(8);
  224. dec_fls(8);
  225. dec_rounds(0);
  226. movq RDST, RIO;
  227. dec_outunpack();
  228. movq RRBP, %rbp;
  229. ret;
  230. /**********************************************************************
  231. 2-way camellia
  232. **********************************************************************/
  233. #define roundsm2(ab, subkey, cd) \
  234. movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
  235. xorq RT2, cd ## 1; \
  236. \
  237. xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
  238. xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
  239. xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
  240. xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
  241. \
  242. xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
  243. xorq RT2, cd ## 0; \
  244. xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
  245. xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
  246. xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
  247. #define fls2(l, r, kl, kr) \
  248. movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
  249. andl l ## 0d, RT0d; \
  250. roll $1, RT0d; \
  251. shlq $32, RT0; \
  252. xorq RT0, l ## 0; \
  253. movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
  254. orq r ## 0, RT1; \
  255. shrq $32, RT1; \
  256. xorq RT1, r ## 0; \
  257. \
  258. movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
  259. andl l ## 1d, RT2d; \
  260. roll $1, RT2d; \
  261. shlq $32, RT2; \
  262. xorq RT2, l ## 1; \
  263. movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
  264. orq r ## 1, RT0; \
  265. shrq $32, RT0; \
  266. xorq RT0, r ## 1; \
  267. \
  268. movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
  269. orq l ## 0, RT1; \
  270. shrq $32, RT1; \
  271. xorq RT1, l ## 0; \
  272. movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
  273. andl r ## 0d, RT2d; \
  274. roll $1, RT2d; \
  275. shlq $32, RT2; \
  276. xorq RT2, r ## 0; \
  277. \
  278. movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
  279. orq l ## 1, RT0; \
  280. shrq $32, RT0; \
  281. xorq RT0, l ## 1; \
  282. movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
  283. andl r ## 1d, RT1d; \
  284. roll $1, RT1d; \
  285. shlq $32, RT1; \
  286. xorq RT1, r ## 1;
  287. #define enc_rounds2(i) \
  288. roundsm2(RAB, i + 2, RCD); \
  289. roundsm2(RCD, i + 3, RAB); \
  290. roundsm2(RAB, i + 4, RCD); \
  291. roundsm2(RCD, i + 5, RAB); \
  292. roundsm2(RAB, i + 6, RCD); \
  293. roundsm2(RCD, i + 7, RAB);
  294. #define enc_fls2(i) \
  295. fls2(RAB, RCD, i + 0, i + 1);
  296. #define enc_inpack2() \
  297. movq (RIO), RAB0; \
  298. bswapq RAB0; \
  299. rorq $32, RAB0; \
  300. movq 4*2(RIO), RCD0; \
  301. bswapq RCD0; \
  302. rolq $32, RCD0; \
  303. xorq key_table(CTX), RAB0; \
  304. \
  305. movq 8*2(RIO), RAB1; \
  306. bswapq RAB1; \
  307. rorq $32, RAB1; \
  308. movq 12*2(RIO), RCD1; \
  309. bswapq RCD1; \
  310. rolq $32, RCD1; \
  311. xorq key_table(CTX), RAB1;
  312. #define enc_outunpack2(op, max) \
  313. xorq key_table(CTX, max, 8), RCD0; \
  314. rolq $32, RCD0; \
  315. bswapq RCD0; \
  316. op ## q RCD0, (RIO); \
  317. rorq $32, RAB0; \
  318. bswapq RAB0; \
  319. op ## q RAB0, 4*2(RIO); \
  320. \
  321. xorq key_table(CTX, max, 8), RCD1; \
  322. rolq $32, RCD1; \
  323. bswapq RCD1; \
  324. op ## q RCD1, 8*2(RIO); \
  325. rorq $32, RAB1; \
  326. bswapq RAB1; \
  327. op ## q RAB1, 12*2(RIO);
  328. #define dec_rounds2(i) \
  329. roundsm2(RAB, i + 7, RCD); \
  330. roundsm2(RCD, i + 6, RAB); \
  331. roundsm2(RAB, i + 5, RCD); \
  332. roundsm2(RCD, i + 4, RAB); \
  333. roundsm2(RAB, i + 3, RCD); \
  334. roundsm2(RCD, i + 2, RAB);
  335. #define dec_fls2(i) \
  336. fls2(RAB, RCD, i + 1, i + 0);
  337. #define dec_inpack2(max) \
  338. movq (RIO), RAB0; \
  339. bswapq RAB0; \
  340. rorq $32, RAB0; \
  341. movq 4*2(RIO), RCD0; \
  342. bswapq RCD0; \
  343. rolq $32, RCD0; \
  344. xorq key_table(CTX, max, 8), RAB0; \
  345. \
  346. movq 8*2(RIO), RAB1; \
  347. bswapq RAB1; \
  348. rorq $32, RAB1; \
  349. movq 12*2(RIO), RCD1; \
  350. bswapq RCD1; \
  351. rolq $32, RCD1; \
  352. xorq key_table(CTX, max, 8), RAB1;
  353. #define dec_outunpack2() \
  354. xorq key_table(CTX), RCD0; \
  355. rolq $32, RCD0; \
  356. bswapq RCD0; \
  357. movq RCD0, (RIO); \
  358. rorq $32, RAB0; \
  359. bswapq RAB0; \
  360. movq RAB0, 4*2(RIO); \
  361. \
  362. xorq key_table(CTX), RCD1; \
  363. rolq $32, RCD1; \
  364. bswapq RCD1; \
  365. movq RCD1, 8*2(RIO); \
  366. rorq $32, RAB1; \
  367. bswapq RAB1; \
  368. movq RAB1, 12*2(RIO);
  369. .global __camellia_enc_blk_2way;
  370. .type __camellia_enc_blk_2way,@function;
  371. __camellia_enc_blk_2way:
  372. /* input:
  373. * %rdi: ctx, CTX
  374. * %rsi: dst
  375. * %rdx: src
  376. * %rcx: bool xor
  377. */
  378. pushq %rbx;
  379. movq %rbp, RRBP;
  380. movq %rcx, RXOR;
  381. movq %rsi, RDST;
  382. movq %rdx, RIO;
  383. enc_inpack2();
  384. enc_rounds2(0);
  385. enc_fls2(8);
  386. enc_rounds2(8);
  387. enc_fls2(16);
  388. enc_rounds2(16);
  389. movl $24, RT2d; /* max */
  390. cmpb $16, key_length(CTX);
  391. je __enc2_done;
  392. enc_fls2(24);
  393. enc_rounds2(24);
  394. movl $32, RT2d; /* max */
  395. __enc2_done:
  396. test RXORbl, RXORbl;
  397. movq RDST, RIO;
  398. jnz __enc2_xor;
  399. enc_outunpack2(mov, RT2);
  400. movq RRBP, %rbp;
  401. popq %rbx;
  402. ret;
  403. __enc2_xor:
  404. enc_outunpack2(xor, RT2);
  405. movq RRBP, %rbp;
  406. popq %rbx;
  407. ret;
  408. .global camellia_dec_blk_2way;
  409. .type camellia_dec_blk_2way,@function;
  410. camellia_dec_blk_2way:
  411. /* input:
  412. * %rdi: ctx, CTX
  413. * %rsi: dst
  414. * %rdx: src
  415. */
  416. cmpl $16, key_length(CTX);
  417. movl $32, RT2d;
  418. movl $24, RXORd;
  419. cmovel RXORd, RT2d; /* max */
  420. movq %rbx, RXOR;
  421. movq %rbp, RRBP;
  422. movq %rsi, RDST;
  423. movq %rdx, RIO;
  424. dec_inpack2(RT2);
  425. cmpb $24, RT2bl;
  426. je __dec2_rounds16;
  427. dec_rounds2(24);
  428. dec_fls2(24);
  429. __dec2_rounds16:
  430. dec_rounds2(16);
  431. dec_fls2(16);
  432. dec_rounds2(8);
  433. dec_fls2(8);
  434. dec_rounds2(0);
  435. movq RDST, RIO;
  436. dec_outunpack2();
  437. movq RRBP, %rbp;
  438. movq RXOR, %rbx;
  439. ret;