123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521 |
- /*
- * Camellia Cipher Algorithm (x86_64)
- *
- * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
- */
- .file "camellia-x86_64-asm_64.S"
- .text
- .extern camellia_sp10011110;
- .extern camellia_sp22000222;
- .extern camellia_sp03303033;
- .extern camellia_sp00444404;
- .extern camellia_sp02220222;
- .extern camellia_sp30333033;
- .extern camellia_sp44044404;
- .extern camellia_sp11101110;
- #define sp10011110 camellia_sp10011110
- #define sp22000222 camellia_sp22000222
- #define sp03303033 camellia_sp03303033
- #define sp00444404 camellia_sp00444404
- #define sp02220222 camellia_sp02220222
- #define sp30333033 camellia_sp30333033
- #define sp44044404 camellia_sp44044404
- #define sp11101110 camellia_sp11101110
- #define CAMELLIA_TABLE_BYTE_LEN 272
- /* struct camellia_ctx: */
- #define key_table 0
- #define key_length CAMELLIA_TABLE_BYTE_LEN
- /* register macros */
- #define CTX %rdi
- #define RIO %rsi
- #define RIOd %esi
- #define RAB0 %rax
- #define RCD0 %rcx
- #define RAB1 %rbx
- #define RCD1 %rdx
- #define RAB0d %eax
- #define RCD0d %ecx
- #define RAB1d %ebx
- #define RCD1d %edx
- #define RAB0bl %al
- #define RCD0bl %cl
- #define RAB1bl %bl
- #define RCD1bl %dl
- #define RAB0bh %ah
- #define RCD0bh %ch
- #define RAB1bh %bh
- #define RCD1bh %dh
- #define RT0 %rsi
- #define RT1 %rbp
- #define RT2 %r8
- #define RT0d %esi
- #define RT1d %ebp
- #define RT2d %r8d
- #define RT2bl %r8b
- #define RXOR %r9
- #define RRBP %r10
- #define RDST %r11
- #define RXORd %r9d
- #define RXORbl %r9b
- #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
- movzbl ab ## bl, tmp2 ## d; \
- movzbl ab ## bh, tmp1 ## d; \
- rorq $16, ab; \
- xorq T0(, tmp2, 8), dst; \
- xorq T1(, tmp1, 8), dst;
- /**********************************************************************
- 1-way camellia
- **********************************************************************/
- #define roundsm(ab, subkey, cd) \
- movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
- \
- xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
- xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
- xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
- xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
- \
- xorq RT2, cd ## 0;
- #define fls(l, r, kl, kr) \
- movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
- andl l ## 0d, RT0d; \
- roll $1, RT0d; \
- shlq $32, RT0; \
- xorq RT0, l ## 0; \
- movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
- orq r ## 0, RT1; \
- shrq $32, RT1; \
- xorq RT1, r ## 0; \
- \
- movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
- orq l ## 0, RT2; \
- shrq $32, RT2; \
- xorq RT2, l ## 0; \
- movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
- andl r ## 0d, RT0d; \
- roll $1, RT0d; \
- shlq $32, RT0; \
- xorq RT0, r ## 0;
- #define enc_rounds(i) \
- roundsm(RAB, i + 2, RCD); \
- roundsm(RCD, i + 3, RAB); \
- roundsm(RAB, i + 4, RCD); \
- roundsm(RCD, i + 5, RAB); \
- roundsm(RAB, i + 6, RCD); \
- roundsm(RCD, i + 7, RAB);
- #define enc_fls(i) \
- fls(RAB, RCD, i + 0, i + 1);
- #define enc_inpack() \
- movq (RIO), RAB0; \
- bswapq RAB0; \
- rolq $32, RAB0; \
- movq 4*2(RIO), RCD0; \
- bswapq RCD0; \
- rorq $32, RCD0; \
- xorq key_table(CTX), RAB0;
- #define enc_outunpack(op, max) \
- xorq key_table(CTX, max, 8), RCD0; \
- rorq $32, RCD0; \
- bswapq RCD0; \
- op ## q RCD0, (RIO); \
- rolq $32, RAB0; \
- bswapq RAB0; \
- op ## q RAB0, 4*2(RIO);
- #define dec_rounds(i) \
- roundsm(RAB, i + 7, RCD); \
- roundsm(RCD, i + 6, RAB); \
- roundsm(RAB, i + 5, RCD); \
- roundsm(RCD, i + 4, RAB); \
- roundsm(RAB, i + 3, RCD); \
- roundsm(RCD, i + 2, RAB);
- #define dec_fls(i) \
- fls(RAB, RCD, i + 1, i + 0);
- #define dec_inpack(max) \
- movq (RIO), RAB0; \
- bswapq RAB0; \
- rolq $32, RAB0; \
- movq 4*2(RIO), RCD0; \
- bswapq RCD0; \
- rorq $32, RCD0; \
- xorq key_table(CTX, max, 8), RAB0;
- #define dec_outunpack() \
- xorq key_table(CTX), RCD0; \
- rorq $32, RCD0; \
- bswapq RCD0; \
- movq RCD0, (RIO); \
- rolq $32, RAB0; \
- bswapq RAB0; \
- movq RAB0, 4*2(RIO);
- .global __camellia_enc_blk;
- .type __camellia_enc_blk,@function;
- __camellia_enc_blk:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: bool xor
- */
- movq %rbp, RRBP;
- movq %rcx, RXOR;
- movq %rsi, RDST;
- movq %rdx, RIO;
- enc_inpack();
- enc_rounds(0);
- enc_fls(8);
- enc_rounds(8);
- enc_fls(16);
- enc_rounds(16);
- movl $24, RT1d; /* max */
- cmpb $16, key_length(CTX);
- je __enc_done;
- enc_fls(24);
- enc_rounds(24);
- movl $32, RT1d; /* max */
- __enc_done:
- testb RXORbl, RXORbl;
- movq RDST, RIO;
- jnz __enc_xor;
- enc_outunpack(mov, RT1);
- movq RRBP, %rbp;
- ret;
- __enc_xor:
- enc_outunpack(xor, RT1);
- movq RRBP, %rbp;
- ret;
- .global camellia_dec_blk;
- .type camellia_dec_blk,@function;
- camellia_dec_blk:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- */
- cmpl $16, key_length(CTX);
- movl $32, RT2d;
- movl $24, RXORd;
- cmovel RXORd, RT2d; /* max */
- movq %rbp, RRBP;
- movq %rsi, RDST;
- movq %rdx, RIO;
- dec_inpack(RT2);
- cmpb $24, RT2bl;
- je __dec_rounds16;
- dec_rounds(24);
- dec_fls(24);
- __dec_rounds16:
- dec_rounds(16);
- dec_fls(16);
- dec_rounds(8);
- dec_fls(8);
- dec_rounds(0);
- movq RDST, RIO;
- dec_outunpack();
- movq RRBP, %rbp;
- ret;
- /**********************************************************************
- 2-way camellia
- **********************************************************************/
- #define roundsm2(ab, subkey, cd) \
- movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
- xorq RT2, cd ## 1; \
- \
- xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
- xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
- xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
- xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
- \
- xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
- xorq RT2, cd ## 0; \
- xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
- xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
- xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
- #define fls2(l, r, kl, kr) \
- movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
- andl l ## 0d, RT0d; \
- roll $1, RT0d; \
- shlq $32, RT0; \
- xorq RT0, l ## 0; \
- movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
- orq r ## 0, RT1; \
- shrq $32, RT1; \
- xorq RT1, r ## 0; \
- \
- movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
- andl l ## 1d, RT2d; \
- roll $1, RT2d; \
- shlq $32, RT2; \
- xorq RT2, l ## 1; \
- movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
- orq r ## 1, RT0; \
- shrq $32, RT0; \
- xorq RT0, r ## 1; \
- \
- movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
- orq l ## 0, RT1; \
- shrq $32, RT1; \
- xorq RT1, l ## 0; \
- movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
- andl r ## 0d, RT2d; \
- roll $1, RT2d; \
- shlq $32, RT2; \
- xorq RT2, r ## 0; \
- \
- movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
- orq l ## 1, RT0; \
- shrq $32, RT0; \
- xorq RT0, l ## 1; \
- movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
- andl r ## 1d, RT1d; \
- roll $1, RT1d; \
- shlq $32, RT1; \
- xorq RT1, r ## 1;
- #define enc_rounds2(i) \
- roundsm2(RAB, i + 2, RCD); \
- roundsm2(RCD, i + 3, RAB); \
- roundsm2(RAB, i + 4, RCD); \
- roundsm2(RCD, i + 5, RAB); \
- roundsm2(RAB, i + 6, RCD); \
- roundsm2(RCD, i + 7, RAB);
- #define enc_fls2(i) \
- fls2(RAB, RCD, i + 0, i + 1);
- #define enc_inpack2() \
- movq (RIO), RAB0; \
- bswapq RAB0; \
- rorq $32, RAB0; \
- movq 4*2(RIO), RCD0; \
- bswapq RCD0; \
- rolq $32, RCD0; \
- xorq key_table(CTX), RAB0; \
- \
- movq 8*2(RIO), RAB1; \
- bswapq RAB1; \
- rorq $32, RAB1; \
- movq 12*2(RIO), RCD1; \
- bswapq RCD1; \
- rolq $32, RCD1; \
- xorq key_table(CTX), RAB1;
- #define enc_outunpack2(op, max) \
- xorq key_table(CTX, max, 8), RCD0; \
- rolq $32, RCD0; \
- bswapq RCD0; \
- op ## q RCD0, (RIO); \
- rorq $32, RAB0; \
- bswapq RAB0; \
- op ## q RAB0, 4*2(RIO); \
- \
- xorq key_table(CTX, max, 8), RCD1; \
- rolq $32, RCD1; \
- bswapq RCD1; \
- op ## q RCD1, 8*2(RIO); \
- rorq $32, RAB1; \
- bswapq RAB1; \
- op ## q RAB1, 12*2(RIO);
- #define dec_rounds2(i) \
- roundsm2(RAB, i + 7, RCD); \
- roundsm2(RCD, i + 6, RAB); \
- roundsm2(RAB, i + 5, RCD); \
- roundsm2(RCD, i + 4, RAB); \
- roundsm2(RAB, i + 3, RCD); \
- roundsm2(RCD, i + 2, RAB);
- #define dec_fls2(i) \
- fls2(RAB, RCD, i + 1, i + 0);
- #define dec_inpack2(max) \
- movq (RIO), RAB0; \
- bswapq RAB0; \
- rorq $32, RAB0; \
- movq 4*2(RIO), RCD0; \
- bswapq RCD0; \
- rolq $32, RCD0; \
- xorq key_table(CTX, max, 8), RAB0; \
- \
- movq 8*2(RIO), RAB1; \
- bswapq RAB1; \
- rorq $32, RAB1; \
- movq 12*2(RIO), RCD1; \
- bswapq RCD1; \
- rolq $32, RCD1; \
- xorq key_table(CTX, max, 8), RAB1;
- #define dec_outunpack2() \
- xorq key_table(CTX), RCD0; \
- rolq $32, RCD0; \
- bswapq RCD0; \
- movq RCD0, (RIO); \
- rorq $32, RAB0; \
- bswapq RAB0; \
- movq RAB0, 4*2(RIO); \
- \
- xorq key_table(CTX), RCD1; \
- rolq $32, RCD1; \
- bswapq RCD1; \
- movq RCD1, 8*2(RIO); \
- rorq $32, RAB1; \
- bswapq RAB1; \
- movq RAB1, 12*2(RIO);
- .global __camellia_enc_blk_2way;
- .type __camellia_enc_blk_2way,@function;
- __camellia_enc_blk_2way:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: bool xor
- */
- pushq %rbx;
- movq %rbp, RRBP;
- movq %rcx, RXOR;
- movq %rsi, RDST;
- movq %rdx, RIO;
- enc_inpack2();
- enc_rounds2(0);
- enc_fls2(8);
- enc_rounds2(8);
- enc_fls2(16);
- enc_rounds2(16);
- movl $24, RT2d; /* max */
- cmpb $16, key_length(CTX);
- je __enc2_done;
- enc_fls2(24);
- enc_rounds2(24);
- movl $32, RT2d; /* max */
- __enc2_done:
- test RXORbl, RXORbl;
- movq RDST, RIO;
- jnz __enc2_xor;
- enc_outunpack2(mov, RT2);
- movq RRBP, %rbp;
- popq %rbx;
- ret;
- __enc2_xor:
- enc_outunpack2(xor, RT2);
- movq RRBP, %rbp;
- popq %rbx;
- ret;
- .global camellia_dec_blk_2way;
- .type camellia_dec_blk_2way,@function;
- camellia_dec_blk_2way:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- */
- cmpl $16, key_length(CTX);
- movl $32, RT2d;
- movl $24, RXORd;
- cmovel RXORd, RT2d; /* max */
- movq %rbx, RXOR;
- movq %rbp, RRBP;
- movq %rsi, RDST;
- movq %rdx, RIO;
- dec_inpack2(RT2);
- cmpb $24, RT2bl;
- je __dec2_rounds16;
- dec_rounds2(24);
- dec_fls2(24);
- __dec2_rounds16:
- dec_rounds2(16);
- dec_fls2(16);
- dec_rounds2(8);
- dec_fls2(8);
- dec_rounds2(0);
- movq RDST, RIO;
- dec_outunpack2();
- movq RRBP, %rbp;
- movq RXOR, %rbx;
- ret;
|