123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317 |
- /*
- * Twofish Cipher 3-way parallel algorithm (x86_64)
- *
- * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
- */
- .file "twofish-x86_64-asm-3way.S"
- .text
- /* structure of crypto context */
- #define s0 0
- #define s1 1024
- #define s2 2048
- #define s3 3072
- #define w 4096
- #define k 4128
- /**********************************************************************
- 3-way twofish
- **********************************************************************/
- #define CTX %rdi
- #define RIO %rdx
- #define RAB0 %rax
- #define RAB1 %rbx
- #define RAB2 %rcx
- #define RAB0d %eax
- #define RAB1d %ebx
- #define RAB2d %ecx
- #define RAB0bh %ah
- #define RAB1bh %bh
- #define RAB2bh %ch
- #define RAB0bl %al
- #define RAB1bl %bl
- #define RAB2bl %cl
- #define RCD0 %r8
- #define RCD1 %r9
- #define RCD2 %r10
- #define RCD0d %r8d
- #define RCD1d %r9d
- #define RCD2d %r10d
- #define RX0 %rbp
- #define RX1 %r11
- #define RX2 %r12
- #define RX0d %ebp
- #define RX1d %r11d
- #define RX2d %r12d
- #define RY0 %r13
- #define RY1 %r14
- #define RY2 %r15
- #define RY0d %r13d
- #define RY1d %r14d
- #define RY2d %r15d
- #define RT0 %rdx
- #define RT1 %rsi
- #define RT0d %edx
- #define RT1d %esi
- #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
- movzbl ab ## bl, tmp2 ## d; \
- movzbl ab ## bh, tmp1 ## d; \
- rorq $(rot), ab; \
- op1##l T0(CTX, tmp2, 4), dst ## d; \
- op2##l T1(CTX, tmp1, 4), dst ## d;
- /*
- * Combined G1 & G2 function. Reordered with help of rotates to have moves
- * at begining.
- */
- #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
- /* G1,1 && G2,1 */ \
- do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
- do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
- \
- do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
- do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
- \
- do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
- do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
- \
- /* G1,2 && G2,2 */ \
- do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
- do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
- xchgq cd ## 0, ab ## 0; \
- \
- do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
- do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
- xchgq cd ## 1, ab ## 1; \
- \
- do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
- do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
- xchgq cd ## 2, ab ## 2;
- #define enc_round_end(ab, x, y, n) \
- addl y ## d, x ## d; \
- addl x ## d, y ## d; \
- addl k+4*(2*(n))(CTX), x ## d; \
- xorl ab ## d, x ## d; \
- addl k+4*(2*(n)+1)(CTX), y ## d; \
- shrq $32, ab; \
- roll $1, ab ## d; \
- xorl y ## d, ab ## d; \
- shlq $32, ab; \
- rorl $1, x ## d; \
- orq x, ab;
- #define dec_round_end(ba, x, y, n) \
- addl y ## d, x ## d; \
- addl x ## d, y ## d; \
- addl k+4*(2*(n))(CTX), x ## d; \
- addl k+4*(2*(n)+1)(CTX), y ## d; \
- xorl ba ## d, y ## d; \
- shrq $32, ba; \
- roll $1, ba ## d; \
- xorl x ## d, ba ## d; \
- shlq $32, ba; \
- rorl $1, y ## d; \
- orq y, ba;
- #define encrypt_round3(ab, cd, n) \
- g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
- \
- enc_round_end(ab ## 0, RX0, RY0, n); \
- enc_round_end(ab ## 1, RX1, RY1, n); \
- enc_round_end(ab ## 2, RX2, RY2, n);
- #define decrypt_round3(ba, dc, n) \
- g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
- \
- dec_round_end(ba ## 0, RX0, RY0, n); \
- dec_round_end(ba ## 1, RX1, RY1, n); \
- dec_round_end(ba ## 2, RX2, RY2, n);
- #define encrypt_cycle3(ab, cd, n) \
- encrypt_round3(ab, cd, n*2); \
- encrypt_round3(ab, cd, (n*2)+1);
- #define decrypt_cycle3(ba, dc, n) \
- decrypt_round3(ba, dc, (n*2)+1); \
- decrypt_round3(ba, dc, (n*2));
- #define inpack3(in, n, xy, m) \
- movq 4*(n)(in), xy ## 0; \
- xorq w+4*m(CTX), xy ## 0; \
- \
- movq 4*(4+(n))(in), xy ## 1; \
- xorq w+4*m(CTX), xy ## 1; \
- \
- movq 4*(8+(n))(in), xy ## 2; \
- xorq w+4*m(CTX), xy ## 2;
- #define outunpack3(op, out, n, xy, m) \
- xorq w+4*m(CTX), xy ## 0; \
- op ## q xy ## 0, 4*(n)(out); \
- \
- xorq w+4*m(CTX), xy ## 1; \
- op ## q xy ## 1, 4*(4+(n))(out); \
- \
- xorq w+4*m(CTX), xy ## 2; \
- op ## q xy ## 2, 4*(8+(n))(out);
- #define inpack_enc3() \
- inpack3(RIO, 0, RAB, 0); \
- inpack3(RIO, 2, RCD, 2);
- #define outunpack_enc3(op) \
- outunpack3(op, RIO, 2, RAB, 6); \
- outunpack3(op, RIO, 0, RCD, 4);
- #define inpack_dec3() \
- inpack3(RIO, 0, RAB, 4); \
- rorq $32, RAB0; \
- rorq $32, RAB1; \
- rorq $32, RAB2; \
- inpack3(RIO, 2, RCD, 6); \
- rorq $32, RCD0; \
- rorq $32, RCD1; \
- rorq $32, RCD2;
- #define outunpack_dec3() \
- rorq $32, RCD0; \
- rorq $32, RCD1; \
- rorq $32, RCD2; \
- outunpack3(mov, RIO, 0, RCD, 0); \
- rorq $32, RAB0; \
- rorq $32, RAB1; \
- rorq $32, RAB2; \
- outunpack3(mov, RIO, 2, RAB, 2);
- .align 8
- .global __twofish_enc_blk_3way
- .type __twofish_enc_blk_3way,@function;
- __twofish_enc_blk_3way:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src, RIO
- * %rcx: bool, if true: xor output
- */
- pushq %r15;
- pushq %r14;
- pushq %r13;
- pushq %r12;
- pushq %rbp;
- pushq %rbx;
- pushq %rcx; /* bool xor */
- pushq %rsi; /* dst */
- inpack_enc3();
- encrypt_cycle3(RAB, RCD, 0);
- encrypt_cycle3(RAB, RCD, 1);
- encrypt_cycle3(RAB, RCD, 2);
- encrypt_cycle3(RAB, RCD, 3);
- encrypt_cycle3(RAB, RCD, 4);
- encrypt_cycle3(RAB, RCD, 5);
- encrypt_cycle3(RAB, RCD, 6);
- encrypt_cycle3(RAB, RCD, 7);
- popq RIO; /* dst */
- popq %rbp; /* bool xor */
- testb %bpl, %bpl;
- jnz __enc_xor3;
- outunpack_enc3(mov);
- popq %rbx;
- popq %rbp;
- popq %r12;
- popq %r13;
- popq %r14;
- popq %r15;
- ret;
- __enc_xor3:
- outunpack_enc3(xor);
- popq %rbx;
- popq %rbp;
- popq %r12;
- popq %r13;
- popq %r14;
- popq %r15;
- ret;
- .global twofish_dec_blk_3way
- .type twofish_dec_blk_3way,@function;
- twofish_dec_blk_3way:
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src, RIO
- */
- pushq %r15;
- pushq %r14;
- pushq %r13;
- pushq %r12;
- pushq %rbp;
- pushq %rbx;
- pushq %rsi; /* dst */
- inpack_dec3();
- decrypt_cycle3(RAB, RCD, 7);
- decrypt_cycle3(RAB, RCD, 6);
- decrypt_cycle3(RAB, RCD, 5);
- decrypt_cycle3(RAB, RCD, 4);
- decrypt_cycle3(RAB, RCD, 3);
- decrypt_cycle3(RAB, RCD, 2);
- decrypt_cycle3(RAB, RCD, 1);
- decrypt_cycle3(RAB, RCD, 0);
- popq RIO; /* dst */
- outunpack_dec3();
- popq %rbx;
- popq %rbp;
- popq %r12;
- popq %r13;
- popq %r14;
- popq %r15;
- ret;
|