twofish-x86_64-asm_64-3way.S 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. /*
  2. * Twofish Cipher 3-way parallel algorithm (x86_64)
  3. *
  4. * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  19. * USA
  20. *
  21. */
  22. .file "twofish-x86_64-asm-3way.S"
  23. .text
  24. /* structure of crypto context */
  25. #define s0 0
  26. #define s1 1024
  27. #define s2 2048
  28. #define s3 3072
  29. #define w 4096
  30. #define k 4128
  31. /**********************************************************************
  32. 3-way twofish
  33. **********************************************************************/
  34. #define CTX %rdi
  35. #define RIO %rdx
  36. #define RAB0 %rax
  37. #define RAB1 %rbx
  38. #define RAB2 %rcx
  39. #define RAB0d %eax
  40. #define RAB1d %ebx
  41. #define RAB2d %ecx
  42. #define RAB0bh %ah
  43. #define RAB1bh %bh
  44. #define RAB2bh %ch
  45. #define RAB0bl %al
  46. #define RAB1bl %bl
  47. #define RAB2bl %cl
  48. #define RCD0 %r8
  49. #define RCD1 %r9
  50. #define RCD2 %r10
  51. #define RCD0d %r8d
  52. #define RCD1d %r9d
  53. #define RCD2d %r10d
  54. #define RX0 %rbp
  55. #define RX1 %r11
  56. #define RX2 %r12
  57. #define RX0d %ebp
  58. #define RX1d %r11d
  59. #define RX2d %r12d
  60. #define RY0 %r13
  61. #define RY1 %r14
  62. #define RY2 %r15
  63. #define RY0d %r13d
  64. #define RY1d %r14d
  65. #define RY2d %r15d
  66. #define RT0 %rdx
  67. #define RT1 %rsi
  68. #define RT0d %edx
  69. #define RT1d %esi
  70. #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
  71. movzbl ab ## bl, tmp2 ## d; \
  72. movzbl ab ## bh, tmp1 ## d; \
  73. rorq $(rot), ab; \
  74. op1##l T0(CTX, tmp2, 4), dst ## d; \
  75. op2##l T1(CTX, tmp1, 4), dst ## d;
  76. /*
  77. * Combined G1 & G2 function. Reordered with help of rotates to have moves
  78. * at begining.
  79. */
  80. #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
  81. /* G1,1 && G2,1 */ \
  82. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
  83. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
  84. \
  85. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
  86. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
  87. \
  88. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
  89. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
  90. \
  91. /* G1,2 && G2,2 */ \
  92. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
  93. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
  94. xchgq cd ## 0, ab ## 0; \
  95. \
  96. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
  97. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
  98. xchgq cd ## 1, ab ## 1; \
  99. \
  100. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
  101. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
  102. xchgq cd ## 2, ab ## 2;
  103. #define enc_round_end(ab, x, y, n) \
  104. addl y ## d, x ## d; \
  105. addl x ## d, y ## d; \
  106. addl k+4*(2*(n))(CTX), x ## d; \
  107. xorl ab ## d, x ## d; \
  108. addl k+4*(2*(n)+1)(CTX), y ## d; \
  109. shrq $32, ab; \
  110. roll $1, ab ## d; \
  111. xorl y ## d, ab ## d; \
  112. shlq $32, ab; \
  113. rorl $1, x ## d; \
  114. orq x, ab;
  115. #define dec_round_end(ba, x, y, n) \
  116. addl y ## d, x ## d; \
  117. addl x ## d, y ## d; \
  118. addl k+4*(2*(n))(CTX), x ## d; \
  119. addl k+4*(2*(n)+1)(CTX), y ## d; \
  120. xorl ba ## d, y ## d; \
  121. shrq $32, ba; \
  122. roll $1, ba ## d; \
  123. xorl x ## d, ba ## d; \
  124. shlq $32, ba; \
  125. rorl $1, y ## d; \
  126. orq y, ba;
  127. #define encrypt_round3(ab, cd, n) \
  128. g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
  129. \
  130. enc_round_end(ab ## 0, RX0, RY0, n); \
  131. enc_round_end(ab ## 1, RX1, RY1, n); \
  132. enc_round_end(ab ## 2, RX2, RY2, n);
  133. #define decrypt_round3(ba, dc, n) \
  134. g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
  135. \
  136. dec_round_end(ba ## 0, RX0, RY0, n); \
  137. dec_round_end(ba ## 1, RX1, RY1, n); \
  138. dec_round_end(ba ## 2, RX2, RY2, n);
  139. #define encrypt_cycle3(ab, cd, n) \
  140. encrypt_round3(ab, cd, n*2); \
  141. encrypt_round3(ab, cd, (n*2)+1);
  142. #define decrypt_cycle3(ba, dc, n) \
  143. decrypt_round3(ba, dc, (n*2)+1); \
  144. decrypt_round3(ba, dc, (n*2));
  145. #define inpack3(in, n, xy, m) \
  146. movq 4*(n)(in), xy ## 0; \
  147. xorq w+4*m(CTX), xy ## 0; \
  148. \
  149. movq 4*(4+(n))(in), xy ## 1; \
  150. xorq w+4*m(CTX), xy ## 1; \
  151. \
  152. movq 4*(8+(n))(in), xy ## 2; \
  153. xorq w+4*m(CTX), xy ## 2;
  154. #define outunpack3(op, out, n, xy, m) \
  155. xorq w+4*m(CTX), xy ## 0; \
  156. op ## q xy ## 0, 4*(n)(out); \
  157. \
  158. xorq w+4*m(CTX), xy ## 1; \
  159. op ## q xy ## 1, 4*(4+(n))(out); \
  160. \
  161. xorq w+4*m(CTX), xy ## 2; \
  162. op ## q xy ## 2, 4*(8+(n))(out);
  163. #define inpack_enc3() \
  164. inpack3(RIO, 0, RAB, 0); \
  165. inpack3(RIO, 2, RCD, 2);
  166. #define outunpack_enc3(op) \
  167. outunpack3(op, RIO, 2, RAB, 6); \
  168. outunpack3(op, RIO, 0, RCD, 4);
  169. #define inpack_dec3() \
  170. inpack3(RIO, 0, RAB, 4); \
  171. rorq $32, RAB0; \
  172. rorq $32, RAB1; \
  173. rorq $32, RAB2; \
  174. inpack3(RIO, 2, RCD, 6); \
  175. rorq $32, RCD0; \
  176. rorq $32, RCD1; \
  177. rorq $32, RCD2;
  178. #define outunpack_dec3() \
  179. rorq $32, RCD0; \
  180. rorq $32, RCD1; \
  181. rorq $32, RCD2; \
  182. outunpack3(mov, RIO, 0, RCD, 0); \
  183. rorq $32, RAB0; \
  184. rorq $32, RAB1; \
  185. rorq $32, RAB2; \
  186. outunpack3(mov, RIO, 2, RAB, 2);
  187. .align 8
  188. .global __twofish_enc_blk_3way
  189. .type __twofish_enc_blk_3way,@function;
  190. __twofish_enc_blk_3way:
  191. /* input:
  192. * %rdi: ctx, CTX
  193. * %rsi: dst
  194. * %rdx: src, RIO
  195. * %rcx: bool, if true: xor output
  196. */
  197. pushq %r15;
  198. pushq %r14;
  199. pushq %r13;
  200. pushq %r12;
  201. pushq %rbp;
  202. pushq %rbx;
  203. pushq %rcx; /* bool xor */
  204. pushq %rsi; /* dst */
  205. inpack_enc3();
  206. encrypt_cycle3(RAB, RCD, 0);
  207. encrypt_cycle3(RAB, RCD, 1);
  208. encrypt_cycle3(RAB, RCD, 2);
  209. encrypt_cycle3(RAB, RCD, 3);
  210. encrypt_cycle3(RAB, RCD, 4);
  211. encrypt_cycle3(RAB, RCD, 5);
  212. encrypt_cycle3(RAB, RCD, 6);
  213. encrypt_cycle3(RAB, RCD, 7);
  214. popq RIO; /* dst */
  215. popq %rbp; /* bool xor */
  216. testb %bpl, %bpl;
  217. jnz __enc_xor3;
  218. outunpack_enc3(mov);
  219. popq %rbx;
  220. popq %rbp;
  221. popq %r12;
  222. popq %r13;
  223. popq %r14;
  224. popq %r15;
  225. ret;
  226. __enc_xor3:
  227. outunpack_enc3(xor);
  228. popq %rbx;
  229. popq %rbp;
  230. popq %r12;
  231. popq %r13;
  232. popq %r14;
  233. popq %r15;
  234. ret;
  235. .global twofish_dec_blk_3way
  236. .type twofish_dec_blk_3way,@function;
  237. twofish_dec_blk_3way:
  238. /* input:
  239. * %rdi: ctx, CTX
  240. * %rsi: dst
  241. * %rdx: src, RIO
  242. */
  243. pushq %r15;
  244. pushq %r14;
  245. pushq %r13;
  246. pushq %r12;
  247. pushq %rbp;
  248. pushq %rbx;
  249. pushq %rsi; /* dst */
  250. inpack_dec3();
  251. decrypt_cycle3(RAB, RCD, 7);
  252. decrypt_cycle3(RAB, RCD, 6);
  253. decrypt_cycle3(RAB, RCD, 5);
  254. decrypt_cycle3(RAB, RCD, 4);
  255. decrypt_cycle3(RAB, RCD, 3);
  256. decrypt_cycle3(RAB, RCD, 2);
  257. decrypt_cycle3(RAB, RCD, 1);
  258. decrypt_cycle3(RAB, RCD, 0);
  259. popq RIO; /* dst */
  260. outunpack_dec3();
  261. popq %rbx;
  262. popq %rbp;
  263. popq %r12;
  264. popq %r13;
  265. popq %r14;
  266. popq %r15;
  267. ret;