blowfish-x86_64-asm_64.S 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. /*
  2. * Blowfish Cipher Algorithm (x86_64)
  3. *
  4. * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  19. * USA
  20. *
  21. */
  22. .file "blowfish-x86_64-asm.S"
  23. .text
  24. /* structure of crypto context */
  25. #define p 0
  26. #define s0 ((16 + 2) * 4)
  27. #define s1 ((16 + 2 + (1 * 256)) * 4)
  28. #define s2 ((16 + 2 + (2 * 256)) * 4)
  29. #define s3 ((16 + 2 + (3 * 256)) * 4)
  30. /* register macros */
  31. #define CTX %rdi
  32. #define RIO %rsi
  33. #define RX0 %rax
  34. #define RX1 %rbx
  35. #define RX2 %rcx
  36. #define RX3 %rdx
  37. #define RX0d %eax
  38. #define RX1d %ebx
  39. #define RX2d %ecx
  40. #define RX3d %edx
  41. #define RX0bl %al
  42. #define RX1bl %bl
  43. #define RX2bl %cl
  44. #define RX3bl %dl
  45. #define RX0bh %ah
  46. #define RX1bh %bh
  47. #define RX2bh %ch
  48. #define RX3bh %dh
  49. #define RT0 %rbp
  50. #define RT1 %rsi
  51. #define RT2 %r8
  52. #define RT3 %r9
  53. #define RT0d %ebp
  54. #define RT1d %esi
  55. #define RT2d %r8d
  56. #define RT3d %r9d
  57. #define RKEY %r10
  58. /***********************************************************************
  59. * 1-way blowfish
  60. ***********************************************************************/
  61. #define F() \
  62. rorq $16, RX0; \
  63. movzbl RX0bh, RT0d; \
  64. movzbl RX0bl, RT1d; \
  65. rolq $16, RX0; \
  66. movl s0(CTX,RT0,4), RT0d; \
  67. addl s1(CTX,RT1,4), RT0d; \
  68. movzbl RX0bh, RT1d; \
  69. movzbl RX0bl, RT2d; \
  70. rolq $32, RX0; \
  71. xorl s2(CTX,RT1,4), RT0d; \
  72. addl s3(CTX,RT2,4), RT0d; \
  73. xorq RT0, RX0;
  74. #define add_roundkey_enc(n) \
  75. xorq p+4*(n)(CTX), RX0;
  76. #define round_enc(n) \
  77. add_roundkey_enc(n); \
  78. \
  79. F(); \
  80. F();
  81. #define add_roundkey_dec(n) \
  82. movq p+4*(n-1)(CTX), RT0; \
  83. rorq $32, RT0; \
  84. xorq RT0, RX0;
  85. #define round_dec(n) \
  86. add_roundkey_dec(n); \
  87. \
  88. F(); \
  89. F(); \
  90. #define read_block() \
  91. movq (RIO), RX0; \
  92. rorq $32, RX0; \
  93. bswapq RX0;
  94. #define write_block() \
  95. bswapq RX0; \
  96. movq RX0, (RIO);
  97. #define xor_block() \
  98. bswapq RX0; \
  99. xorq RX0, (RIO);
  100. .align 8
  101. .global __blowfish_enc_blk
  102. .type __blowfish_enc_blk,@function;
  103. __blowfish_enc_blk:
  104. /* input:
  105. * %rdi: ctx, CTX
  106. * %rsi: dst
  107. * %rdx: src
  108. * %rcx: bool, if true: xor output
  109. */
  110. movq %rbp, %r11;
  111. movq %rsi, %r10;
  112. movq %rdx, RIO;
  113. read_block();
  114. round_enc(0);
  115. round_enc(2);
  116. round_enc(4);
  117. round_enc(6);
  118. round_enc(8);
  119. round_enc(10);
  120. round_enc(12);
  121. round_enc(14);
  122. add_roundkey_enc(16);
  123. movq %r11, %rbp;
  124. movq %r10, RIO;
  125. test %cl, %cl;
  126. jnz __enc_xor;
  127. write_block();
  128. ret;
  129. __enc_xor:
  130. xor_block();
  131. ret;
  132. .align 8
  133. .global blowfish_dec_blk
  134. .type blowfish_dec_blk,@function;
  135. blowfish_dec_blk:
  136. /* input:
  137. * %rdi: ctx, CTX
  138. * %rsi: dst
  139. * %rdx: src
  140. */
  141. movq %rbp, %r11;
  142. movq %rsi, %r10;
  143. movq %rdx, RIO;
  144. read_block();
  145. round_dec(17);
  146. round_dec(15);
  147. round_dec(13);
  148. round_dec(11);
  149. round_dec(9);
  150. round_dec(7);
  151. round_dec(5);
  152. round_dec(3);
  153. add_roundkey_dec(1);
  154. movq %r10, RIO;
  155. write_block();
  156. movq %r11, %rbp;
  157. ret;
  158. /**********************************************************************
  159. 4-way blowfish, four blocks parallel
  160. **********************************************************************/
  161. /* F() for 4-way. Slower when used alone/1-way, but faster when used
  162. * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
  163. */
  164. #define F4(x) \
  165. movzbl x ## bh, RT1d; \
  166. movzbl x ## bl, RT3d; \
  167. rorq $16, x; \
  168. movzbl x ## bh, RT0d; \
  169. movzbl x ## bl, RT2d; \
  170. rorq $16, x; \
  171. movl s0(CTX,RT0,4), RT0d; \
  172. addl s1(CTX,RT2,4), RT0d; \
  173. xorl s2(CTX,RT1,4), RT0d; \
  174. addl s3(CTX,RT3,4), RT0d; \
  175. xorq RT0, x;
  176. #define add_preloaded_roundkey4() \
  177. xorq RKEY, RX0; \
  178. xorq RKEY, RX1; \
  179. xorq RKEY, RX2; \
  180. xorq RKEY, RX3;
  181. #define preload_roundkey_enc(n) \
  182. movq p+4*(n)(CTX), RKEY;
  183. #define add_roundkey_enc4(n) \
  184. add_preloaded_roundkey4(); \
  185. preload_roundkey_enc(n + 2);
  186. #define round_enc4(n) \
  187. add_roundkey_enc4(n); \
  188. \
  189. F4(RX0); \
  190. F4(RX1); \
  191. F4(RX2); \
  192. F4(RX3); \
  193. \
  194. F4(RX0); \
  195. F4(RX1); \
  196. F4(RX2); \
  197. F4(RX3);
  198. #define preload_roundkey_dec(n) \
  199. movq p+4*((n)-1)(CTX), RKEY; \
  200. rorq $32, RKEY;
  201. #define add_roundkey_dec4(n) \
  202. add_preloaded_roundkey4(); \
  203. preload_roundkey_dec(n - 2);
  204. #define round_dec4(n) \
  205. add_roundkey_dec4(n); \
  206. \
  207. F4(RX0); \
  208. F4(RX1); \
  209. F4(RX2); \
  210. F4(RX3); \
  211. \
  212. F4(RX0); \
  213. F4(RX1); \
  214. F4(RX2); \
  215. F4(RX3);
  216. #define read_block4() \
  217. movq (RIO), RX0; \
  218. rorq $32, RX0; \
  219. bswapq RX0; \
  220. \
  221. movq 8(RIO), RX1; \
  222. rorq $32, RX1; \
  223. bswapq RX1; \
  224. \
  225. movq 16(RIO), RX2; \
  226. rorq $32, RX2; \
  227. bswapq RX2; \
  228. \
  229. movq 24(RIO), RX3; \
  230. rorq $32, RX3; \
  231. bswapq RX3;
  232. #define write_block4() \
  233. bswapq RX0; \
  234. movq RX0, (RIO); \
  235. \
  236. bswapq RX1; \
  237. movq RX1, 8(RIO); \
  238. \
  239. bswapq RX2; \
  240. movq RX2, 16(RIO); \
  241. \
  242. bswapq RX3; \
  243. movq RX3, 24(RIO);
  244. #define xor_block4() \
  245. bswapq RX0; \
  246. xorq RX0, (RIO); \
  247. \
  248. bswapq RX1; \
  249. xorq RX1, 8(RIO); \
  250. \
  251. bswapq RX2; \
  252. xorq RX2, 16(RIO); \
  253. \
  254. bswapq RX3; \
  255. xorq RX3, 24(RIO);
  256. .align 8
  257. .global __blowfish_enc_blk_4way
  258. .type __blowfish_enc_blk_4way,@function;
  259. __blowfish_enc_blk_4way:
  260. /* input:
  261. * %rdi: ctx, CTX
  262. * %rsi: dst
  263. * %rdx: src
  264. * %rcx: bool, if true: xor output
  265. */
  266. pushq %rbp;
  267. pushq %rbx;
  268. pushq %rcx;
  269. preload_roundkey_enc(0);
  270. movq %rsi, %r11;
  271. movq %rdx, RIO;
  272. read_block4();
  273. round_enc4(0);
  274. round_enc4(2);
  275. round_enc4(4);
  276. round_enc4(6);
  277. round_enc4(8);
  278. round_enc4(10);
  279. round_enc4(12);
  280. round_enc4(14);
  281. add_preloaded_roundkey4();
  282. popq %rbp;
  283. movq %r11, RIO;
  284. test %bpl, %bpl;
  285. jnz __enc_xor4;
  286. write_block4();
  287. popq %rbx;
  288. popq %rbp;
  289. ret;
  290. __enc_xor4:
  291. xor_block4();
  292. popq %rbx;
  293. popq %rbp;
  294. ret;
  295. .align 8
  296. .global blowfish_dec_blk_4way
  297. .type blowfish_dec_blk_4way,@function;
  298. blowfish_dec_blk_4way:
  299. /* input:
  300. * %rdi: ctx, CTX
  301. * %rsi: dst
  302. * %rdx: src
  303. */
  304. pushq %rbp;
  305. pushq %rbx;
  306. preload_roundkey_dec(17);
  307. movq %rsi, %r11;
  308. movq %rdx, RIO;
  309. read_block4();
  310. round_dec4(17);
  311. round_dec4(15);
  312. round_dec4(13);
  313. round_dec4(11);
  314. round_dec4(9);
  315. round_dec4(7);
  316. round_dec4(5);
  317. round_dec4(3);
  318. add_preloaded_roundkey4();
  319. movq %r11, RIO;
  320. write_block4();
  321. popq %rbx;
  322. popq %rbp;
  323. ret;