serpent-sse2-i586-asm_32.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636
  1. /*
  2. * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
  3. *
  4. * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5. *
  6. * Based on crypto/serpent.c by
  7. * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
  8. * 2003 Herbert Valerio Riedel <hvr@gnu.org>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  23. * USA
  24. *
  25. */
  26. .file "serpent-sse2-i586-asm_32.S"
  27. .text
  28. #define arg_ctx 4
  29. #define arg_dst 8
  30. #define arg_src 12
  31. #define arg_xor 16
  32. /**********************************************************************
  33. 4-way SSE2 serpent
  34. **********************************************************************/
  35. #define CTX %edx
  36. #define RA %xmm0
  37. #define RB %xmm1
  38. #define RC %xmm2
  39. #define RD %xmm3
  40. #define RE %xmm4
  41. #define RT0 %xmm5
  42. #define RT1 %xmm6
  43. #define RNOT %xmm7
  44. #define get_key(i, j, t) \
  45. movd (4*(i)+(j))*4(CTX), t; \
  46. pshufd $0, t, t;
  47. #define K(x0, x1, x2, x3, x4, i) \
  48. get_key(i, 0, x4); \
  49. get_key(i, 1, RT0); \
  50. get_key(i, 2, RT1); \
  51. pxor x4, x0; \
  52. pxor RT0, x1; \
  53. pxor RT1, x2; \
  54. get_key(i, 3, x4); \
  55. pxor x4, x3;
  56. #define LK(x0, x1, x2, x3, x4, i) \
  57. movdqa x0, x4; \
  58. pslld $13, x0; \
  59. psrld $(32 - 13), x4; \
  60. por x4, x0; \
  61. pxor x0, x1; \
  62. movdqa x2, x4; \
  63. pslld $3, x2; \
  64. psrld $(32 - 3), x4; \
  65. por x4, x2; \
  66. pxor x2, x1; \
  67. movdqa x1, x4; \
  68. pslld $1, x1; \
  69. psrld $(32 - 1), x4; \
  70. por x4, x1; \
  71. movdqa x0, x4; \
  72. pslld $3, x4; \
  73. pxor x2, x3; \
  74. pxor x4, x3; \
  75. movdqa x3, x4; \
  76. pslld $7, x3; \
  77. psrld $(32 - 7), x4; \
  78. por x4, x3; \
  79. movdqa x1, x4; \
  80. pslld $7, x4; \
  81. pxor x1, x0; \
  82. pxor x3, x0; \
  83. pxor x3, x2; \
  84. pxor x4, x2; \
  85. movdqa x0, x4; \
  86. get_key(i, 1, RT0); \
  87. pxor RT0, x1; \
  88. get_key(i, 3, RT0); \
  89. pxor RT0, x3; \
  90. pslld $5, x0; \
  91. psrld $(32 - 5), x4; \
  92. por x4, x0; \
  93. movdqa x2, x4; \
  94. pslld $22, x2; \
  95. psrld $(32 - 22), x4; \
  96. por x4, x2; \
  97. get_key(i, 0, RT0); \
  98. pxor RT0, x0; \
  99. get_key(i, 2, RT0); \
  100. pxor RT0, x2;
  101. #define KL(x0, x1, x2, x3, x4, i) \
  102. K(x0, x1, x2, x3, x4, i); \
  103. movdqa x0, x4; \
  104. psrld $5, x0; \
  105. pslld $(32 - 5), x4; \
  106. por x4, x0; \
  107. movdqa x2, x4; \
  108. psrld $22, x2; \
  109. pslld $(32 - 22), x4; \
  110. por x4, x2; \
  111. pxor x3, x2; \
  112. pxor x3, x0; \
  113. movdqa x1, x4; \
  114. pslld $7, x4; \
  115. pxor x1, x0; \
  116. pxor x4, x2; \
  117. movdqa x1, x4; \
  118. psrld $1, x1; \
  119. pslld $(32 - 1), x4; \
  120. por x4, x1; \
  121. movdqa x3, x4; \
  122. psrld $7, x3; \
  123. pslld $(32 - 7), x4; \
  124. por x4, x3; \
  125. pxor x0, x1; \
  126. movdqa x0, x4; \
  127. pslld $3, x4; \
  128. pxor x4, x3; \
  129. movdqa x0, x4; \
  130. psrld $13, x0; \
  131. pslld $(32 - 13), x4; \
  132. por x4, x0; \
  133. pxor x2, x1; \
  134. pxor x2, x3; \
  135. movdqa x2, x4; \
  136. psrld $3, x2; \
  137. pslld $(32 - 3), x4; \
  138. por x4, x2;
  139. #define S0(x0, x1, x2, x3, x4) \
  140. movdqa x3, x4; \
  141. por x0, x3; \
  142. pxor x4, x0; \
  143. pxor x2, x4; \
  144. pxor RNOT, x4; \
  145. pxor x1, x3; \
  146. pand x0, x1; \
  147. pxor x4, x1; \
  148. pxor x0, x2; \
  149. pxor x3, x0; \
  150. por x0, x4; \
  151. pxor x2, x0; \
  152. pand x1, x2; \
  153. pxor x2, x3; \
  154. pxor RNOT, x1; \
  155. pxor x4, x2; \
  156. pxor x2, x1;
  157. #define S1(x0, x1, x2, x3, x4) \
  158. movdqa x1, x4; \
  159. pxor x0, x1; \
  160. pxor x3, x0; \
  161. pxor RNOT, x3; \
  162. pand x1, x4; \
  163. por x1, x0; \
  164. pxor x2, x3; \
  165. pxor x3, x0; \
  166. pxor x3, x1; \
  167. pxor x4, x3; \
  168. por x4, x1; \
  169. pxor x2, x4; \
  170. pand x0, x2; \
  171. pxor x1, x2; \
  172. por x0, x1; \
  173. pxor RNOT, x0; \
  174. pxor x2, x0; \
  175. pxor x1, x4;
  176. #define S2(x0, x1, x2, x3, x4) \
  177. pxor RNOT, x3; \
  178. pxor x0, x1; \
  179. movdqa x0, x4; \
  180. pand x2, x0; \
  181. pxor x3, x0; \
  182. por x4, x3; \
  183. pxor x1, x2; \
  184. pxor x1, x3; \
  185. pand x0, x1; \
  186. pxor x2, x0; \
  187. pand x3, x2; \
  188. por x1, x3; \
  189. pxor RNOT, x0; \
  190. pxor x0, x3; \
  191. pxor x0, x4; \
  192. pxor x2, x0; \
  193. por x2, x1;
  194. #define S3(x0, x1, x2, x3, x4) \
  195. movdqa x1, x4; \
  196. pxor x3, x1; \
  197. por x0, x3; \
  198. pand x0, x4; \
  199. pxor x2, x0; \
  200. pxor x1, x2; \
  201. pand x3, x1; \
  202. pxor x3, x2; \
  203. por x4, x0; \
  204. pxor x3, x4; \
  205. pxor x0, x1; \
  206. pand x3, x0; \
  207. pand x4, x3; \
  208. pxor x2, x3; \
  209. por x1, x4; \
  210. pand x1, x2; \
  211. pxor x3, x4; \
  212. pxor x3, x0; \
  213. pxor x2, x3;
  214. #define S4(x0, x1, x2, x3, x4) \
  215. movdqa x3, x4; \
  216. pand x0, x3; \
  217. pxor x4, x0; \
  218. pxor x2, x3; \
  219. por x4, x2; \
  220. pxor x1, x0; \
  221. pxor x3, x4; \
  222. por x0, x2; \
  223. pxor x1, x2; \
  224. pand x0, x1; \
  225. pxor x4, x1; \
  226. pand x2, x4; \
  227. pxor x3, x2; \
  228. pxor x0, x4; \
  229. por x1, x3; \
  230. pxor RNOT, x1; \
  231. pxor x0, x3;
  232. #define S5(x0, x1, x2, x3, x4) \
  233. movdqa x1, x4; \
  234. por x0, x1; \
  235. pxor x1, x2; \
  236. pxor RNOT, x3; \
  237. pxor x0, x4; \
  238. pxor x2, x0; \
  239. pand x4, x1; \
  240. por x3, x4; \
  241. pxor x0, x4; \
  242. pand x3, x0; \
  243. pxor x3, x1; \
  244. pxor x2, x3; \
  245. pxor x1, x0; \
  246. pand x4, x2; \
  247. pxor x2, x1; \
  248. pand x0, x2; \
  249. pxor x2, x3;
  250. #define S6(x0, x1, x2, x3, x4) \
  251. movdqa x1, x4; \
  252. pxor x0, x3; \
  253. pxor x2, x1; \
  254. pxor x0, x2; \
  255. pand x3, x0; \
  256. por x3, x1; \
  257. pxor RNOT, x4; \
  258. pxor x1, x0; \
  259. pxor x2, x1; \
  260. pxor x4, x3; \
  261. pxor x0, x4; \
  262. pand x0, x2; \
  263. pxor x1, x4; \
  264. pxor x3, x2; \
  265. pand x1, x3; \
  266. pxor x0, x3; \
  267. pxor x2, x1;
  268. #define S7(x0, x1, x2, x3, x4) \
  269. pxor RNOT, x1; \
  270. movdqa x1, x4; \
  271. pxor RNOT, x0; \
  272. pand x2, x1; \
  273. pxor x3, x1; \
  274. por x4, x3; \
  275. pxor x2, x4; \
  276. pxor x3, x2; \
  277. pxor x0, x3; \
  278. por x1, x0; \
  279. pand x0, x2; \
  280. pxor x4, x0; \
  281. pxor x3, x4; \
  282. pand x0, x3; \
  283. pxor x1, x4; \
  284. pxor x4, x2; \
  285. pxor x1, x3; \
  286. por x0, x4; \
  287. pxor x1, x4;
  288. #define SI0(x0, x1, x2, x3, x4) \
  289. movdqa x3, x4; \
  290. pxor x0, x1; \
  291. por x1, x3; \
  292. pxor x1, x4; \
  293. pxor RNOT, x0; \
  294. pxor x3, x2; \
  295. pxor x0, x3; \
  296. pand x1, x0; \
  297. pxor x2, x0; \
  298. pand x3, x2; \
  299. pxor x4, x3; \
  300. pxor x3, x2; \
  301. pxor x3, x1; \
  302. pand x0, x3; \
  303. pxor x0, x1; \
  304. pxor x2, x0; \
  305. pxor x3, x4;
  306. #define SI1(x0, x1, x2, x3, x4) \
  307. pxor x3, x1; \
  308. movdqa x0, x4; \
  309. pxor x2, x0; \
  310. pxor RNOT, x2; \
  311. por x1, x4; \
  312. pxor x3, x4; \
  313. pand x1, x3; \
  314. pxor x2, x1; \
  315. pand x4, x2; \
  316. pxor x1, x4; \
  317. por x3, x1; \
  318. pxor x0, x3; \
  319. pxor x0, x2; \
  320. por x4, x0; \
  321. pxor x4, x2; \
  322. pxor x0, x1; \
  323. pxor x1, x4;
  324. #define SI2(x0, x1, x2, x3, x4) \
  325. pxor x1, x2; \
  326. movdqa x3, x4; \
  327. pxor RNOT, x3; \
  328. por x2, x3; \
  329. pxor x4, x2; \
  330. pxor x0, x4; \
  331. pxor x1, x3; \
  332. por x2, x1; \
  333. pxor x0, x2; \
  334. pxor x4, x1; \
  335. por x3, x4; \
  336. pxor x3, x2; \
  337. pxor x2, x4; \
  338. pand x1, x2; \
  339. pxor x3, x2; \
  340. pxor x4, x3; \
  341. pxor x0, x4;
  342. #define SI3(x0, x1, x2, x3, x4) \
  343. pxor x1, x2; \
  344. movdqa x1, x4; \
  345. pand x2, x1; \
  346. pxor x0, x1; \
  347. por x4, x0; \
  348. pxor x3, x4; \
  349. pxor x3, x0; \
  350. por x1, x3; \
  351. pxor x2, x1; \
  352. pxor x3, x1; \
  353. pxor x2, x0; \
  354. pxor x3, x2; \
  355. pand x1, x3; \
  356. pxor x0, x1; \
  357. pand x2, x0; \
  358. pxor x3, x4; \
  359. pxor x0, x3; \
  360. pxor x1, x0;
  361. #define SI4(x0, x1, x2, x3, x4) \
  362. pxor x3, x2; \
  363. movdqa x0, x4; \
  364. pand x1, x0; \
  365. pxor x2, x0; \
  366. por x3, x2; \
  367. pxor RNOT, x4; \
  368. pxor x0, x1; \
  369. pxor x2, x0; \
  370. pand x4, x2; \
  371. pxor x0, x2; \
  372. por x4, x0; \
  373. pxor x3, x0; \
  374. pand x2, x3; \
  375. pxor x3, x4; \
  376. pxor x1, x3; \
  377. pand x0, x1; \
  378. pxor x1, x4; \
  379. pxor x3, x0;
  380. #define SI5(x0, x1, x2, x3, x4) \
  381. movdqa x1, x4; \
  382. por x2, x1; \
  383. pxor x4, x2; \
  384. pxor x3, x1; \
  385. pand x4, x3; \
  386. pxor x3, x2; \
  387. por x0, x3; \
  388. pxor RNOT, x0; \
  389. pxor x2, x3; \
  390. por x0, x2; \
  391. pxor x1, x4; \
  392. pxor x4, x2; \
  393. pand x0, x4; \
  394. pxor x1, x0; \
  395. pxor x3, x1; \
  396. pand x2, x0; \
  397. pxor x3, x2; \
  398. pxor x2, x0; \
  399. pxor x4, x2; \
  400. pxor x3, x4;
  401. #define SI6(x0, x1, x2, x3, x4) \
  402. pxor x2, x0; \
  403. movdqa x0, x4; \
  404. pand x3, x0; \
  405. pxor x3, x2; \
  406. pxor x2, x0; \
  407. pxor x1, x3; \
  408. por x4, x2; \
  409. pxor x3, x2; \
  410. pand x0, x3; \
  411. pxor RNOT, x0; \
  412. pxor x1, x3; \
  413. pand x2, x1; \
  414. pxor x0, x4; \
  415. pxor x4, x3; \
  416. pxor x2, x4; \
  417. pxor x1, x0; \
  418. pxor x0, x2;
  419. #define SI7(x0, x1, x2, x3, x4) \
  420. movdqa x3, x4; \
  421. pand x0, x3; \
  422. pxor x2, x0; \
  423. por x4, x2; \
  424. pxor x1, x4; \
  425. pxor RNOT, x0; \
  426. por x3, x1; \
  427. pxor x0, x4; \
  428. pand x2, x0; \
  429. pxor x1, x0; \
  430. pand x2, x1; \
  431. pxor x2, x3; \
  432. pxor x3, x4; \
  433. pand x3, x2; \
  434. por x0, x3; \
  435. pxor x4, x1; \
  436. pxor x4, x3; \
  437. pand x0, x4; \
  438. pxor x2, x4;
  439. #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  440. movdqa x0, t2; \
  441. punpckldq x1, x0; \
  442. punpckhdq x1, t2; \
  443. movdqa x2, t1; \
  444. punpckhdq x3, x2; \
  445. punpckldq x3, t1; \
  446. movdqa x0, x1; \
  447. punpcklqdq t1, x0; \
  448. punpckhqdq t1, x1; \
  449. movdqa t2, x3; \
  450. punpcklqdq x2, t2; \
  451. punpckhqdq x2, x3; \
  452. movdqa t2, x2;
  453. #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
  454. movdqu (0*4*4)(in), x0; \
  455. movdqu (1*4*4)(in), x1; \
  456. movdqu (2*4*4)(in), x2; \
  457. movdqu (3*4*4)(in), x3; \
  458. \
  459. transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  460. #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
  461. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  462. \
  463. movdqu x0, (0*4*4)(out); \
  464. movdqu x1, (1*4*4)(out); \
  465. movdqu x2, (2*4*4)(out); \
  466. movdqu x3, (3*4*4)(out);
  467. #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
  468. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  469. \
  470. movdqu (0*4*4)(out), t0; \
  471. pxor t0, x0; \
  472. movdqu x0, (0*4*4)(out); \
  473. movdqu (1*4*4)(out), t0; \
  474. pxor t0, x1; \
  475. movdqu x1, (1*4*4)(out); \
  476. movdqu (2*4*4)(out), t0; \
  477. pxor t0, x2; \
  478. movdqu x2, (2*4*4)(out); \
  479. movdqu (3*4*4)(out), t0; \
  480. pxor t0, x3; \
  481. movdqu x3, (3*4*4)(out);
  482. .align 8
  483. .global __serpent_enc_blk_4way
  484. .type __serpent_enc_blk_4way,@function;
  485. __serpent_enc_blk_4way:
  486. /* input:
  487. * arg_ctx(%esp): ctx, CTX
  488. * arg_dst(%esp): dst
  489. * arg_src(%esp): src
  490. * arg_xor(%esp): bool, if true: xor output
  491. */
  492. pcmpeqd RNOT, RNOT;
  493. movl arg_ctx(%esp), CTX;
  494. movl arg_src(%esp), %eax;
  495. read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
  496. K(RA, RB, RC, RD, RE, 0);
  497. S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
  498. S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
  499. S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
  500. S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
  501. S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
  502. S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
  503. S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
  504. S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
  505. S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
  506. S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
  507. S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
  508. S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
  509. S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
  510. S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
  511. S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
  512. S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
  513. S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
  514. S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
  515. S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
  516. S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
  517. S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
  518. S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
  519. S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
  520. S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
  521. S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
  522. S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
  523. S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
  524. S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
  525. S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
  526. S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
  527. S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
  528. S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
  529. movl arg_dst(%esp), %eax;
  530. cmpb $0, arg_xor(%esp);
  531. jnz __enc_xor4;
  532. write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
  533. ret;
  534. __enc_xor4:
  535. xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
  536. ret;
  537. .align 8
  538. .global serpent_dec_blk_4way
  539. .type serpent_dec_blk_4way,@function;
  540. serpent_dec_blk_4way:
  541. /* input:
  542. * arg_ctx(%esp): ctx, CTX
  543. * arg_dst(%esp): dst
  544. * arg_src(%esp): src
  545. */
  546. pcmpeqd RNOT, RNOT;
  547. movl arg_ctx(%esp), CTX;
  548. movl arg_src(%esp), %eax;
  549. read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
  550. K(RA, RB, RC, RD, RE, 32);
  551. SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
  552. SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
  553. SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
  554. SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
  555. SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
  556. SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
  557. SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
  558. SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
  559. SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
  560. SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
  561. SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
  562. SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
  563. SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
  564. SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
  565. SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
  566. SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
  567. SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
  568. SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
  569. SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
  570. SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
  571. SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
  572. SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
  573. SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
  574. SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
  575. SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
  576. SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
  577. SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
  578. SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
  579. SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
  580. SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
  581. SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
  582. SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
  583. movl arg_dst(%esp), %eax;
  584. write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
  585. ret;