sha256-armv4.pl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. #
  8. # Permission to use under GPL terms is granted.
  9. # ====================================================================
  10. # SHA256 block procedure for ARMv4. May 2007.
  11. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  12. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  13. # byte [on single-issue Xscale PXA250 core].
  14. # July 2010.
  15. #
  16. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  17. # Cortex A8 core and ~20 cycles per processed byte.
  18. # February 2011.
  19. #
  20. # Profiler-assisted and platform-specific optimization resulted in 16%
  21. # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  22. # September 2013.
  23. #
  24. # Add NEON implementation. On Cortex A8 it was measured to process one
  25. # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  26. # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  27. # code (meaning that latter performs sub-optimally, nothing was done
  28. # about it).
  29. # May 2014.
  30. #
  31. # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  32. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  33. open STDOUT,">$output";
  34. $ctx="r0"; $t0="r0";
  35. $inp="r1"; $t4="r1";
  36. $len="r2"; $t1="r2";
  37. $T1="r3"; $t3="r3";
  38. $A="r4";
  39. $B="r5";
  40. $C="r6";
  41. $D="r7";
  42. $E="r8";
  43. $F="r9";
  44. $G="r10";
  45. $H="r11";
  46. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  47. $t2="r12";
  48. $Ktbl="r14";
  49. @Sigma0=( 2,13,22);
  50. @Sigma1=( 6,11,25);
  51. @sigma0=( 7,18, 3);
  52. @sigma1=(17,19,10);
  53. sub BODY_00_15 {
  54. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  55. $code.=<<___ if ($i<16);
  56. #if __ARM_ARCH__>=7
  57. @ ldr $t1,[$inp],#4 @ $i
  58. # if $i==15
  59. str $inp,[sp,#17*4] @ make room for $t4
  60. # endif
  61. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  62. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  63. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  64. # ifndef __ARMEB__
  65. rev $t1,$t1
  66. # endif
  67. #else
  68. @ ldrb $t1,[$inp,#3] @ $i
  69. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  70. ldrb $t2,[$inp,#2]
  71. ldrb $t0,[$inp,#1]
  72. orr $t1,$t1,$t2,lsl#8
  73. ldrb $t2,[$inp],#4
  74. orr $t1,$t1,$t0,lsl#16
  75. # if $i==15
  76. str $inp,[sp,#17*4] @ make room for $t4
  77. # endif
  78. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  79. orr $t1,$t1,$t2,lsl#24
  80. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  81. #endif
  82. ___
  83. $code.=<<___;
  84. ldr $t2,[$Ktbl],#4 @ *K256++
  85. add $h,$h,$t1 @ h+=X[i]
  86. str $t1,[sp,#`$i%16`*4]
  87. eor $t1,$f,$g
  88. add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
  89. and $t1,$t1,$e
  90. add $h,$h,$t2 @ h+=K256[i]
  91. eor $t1,$t1,$g @ Ch(e,f,g)
  92. eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  93. add $h,$h,$t1 @ h+=Ch(e,f,g)
  94. #if $i==31
  95. and $t2,$t2,#0xff
  96. cmp $t2,#0xf2 @ done?
  97. #endif
  98. #if $i<15
  99. # if __ARM_ARCH__>=7
  100. ldr $t1,[$inp],#4 @ prefetch
  101. # else
  102. ldrb $t1,[$inp,#3]
  103. # endif
  104. eor $t2,$a,$b @ a^b, b^c in next round
  105. #else
  106. ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
  107. eor $t2,$a,$b @ a^b, b^c in next round
  108. ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
  109. #endif
  110. eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
  111. and $t3,$t3,$t2 @ (b^c)&=(a^b)
  112. add $d,$d,$h @ d+=h
  113. eor $t3,$t3,$b @ Maj(a,b,c)
  114. add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
  115. @ add $h,$h,$t3 @ h+=Maj(a,b,c)
  116. ___
  117. ($t2,$t3)=($t3,$t2);
  118. }
  119. sub BODY_16_XX {
  120. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  121. $code.=<<___;
  122. @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
  123. @ ldr $t4,[sp,#`($i+14)%16`*4]
  124. mov $t0,$t1,ror#$sigma0[0]
  125. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  126. mov $t2,$t4,ror#$sigma1[0]
  127. eor $t0,$t0,$t1,ror#$sigma0[1]
  128. eor $t2,$t2,$t4,ror#$sigma1[1]
  129. eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
  130. ldr $t1,[sp,#`($i+0)%16`*4]
  131. eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
  132. ldr $t4,[sp,#`($i+9)%16`*4]
  133. add $t2,$t2,$t0
  134. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
  135. add $t1,$t1,$t2
  136. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  137. add $t1,$t1,$t4 @ X[i]
  138. ___
  139. &BODY_00_15(@_);
  140. }
  141. $code=<<___;
  142. #ifndef __KERNEL__
  143. # include "arm_arch.h"
  144. #else
  145. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  146. # define __ARM_MAX_ARCH__ 7
  147. #endif
  148. .text
  149. #if __ARM_ARCH__<7
  150. .code 32
  151. #else
  152. .syntax unified
  153. # ifdef __thumb2__
  154. # define adrl adr
  155. .thumb
  156. # else
  157. .code 32
  158. # endif
  159. #endif
  160. .type K256,%object
  161. .align 5
  162. K256:
  163. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  164. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  165. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  166. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  167. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  168. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  169. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  170. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  171. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  172. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  173. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  174. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  175. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  176. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  177. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  178. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  179. .size K256,.-K256
  180. .word 0 @ terminator
  181. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  182. .LOPENSSL_armcap:
  183. .word OPENSSL_armcap_P-sha256_block_data_order
  184. #endif
  185. .align 5
  186. .global sha256_block_data_order
  187. .type sha256_block_data_order,%function
  188. sha256_block_data_order:
  189. #if __ARM_ARCH__<7
  190. sub r3,pc,#8 @ sha256_block_data_order
  191. #else
  192. adr r3,sha256_block_data_order
  193. #endif
  194. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  195. ldr r12,.LOPENSSL_armcap
  196. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  197. tst r12,#ARMV8_SHA256
  198. bne .LARMv8
  199. tst r12,#ARMV7_NEON
  200. bne .LNEON
  201. #endif
  202. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  203. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  204. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  205. sub $Ktbl,r3,#256+32 @ K256
  206. sub sp,sp,#16*4 @ alloca(X[16])
  207. .Loop:
  208. # if __ARM_ARCH__>=7
  209. ldr $t1,[$inp],#4
  210. # else
  211. ldrb $t1,[$inp,#3]
  212. # endif
  213. eor $t3,$B,$C @ magic
  214. eor $t2,$t2,$t2
  215. ___
  216. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  217. $code.=".Lrounds_16_xx:\n";
  218. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  219. $code.=<<___;
  220. #if __ARM_ARCH__>=7
  221. ite eq @ Thumb2 thing, sanity check in ARM
  222. #endif
  223. ldreq $t3,[sp,#16*4] @ pull ctx
  224. bne .Lrounds_16_xx
  225. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  226. ldr $t0,[$t3,#0]
  227. ldr $t1,[$t3,#4]
  228. ldr $t2,[$t3,#8]
  229. add $A,$A,$t0
  230. ldr $t0,[$t3,#12]
  231. add $B,$B,$t1
  232. ldr $t1,[$t3,#16]
  233. add $C,$C,$t2
  234. ldr $t2,[$t3,#20]
  235. add $D,$D,$t0
  236. ldr $t0,[$t3,#24]
  237. add $E,$E,$t1
  238. ldr $t1,[$t3,#28]
  239. add $F,$F,$t2
  240. ldr $inp,[sp,#17*4] @ pull inp
  241. ldr $t2,[sp,#18*4] @ pull inp+len
  242. add $G,$G,$t0
  243. add $H,$H,$t1
  244. stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  245. cmp $inp,$t2
  246. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  247. bne .Loop
  248. add sp,sp,#`16+3`*4 @ destroy frame
  249. #if __ARM_ARCH__>=5
  250. ldmia sp!,{r4-r11,pc}
  251. #else
  252. ldmia sp!,{r4-r11,lr}
  253. tst lr,#1
  254. moveq pc,lr @ be binary compatible with V4, yet
  255. bx lr @ interoperable with Thumb ISA:-)
  256. #endif
  257. .size sha256_block_data_order,.-sha256_block_data_order
  258. ___
  259. ######################################################################
  260. # NEON stuff
  261. #
  262. {{{
  263. my @X=map("q$_",(0..3));
  264. my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
  265. my $Xfer=$t4;
  266. my $j=0;
  267. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  268. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  269. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  270. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  271. my $arg = pop;
  272. $arg = "#$arg" if ($arg*1 eq $arg);
  273. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  274. }
  275. sub Xupdate()
  276. { use integer;
  277. my $body = shift;
  278. my @insns = (&$body,&$body,&$body,&$body);
  279. my ($a,$b,$c,$d,$e,$f,$g,$h);
  280. &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
  281. eval(shift(@insns));
  282. eval(shift(@insns));
  283. eval(shift(@insns));
  284. &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
  285. eval(shift(@insns));
  286. eval(shift(@insns));
  287. eval(shift(@insns));
  288. &vshr_u32 ($T2,$T0,$sigma0[0]);
  289. eval(shift(@insns));
  290. eval(shift(@insns));
  291. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
  292. eval(shift(@insns));
  293. eval(shift(@insns));
  294. &vshr_u32 ($T1,$T0,$sigma0[2]);
  295. eval(shift(@insns));
  296. eval(shift(@insns));
  297. &vsli_32 ($T2,$T0,32-$sigma0[0]);
  298. eval(shift(@insns));
  299. eval(shift(@insns));
  300. &vshr_u32 ($T3,$T0,$sigma0[1]);
  301. eval(shift(@insns));
  302. eval(shift(@insns));
  303. &veor ($T1,$T1,$T2);
  304. eval(shift(@insns));
  305. eval(shift(@insns));
  306. &vsli_32 ($T3,$T0,32-$sigma0[1]);
  307. eval(shift(@insns));
  308. eval(shift(@insns));
  309. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
  310. eval(shift(@insns));
  311. eval(shift(@insns));
  312. &veor ($T1,$T1,$T3); # sigma0(X[1..4])
  313. eval(shift(@insns));
  314. eval(shift(@insns));
  315. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
  316. eval(shift(@insns));
  317. eval(shift(@insns));
  318. &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
  319. eval(shift(@insns));
  320. eval(shift(@insns));
  321. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
  322. eval(shift(@insns));
  323. eval(shift(@insns));
  324. &veor ($T5,$T5,$T4);
  325. eval(shift(@insns));
  326. eval(shift(@insns));
  327. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
  328. eval(shift(@insns));
  329. eval(shift(@insns));
  330. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
  331. eval(shift(@insns));
  332. eval(shift(@insns));
  333. &veor ($T5,$T5,$T4); # sigma1(X[14..15])
  334. eval(shift(@insns));
  335. eval(shift(@insns));
  336. &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
  337. eval(shift(@insns));
  338. eval(shift(@insns));
  339. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
  340. eval(shift(@insns));
  341. eval(shift(@insns));
  342. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
  343. eval(shift(@insns));
  344. eval(shift(@insns));
  345. &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
  346. eval(shift(@insns));
  347. eval(shift(@insns));
  348. &veor ($T5,$T5,$T4);
  349. eval(shift(@insns));
  350. eval(shift(@insns));
  351. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
  352. eval(shift(@insns));
  353. eval(shift(@insns));
  354. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  355. eval(shift(@insns));
  356. eval(shift(@insns));
  357. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
  358. eval(shift(@insns));
  359. eval(shift(@insns));
  360. &veor ($T5,$T5,$T4); # sigma1(X[16..17])
  361. eval(shift(@insns));
  362. eval(shift(@insns));
  363. &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
  364. eval(shift(@insns));
  365. eval(shift(@insns));
  366. &vadd_i32 ($T0,$T0,@X[0]);
  367. while($#insns>=2) { eval(shift(@insns)); }
  368. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  369. eval(shift(@insns));
  370. eval(shift(@insns));
  371. push(@X,shift(@X)); # "rotate" X[]
  372. }
  373. sub Xpreload()
  374. { use integer;
  375. my $body = shift;
  376. my @insns = (&$body,&$body,&$body,&$body);
  377. my ($a,$b,$c,$d,$e,$f,$g,$h);
  378. eval(shift(@insns));
  379. eval(shift(@insns));
  380. eval(shift(@insns));
  381. eval(shift(@insns));
  382. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  383. eval(shift(@insns));
  384. eval(shift(@insns));
  385. eval(shift(@insns));
  386. eval(shift(@insns));
  387. &vrev32_8 (@X[0],@X[0]);
  388. eval(shift(@insns));
  389. eval(shift(@insns));
  390. eval(shift(@insns));
  391. eval(shift(@insns));
  392. &vadd_i32 ($T0,$T0,@X[0]);
  393. foreach (@insns) { eval; } # remaining instructions
  394. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  395. push(@X,shift(@X)); # "rotate" X[]
  396. }
  397. sub body_00_15 () {
  398. (
  399. '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
  400. '&add ($h,$h,$t1)', # h+=X[i]+K[i]
  401. '&eor ($t1,$f,$g)',
  402. '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
  403. '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
  404. '&and ($t1,$t1,$e)',
  405. '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
  406. '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
  407. '&eor ($t1,$t1,$g)', # Ch(e,f,g)
  408. '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
  409. '&eor ($t2,$a,$b)', # a^b, b^c in next round
  410. '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
  411. '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
  412. '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
  413. '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
  414. '&ldr ($t1,"[sp,#64]") if ($j==31)',
  415. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
  416. '&add ($d,$d,$h)', # d+=h
  417. '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
  418. '&eor ($t3,$t3,$b)', # Maj(a,b,c)
  419. '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
  420. )
  421. }
  422. $code.=<<___;
  423. #if __ARM_MAX_ARCH__>=7
  424. .arch armv7-a
  425. .fpu neon
  426. .global sha256_block_data_order_neon
  427. .type sha256_block_data_order_neon,%function
  428. .align 4
  429. sha256_block_data_order_neon:
  430. .LNEON:
  431. stmdb sp!,{r4-r12,lr}
  432. sub $H,sp,#16*4+16
  433. adrl $Ktbl,K256
  434. bic $H,$H,#15 @ align for 128-bit stores
  435. mov $t2,sp
  436. mov sp,$H @ alloca
  437. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  438. vld1.8 {@X[0]},[$inp]!
  439. vld1.8 {@X[1]},[$inp]!
  440. vld1.8 {@X[2]},[$inp]!
  441. vld1.8 {@X[3]},[$inp]!
  442. vld1.32 {$T0},[$Ktbl,:128]!
  443. vld1.32 {$T1},[$Ktbl,:128]!
  444. vld1.32 {$T2},[$Ktbl,:128]!
  445. vld1.32 {$T3},[$Ktbl,:128]!
  446. vrev32.8 @X[0],@X[0] @ yes, even on
  447. str $ctx,[sp,#64]
  448. vrev32.8 @X[1],@X[1] @ big-endian
  449. str $inp,[sp,#68]
  450. mov $Xfer,sp
  451. vrev32.8 @X[2],@X[2]
  452. str $len,[sp,#72]
  453. vrev32.8 @X[3],@X[3]
  454. str $t2,[sp,#76] @ save original sp
  455. vadd.i32 $T0,$T0,@X[0]
  456. vadd.i32 $T1,$T1,@X[1]
  457. vst1.32 {$T0},[$Xfer,:128]!
  458. vadd.i32 $T2,$T2,@X[2]
  459. vst1.32 {$T1},[$Xfer,:128]!
  460. vadd.i32 $T3,$T3,@X[3]
  461. vst1.32 {$T2},[$Xfer,:128]!
  462. vst1.32 {$T3},[$Xfer,:128]!
  463. ldmia $ctx,{$A-$H}
  464. sub $Xfer,$Xfer,#64
  465. ldr $t1,[sp,#0]
  466. eor $t2,$t2,$t2
  467. eor $t3,$B,$C
  468. b .L_00_48
  469. .align 4
  470. .L_00_48:
  471. ___
  472. &Xupdate(\&body_00_15);
  473. &Xupdate(\&body_00_15);
  474. &Xupdate(\&body_00_15);
  475. &Xupdate(\&body_00_15);
  476. $code.=<<___;
  477. teq $t1,#0 @ check for K256 terminator
  478. ldr $t1,[sp,#0]
  479. sub $Xfer,$Xfer,#64
  480. bne .L_00_48
  481. ldr $inp,[sp,#68]
  482. ldr $t0,[sp,#72]
  483. sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
  484. teq $inp,$t0
  485. it eq
  486. subeq $inp,$inp,#64 @ avoid SEGV
  487. vld1.8 {@X[0]},[$inp]! @ load next input block
  488. vld1.8 {@X[1]},[$inp]!
  489. vld1.8 {@X[2]},[$inp]!
  490. vld1.8 {@X[3]},[$inp]!
  491. it ne
  492. strne $inp,[sp,#68]
  493. mov $Xfer,sp
  494. ___
  495. &Xpreload(\&body_00_15);
  496. &Xpreload(\&body_00_15);
  497. &Xpreload(\&body_00_15);
  498. &Xpreload(\&body_00_15);
  499. $code.=<<___;
  500. ldr $t0,[$t1,#0]
  501. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  502. ldr $t2,[$t1,#4]
  503. ldr $t3,[$t1,#8]
  504. ldr $t4,[$t1,#12]
  505. add $A,$A,$t0 @ accumulate
  506. ldr $t0,[$t1,#16]
  507. add $B,$B,$t2
  508. ldr $t2,[$t1,#20]
  509. add $C,$C,$t3
  510. ldr $t3,[$t1,#24]
  511. add $D,$D,$t4
  512. ldr $t4,[$t1,#28]
  513. add $E,$E,$t0
  514. str $A,[$t1],#4
  515. add $F,$F,$t2
  516. str $B,[$t1],#4
  517. add $G,$G,$t3
  518. str $C,[$t1],#4
  519. add $H,$H,$t4
  520. str $D,[$t1],#4
  521. stmia $t1,{$E-$H}
  522. ittte ne
  523. movne $Xfer,sp
  524. ldrne $t1,[sp,#0]
  525. eorne $t2,$t2,$t2
  526. ldreq sp,[sp,#76] @ restore original sp
  527. itt ne
  528. eorne $t3,$B,$C
  529. bne .L_00_48
  530. ldmia sp!,{r4-r12,pc}
  531. .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
  532. #endif
  533. ___
  534. }}}
  535. ######################################################################
  536. # ARMv8 stuff
  537. #
  538. {{{
  539. my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
  540. my @MSG=map("q$_",(8..11));
  541. my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
  542. my $Ktbl="r3";
  543. $code.=<<___;
  544. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  545. # ifdef __thumb2__
  546. # define INST(a,b,c,d) .byte c,d|0xc,a,b
  547. # else
  548. # define INST(a,b,c,d) .byte a,b,c,d
  549. # endif
  550. .type sha256_block_data_order_armv8,%function
  551. .align 5
  552. sha256_block_data_order_armv8:
  553. .LARMv8:
  554. vld1.32 {$ABCD,$EFGH},[$ctx]
  555. # ifdef __thumb2__
  556. adr $Ktbl,.LARMv8
  557. sub $Ktbl,$Ktbl,#.LARMv8-K256
  558. # else
  559. adrl $Ktbl,K256
  560. # endif
  561. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  562. .Loop_v8:
  563. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  564. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  565. vld1.32 {$W0},[$Ktbl]!
  566. vrev32.8 @MSG[0],@MSG[0]
  567. vrev32.8 @MSG[1],@MSG[1]
  568. vrev32.8 @MSG[2],@MSG[2]
  569. vrev32.8 @MSG[3],@MSG[3]
  570. vmov $ABCD_SAVE,$ABCD @ offload
  571. vmov $EFGH_SAVE,$EFGH
  572. teq $inp,$len
  573. ___
  574. for($i=0;$i<12;$i++) {
  575. $code.=<<___;
  576. vld1.32 {$W1},[$Ktbl]!
  577. vadd.i32 $W0,$W0,@MSG[0]
  578. sha256su0 @MSG[0],@MSG[1]
  579. vmov $abcd,$ABCD
  580. sha256h $ABCD,$EFGH,$W0
  581. sha256h2 $EFGH,$abcd,$W0
  582. sha256su1 @MSG[0],@MSG[2],@MSG[3]
  583. ___
  584. ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
  585. }
  586. $code.=<<___;
  587. vld1.32 {$W1},[$Ktbl]!
  588. vadd.i32 $W0,$W0,@MSG[0]
  589. vmov $abcd,$ABCD
  590. sha256h $ABCD,$EFGH,$W0
  591. sha256h2 $EFGH,$abcd,$W0
  592. vld1.32 {$W0},[$Ktbl]!
  593. vadd.i32 $W1,$W1,@MSG[1]
  594. vmov $abcd,$ABCD
  595. sha256h $ABCD,$EFGH,$W1
  596. sha256h2 $EFGH,$abcd,$W1
  597. vld1.32 {$W1},[$Ktbl]
  598. vadd.i32 $W0,$W0,@MSG[2]
  599. sub $Ktbl,$Ktbl,#256-16 @ rewind
  600. vmov $abcd,$ABCD
  601. sha256h $ABCD,$EFGH,$W0
  602. sha256h2 $EFGH,$abcd,$W0
  603. vadd.i32 $W1,$W1,@MSG[3]
  604. vmov $abcd,$ABCD
  605. sha256h $ABCD,$EFGH,$W1
  606. sha256h2 $EFGH,$abcd,$W1
  607. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  608. vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
  609. it ne
  610. bne .Loop_v8
  611. vst1.32 {$ABCD,$EFGH},[$ctx]
  612. ret @ bx lr
  613. .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
  614. #endif
  615. ___
  616. }}}
  617. $code.=<<___;
  618. .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  619. .align 2
  620. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  621. .comm OPENSSL_armcap_P,4,4
  622. #endif
  623. ___
  624. open SELF,$0;
  625. while(<SELF>) {
  626. next if (/^#!/);
  627. last if (!s/^#/@/ and !/^$/);
  628. print;
  629. }
  630. close SELF;
  631. { my %opcode = (
  632. "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
  633. "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
  634. sub unsha256 {
  635. my ($mnemonic,$arg)=@_;
  636. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  637. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  638. |(($2&7)<<17)|(($2&8)<<4)
  639. |(($3&7)<<1) |(($3&8)<<2);
  640. # since ARMv7 instructions are always encoded little-endian.
  641. # correct solution is to use .inst directive, but older
  642. # assemblers don't implement it:-(
  643. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  644. $word&0xff,($word>>8)&0xff,
  645. ($word>>16)&0xff,($word>>24)&0xff,
  646. $mnemonic,$arg;
  647. }
  648. }
  649. }
  650. foreach (split($/,$code)) {
  651. s/\`([^\`]*)\`/eval $1/geo;
  652. s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
  653. s/\bret\b/bx lr/go or
  654. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  655. print $_,"\n";
  656. }
  657. close STDOUT; # enforce flush