bsaes-armv7.pl 62 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. #
  8. # Specific modes and adaptation for Linux kernel by Ard Biesheuvel
  9. # <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
  10. # granted.
  11. # ====================================================================
  12. # Bit-sliced AES for ARM NEON
  13. #
  14. # February 2012.
  15. #
  16. # This implementation is direct adaptation of bsaes-x86_64 module for
  17. # ARM NEON. Except that this module is endian-neutral [in sense that
  18. # it can be compiled for either endianness] by courtesy of vld1.8's
  19. # neutrality. Initial version doesn't implement interface to OpenSSL,
  20. # only low-level primitives and unsupported entry points, just enough
  21. # to collect performance results, which for Cortex-A8 core are:
  22. #
  23. # encrypt 19.5 cycles per byte processed with 128-bit key
  24. # decrypt 22.1 cycles per byte processed with 128-bit key
  25. # key conv. 440 cycles per 128-bit key/0.18 of 8x block
  26. #
  27. # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
  28. # which is [much] worse than anticipated (for further details see
  29. # http://www.openssl.org/~appro/Snapdragon-S4.html).
  30. #
  31. # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
  32. # manages in 20.0 cycles].
  33. #
  34. # When comparing to x86_64 results keep in mind that NEON unit is
  35. # [mostly] single-issue and thus can't [fully] benefit from
  36. # instruction-level parallelism. And when comparing to aes-armv4
  37. # results keep in mind key schedule conversion overhead (see
  38. # bsaes-x86_64.pl for further details)...
  39. #
  40. # <appro@openssl.org>
  41. # April-August 2013
  42. #
  43. # Add CBC, CTR and XTS subroutines, adapt for kernel use.
  44. #
  45. # <ard.biesheuvel@linaro.org>
  46. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  47. open STDOUT,">$output";
  48. my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
  49. my @XMM=map("q$_",(0..15));
  50. {
  51. my ($key,$rounds,$const)=("r4","r5","r6");
  52. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  53. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  54. sub Sbox {
  55. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  56. # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  57. my @b=@_[0..7];
  58. my @t=@_[8..11];
  59. my @s=@_[12..15];
  60. &InBasisChange (@b);
  61. &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
  62. &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  63. }
  64. sub InBasisChange {
  65. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  66. # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  67. my @b=@_[0..7];
  68. $code.=<<___;
  69. veor @b[2], @b[2], @b[1]
  70. veor @b[5], @b[5], @b[6]
  71. veor @b[3], @b[3], @b[0]
  72. veor @b[6], @b[6], @b[2]
  73. veor @b[5], @b[5], @b[0]
  74. veor @b[6], @b[6], @b[3]
  75. veor @b[3], @b[3], @b[7]
  76. veor @b[7], @b[7], @b[5]
  77. veor @b[3], @b[3], @b[4]
  78. veor @b[4], @b[4], @b[5]
  79. veor @b[2], @b[2], @b[7]
  80. veor @b[3], @b[3], @b[1]
  81. veor @b[1], @b[1], @b[5]
  82. ___
  83. }
  84. sub OutBasisChange {
  85. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  86. # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
  87. my @b=@_[0..7];
  88. $code.=<<___;
  89. veor @b[0], @b[0], @b[6]
  90. veor @b[1], @b[1], @b[4]
  91. veor @b[4], @b[4], @b[6]
  92. veor @b[2], @b[2], @b[0]
  93. veor @b[6], @b[6], @b[1]
  94. veor @b[1], @b[1], @b[5]
  95. veor @b[5], @b[5], @b[3]
  96. veor @b[3], @b[3], @b[7]
  97. veor @b[7], @b[7], @b[5]
  98. veor @b[2], @b[2], @b[5]
  99. veor @b[4], @b[4], @b[7]
  100. ___
  101. }
  102. sub InvSbox {
  103. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  104. # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
  105. my @b=@_[0..7];
  106. my @t=@_[8..11];
  107. my @s=@_[12..15];
  108. &InvInBasisChange (@b);
  109. &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
  110. &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
  111. }
  112. sub InvInBasisChange { # OutBasisChange in reverse (with twist)
  113. my @b=@_[5,1,2,6,3,7,0,4];
  114. $code.=<<___
  115. veor @b[1], @b[1], @b[7]
  116. veor @b[4], @b[4], @b[7]
  117. veor @b[7], @b[7], @b[5]
  118. veor @b[1], @b[1], @b[3]
  119. veor @b[2], @b[2], @b[5]
  120. veor @b[3], @b[3], @b[7]
  121. veor @b[6], @b[6], @b[1]
  122. veor @b[2], @b[2], @b[0]
  123. veor @b[5], @b[5], @b[3]
  124. veor @b[4], @b[4], @b[6]
  125. veor @b[0], @b[0], @b[6]
  126. veor @b[1], @b[1], @b[4]
  127. ___
  128. }
  129. sub InvOutBasisChange { # InBasisChange in reverse
  130. my @b=@_[2,5,7,3,6,1,0,4];
  131. $code.=<<___;
  132. veor @b[1], @b[1], @b[5]
  133. veor @b[2], @b[2], @b[7]
  134. veor @b[3], @b[3], @b[1]
  135. veor @b[4], @b[4], @b[5]
  136. veor @b[7], @b[7], @b[5]
  137. veor @b[3], @b[3], @b[4]
  138. veor @b[5], @b[5], @b[0]
  139. veor @b[3], @b[3], @b[7]
  140. veor @b[6], @b[6], @b[2]
  141. veor @b[2], @b[2], @b[1]
  142. veor @b[6], @b[6], @b[3]
  143. veor @b[3], @b[3], @b[0]
  144. veor @b[5], @b[5], @b[6]
  145. ___
  146. }
  147. sub Mul_GF4 {
  148. #;*************************************************************
  149. #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
  150. #;*************************************************************
  151. my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
  152. $code.=<<___;
  153. veor $t0, $y0, $y1
  154. vand $t0, $t0, $x0
  155. veor $x0, $x0, $x1
  156. vand $t1, $x1, $y0
  157. vand $x0, $x0, $y1
  158. veor $x1, $t1, $t0
  159. veor $x0, $x0, $t1
  160. ___
  161. }
  162. sub Mul_GF4_N { # not used, see next subroutine
  163. # multiply and scale by N
  164. my ($x0,$x1,$y0,$y1,$t0)=@_;
  165. $code.=<<___;
  166. veor $t0, $y0, $y1
  167. vand $t0, $t0, $x0
  168. veor $x0, $x0, $x1
  169. vand $x1, $x1, $y0
  170. vand $x0, $x0, $y1
  171. veor $x1, $x1, $x0
  172. veor $x0, $x0, $t0
  173. ___
  174. }
  175. sub Mul_GF4_N_GF4 {
  176. # interleaved Mul_GF4_N and Mul_GF4
  177. my ($x0,$x1,$y0,$y1,$t0,
  178. $x2,$x3,$y2,$y3,$t1)=@_;
  179. $code.=<<___;
  180. veor $t0, $y0, $y1
  181. veor $t1, $y2, $y3
  182. vand $t0, $t0, $x0
  183. vand $t1, $t1, $x2
  184. veor $x0, $x0, $x1
  185. veor $x2, $x2, $x3
  186. vand $x1, $x1, $y0
  187. vand $x3, $x3, $y2
  188. vand $x0, $x0, $y1
  189. vand $x2, $x2, $y3
  190. veor $x1, $x1, $x0
  191. veor $x2, $x2, $x3
  192. veor $x0, $x0, $t0
  193. veor $x3, $x3, $t1
  194. ___
  195. }
  196. sub Mul_GF16_2 {
  197. my @x=@_[0..7];
  198. my @y=@_[8..11];
  199. my @t=@_[12..15];
  200. $code.=<<___;
  201. veor @t[0], @x[0], @x[2]
  202. veor @t[1], @x[1], @x[3]
  203. ___
  204. &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
  205. $code.=<<___;
  206. veor @y[0], @y[0], @y[2]
  207. veor @y[1], @y[1], @y[3]
  208. ___
  209. Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  210. @x[2], @x[3], @y[2], @y[3], @t[2]);
  211. $code.=<<___;
  212. veor @x[0], @x[0], @t[0]
  213. veor @x[2], @x[2], @t[0]
  214. veor @x[1], @x[1], @t[1]
  215. veor @x[3], @x[3], @t[1]
  216. veor @t[0], @x[4], @x[6]
  217. veor @t[1], @x[5], @x[7]
  218. ___
  219. &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  220. @x[6], @x[7], @y[2], @y[3], @t[2]);
  221. $code.=<<___;
  222. veor @y[0], @y[0], @y[2]
  223. veor @y[1], @y[1], @y[3]
  224. ___
  225. &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
  226. $code.=<<___;
  227. veor @x[4], @x[4], @t[0]
  228. veor @x[6], @x[6], @t[0]
  229. veor @x[5], @x[5], @t[1]
  230. veor @x[7], @x[7], @t[1]
  231. ___
  232. }
  233. sub Inv_GF256 {
  234. #;********************************************************************
  235. #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
  236. #;********************************************************************
  237. my @x=@_[0..7];
  238. my @t=@_[8..11];
  239. my @s=@_[12..15];
  240. # direct optimizations from hardware
  241. $code.=<<___;
  242. veor @t[3], @x[4], @x[6]
  243. veor @t[2], @x[5], @x[7]
  244. veor @t[1], @x[1], @x[3]
  245. veor @s[1], @x[7], @x[6]
  246. vmov @t[0], @t[2]
  247. veor @s[0], @x[0], @x[2]
  248. vorr @t[2], @t[2], @t[1]
  249. veor @s[3], @t[3], @t[0]
  250. vand @s[2], @t[3], @s[0]
  251. vorr @t[3], @t[3], @s[0]
  252. veor @s[0], @s[0], @t[1]
  253. vand @t[0], @t[0], @t[1]
  254. veor @t[1], @x[3], @x[2]
  255. vand @s[3], @s[3], @s[0]
  256. vand @s[1], @s[1], @t[1]
  257. veor @t[1], @x[4], @x[5]
  258. veor @s[0], @x[1], @x[0]
  259. veor @t[3], @t[3], @s[1]
  260. veor @t[2], @t[2], @s[1]
  261. vand @s[1], @t[1], @s[0]
  262. vorr @t[1], @t[1], @s[0]
  263. veor @t[3], @t[3], @s[3]
  264. veor @t[0], @t[0], @s[1]
  265. veor @t[2], @t[2], @s[2]
  266. veor @t[1], @t[1], @s[3]
  267. veor @t[0], @t[0], @s[2]
  268. vand @s[0], @x[7], @x[3]
  269. veor @t[1], @t[1], @s[2]
  270. vand @s[1], @x[6], @x[2]
  271. vand @s[2], @x[5], @x[1]
  272. vorr @s[3], @x[4], @x[0]
  273. veor @t[3], @t[3], @s[0]
  274. veor @t[1], @t[1], @s[2]
  275. veor @t[0], @t[0], @s[3]
  276. veor @t[2], @t[2], @s[1]
  277. @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  278. @ new smaller inversion
  279. vand @s[2], @t[3], @t[1]
  280. vmov @s[0], @t[0]
  281. veor @s[1], @t[2], @s[2]
  282. veor @s[3], @t[0], @s[2]
  283. veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
  284. vbsl @s[1], @t[1], @t[0]
  285. vbsl @s[3], @t[3], @t[2]
  286. veor @t[3], @t[3], @t[2]
  287. vbsl @s[0], @s[1], @s[2]
  288. vbsl @t[0], @s[2], @s[1]
  289. vand @s[2], @s[0], @s[3]
  290. veor @t[1], @t[1], @t[0]
  291. veor @s[2], @s[2], @t[3]
  292. ___
  293. # output in s3, s2, s1, t1
  294. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
  295. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  296. &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
  297. ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
  298. }
  299. # AES linear components
  300. sub ShiftRows {
  301. my @x=@_[0..7];
  302. my @t=@_[8..11];
  303. my $mask=pop;
  304. $code.=<<___;
  305. vldmia $key!, {@t[0]-@t[3]}
  306. veor @t[0], @t[0], @x[0]
  307. veor @t[1], @t[1], @x[1]
  308. vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
  309. vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
  310. vldmia $key!, {@t[0]}
  311. veor @t[2], @t[2], @x[2]
  312. vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
  313. vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
  314. vldmia $key!, {@t[1]}
  315. veor @t[3], @t[3], @x[3]
  316. vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
  317. vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
  318. vldmia $key!, {@t[2]}
  319. vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
  320. vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
  321. vldmia $key!, {@t[3]}
  322. veor @t[0], @t[0], @x[4]
  323. veor @t[1], @t[1], @x[5]
  324. vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
  325. vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
  326. veor @t[2], @t[2], @x[6]
  327. vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
  328. vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
  329. veor @t[3], @t[3], @x[7]
  330. vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
  331. vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
  332. vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
  333. vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
  334. ___
  335. }
  336. sub MixColumns {
  337. # modified to emit output in order suitable for feeding back to aesenc[last]
  338. my @x=@_[0..7];
  339. my @t=@_[8..15];
  340. my $inv=@_[16]; # optional
  341. $code.=<<___;
  342. vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
  343. vext.8 @t[1], @x[1], @x[1], #12
  344. veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
  345. vext.8 @t[2], @x[2], @x[2], #12
  346. veor @x[1], @x[1], @t[1]
  347. vext.8 @t[3], @x[3], @x[3], #12
  348. veor @x[2], @x[2], @t[2]
  349. vext.8 @t[4], @x[4], @x[4], #12
  350. veor @x[3], @x[3], @t[3]
  351. vext.8 @t[5], @x[5], @x[5], #12
  352. veor @x[4], @x[4], @t[4]
  353. vext.8 @t[6], @x[6], @x[6], #12
  354. veor @x[5], @x[5], @t[5]
  355. vext.8 @t[7], @x[7], @x[7], #12
  356. veor @x[6], @x[6], @t[6]
  357. veor @t[1], @t[1], @x[0]
  358. veor @x[7], @x[7], @t[7]
  359. vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
  360. veor @t[2], @t[2], @x[1]
  361. veor @t[0], @t[0], @x[7]
  362. veor @t[1], @t[1], @x[7]
  363. vext.8 @x[1], @x[1], @x[1], #8
  364. veor @t[5], @t[5], @x[4]
  365. veor @x[0], @x[0], @t[0]
  366. veor @t[6], @t[6], @x[5]
  367. veor @x[1], @x[1], @t[1]
  368. vext.8 @t[0], @x[4], @x[4], #8
  369. veor @t[4], @t[4], @x[3]
  370. vext.8 @t[1], @x[5], @x[5], #8
  371. veor @t[7], @t[7], @x[6]
  372. vext.8 @x[4], @x[3], @x[3], #8
  373. veor @t[3], @t[3], @x[2]
  374. vext.8 @x[5], @x[7], @x[7], #8
  375. veor @t[4], @t[4], @x[7]
  376. vext.8 @x[3], @x[6], @x[6], #8
  377. veor @t[3], @t[3], @x[7]
  378. vext.8 @x[6], @x[2], @x[2], #8
  379. veor @x[7], @t[1], @t[5]
  380. ___
  381. $code.=<<___ if (!$inv);
  382. veor @x[2], @t[0], @t[4]
  383. veor @x[4], @x[4], @t[3]
  384. veor @x[5], @x[5], @t[7]
  385. veor @x[3], @x[3], @t[6]
  386. @ vmov @x[2], @t[0]
  387. veor @x[6], @x[6], @t[2]
  388. @ vmov @x[7], @t[1]
  389. ___
  390. $code.=<<___ if ($inv);
  391. veor @t[3], @t[3], @x[4]
  392. veor @x[5], @x[5], @t[7]
  393. veor @x[2], @x[3], @t[6]
  394. veor @x[3], @t[0], @t[4]
  395. veor @x[4], @x[6], @t[2]
  396. vmov @x[6], @t[3]
  397. @ vmov @x[7], @t[1]
  398. ___
  399. }
  400. sub InvMixColumns_orig {
  401. my @x=@_[0..7];
  402. my @t=@_[8..15];
  403. $code.=<<___;
  404. @ multiplication by 0x0e
  405. vext.8 @t[7], @x[7], @x[7], #12
  406. vmov @t[2], @x[2]
  407. veor @x[2], @x[2], @x[5] @ 2 5
  408. veor @x[7], @x[7], @x[5] @ 7 5
  409. vext.8 @t[0], @x[0], @x[0], #12
  410. vmov @t[5], @x[5]
  411. veor @x[5], @x[5], @x[0] @ 5 0 [1]
  412. veor @x[0], @x[0], @x[1] @ 0 1
  413. vext.8 @t[1], @x[1], @x[1], #12
  414. veor @x[1], @x[1], @x[2] @ 1 25
  415. veor @x[0], @x[0], @x[6] @ 01 6 [2]
  416. vext.8 @t[3], @x[3], @x[3], #12
  417. veor @x[1], @x[1], @x[3] @ 125 3 [4]
  418. veor @x[2], @x[2], @x[0] @ 25 016 [3]
  419. veor @x[3], @x[3], @x[7] @ 3 75
  420. veor @x[7], @x[7], @x[6] @ 75 6 [0]
  421. vext.8 @t[6], @x[6], @x[6], #12
  422. vmov @t[4], @x[4]
  423. veor @x[6], @x[6], @x[4] @ 6 4
  424. veor @x[4], @x[4], @x[3] @ 4 375 [6]
  425. veor @x[3], @x[3], @x[7] @ 375 756=36
  426. veor @x[6], @x[6], @t[5] @ 64 5 [7]
  427. veor @x[3], @x[3], @t[2] @ 36 2
  428. vext.8 @t[5], @t[5], @t[5], #12
  429. veor @x[3], @x[3], @t[4] @ 362 4 [5]
  430. ___
  431. my @y = @x[7,5,0,2,1,3,4,6];
  432. $code.=<<___;
  433. @ multiplication by 0x0b
  434. veor @y[1], @y[1], @y[0]
  435. veor @y[0], @y[0], @t[0]
  436. vext.8 @t[2], @t[2], @t[2], #12
  437. veor @y[1], @y[1], @t[1]
  438. veor @y[0], @y[0], @t[5]
  439. vext.8 @t[4], @t[4], @t[4], #12
  440. veor @y[1], @y[1], @t[6]
  441. veor @y[0], @y[0], @t[7]
  442. veor @t[7], @t[7], @t[6] @ clobber t[7]
  443. veor @y[3], @y[3], @t[0]
  444. veor @y[1], @y[1], @y[0]
  445. vext.8 @t[0], @t[0], @t[0], #12
  446. veor @y[2], @y[2], @t[1]
  447. veor @y[4], @y[4], @t[1]
  448. vext.8 @t[1], @t[1], @t[1], #12
  449. veor @y[2], @y[2], @t[2]
  450. veor @y[3], @y[3], @t[2]
  451. veor @y[5], @y[5], @t[2]
  452. veor @y[2], @y[2], @t[7]
  453. vext.8 @t[2], @t[2], @t[2], #12
  454. veor @y[3], @y[3], @t[3]
  455. veor @y[6], @y[6], @t[3]
  456. veor @y[4], @y[4], @t[3]
  457. veor @y[7], @y[7], @t[4]
  458. vext.8 @t[3], @t[3], @t[3], #12
  459. veor @y[5], @y[5], @t[4]
  460. veor @y[7], @y[7], @t[7]
  461. veor @t[7], @t[7], @t[5] @ clobber t[7] even more
  462. veor @y[3], @y[3], @t[5]
  463. veor @y[4], @y[4], @t[4]
  464. veor @y[5], @y[5], @t[7]
  465. vext.8 @t[4], @t[4], @t[4], #12
  466. veor @y[6], @y[6], @t[7]
  467. veor @y[4], @y[4], @t[7]
  468. veor @t[7], @t[7], @t[5]
  469. vext.8 @t[5], @t[5], @t[5], #12
  470. @ multiplication by 0x0d
  471. veor @y[4], @y[4], @y[7]
  472. veor @t[7], @t[7], @t[6] @ restore t[7]
  473. veor @y[7], @y[7], @t[4]
  474. vext.8 @t[6], @t[6], @t[6], #12
  475. veor @y[2], @y[2], @t[0]
  476. veor @y[7], @y[7], @t[5]
  477. vext.8 @t[7], @t[7], @t[7], #12
  478. veor @y[2], @y[2], @t[2]
  479. veor @y[3], @y[3], @y[1]
  480. veor @y[1], @y[1], @t[1]
  481. veor @y[0], @y[0], @t[0]
  482. veor @y[3], @y[3], @t[0]
  483. veor @y[1], @y[1], @t[5]
  484. veor @y[0], @y[0], @t[5]
  485. vext.8 @t[0], @t[0], @t[0], #12
  486. veor @y[1], @y[1], @t[7]
  487. veor @y[0], @y[0], @t[6]
  488. veor @y[3], @y[3], @y[1]
  489. veor @y[4], @y[4], @t[1]
  490. vext.8 @t[1], @t[1], @t[1], #12
  491. veor @y[7], @y[7], @t[7]
  492. veor @y[4], @y[4], @t[2]
  493. veor @y[5], @y[5], @t[2]
  494. veor @y[2], @y[2], @t[6]
  495. veor @t[6], @t[6], @t[3] @ clobber t[6]
  496. vext.8 @t[2], @t[2], @t[2], #12
  497. veor @y[4], @y[4], @y[7]
  498. veor @y[3], @y[3], @t[6]
  499. veor @y[6], @y[6], @t[6]
  500. veor @y[5], @y[5], @t[5]
  501. vext.8 @t[5], @t[5], @t[5], #12
  502. veor @y[6], @y[6], @t[4]
  503. vext.8 @t[4], @t[4], @t[4], #12
  504. veor @y[5], @y[5], @t[6]
  505. veor @y[6], @y[6], @t[7]
  506. vext.8 @t[7], @t[7], @t[7], #12
  507. veor @t[6], @t[6], @t[3] @ restore t[6]
  508. vext.8 @t[3], @t[3], @t[3], #12
  509. @ multiplication by 0x09
  510. veor @y[4], @y[4], @y[1]
  511. veor @t[1], @t[1], @y[1] @ t[1]=y[1]
  512. veor @t[0], @t[0], @t[5] @ clobber t[0]
  513. vext.8 @t[6], @t[6], @t[6], #12
  514. veor @t[1], @t[1], @t[5]
  515. veor @y[3], @y[3], @t[0]
  516. veor @t[0], @t[0], @y[0] @ t[0]=y[0]
  517. veor @t[1], @t[1], @t[6]
  518. veor @t[6], @t[6], @t[7] @ clobber t[6]
  519. veor @y[4], @y[4], @t[1]
  520. veor @y[7], @y[7], @t[4]
  521. veor @y[6], @y[6], @t[3]
  522. veor @y[5], @y[5], @t[2]
  523. veor @t[4], @t[4], @y[4] @ t[4]=y[4]
  524. veor @t[3], @t[3], @y[3] @ t[3]=y[3]
  525. veor @t[5], @t[5], @y[5] @ t[5]=y[5]
  526. veor @t[2], @t[2], @y[2] @ t[2]=y[2]
  527. veor @t[3], @t[3], @t[7]
  528. veor @XMM[5], @t[5], @t[6]
  529. veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
  530. veor @XMM[2], @t[2], @t[6]
  531. veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
  532. vmov @XMM[0], @t[0]
  533. vmov @XMM[1], @t[1]
  534. @ vmov @XMM[2], @t[2]
  535. vmov @XMM[3], @t[3]
  536. vmov @XMM[4], @t[4]
  537. @ vmov @XMM[5], @t[5]
  538. @ vmov @XMM[6], @t[6]
  539. @ vmov @XMM[7], @t[7]
  540. ___
  541. }
  542. sub InvMixColumns {
  543. my @x=@_[0..7];
  544. my @t=@_[8..15];
  545. # Thanks to Jussi Kivilinna for providing pointer to
  546. #
  547. # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
  548. # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
  549. # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
  550. # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
  551. $code.=<<___;
  552. @ multiplication by 0x05-0x00-0x04-0x00
  553. vext.8 @t[0], @x[0], @x[0], #8
  554. vext.8 @t[6], @x[6], @x[6], #8
  555. vext.8 @t[7], @x[7], @x[7], #8
  556. veor @t[0], @t[0], @x[0]
  557. vext.8 @t[1], @x[1], @x[1], #8
  558. veor @t[6], @t[6], @x[6]
  559. vext.8 @t[2], @x[2], @x[2], #8
  560. veor @t[7], @t[7], @x[7]
  561. vext.8 @t[3], @x[3], @x[3], #8
  562. veor @t[1], @t[1], @x[1]
  563. vext.8 @t[4], @x[4], @x[4], #8
  564. veor @t[2], @t[2], @x[2]
  565. vext.8 @t[5], @x[5], @x[5], #8
  566. veor @t[3], @t[3], @x[3]
  567. veor @t[4], @t[4], @x[4]
  568. veor @t[5], @t[5], @x[5]
  569. veor @x[0], @x[0], @t[6]
  570. veor @x[1], @x[1], @t[6]
  571. veor @x[2], @x[2], @t[0]
  572. veor @x[4], @x[4], @t[2]
  573. veor @x[3], @x[3], @t[1]
  574. veor @x[1], @x[1], @t[7]
  575. veor @x[2], @x[2], @t[7]
  576. veor @x[4], @x[4], @t[6]
  577. veor @x[5], @x[5], @t[3]
  578. veor @x[3], @x[3], @t[6]
  579. veor @x[6], @x[6], @t[4]
  580. veor @x[4], @x[4], @t[7]
  581. veor @x[5], @x[5], @t[7]
  582. veor @x[7], @x[7], @t[5]
  583. ___
  584. &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
  585. }
  586. sub swapmove {
  587. my ($a,$b,$n,$mask,$t)=@_;
  588. $code.=<<___;
  589. vshr.u64 $t, $b, #$n
  590. veor $t, $t, $a
  591. vand $t, $t, $mask
  592. veor $a, $a, $t
  593. vshl.u64 $t, $t, #$n
  594. veor $b, $b, $t
  595. ___
  596. }
  597. sub swapmove2x {
  598. my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
  599. $code.=<<___;
  600. vshr.u64 $t0, $b0, #$n
  601. vshr.u64 $t1, $b1, #$n
  602. veor $t0, $t0, $a0
  603. veor $t1, $t1, $a1
  604. vand $t0, $t0, $mask
  605. vand $t1, $t1, $mask
  606. veor $a0, $a0, $t0
  607. vshl.u64 $t0, $t0, #$n
  608. veor $a1, $a1, $t1
  609. vshl.u64 $t1, $t1, #$n
  610. veor $b0, $b0, $t0
  611. veor $b1, $b1, $t1
  612. ___
  613. }
  614. sub bitslice {
  615. my @x=reverse(@_[0..7]);
  616. my ($t0,$t1,$t2,$t3)=@_[8..11];
  617. $code.=<<___;
  618. vmov.i8 $t0,#0x55 @ compose .LBS0
  619. vmov.i8 $t1,#0x33 @ compose .LBS1
  620. ___
  621. &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
  622. &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  623. $code.=<<___;
  624. vmov.i8 $t0,#0x0f @ compose .LBS2
  625. ___
  626. &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
  627. &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  628. &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
  629. &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
  630. }
  631. $code.=<<___;
  632. #ifndef __KERNEL__
  633. # include "arm_arch.h"
  634. # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
  635. # define VFP_ABI_POP vldmia sp!,{d8-d15}
  636. # define VFP_ABI_FRAME 0x40
  637. #else
  638. # define VFP_ABI_PUSH
  639. # define VFP_ABI_POP
  640. # define VFP_ABI_FRAME 0
  641. # define BSAES_ASM_EXTENDED_KEY
  642. # define XTS_CHAIN_TWEAK
  643. # define __ARM_ARCH__ 7
  644. #endif
  645. #ifdef __thumb__
  646. # define adrl adr
  647. #endif
  648. #if __ARM_ARCH__>=7
  649. .text
  650. .syntax unified @ ARMv7-capable assembler is expected to handle this
  651. #ifdef __thumb2__
  652. .thumb
  653. #else
  654. .code 32
  655. #endif
  656. .fpu neon
  657. .type _bsaes_decrypt8,%function
  658. .align 4
  659. _bsaes_decrypt8:
  660. adr $const,_bsaes_decrypt8
  661. vldmia $key!, {@XMM[9]} @ round 0 key
  662. add $const,$const,#.LM0ISR-_bsaes_decrypt8
  663. vldmia $const!, {@XMM[8]} @ .LM0ISR
  664. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  665. veor @XMM[11], @XMM[1], @XMM[9]
  666. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  667. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  668. veor @XMM[12], @XMM[2], @XMM[9]
  669. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  670. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  671. veor @XMM[13], @XMM[3], @XMM[9]
  672. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  673. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  674. veor @XMM[14], @XMM[4], @XMM[9]
  675. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  676. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  677. veor @XMM[15], @XMM[5], @XMM[9]
  678. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  679. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  680. veor @XMM[10], @XMM[6], @XMM[9]
  681. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  682. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  683. veor @XMM[11], @XMM[7], @XMM[9]
  684. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  685. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  686. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  687. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  688. ___
  689. &bitslice (@XMM[0..7, 8..11]);
  690. $code.=<<___;
  691. sub $rounds,$rounds,#1
  692. b .Ldec_sbox
  693. .align 4
  694. .Ldec_loop:
  695. ___
  696. &ShiftRows (@XMM[0..7, 8..12]);
  697. $code.=".Ldec_sbox:\n";
  698. &InvSbox (@XMM[0..7, 8..15]);
  699. $code.=<<___;
  700. subs $rounds,$rounds,#1
  701. bcc .Ldec_done
  702. ___
  703. &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
  704. $code.=<<___;
  705. vldmia $const, {@XMM[12]} @ .LISR
  706. ite eq @ Thumb2 thing, sanity check in ARM
  707. addeq $const,$const,#0x10
  708. bne .Ldec_loop
  709. vldmia $const, {@XMM[12]} @ .LISRM0
  710. b .Ldec_loop
  711. .align 4
  712. .Ldec_done:
  713. ___
  714. &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
  715. $code.=<<___;
  716. vldmia $key, {@XMM[8]} @ last round key
  717. veor @XMM[6], @XMM[6], @XMM[8]
  718. veor @XMM[4], @XMM[4], @XMM[8]
  719. veor @XMM[2], @XMM[2], @XMM[8]
  720. veor @XMM[7], @XMM[7], @XMM[8]
  721. veor @XMM[3], @XMM[3], @XMM[8]
  722. veor @XMM[5], @XMM[5], @XMM[8]
  723. veor @XMM[0], @XMM[0], @XMM[8]
  724. veor @XMM[1], @XMM[1], @XMM[8]
  725. bx lr
  726. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  727. .type _bsaes_const,%object
  728. .align 6
  729. _bsaes_const:
  730. .LM0ISR: @ InvShiftRows constants
  731. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  732. .LISR:
  733. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  734. .LISRM0:
  735. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  736. .LM0SR: @ ShiftRows constants
  737. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  738. .LSR:
  739. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  740. .LSRM0:
  741. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  742. .LM0:
  743. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  744. .LREVM0SR:
  745. .quad 0x090d01050c000408, 0x03070b0f060a0e02
  746. .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
  747. .align 6
  748. .size _bsaes_const,.-_bsaes_const
  749. .type _bsaes_encrypt8,%function
  750. .align 4
  751. _bsaes_encrypt8:
  752. adr $const,_bsaes_encrypt8
  753. vldmia $key!, {@XMM[9]} @ round 0 key
  754. sub $const,$const,#_bsaes_encrypt8-.LM0SR
  755. vldmia $const!, {@XMM[8]} @ .LM0SR
  756. _bsaes_encrypt8_alt:
  757. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  758. veor @XMM[11], @XMM[1], @XMM[9]
  759. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  760. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  761. veor @XMM[12], @XMM[2], @XMM[9]
  762. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  763. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  764. veor @XMM[13], @XMM[3], @XMM[9]
  765. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  766. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  767. veor @XMM[14], @XMM[4], @XMM[9]
  768. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  769. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  770. veor @XMM[15], @XMM[5], @XMM[9]
  771. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  772. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  773. veor @XMM[10], @XMM[6], @XMM[9]
  774. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  775. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  776. veor @XMM[11], @XMM[7], @XMM[9]
  777. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  778. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  779. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  780. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  781. _bsaes_encrypt8_bitslice:
  782. ___
  783. &bitslice (@XMM[0..7, 8..11]);
  784. $code.=<<___;
  785. sub $rounds,$rounds,#1
  786. b .Lenc_sbox
  787. .align 4
  788. .Lenc_loop:
  789. ___
  790. &ShiftRows (@XMM[0..7, 8..12]);
  791. $code.=".Lenc_sbox:\n";
  792. &Sbox (@XMM[0..7, 8..15]);
  793. $code.=<<___;
  794. subs $rounds,$rounds,#1
  795. bcc .Lenc_done
  796. ___
  797. &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
  798. $code.=<<___;
  799. vldmia $const, {@XMM[12]} @ .LSR
  800. ite eq @ Thumb2 thing, samity check in ARM
  801. addeq $const,$const,#0x10
  802. bne .Lenc_loop
  803. vldmia $const, {@XMM[12]} @ .LSRM0
  804. b .Lenc_loop
  805. .align 4
  806. .Lenc_done:
  807. ___
  808. # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
  809. &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
  810. $code.=<<___;
  811. vldmia $key, {@XMM[8]} @ last round key
  812. veor @XMM[4], @XMM[4], @XMM[8]
  813. veor @XMM[6], @XMM[6], @XMM[8]
  814. veor @XMM[3], @XMM[3], @XMM[8]
  815. veor @XMM[7], @XMM[7], @XMM[8]
  816. veor @XMM[2], @XMM[2], @XMM[8]
  817. veor @XMM[5], @XMM[5], @XMM[8]
  818. veor @XMM[0], @XMM[0], @XMM[8]
  819. veor @XMM[1], @XMM[1], @XMM[8]
  820. bx lr
  821. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  822. ___
  823. }
  824. {
  825. my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
  826. sub bitslice_key {
  827. my @x=reverse(@_[0..7]);
  828. my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
  829. &swapmove (@x[0,1],1,$bs0,$t2,$t3);
  830. $code.=<<___;
  831. @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
  832. vmov @x[2], @x[0]
  833. vmov @x[3], @x[1]
  834. ___
  835. #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  836. &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
  837. $code.=<<___;
  838. @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  839. vmov @x[4], @x[0]
  840. vmov @x[6], @x[2]
  841. vmov @x[5], @x[1]
  842. vmov @x[7], @x[3]
  843. ___
  844. &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
  845. &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
  846. }
  847. $code.=<<___;
  848. .type _bsaes_key_convert,%function
  849. .align 4
  850. _bsaes_key_convert:
  851. adr $const,_bsaes_key_convert
  852. vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
  853. sub $const,$const,#_bsaes_key_convert-.LM0
  854. vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
  855. vmov.i8 @XMM[8], #0x01 @ bit masks
  856. vmov.i8 @XMM[9], #0x02
  857. vmov.i8 @XMM[10], #0x04
  858. vmov.i8 @XMM[11], #0x08
  859. vmov.i8 @XMM[12], #0x10
  860. vmov.i8 @XMM[13], #0x20
  861. vldmia $const, {@XMM[14]} @ .LM0
  862. #ifdef __ARMEL__
  863. vrev32.8 @XMM[7], @XMM[7]
  864. vrev32.8 @XMM[15], @XMM[15]
  865. #endif
  866. sub $rounds,$rounds,#1
  867. vstmia $out!, {@XMM[7]} @ save round 0 key
  868. b .Lkey_loop
  869. .align 4
  870. .Lkey_loop:
  871. vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
  872. vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
  873. vmov.i8 @XMM[6], #0x40
  874. vmov.i8 @XMM[15], #0x80
  875. vtst.8 @XMM[0], @XMM[7], @XMM[8]
  876. vtst.8 @XMM[1], @XMM[7], @XMM[9]
  877. vtst.8 @XMM[2], @XMM[7], @XMM[10]
  878. vtst.8 @XMM[3], @XMM[7], @XMM[11]
  879. vtst.8 @XMM[4], @XMM[7], @XMM[12]
  880. vtst.8 @XMM[5], @XMM[7], @XMM[13]
  881. vtst.8 @XMM[6], @XMM[7], @XMM[6]
  882. vtst.8 @XMM[7], @XMM[7], @XMM[15]
  883. vld1.8 {@XMM[15]}, [$inp]! @ load next round key
  884. vmvn @XMM[0], @XMM[0] @ "pnot"
  885. vmvn @XMM[1], @XMM[1]
  886. vmvn @XMM[5], @XMM[5]
  887. vmvn @XMM[6], @XMM[6]
  888. #ifdef __ARMEL__
  889. vrev32.8 @XMM[15], @XMM[15]
  890. #endif
  891. subs $rounds,$rounds,#1
  892. vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
  893. bne .Lkey_loop
  894. vmov.i8 @XMM[7],#0x63 @ compose .L63
  895. @ don't save last round key
  896. bx lr
  897. .size _bsaes_key_convert,.-_bsaes_key_convert
  898. ___
  899. }
  900. if (0) { # following four functions are unsupported interface
  901. # used for benchmarking...
  902. $code.=<<___;
  903. .globl bsaes_enc_key_convert
  904. .type bsaes_enc_key_convert,%function
  905. .align 4
  906. bsaes_enc_key_convert:
  907. stmdb sp!,{r4-r6,lr}
  908. vstmdb sp!,{d8-d15} @ ABI specification says so
  909. ldr r5,[$inp,#240] @ pass rounds
  910. mov r4,$inp @ pass key
  911. mov r12,$out @ pass key schedule
  912. bl _bsaes_key_convert
  913. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  914. vstmia r12, {@XMM[7]} @ save last round key
  915. vldmia sp!,{d8-d15}
  916. ldmia sp!,{r4-r6,pc}
  917. .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
  918. .globl bsaes_encrypt_128
  919. .type bsaes_encrypt_128,%function
  920. .align 4
  921. bsaes_encrypt_128:
  922. stmdb sp!,{r4-r6,lr}
  923. vstmdb sp!,{d8-d15} @ ABI specification says so
  924. .Lenc128_loop:
  925. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  926. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  927. mov r4,$key @ pass the key
  928. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  929. mov r5,#10 @ pass rounds
  930. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  931. bl _bsaes_encrypt8
  932. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  933. vst1.8 {@XMM[4]}, [$out]!
  934. vst1.8 {@XMM[6]}, [$out]!
  935. vst1.8 {@XMM[3]}, [$out]!
  936. vst1.8 {@XMM[7]}, [$out]!
  937. vst1.8 {@XMM[2]}, [$out]!
  938. subs $len,$len,#0x80
  939. vst1.8 {@XMM[5]}, [$out]!
  940. bhi .Lenc128_loop
  941. vldmia sp!,{d8-d15}
  942. ldmia sp!,{r4-r6,pc}
  943. .size bsaes_encrypt_128,.-bsaes_encrypt_128
  944. .globl bsaes_dec_key_convert
  945. .type bsaes_dec_key_convert,%function
  946. .align 4
  947. bsaes_dec_key_convert:
  948. stmdb sp!,{r4-r6,lr}
  949. vstmdb sp!,{d8-d15} @ ABI specification says so
  950. ldr r5,[$inp,#240] @ pass rounds
  951. mov r4,$inp @ pass key
  952. mov r12,$out @ pass key schedule
  953. bl _bsaes_key_convert
  954. vldmia $out, {@XMM[6]}
  955. vstmia r12, {@XMM[15]} @ save last round key
  956. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  957. vstmia $out, {@XMM[7]}
  958. vldmia sp!,{d8-d15}
  959. ldmia sp!,{r4-r6,pc}
  960. .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
  961. .globl bsaes_decrypt_128
  962. .type bsaes_decrypt_128,%function
  963. .align 4
  964. bsaes_decrypt_128:
  965. stmdb sp!,{r4-r6,lr}
  966. vstmdb sp!,{d8-d15} @ ABI specification says so
  967. .Ldec128_loop:
  968. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  969. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  970. mov r4,$key @ pass the key
  971. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  972. mov r5,#10 @ pass rounds
  973. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  974. bl _bsaes_decrypt8
  975. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  976. vst1.8 {@XMM[6]}, [$out]!
  977. vst1.8 {@XMM[4]}, [$out]!
  978. vst1.8 {@XMM[2]}, [$out]!
  979. vst1.8 {@XMM[7]}, [$out]!
  980. vst1.8 {@XMM[3]}, [$out]!
  981. subs $len,$len,#0x80
  982. vst1.8 {@XMM[5]}, [$out]!
  983. bhi .Ldec128_loop
  984. vldmia sp!,{d8-d15}
  985. ldmia sp!,{r4-r6,pc}
  986. .size bsaes_decrypt_128,.-bsaes_decrypt_128
  987. ___
  988. }
  989. {
  990. my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
  991. my ($keysched)=("sp");
  992. $code.=<<___;
  993. .extern AES_cbc_encrypt
  994. .extern AES_decrypt
  995. .global bsaes_cbc_encrypt
  996. .type bsaes_cbc_encrypt,%function
  997. .align 5
  998. bsaes_cbc_encrypt:
  999. #ifndef __KERNEL__
  1000. cmp $len, #128
  1001. #ifndef __thumb__
  1002. blo AES_cbc_encrypt
  1003. #else
  1004. bhs 1f
  1005. b AES_cbc_encrypt
  1006. 1:
  1007. #endif
  1008. #endif
  1009. @ it is up to the caller to make sure we are called with enc == 0
  1010. mov ip, sp
  1011. stmdb sp!, {r4-r10, lr}
  1012. VFP_ABI_PUSH
  1013. ldr $ivp, [ip] @ IV is 1st arg on the stack
  1014. mov $len, $len, lsr#4 @ len in 16 byte blocks
  1015. sub sp, #0x10 @ scratch space to carry over the IV
  1016. mov $fp, sp @ save sp
  1017. ldr $rounds, [$key, #240] @ get # of rounds
  1018. #ifndef BSAES_ASM_EXTENDED_KEY
  1019. @ allocate the key schedule on the stack
  1020. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1021. add r12, #`128-32` @ sifze of bit-slices key schedule
  1022. @ populate the key schedule
  1023. mov r4, $key @ pass key
  1024. mov r5, $rounds @ pass # of rounds
  1025. mov sp, r12 @ sp is $keysched
  1026. bl _bsaes_key_convert
  1027. vldmia $keysched, {@XMM[6]}
  1028. vstmia r12, {@XMM[15]} @ save last round key
  1029. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1030. vstmia $keysched, {@XMM[7]}
  1031. #else
  1032. ldr r12, [$key, #244]
  1033. eors r12, #1
  1034. beq 0f
  1035. @ populate the key schedule
  1036. str r12, [$key, #244]
  1037. mov r4, $key @ pass key
  1038. mov r5, $rounds @ pass # of rounds
  1039. add r12, $key, #248 @ pass key schedule
  1040. bl _bsaes_key_convert
  1041. add r4, $key, #248
  1042. vldmia r4, {@XMM[6]}
  1043. vstmia r12, {@XMM[15]} @ save last round key
  1044. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1045. vstmia r4, {@XMM[7]}
  1046. .align 2
  1047. 0:
  1048. #endif
  1049. vld1.8 {@XMM[15]}, [$ivp] @ load IV
  1050. b .Lcbc_dec_loop
  1051. .align 4
  1052. .Lcbc_dec_loop:
  1053. subs $len, $len, #0x8
  1054. bmi .Lcbc_dec_loop_finish
  1055. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  1056. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  1057. #ifndef BSAES_ASM_EXTENDED_KEY
  1058. mov r4, $keysched @ pass the key
  1059. #else
  1060. add r4, $key, #248
  1061. #endif
  1062. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  1063. mov r5, $rounds
  1064. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
  1065. sub $inp, $inp, #0x60
  1066. vstmia $fp, {@XMM[15]} @ put aside IV
  1067. bl _bsaes_decrypt8
  1068. vldmia $fp, {@XMM[14]} @ reload IV
  1069. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1070. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1071. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1072. veor @XMM[1], @XMM[1], @XMM[8]
  1073. veor @XMM[6], @XMM[6], @XMM[9]
  1074. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1075. veor @XMM[4], @XMM[4], @XMM[10]
  1076. veor @XMM[2], @XMM[2], @XMM[11]
  1077. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1078. veor @XMM[7], @XMM[7], @XMM[12]
  1079. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1080. veor @XMM[3], @XMM[3], @XMM[13]
  1081. vst1.8 {@XMM[6]}, [$out]!
  1082. veor @XMM[5], @XMM[5], @XMM[14]
  1083. vst1.8 {@XMM[4]}, [$out]!
  1084. vst1.8 {@XMM[2]}, [$out]!
  1085. vst1.8 {@XMM[7]}, [$out]!
  1086. vst1.8 {@XMM[3]}, [$out]!
  1087. vst1.8 {@XMM[5]}, [$out]!
  1088. b .Lcbc_dec_loop
  1089. .Lcbc_dec_loop_finish:
  1090. adds $len, $len, #8
  1091. beq .Lcbc_dec_done
  1092. vld1.8 {@XMM[0]}, [$inp]! @ load input
  1093. cmp $len, #2
  1094. blo .Lcbc_dec_one
  1095. vld1.8 {@XMM[1]}, [$inp]!
  1096. #ifndef BSAES_ASM_EXTENDED_KEY
  1097. mov r4, $keysched @ pass the key
  1098. #else
  1099. add r4, $key, #248
  1100. #endif
  1101. mov r5, $rounds
  1102. vstmia $fp, {@XMM[15]} @ put aside IV
  1103. beq .Lcbc_dec_two
  1104. vld1.8 {@XMM[2]}, [$inp]!
  1105. cmp $len, #4
  1106. blo .Lcbc_dec_three
  1107. vld1.8 {@XMM[3]}, [$inp]!
  1108. beq .Lcbc_dec_four
  1109. vld1.8 {@XMM[4]}, [$inp]!
  1110. cmp $len, #6
  1111. blo .Lcbc_dec_five
  1112. vld1.8 {@XMM[5]}, [$inp]!
  1113. beq .Lcbc_dec_six
  1114. vld1.8 {@XMM[6]}, [$inp]!
  1115. sub $inp, $inp, #0x70
  1116. bl _bsaes_decrypt8
  1117. vldmia $fp, {@XMM[14]} @ reload IV
  1118. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1119. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1120. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1121. veor @XMM[1], @XMM[1], @XMM[8]
  1122. veor @XMM[6], @XMM[6], @XMM[9]
  1123. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1124. veor @XMM[4], @XMM[4], @XMM[10]
  1125. veor @XMM[2], @XMM[2], @XMM[11]
  1126. vld1.8 {@XMM[15]}, [$inp]!
  1127. veor @XMM[7], @XMM[7], @XMM[12]
  1128. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1129. veor @XMM[3], @XMM[3], @XMM[13]
  1130. vst1.8 {@XMM[6]}, [$out]!
  1131. vst1.8 {@XMM[4]}, [$out]!
  1132. vst1.8 {@XMM[2]}, [$out]!
  1133. vst1.8 {@XMM[7]}, [$out]!
  1134. vst1.8 {@XMM[3]}, [$out]!
  1135. b .Lcbc_dec_done
  1136. .align 4
  1137. .Lcbc_dec_six:
  1138. sub $inp, $inp, #0x60
  1139. bl _bsaes_decrypt8
  1140. vldmia $fp,{@XMM[14]} @ reload IV
  1141. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1142. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1143. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1144. veor @XMM[1], @XMM[1], @XMM[8]
  1145. veor @XMM[6], @XMM[6], @XMM[9]
  1146. vld1.8 {@XMM[12]}, [$inp]!
  1147. veor @XMM[4], @XMM[4], @XMM[10]
  1148. veor @XMM[2], @XMM[2], @XMM[11]
  1149. vld1.8 {@XMM[15]}, [$inp]!
  1150. veor @XMM[7], @XMM[7], @XMM[12]
  1151. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1152. vst1.8 {@XMM[6]}, [$out]!
  1153. vst1.8 {@XMM[4]}, [$out]!
  1154. vst1.8 {@XMM[2]}, [$out]!
  1155. vst1.8 {@XMM[7]}, [$out]!
  1156. b .Lcbc_dec_done
  1157. .align 4
  1158. .Lcbc_dec_five:
  1159. sub $inp, $inp, #0x50
  1160. bl _bsaes_decrypt8
  1161. vldmia $fp, {@XMM[14]} @ reload IV
  1162. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1163. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1164. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1165. veor @XMM[1], @XMM[1], @XMM[8]
  1166. veor @XMM[6], @XMM[6], @XMM[9]
  1167. vld1.8 {@XMM[15]}, [$inp]!
  1168. veor @XMM[4], @XMM[4], @XMM[10]
  1169. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1170. veor @XMM[2], @XMM[2], @XMM[11]
  1171. vst1.8 {@XMM[6]}, [$out]!
  1172. vst1.8 {@XMM[4]}, [$out]!
  1173. vst1.8 {@XMM[2]}, [$out]!
  1174. b .Lcbc_dec_done
  1175. .align 4
  1176. .Lcbc_dec_four:
  1177. sub $inp, $inp, #0x40
  1178. bl _bsaes_decrypt8
  1179. vldmia $fp, {@XMM[14]} @ reload IV
  1180. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1181. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1182. vld1.8 {@XMM[10]}, [$inp]!
  1183. veor @XMM[1], @XMM[1], @XMM[8]
  1184. veor @XMM[6], @XMM[6], @XMM[9]
  1185. vld1.8 {@XMM[15]}, [$inp]!
  1186. veor @XMM[4], @XMM[4], @XMM[10]
  1187. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1188. vst1.8 {@XMM[6]}, [$out]!
  1189. vst1.8 {@XMM[4]}, [$out]!
  1190. b .Lcbc_dec_done
  1191. .align 4
  1192. .Lcbc_dec_three:
  1193. sub $inp, $inp, #0x30
  1194. bl _bsaes_decrypt8
  1195. vldmia $fp, {@XMM[14]} @ reload IV
  1196. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1197. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1198. vld1.8 {@XMM[15]}, [$inp]!
  1199. veor @XMM[1], @XMM[1], @XMM[8]
  1200. veor @XMM[6], @XMM[6], @XMM[9]
  1201. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1202. vst1.8 {@XMM[6]}, [$out]!
  1203. b .Lcbc_dec_done
  1204. .align 4
  1205. .Lcbc_dec_two:
  1206. sub $inp, $inp, #0x20
  1207. bl _bsaes_decrypt8
  1208. vldmia $fp, {@XMM[14]} @ reload IV
  1209. vld1.8 {@XMM[8]}, [$inp]! @ reload input
  1210. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1211. vld1.8 {@XMM[15]}, [$inp]! @ reload input
  1212. veor @XMM[1], @XMM[1], @XMM[8]
  1213. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1214. b .Lcbc_dec_done
  1215. .align 4
  1216. .Lcbc_dec_one:
  1217. sub $inp, $inp, #0x10
  1218. mov $rounds, $out @ save original out pointer
  1219. mov $out, $fp @ use the iv scratch space as out buffer
  1220. mov r2, $key
  1221. vmov @XMM[4],@XMM[15] @ just in case ensure that IV
  1222. vmov @XMM[5],@XMM[0] @ and input are preserved
  1223. bl AES_decrypt
  1224. vld1.8 {@XMM[0]}, [$fp,:64] @ load result
  1225. veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
  1226. vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
  1227. vst1.8 {@XMM[0]}, [$rounds] @ write output
  1228. .Lcbc_dec_done:
  1229. #ifndef BSAES_ASM_EXTENDED_KEY
  1230. vmov.i32 q0, #0
  1231. vmov.i32 q1, #0
  1232. .Lcbc_dec_bzero: @ wipe key schedule [if any]
  1233. vstmia $keysched!, {q0-q1}
  1234. cmp $keysched, $fp
  1235. bne .Lcbc_dec_bzero
  1236. #endif
  1237. mov sp, $fp
  1238. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1239. vst1.8 {@XMM[15]}, [$ivp] @ return IV
  1240. VFP_ABI_POP
  1241. ldmia sp!, {r4-r10, pc}
  1242. .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
  1243. ___
  1244. }
  1245. {
  1246. my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
  1247. my $const = "r6"; # shared with _bsaes_encrypt8_alt
  1248. my $keysched = "sp";
  1249. $code.=<<___;
  1250. .extern AES_encrypt
  1251. .global bsaes_ctr32_encrypt_blocks
  1252. .type bsaes_ctr32_encrypt_blocks,%function
  1253. .align 5
  1254. bsaes_ctr32_encrypt_blocks:
  1255. cmp $len, #8 @ use plain AES for
  1256. blo .Lctr_enc_short @ small sizes
  1257. mov ip, sp
  1258. stmdb sp!, {r4-r10, lr}
  1259. VFP_ABI_PUSH
  1260. ldr $ctr, [ip] @ ctr is 1st arg on the stack
  1261. sub sp, sp, #0x10 @ scratch space to carry over the ctr
  1262. mov $fp, sp @ save sp
  1263. ldr $rounds, [$key, #240] @ get # of rounds
  1264. #ifndef BSAES_ASM_EXTENDED_KEY
  1265. @ allocate the key schedule on the stack
  1266. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1267. add r12, #`128-32` @ size of bit-sliced key schedule
  1268. @ populate the key schedule
  1269. mov r4, $key @ pass key
  1270. mov r5, $rounds @ pass # of rounds
  1271. mov sp, r12 @ sp is $keysched
  1272. bl _bsaes_key_convert
  1273. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1274. vstmia r12, {@XMM[7]} @ save last round key
  1275. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1276. add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
  1277. vldmia $keysched, {@XMM[4]} @ load round0 key
  1278. #else
  1279. ldr r12, [$key, #244]
  1280. eors r12, #1
  1281. beq 0f
  1282. @ populate the key schedule
  1283. str r12, [$key, #244]
  1284. mov r4, $key @ pass key
  1285. mov r5, $rounds @ pass # of rounds
  1286. add r12, $key, #248 @ pass key schedule
  1287. bl _bsaes_key_convert
  1288. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1289. vstmia r12, {@XMM[7]} @ save last round key
  1290. .align 2
  1291. 0: add r12, $key, #248
  1292. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1293. adrl $ctr, .LREVM0SR @ borrow $ctr
  1294. vldmia r12, {@XMM[4]} @ load round0 key
  1295. sub sp, #0x10 @ place for adjusted round0 key
  1296. #endif
  1297. vmov.i32 @XMM[8],#1 @ compose 1<<96
  1298. veor @XMM[9],@XMM[9],@XMM[9]
  1299. vrev32.8 @XMM[0],@XMM[0]
  1300. vext.8 @XMM[8],@XMM[9],@XMM[8],#4
  1301. vrev32.8 @XMM[4],@XMM[4]
  1302. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1303. vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
  1304. b .Lctr_enc_loop
  1305. .align 4
  1306. .Lctr_enc_loop:
  1307. vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
  1308. vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
  1309. vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
  1310. vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
  1311. vadd.u32 @XMM[4], @XMM[1], @XMM[10]
  1312. vadd.u32 @XMM[5], @XMM[2], @XMM[10]
  1313. vadd.u32 @XMM[6], @XMM[3], @XMM[10]
  1314. vadd.u32 @XMM[7], @XMM[4], @XMM[10]
  1315. vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
  1316. @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
  1317. @ to flip byte order in 32-bit counter
  1318. vldmia $keysched, {@XMM[9]} @ load round0 key
  1319. #ifndef BSAES_ASM_EXTENDED_KEY
  1320. add r4, $keysched, #0x10 @ pass next round key
  1321. #else
  1322. add r4, $key, #`248+16`
  1323. #endif
  1324. vldmia $ctr, {@XMM[8]} @ .LREVM0SR
  1325. mov r5, $rounds @ pass rounds
  1326. vstmia $fp, {@XMM[10]} @ save next counter
  1327. sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
  1328. bl _bsaes_encrypt8_alt
  1329. subs $len, $len, #8
  1330. blo .Lctr_enc_loop_done
  1331. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
  1332. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1333. veor @XMM[0], @XMM[8]
  1334. veor @XMM[1], @XMM[9]
  1335. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1336. veor @XMM[4], @XMM[10]
  1337. veor @XMM[6], @XMM[11]
  1338. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1339. veor @XMM[3], @XMM[12]
  1340. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1341. veor @XMM[7], @XMM[13]
  1342. veor @XMM[2], @XMM[14]
  1343. vst1.8 {@XMM[4]}, [$out]!
  1344. veor @XMM[5], @XMM[15]
  1345. vst1.8 {@XMM[6]}, [$out]!
  1346. vmov.i32 @XMM[8], #1 @ compose 1<<96
  1347. vst1.8 {@XMM[3]}, [$out]!
  1348. veor @XMM[9], @XMM[9], @XMM[9]
  1349. vst1.8 {@XMM[7]}, [$out]!
  1350. vext.8 @XMM[8], @XMM[9], @XMM[8], #4
  1351. vst1.8 {@XMM[2]}, [$out]!
  1352. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1353. vst1.8 {@XMM[5]}, [$out]!
  1354. vldmia $fp, {@XMM[0]} @ load counter
  1355. bne .Lctr_enc_loop
  1356. b .Lctr_enc_done
  1357. .align 4
  1358. .Lctr_enc_loop_done:
  1359. add $len, $len, #8
  1360. vld1.8 {@XMM[8]}, [$inp]! @ load input
  1361. veor @XMM[0], @XMM[8]
  1362. vst1.8 {@XMM[0]}, [$out]! @ write output
  1363. cmp $len, #2
  1364. blo .Lctr_enc_done
  1365. vld1.8 {@XMM[9]}, [$inp]!
  1366. veor @XMM[1], @XMM[9]
  1367. vst1.8 {@XMM[1]}, [$out]!
  1368. beq .Lctr_enc_done
  1369. vld1.8 {@XMM[10]}, [$inp]!
  1370. veor @XMM[4], @XMM[10]
  1371. vst1.8 {@XMM[4]}, [$out]!
  1372. cmp $len, #4
  1373. blo .Lctr_enc_done
  1374. vld1.8 {@XMM[11]}, [$inp]!
  1375. veor @XMM[6], @XMM[11]
  1376. vst1.8 {@XMM[6]}, [$out]!
  1377. beq .Lctr_enc_done
  1378. vld1.8 {@XMM[12]}, [$inp]!
  1379. veor @XMM[3], @XMM[12]
  1380. vst1.8 {@XMM[3]}, [$out]!
  1381. cmp $len, #6
  1382. blo .Lctr_enc_done
  1383. vld1.8 {@XMM[13]}, [$inp]!
  1384. veor @XMM[7], @XMM[13]
  1385. vst1.8 {@XMM[7]}, [$out]!
  1386. beq .Lctr_enc_done
  1387. vld1.8 {@XMM[14]}, [$inp]
  1388. veor @XMM[2], @XMM[14]
  1389. vst1.8 {@XMM[2]}, [$out]!
  1390. .Lctr_enc_done:
  1391. vmov.i32 q0, #0
  1392. vmov.i32 q1, #0
  1393. #ifndef BSAES_ASM_EXTENDED_KEY
  1394. .Lctr_enc_bzero: @ wipe key schedule [if any]
  1395. vstmia $keysched!, {q0-q1}
  1396. cmp $keysched, $fp
  1397. bne .Lctr_enc_bzero
  1398. #else
  1399. vstmia $keysched, {q0-q1}
  1400. #endif
  1401. mov sp, $fp
  1402. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1403. VFP_ABI_POP
  1404. ldmia sp!, {r4-r10, pc} @ return
  1405. .align 4
  1406. .Lctr_enc_short:
  1407. ldr ip, [sp] @ ctr pointer is passed on stack
  1408. stmdb sp!, {r4-r8, lr}
  1409. mov r4, $inp @ copy arguments
  1410. mov r5, $out
  1411. mov r6, $len
  1412. mov r7, $key
  1413. ldr r8, [ip, #12] @ load counter LSW
  1414. vld1.8 {@XMM[1]}, [ip] @ load whole counter value
  1415. #ifdef __ARMEL__
  1416. rev r8, r8
  1417. #endif
  1418. sub sp, sp, #0x10
  1419. vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
  1420. sub sp, sp, #0x10
  1421. .Lctr_enc_short_loop:
  1422. add r0, sp, #0x10 @ input counter value
  1423. mov r1, sp @ output on the stack
  1424. mov r2, r7 @ key
  1425. bl AES_encrypt
  1426. vld1.8 {@XMM[0]}, [r4]! @ load input
  1427. vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
  1428. add r8, r8, #1
  1429. #ifdef __ARMEL__
  1430. rev r0, r8
  1431. str r0, [sp, #0x1c] @ next counter value
  1432. #else
  1433. str r8, [sp, #0x1c] @ next counter value
  1434. #endif
  1435. veor @XMM[0],@XMM[0],@XMM[1]
  1436. vst1.8 {@XMM[0]}, [r5]! @ store output
  1437. subs r6, r6, #1
  1438. bne .Lctr_enc_short_loop
  1439. vmov.i32 q0, #0
  1440. vmov.i32 q1, #0
  1441. vstmia sp!, {q0-q1}
  1442. ldmia sp!, {r4-r8, pc}
  1443. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
  1444. ___
  1445. }
  1446. {
  1447. ######################################################################
  1448. # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  1449. # const AES_KEY *key1, const AES_KEY *key2,
  1450. # const unsigned char iv[16]);
  1451. #
  1452. my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
  1453. my $const="r6"; # returned by _bsaes_key_convert
  1454. my $twmask=@XMM[5];
  1455. my @T=@XMM[6..7];
  1456. $code.=<<___;
  1457. .globl bsaes_xts_encrypt
  1458. .type bsaes_xts_encrypt,%function
  1459. .align 4
  1460. bsaes_xts_encrypt:
  1461. mov ip, sp
  1462. stmdb sp!, {r4-r10, lr} @ 0x20
  1463. VFP_ABI_PUSH
  1464. mov r6, sp @ future $fp
  1465. mov $inp, r0
  1466. mov $out, r1
  1467. mov $len, r2
  1468. mov $key, r3
  1469. sub r0, sp, #0x10 @ 0x10
  1470. bic r0, #0xf @ align at 16 bytes
  1471. mov sp, r0
  1472. #ifdef XTS_CHAIN_TWEAK
  1473. ldr r0, [ip] @ pointer to input tweak
  1474. #else
  1475. @ generate initial tweak
  1476. ldr r0, [ip, #4] @ iv[]
  1477. mov r1, sp
  1478. ldr r2, [ip, #0] @ key2
  1479. bl AES_encrypt
  1480. mov r0,sp @ pointer to initial tweak
  1481. #endif
  1482. ldr $rounds, [$key, #240] @ get # of rounds
  1483. mov $fp, r6
  1484. #ifndef BSAES_ASM_EXTENDED_KEY
  1485. @ allocate the key schedule on the stack
  1486. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1487. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1488. sub r12, #`32+16` @ place for tweak[9]
  1489. @ populate the key schedule
  1490. mov r4, $key @ pass key
  1491. mov r5, $rounds @ pass # of rounds
  1492. mov sp, r12
  1493. add r12, #0x90 @ pass key schedule
  1494. bl _bsaes_key_convert
  1495. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1496. vstmia r12, {@XMM[7]} @ save last round key
  1497. #else
  1498. ldr r12, [$key, #244]
  1499. eors r12, #1
  1500. beq 0f
  1501. str r12, [$key, #244]
  1502. mov r4, $key @ pass key
  1503. mov r5, $rounds @ pass # of rounds
  1504. add r12, $key, #248 @ pass key schedule
  1505. bl _bsaes_key_convert
  1506. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1507. vstmia r12, {@XMM[7]}
  1508. .align 2
  1509. 0: sub sp, #0x90 @ place for tweak[9]
  1510. #endif
  1511. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1512. adr $magic, .Lxts_magic
  1513. subs $len, #0x80
  1514. blo .Lxts_enc_short
  1515. b .Lxts_enc_loop
  1516. .align 4
  1517. .Lxts_enc_loop:
  1518. vldmia $magic, {$twmask} @ load XTS magic
  1519. vshr.s64 @T[0], @XMM[8], #63
  1520. mov r0, sp
  1521. vand @T[0], @T[0], $twmask
  1522. ___
  1523. for($i=9;$i<16;$i++) {
  1524. $code.=<<___;
  1525. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1526. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1527. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1528. vshr.s64 @T[1], @XMM[$i], #63
  1529. veor @XMM[$i], @XMM[$i], @T[0]
  1530. vand @T[1], @T[1], $twmask
  1531. ___
  1532. @T=reverse(@T);
  1533. $code.=<<___ if ($i>=10);
  1534. vld1.8 {@XMM[$i-10]}, [$inp]!
  1535. ___
  1536. $code.=<<___ if ($i>=11);
  1537. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1538. ___
  1539. }
  1540. $code.=<<___;
  1541. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1542. vst1.64 {@XMM[15]}, [r0,:128]!
  1543. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1544. veor @XMM[8], @XMM[8], @T[0]
  1545. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1546. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1547. veor @XMM[5], @XMM[5], @XMM[13]
  1548. #ifndef BSAES_ASM_EXTENDED_KEY
  1549. add r4, sp, #0x90 @ pass key schedule
  1550. #else
  1551. add r4, $key, #248 @ pass key schedule
  1552. #endif
  1553. veor @XMM[6], @XMM[6], @XMM[14]
  1554. mov r5, $rounds @ pass rounds
  1555. veor @XMM[7], @XMM[7], @XMM[15]
  1556. mov r0, sp
  1557. bl _bsaes_encrypt8
  1558. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1559. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1560. veor @XMM[0], @XMM[0], @XMM[ 8]
  1561. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1562. veor @XMM[1], @XMM[1], @XMM[ 9]
  1563. veor @XMM[8], @XMM[4], @XMM[10]
  1564. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1565. veor @XMM[9], @XMM[6], @XMM[11]
  1566. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1567. veor @XMM[10], @XMM[3], @XMM[12]
  1568. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1569. veor @XMM[11], @XMM[7], @XMM[13]
  1570. veor @XMM[12], @XMM[2], @XMM[14]
  1571. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1572. veor @XMM[13], @XMM[5], @XMM[15]
  1573. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1574. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1575. subs $len, #0x80
  1576. bpl .Lxts_enc_loop
  1577. .Lxts_enc_short:
  1578. adds $len, #0x70
  1579. bmi .Lxts_enc_done
  1580. vldmia $magic, {$twmask} @ load XTS magic
  1581. vshr.s64 @T[0], @XMM[8], #63
  1582. mov r0, sp
  1583. vand @T[0], @T[0], $twmask
  1584. ___
  1585. for($i=9;$i<16;$i++) {
  1586. $code.=<<___;
  1587. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1588. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1589. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1590. vshr.s64 @T[1], @XMM[$i], #63
  1591. veor @XMM[$i], @XMM[$i], @T[0]
  1592. vand @T[1], @T[1], $twmask
  1593. ___
  1594. @T=reverse(@T);
  1595. $code.=<<___ if ($i>=10);
  1596. vld1.8 {@XMM[$i-10]}, [$inp]!
  1597. subs $len, #0x10
  1598. bmi .Lxts_enc_`$i-9`
  1599. ___
  1600. $code.=<<___ if ($i>=11);
  1601. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1602. ___
  1603. }
  1604. $code.=<<___;
  1605. sub $len, #0x10
  1606. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  1607. vld1.8 {@XMM[6]}, [$inp]!
  1608. veor @XMM[5], @XMM[5], @XMM[13]
  1609. #ifndef BSAES_ASM_EXTENDED_KEY
  1610. add r4, sp, #0x90 @ pass key schedule
  1611. #else
  1612. add r4, $key, #248 @ pass key schedule
  1613. #endif
  1614. veor @XMM[6], @XMM[6], @XMM[14]
  1615. mov r5, $rounds @ pass rounds
  1616. mov r0, sp
  1617. bl _bsaes_encrypt8
  1618. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1619. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1620. veor @XMM[0], @XMM[0], @XMM[ 8]
  1621. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1622. veor @XMM[1], @XMM[1], @XMM[ 9]
  1623. veor @XMM[8], @XMM[4], @XMM[10]
  1624. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1625. veor @XMM[9], @XMM[6], @XMM[11]
  1626. vld1.64 {@XMM[14]}, [r0,:128]!
  1627. veor @XMM[10], @XMM[3], @XMM[12]
  1628. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1629. veor @XMM[11], @XMM[7], @XMM[13]
  1630. veor @XMM[12], @XMM[2], @XMM[14]
  1631. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1632. vst1.8 {@XMM[12]}, [$out]!
  1633. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1634. b .Lxts_enc_done
  1635. .align 4
  1636. .Lxts_enc_6:
  1637. vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
  1638. veor @XMM[4], @XMM[4], @XMM[12]
  1639. #ifndef BSAES_ASM_EXTENDED_KEY
  1640. add r4, sp, #0x90 @ pass key schedule
  1641. #else
  1642. add r4, $key, #248 @ pass key schedule
  1643. #endif
  1644. veor @XMM[5], @XMM[5], @XMM[13]
  1645. mov r5, $rounds @ pass rounds
  1646. mov r0, sp
  1647. bl _bsaes_encrypt8
  1648. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1649. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1650. veor @XMM[0], @XMM[0], @XMM[ 8]
  1651. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1652. veor @XMM[1], @XMM[1], @XMM[ 9]
  1653. veor @XMM[8], @XMM[4], @XMM[10]
  1654. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1655. veor @XMM[9], @XMM[6], @XMM[11]
  1656. veor @XMM[10], @XMM[3], @XMM[12]
  1657. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1658. veor @XMM[11], @XMM[7], @XMM[13]
  1659. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1660. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1661. b .Lxts_enc_done
  1662. @ put this in range for both ARM and Thumb mode adr instructions
  1663. .align 5
  1664. .Lxts_magic:
  1665. .quad 1, 0x87
  1666. .align 5
  1667. .Lxts_enc_5:
  1668. vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
  1669. veor @XMM[3], @XMM[3], @XMM[11]
  1670. #ifndef BSAES_ASM_EXTENDED_KEY
  1671. add r4, sp, #0x90 @ pass key schedule
  1672. #else
  1673. add r4, $key, #248 @ pass key schedule
  1674. #endif
  1675. veor @XMM[4], @XMM[4], @XMM[12]
  1676. mov r5, $rounds @ pass rounds
  1677. mov r0, sp
  1678. bl _bsaes_encrypt8
  1679. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1680. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1681. veor @XMM[0], @XMM[0], @XMM[ 8]
  1682. vld1.64 {@XMM[12]}, [r0,:128]!
  1683. veor @XMM[1], @XMM[1], @XMM[ 9]
  1684. veor @XMM[8], @XMM[4], @XMM[10]
  1685. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1686. veor @XMM[9], @XMM[6], @XMM[11]
  1687. veor @XMM[10], @XMM[3], @XMM[12]
  1688. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1689. vst1.8 {@XMM[10]}, [$out]!
  1690. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1691. b .Lxts_enc_done
  1692. .align 4
  1693. .Lxts_enc_4:
  1694. vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
  1695. veor @XMM[2], @XMM[2], @XMM[10]
  1696. #ifndef BSAES_ASM_EXTENDED_KEY
  1697. add r4, sp, #0x90 @ pass key schedule
  1698. #else
  1699. add r4, $key, #248 @ pass key schedule
  1700. #endif
  1701. veor @XMM[3], @XMM[3], @XMM[11]
  1702. mov r5, $rounds @ pass rounds
  1703. mov r0, sp
  1704. bl _bsaes_encrypt8
  1705. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1706. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1707. veor @XMM[0], @XMM[0], @XMM[ 8]
  1708. veor @XMM[1], @XMM[1], @XMM[ 9]
  1709. veor @XMM[8], @XMM[4], @XMM[10]
  1710. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1711. veor @XMM[9], @XMM[6], @XMM[11]
  1712. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1713. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1714. b .Lxts_enc_done
  1715. .align 4
  1716. .Lxts_enc_3:
  1717. vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
  1718. veor @XMM[1], @XMM[1], @XMM[9]
  1719. #ifndef BSAES_ASM_EXTENDED_KEY
  1720. add r4, sp, #0x90 @ pass key schedule
  1721. #else
  1722. add r4, $key, #248 @ pass key schedule
  1723. #endif
  1724. veor @XMM[2], @XMM[2], @XMM[10]
  1725. mov r5, $rounds @ pass rounds
  1726. mov r0, sp
  1727. bl _bsaes_encrypt8
  1728. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1729. vld1.64 {@XMM[10]}, [r0,:128]!
  1730. veor @XMM[0], @XMM[0], @XMM[ 8]
  1731. veor @XMM[1], @XMM[1], @XMM[ 9]
  1732. veor @XMM[8], @XMM[4], @XMM[10]
  1733. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1734. vst1.8 {@XMM[8]}, [$out]!
  1735. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1736. b .Lxts_enc_done
  1737. .align 4
  1738. .Lxts_enc_2:
  1739. vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
  1740. veor @XMM[0], @XMM[0], @XMM[8]
  1741. #ifndef BSAES_ASM_EXTENDED_KEY
  1742. add r4, sp, #0x90 @ pass key schedule
  1743. #else
  1744. add r4, $key, #248 @ pass key schedule
  1745. #endif
  1746. veor @XMM[1], @XMM[1], @XMM[9]
  1747. mov r5, $rounds @ pass rounds
  1748. mov r0, sp
  1749. bl _bsaes_encrypt8
  1750. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1751. veor @XMM[0], @XMM[0], @XMM[ 8]
  1752. veor @XMM[1], @XMM[1], @XMM[ 9]
  1753. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1754. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1755. b .Lxts_enc_done
  1756. .align 4
  1757. .Lxts_enc_1:
  1758. mov r0, sp
  1759. veor @XMM[0], @XMM[8]
  1760. mov r1, sp
  1761. vst1.8 {@XMM[0]}, [sp,:128]
  1762. mov r2, $key
  1763. mov r4, $fp @ preserve fp
  1764. bl AES_encrypt
  1765. vld1.8 {@XMM[0]}, [sp,:128]
  1766. veor @XMM[0], @XMM[0], @XMM[8]
  1767. vst1.8 {@XMM[0]}, [$out]!
  1768. mov $fp, r4
  1769. vmov @XMM[8], @XMM[9] @ next round tweak
  1770. .Lxts_enc_done:
  1771. #ifndef XTS_CHAIN_TWEAK
  1772. adds $len, #0x10
  1773. beq .Lxts_enc_ret
  1774. sub r6, $out, #0x10
  1775. .Lxts_enc_steal:
  1776. ldrb r0, [$inp], #1
  1777. ldrb r1, [$out, #-0x10]
  1778. strb r0, [$out, #-0x10]
  1779. strb r1, [$out], #1
  1780. subs $len, #1
  1781. bhi .Lxts_enc_steal
  1782. vld1.8 {@XMM[0]}, [r6]
  1783. mov r0, sp
  1784. veor @XMM[0], @XMM[0], @XMM[8]
  1785. mov r1, sp
  1786. vst1.8 {@XMM[0]}, [sp,:128]
  1787. mov r2, $key
  1788. mov r4, $fp @ preserve fp
  1789. bl AES_encrypt
  1790. vld1.8 {@XMM[0]}, [sp,:128]
  1791. veor @XMM[0], @XMM[0], @XMM[8]
  1792. vst1.8 {@XMM[0]}, [r6]
  1793. mov $fp, r4
  1794. #endif
  1795. .Lxts_enc_ret:
  1796. bic r0, $fp, #0xf
  1797. vmov.i32 q0, #0
  1798. vmov.i32 q1, #0
  1799. #ifdef XTS_CHAIN_TWEAK
  1800. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  1801. #endif
  1802. .Lxts_enc_bzero: @ wipe key schedule [if any]
  1803. vstmia sp!, {q0-q1}
  1804. cmp sp, r0
  1805. bne .Lxts_enc_bzero
  1806. mov sp, $fp
  1807. #ifdef XTS_CHAIN_TWEAK
  1808. vst1.8 {@XMM[8]}, [r1]
  1809. #endif
  1810. VFP_ABI_POP
  1811. ldmia sp!, {r4-r10, pc} @ return
  1812. .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
  1813. .globl bsaes_xts_decrypt
  1814. .type bsaes_xts_decrypt,%function
  1815. .align 4
  1816. bsaes_xts_decrypt:
  1817. mov ip, sp
  1818. stmdb sp!, {r4-r10, lr} @ 0x20
  1819. VFP_ABI_PUSH
  1820. mov r6, sp @ future $fp
  1821. mov $inp, r0
  1822. mov $out, r1
  1823. mov $len, r2
  1824. mov $key, r3
  1825. sub r0, sp, #0x10 @ 0x10
  1826. bic r0, #0xf @ align at 16 bytes
  1827. mov sp, r0
  1828. #ifdef XTS_CHAIN_TWEAK
  1829. ldr r0, [ip] @ pointer to input tweak
  1830. #else
  1831. @ generate initial tweak
  1832. ldr r0, [ip, #4] @ iv[]
  1833. mov r1, sp
  1834. ldr r2, [ip, #0] @ key2
  1835. bl AES_encrypt
  1836. mov r0, sp @ pointer to initial tweak
  1837. #endif
  1838. ldr $rounds, [$key, #240] @ get # of rounds
  1839. mov $fp, r6
  1840. #ifndef BSAES_ASM_EXTENDED_KEY
  1841. @ allocate the key schedule on the stack
  1842. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1843. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1844. sub r12, #`32+16` @ place for tweak[9]
  1845. @ populate the key schedule
  1846. mov r4, $key @ pass key
  1847. mov r5, $rounds @ pass # of rounds
  1848. mov sp, r12
  1849. add r12, #0x90 @ pass key schedule
  1850. bl _bsaes_key_convert
  1851. add r4, sp, #0x90
  1852. vldmia r4, {@XMM[6]}
  1853. vstmia r12, {@XMM[15]} @ save last round key
  1854. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1855. vstmia r4, {@XMM[7]}
  1856. #else
  1857. ldr r12, [$key, #244]
  1858. eors r12, #1
  1859. beq 0f
  1860. str r12, [$key, #244]
  1861. mov r4, $key @ pass key
  1862. mov r5, $rounds @ pass # of rounds
  1863. add r12, $key, #248 @ pass key schedule
  1864. bl _bsaes_key_convert
  1865. add r4, $key, #248
  1866. vldmia r4, {@XMM[6]}
  1867. vstmia r12, {@XMM[15]} @ save last round key
  1868. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1869. vstmia r4, {@XMM[7]}
  1870. .align 2
  1871. 0: sub sp, #0x90 @ place for tweak[9]
  1872. #endif
  1873. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1874. adr $magic, .Lxts_magic
  1875. tst $len, #0xf @ if not multiple of 16
  1876. it ne @ Thumb2 thing, sanity check in ARM
  1877. subne $len, #0x10 @ subtract another 16 bytes
  1878. subs $len, #0x80
  1879. blo .Lxts_dec_short
  1880. b .Lxts_dec_loop
  1881. .align 4
  1882. .Lxts_dec_loop:
  1883. vldmia $magic, {$twmask} @ load XTS magic
  1884. vshr.s64 @T[0], @XMM[8], #63
  1885. mov r0, sp
  1886. vand @T[0], @T[0], $twmask
  1887. ___
  1888. for($i=9;$i<16;$i++) {
  1889. $code.=<<___;
  1890. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1891. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1892. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1893. vshr.s64 @T[1], @XMM[$i], #63
  1894. veor @XMM[$i], @XMM[$i], @T[0]
  1895. vand @T[1], @T[1], $twmask
  1896. ___
  1897. @T=reverse(@T);
  1898. $code.=<<___ if ($i>=10);
  1899. vld1.8 {@XMM[$i-10]}, [$inp]!
  1900. ___
  1901. $code.=<<___ if ($i>=11);
  1902. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1903. ___
  1904. }
  1905. $code.=<<___;
  1906. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1907. vst1.64 {@XMM[15]}, [r0,:128]!
  1908. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1909. veor @XMM[8], @XMM[8], @T[0]
  1910. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1911. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1912. veor @XMM[5], @XMM[5], @XMM[13]
  1913. #ifndef BSAES_ASM_EXTENDED_KEY
  1914. add r4, sp, #0x90 @ pass key schedule
  1915. #else
  1916. add r4, $key, #248 @ pass key schedule
  1917. #endif
  1918. veor @XMM[6], @XMM[6], @XMM[14]
  1919. mov r5, $rounds @ pass rounds
  1920. veor @XMM[7], @XMM[7], @XMM[15]
  1921. mov r0, sp
  1922. bl _bsaes_decrypt8
  1923. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1924. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1925. veor @XMM[0], @XMM[0], @XMM[ 8]
  1926. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1927. veor @XMM[1], @XMM[1], @XMM[ 9]
  1928. veor @XMM[8], @XMM[6], @XMM[10]
  1929. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1930. veor @XMM[9], @XMM[4], @XMM[11]
  1931. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1932. veor @XMM[10], @XMM[2], @XMM[12]
  1933. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1934. veor @XMM[11], @XMM[7], @XMM[13]
  1935. veor @XMM[12], @XMM[3], @XMM[14]
  1936. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1937. veor @XMM[13], @XMM[5], @XMM[15]
  1938. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1939. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1940. subs $len, #0x80
  1941. bpl .Lxts_dec_loop
  1942. .Lxts_dec_short:
  1943. adds $len, #0x70
  1944. bmi .Lxts_dec_done
  1945. vldmia $magic, {$twmask} @ load XTS magic
  1946. vshr.s64 @T[0], @XMM[8], #63
  1947. mov r0, sp
  1948. vand @T[0], @T[0], $twmask
  1949. ___
  1950. for($i=9;$i<16;$i++) {
  1951. $code.=<<___;
  1952. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1953. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1954. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1955. vshr.s64 @T[1], @XMM[$i], #63
  1956. veor @XMM[$i], @XMM[$i], @T[0]
  1957. vand @T[1], @T[1], $twmask
  1958. ___
  1959. @T=reverse(@T);
  1960. $code.=<<___ if ($i>=10);
  1961. vld1.8 {@XMM[$i-10]}, [$inp]!
  1962. subs $len, #0x10
  1963. bmi .Lxts_dec_`$i-9`
  1964. ___
  1965. $code.=<<___ if ($i>=11);
  1966. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1967. ___
  1968. }
  1969. $code.=<<___;
  1970. sub $len, #0x10
  1971. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  1972. vld1.8 {@XMM[6]}, [$inp]!
  1973. veor @XMM[5], @XMM[5], @XMM[13]
  1974. #ifndef BSAES_ASM_EXTENDED_KEY
  1975. add r4, sp, #0x90 @ pass key schedule
  1976. #else
  1977. add r4, $key, #248 @ pass key schedule
  1978. #endif
  1979. veor @XMM[6], @XMM[6], @XMM[14]
  1980. mov r5, $rounds @ pass rounds
  1981. mov r0, sp
  1982. bl _bsaes_decrypt8
  1983. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1984. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1985. veor @XMM[0], @XMM[0], @XMM[ 8]
  1986. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1987. veor @XMM[1], @XMM[1], @XMM[ 9]
  1988. veor @XMM[8], @XMM[6], @XMM[10]
  1989. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1990. veor @XMM[9], @XMM[4], @XMM[11]
  1991. vld1.64 {@XMM[14]}, [r0,:128]!
  1992. veor @XMM[10], @XMM[2], @XMM[12]
  1993. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1994. veor @XMM[11], @XMM[7], @XMM[13]
  1995. veor @XMM[12], @XMM[3], @XMM[14]
  1996. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1997. vst1.8 {@XMM[12]}, [$out]!
  1998. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1999. b .Lxts_dec_done
  2000. .align 4
  2001. .Lxts_dec_6:
  2002. vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
  2003. veor @XMM[4], @XMM[4], @XMM[12]
  2004. #ifndef BSAES_ASM_EXTENDED_KEY
  2005. add r4, sp, #0x90 @ pass key schedule
  2006. #else
  2007. add r4, $key, #248 @ pass key schedule
  2008. #endif
  2009. veor @XMM[5], @XMM[5], @XMM[13]
  2010. mov r5, $rounds @ pass rounds
  2011. mov r0, sp
  2012. bl _bsaes_decrypt8
  2013. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2014. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2015. veor @XMM[0], @XMM[0], @XMM[ 8]
  2016. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  2017. veor @XMM[1], @XMM[1], @XMM[ 9]
  2018. veor @XMM[8], @XMM[6], @XMM[10]
  2019. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2020. veor @XMM[9], @XMM[4], @XMM[11]
  2021. veor @XMM[10], @XMM[2], @XMM[12]
  2022. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2023. veor @XMM[11], @XMM[7], @XMM[13]
  2024. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2025. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2026. b .Lxts_dec_done
  2027. .align 4
  2028. .Lxts_dec_5:
  2029. vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
  2030. veor @XMM[3], @XMM[3], @XMM[11]
  2031. #ifndef BSAES_ASM_EXTENDED_KEY
  2032. add r4, sp, #0x90 @ pass key schedule
  2033. #else
  2034. add r4, $key, #248 @ pass key schedule
  2035. #endif
  2036. veor @XMM[4], @XMM[4], @XMM[12]
  2037. mov r5, $rounds @ pass rounds
  2038. mov r0, sp
  2039. bl _bsaes_decrypt8
  2040. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2041. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2042. veor @XMM[0], @XMM[0], @XMM[ 8]
  2043. vld1.64 {@XMM[12]}, [r0,:128]!
  2044. veor @XMM[1], @XMM[1], @XMM[ 9]
  2045. veor @XMM[8], @XMM[6], @XMM[10]
  2046. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2047. veor @XMM[9], @XMM[4], @XMM[11]
  2048. veor @XMM[10], @XMM[2], @XMM[12]
  2049. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2050. vst1.8 {@XMM[10]}, [$out]!
  2051. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2052. b .Lxts_dec_done
  2053. .align 4
  2054. .Lxts_dec_4:
  2055. vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
  2056. veor @XMM[2], @XMM[2], @XMM[10]
  2057. #ifndef BSAES_ASM_EXTENDED_KEY
  2058. add r4, sp, #0x90 @ pass key schedule
  2059. #else
  2060. add r4, $key, #248 @ pass key schedule
  2061. #endif
  2062. veor @XMM[3], @XMM[3], @XMM[11]
  2063. mov r5, $rounds @ pass rounds
  2064. mov r0, sp
  2065. bl _bsaes_decrypt8
  2066. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2067. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2068. veor @XMM[0], @XMM[0], @XMM[ 8]
  2069. veor @XMM[1], @XMM[1], @XMM[ 9]
  2070. veor @XMM[8], @XMM[6], @XMM[10]
  2071. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2072. veor @XMM[9], @XMM[4], @XMM[11]
  2073. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2074. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2075. b .Lxts_dec_done
  2076. .align 4
  2077. .Lxts_dec_3:
  2078. vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
  2079. veor @XMM[1], @XMM[1], @XMM[9]
  2080. #ifndef BSAES_ASM_EXTENDED_KEY
  2081. add r4, sp, #0x90 @ pass key schedule
  2082. #else
  2083. add r4, $key, #248 @ pass key schedule
  2084. #endif
  2085. veor @XMM[2], @XMM[2], @XMM[10]
  2086. mov r5, $rounds @ pass rounds
  2087. mov r0, sp
  2088. bl _bsaes_decrypt8
  2089. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2090. vld1.64 {@XMM[10]}, [r0,:128]!
  2091. veor @XMM[0], @XMM[0], @XMM[ 8]
  2092. veor @XMM[1], @XMM[1], @XMM[ 9]
  2093. veor @XMM[8], @XMM[6], @XMM[10]
  2094. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2095. vst1.8 {@XMM[8]}, [$out]!
  2096. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2097. b .Lxts_dec_done
  2098. .align 4
  2099. .Lxts_dec_2:
  2100. vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
  2101. veor @XMM[0], @XMM[0], @XMM[8]
  2102. #ifndef BSAES_ASM_EXTENDED_KEY
  2103. add r4, sp, #0x90 @ pass key schedule
  2104. #else
  2105. add r4, $key, #248 @ pass key schedule
  2106. #endif
  2107. veor @XMM[1], @XMM[1], @XMM[9]
  2108. mov r5, $rounds @ pass rounds
  2109. mov r0, sp
  2110. bl _bsaes_decrypt8
  2111. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2112. veor @XMM[0], @XMM[0], @XMM[ 8]
  2113. veor @XMM[1], @XMM[1], @XMM[ 9]
  2114. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2115. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2116. b .Lxts_dec_done
  2117. .align 4
  2118. .Lxts_dec_1:
  2119. mov r0, sp
  2120. veor @XMM[0], @XMM[8]
  2121. mov r1, sp
  2122. vst1.8 {@XMM[0]}, [sp,:128]
  2123. mov r2, $key
  2124. mov r4, $fp @ preserve fp
  2125. mov r5, $magic @ preserve magic
  2126. bl AES_decrypt
  2127. vld1.8 {@XMM[0]}, [sp,:128]
  2128. veor @XMM[0], @XMM[0], @XMM[8]
  2129. vst1.8 {@XMM[0]}, [$out]!
  2130. mov $fp, r4
  2131. mov $magic, r5
  2132. vmov @XMM[8], @XMM[9] @ next round tweak
  2133. .Lxts_dec_done:
  2134. #ifndef XTS_CHAIN_TWEAK
  2135. adds $len, #0x10
  2136. beq .Lxts_dec_ret
  2137. @ calculate one round of extra tweak for the stolen ciphertext
  2138. vldmia $magic, {$twmask}
  2139. vshr.s64 @XMM[6], @XMM[8], #63
  2140. vand @XMM[6], @XMM[6], $twmask
  2141. vadd.u64 @XMM[9], @XMM[8], @XMM[8]
  2142. vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
  2143. veor @XMM[9], @XMM[9], @XMM[6]
  2144. @ perform the final decryption with the last tweak value
  2145. vld1.8 {@XMM[0]}, [$inp]!
  2146. mov r0, sp
  2147. veor @XMM[0], @XMM[0], @XMM[9]
  2148. mov r1, sp
  2149. vst1.8 {@XMM[0]}, [sp,:128]
  2150. mov r2, $key
  2151. mov r4, $fp @ preserve fp
  2152. bl AES_decrypt
  2153. vld1.8 {@XMM[0]}, [sp,:128]
  2154. veor @XMM[0], @XMM[0], @XMM[9]
  2155. vst1.8 {@XMM[0]}, [$out]
  2156. mov r6, $out
  2157. .Lxts_dec_steal:
  2158. ldrb r1, [$out]
  2159. ldrb r0, [$inp], #1
  2160. strb r1, [$out, #0x10]
  2161. strb r0, [$out], #1
  2162. subs $len, #1
  2163. bhi .Lxts_dec_steal
  2164. vld1.8 {@XMM[0]}, [r6]
  2165. mov r0, sp
  2166. veor @XMM[0], @XMM[8]
  2167. mov r1, sp
  2168. vst1.8 {@XMM[0]}, [sp,:128]
  2169. mov r2, $key
  2170. bl AES_decrypt
  2171. vld1.8 {@XMM[0]}, [sp,:128]
  2172. veor @XMM[0], @XMM[0], @XMM[8]
  2173. vst1.8 {@XMM[0]}, [r6]
  2174. mov $fp, r4
  2175. #endif
  2176. .Lxts_dec_ret:
  2177. bic r0, $fp, #0xf
  2178. vmov.i32 q0, #0
  2179. vmov.i32 q1, #0
  2180. #ifdef XTS_CHAIN_TWEAK
  2181. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  2182. #endif
  2183. .Lxts_dec_bzero: @ wipe key schedule [if any]
  2184. vstmia sp!, {q0-q1}
  2185. cmp sp, r0
  2186. bne .Lxts_dec_bzero
  2187. mov sp, $fp
  2188. #ifdef XTS_CHAIN_TWEAK
  2189. vst1.8 {@XMM[8]}, [r1]
  2190. #endif
  2191. VFP_ABI_POP
  2192. ldmia sp!, {r4-r10, pc} @ return
  2193. .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
  2194. ___
  2195. }
  2196. $code.=<<___;
  2197. #endif
  2198. ___
  2199. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  2200. open SELF,$0;
  2201. while(<SELF>) {
  2202. next if (/^#!/);
  2203. last if (!s/^#/@/ and !/^$/);
  2204. print;
  2205. }
  2206. close SELF;
  2207. print $code;
  2208. close STDOUT;