bsaes-armv7.pl 62 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. #
  8. # Specific modes and adaptation for Linux kernel by Ard Biesheuvel
  9. # <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
  10. # granted.
  11. # ====================================================================
  12. # Bit-sliced AES for ARM NEON
  13. #
  14. # February 2012.
  15. #
  16. # This implementation is direct adaptation of bsaes-x86_64 module for
  17. # ARM NEON. Except that this module is endian-neutral [in sense that
  18. # it can be compiled for either endianness] by courtesy of vld1.8's
  19. # neutrality. Initial version doesn't implement interface to OpenSSL,
  20. # only low-level primitives and unsupported entry points, just enough
  21. # to collect performance results, which for Cortex-A8 core are:
  22. #
  23. # encrypt 19.5 cycles per byte processed with 128-bit key
  24. # decrypt 22.1 cycles per byte processed with 128-bit key
  25. # key conv. 440 cycles per 128-bit key/0.18 of 8x block
  26. #
  27. # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
  28. # which is [much] worse than anticipated (for further details see
  29. # http://www.openssl.org/~appro/Snapdragon-S4.html).
  30. #
  31. # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
  32. # manages in 20.0 cycles].
  33. #
  34. # When comparing to x86_64 results keep in mind that NEON unit is
  35. # [mostly] single-issue and thus can't [fully] benefit from
  36. # instruction-level parallelism. And when comparing to aes-armv4
  37. # results keep in mind key schedule conversion overhead (see
  38. # bsaes-x86_64.pl for further details)...
  39. #
  40. # <appro@openssl.org>
  41. # April-August 2013
  42. #
  43. # Add CBC, CTR and XTS subroutines, adapt for kernel use.
  44. #
  45. # <ard.biesheuvel@linaro.org>
  46. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  47. open STDOUT,">$output";
  48. my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
  49. my @XMM=map("q$_",(0..15));
  50. {
  51. my ($key,$rounds,$const)=("r4","r5","r6");
  52. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  53. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  54. sub Sbox {
  55. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  56. # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  57. my @b=@_[0..7];
  58. my @t=@_[8..11];
  59. my @s=@_[12..15];
  60. &InBasisChange (@b);
  61. &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
  62. &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  63. }
  64. sub InBasisChange {
  65. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  66. # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  67. my @b=@_[0..7];
  68. $code.=<<___;
  69. veor @b[2], @b[2], @b[1]
  70. veor @b[5], @b[5], @b[6]
  71. veor @b[3], @b[3], @b[0]
  72. veor @b[6], @b[6], @b[2]
  73. veor @b[5], @b[5], @b[0]
  74. veor @b[6], @b[6], @b[3]
  75. veor @b[3], @b[3], @b[7]
  76. veor @b[7], @b[7], @b[5]
  77. veor @b[3], @b[3], @b[4]
  78. veor @b[4], @b[4], @b[5]
  79. veor @b[2], @b[2], @b[7]
  80. veor @b[3], @b[3], @b[1]
  81. veor @b[1], @b[1], @b[5]
  82. ___
  83. }
  84. sub OutBasisChange {
  85. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  86. # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
  87. my @b=@_[0..7];
  88. $code.=<<___;
  89. veor @b[0], @b[0], @b[6]
  90. veor @b[1], @b[1], @b[4]
  91. veor @b[4], @b[4], @b[6]
  92. veor @b[2], @b[2], @b[0]
  93. veor @b[6], @b[6], @b[1]
  94. veor @b[1], @b[1], @b[5]
  95. veor @b[5], @b[5], @b[3]
  96. veor @b[3], @b[3], @b[7]
  97. veor @b[7], @b[7], @b[5]
  98. veor @b[2], @b[2], @b[5]
  99. veor @b[4], @b[4], @b[7]
  100. ___
  101. }
  102. sub InvSbox {
  103. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  104. # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
  105. my @b=@_[0..7];
  106. my @t=@_[8..11];
  107. my @s=@_[12..15];
  108. &InvInBasisChange (@b);
  109. &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
  110. &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
  111. }
  112. sub InvInBasisChange { # OutBasisChange in reverse (with twist)
  113. my @b=@_[5,1,2,6,3,7,0,4];
  114. $code.=<<___
  115. veor @b[1], @b[1], @b[7]
  116. veor @b[4], @b[4], @b[7]
  117. veor @b[7], @b[7], @b[5]
  118. veor @b[1], @b[1], @b[3]
  119. veor @b[2], @b[2], @b[5]
  120. veor @b[3], @b[3], @b[7]
  121. veor @b[6], @b[6], @b[1]
  122. veor @b[2], @b[2], @b[0]
  123. veor @b[5], @b[5], @b[3]
  124. veor @b[4], @b[4], @b[6]
  125. veor @b[0], @b[0], @b[6]
  126. veor @b[1], @b[1], @b[4]
  127. ___
  128. }
  129. sub InvOutBasisChange { # InBasisChange in reverse
  130. my @b=@_[2,5,7,3,6,1,0,4];
  131. $code.=<<___;
  132. veor @b[1], @b[1], @b[5]
  133. veor @b[2], @b[2], @b[7]
  134. veor @b[3], @b[3], @b[1]
  135. veor @b[4], @b[4], @b[5]
  136. veor @b[7], @b[7], @b[5]
  137. veor @b[3], @b[3], @b[4]
  138. veor @b[5], @b[5], @b[0]
  139. veor @b[3], @b[3], @b[7]
  140. veor @b[6], @b[6], @b[2]
  141. veor @b[2], @b[2], @b[1]
  142. veor @b[6], @b[6], @b[3]
  143. veor @b[3], @b[3], @b[0]
  144. veor @b[5], @b[5], @b[6]
  145. ___
  146. }
  147. sub Mul_GF4 {
  148. #;*************************************************************
  149. #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
  150. #;*************************************************************
  151. my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
  152. $code.=<<___;
  153. veor $t0, $y0, $y1
  154. vand $t0, $t0, $x0
  155. veor $x0, $x0, $x1
  156. vand $t1, $x1, $y0
  157. vand $x0, $x0, $y1
  158. veor $x1, $t1, $t0
  159. veor $x0, $x0, $t1
  160. ___
  161. }
  162. sub Mul_GF4_N { # not used, see next subroutine
  163. # multiply and scale by N
  164. my ($x0,$x1,$y0,$y1,$t0)=@_;
  165. $code.=<<___;
  166. veor $t0, $y0, $y1
  167. vand $t0, $t0, $x0
  168. veor $x0, $x0, $x1
  169. vand $x1, $x1, $y0
  170. vand $x0, $x0, $y1
  171. veor $x1, $x1, $x0
  172. veor $x0, $x0, $t0
  173. ___
  174. }
  175. sub Mul_GF4_N_GF4 {
  176. # interleaved Mul_GF4_N and Mul_GF4
  177. my ($x0,$x1,$y0,$y1,$t0,
  178. $x2,$x3,$y2,$y3,$t1)=@_;
  179. $code.=<<___;
  180. veor $t0, $y0, $y1
  181. veor $t1, $y2, $y3
  182. vand $t0, $t0, $x0
  183. vand $t1, $t1, $x2
  184. veor $x0, $x0, $x1
  185. veor $x2, $x2, $x3
  186. vand $x1, $x1, $y0
  187. vand $x3, $x3, $y2
  188. vand $x0, $x0, $y1
  189. vand $x2, $x2, $y3
  190. veor $x1, $x1, $x0
  191. veor $x2, $x2, $x3
  192. veor $x0, $x0, $t0
  193. veor $x3, $x3, $t1
  194. ___
  195. }
  196. sub Mul_GF16_2 {
  197. my @x=@_[0..7];
  198. my @y=@_[8..11];
  199. my @t=@_[12..15];
  200. $code.=<<___;
  201. veor @t[0], @x[0], @x[2]
  202. veor @t[1], @x[1], @x[3]
  203. ___
  204. &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
  205. $code.=<<___;
  206. veor @y[0], @y[0], @y[2]
  207. veor @y[1], @y[1], @y[3]
  208. ___
  209. Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  210. @x[2], @x[3], @y[2], @y[3], @t[2]);
  211. $code.=<<___;
  212. veor @x[0], @x[0], @t[0]
  213. veor @x[2], @x[2], @t[0]
  214. veor @x[1], @x[1], @t[1]
  215. veor @x[3], @x[3], @t[1]
  216. veor @t[0], @x[4], @x[6]
  217. veor @t[1], @x[5], @x[7]
  218. ___
  219. &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  220. @x[6], @x[7], @y[2], @y[3], @t[2]);
  221. $code.=<<___;
  222. veor @y[0], @y[0], @y[2]
  223. veor @y[1], @y[1], @y[3]
  224. ___
  225. &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
  226. $code.=<<___;
  227. veor @x[4], @x[4], @t[0]
  228. veor @x[6], @x[6], @t[0]
  229. veor @x[5], @x[5], @t[1]
  230. veor @x[7], @x[7], @t[1]
  231. ___
  232. }
  233. sub Inv_GF256 {
  234. #;********************************************************************
  235. #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
  236. #;********************************************************************
  237. my @x=@_[0..7];
  238. my @t=@_[8..11];
  239. my @s=@_[12..15];
  240. # direct optimizations from hardware
  241. $code.=<<___;
  242. veor @t[3], @x[4], @x[6]
  243. veor @t[2], @x[5], @x[7]
  244. veor @t[1], @x[1], @x[3]
  245. veor @s[1], @x[7], @x[6]
  246. vmov @t[0], @t[2]
  247. veor @s[0], @x[0], @x[2]
  248. vorr @t[2], @t[2], @t[1]
  249. veor @s[3], @t[3], @t[0]
  250. vand @s[2], @t[3], @s[0]
  251. vorr @t[3], @t[3], @s[0]
  252. veor @s[0], @s[0], @t[1]
  253. vand @t[0], @t[0], @t[1]
  254. veor @t[1], @x[3], @x[2]
  255. vand @s[3], @s[3], @s[0]
  256. vand @s[1], @s[1], @t[1]
  257. veor @t[1], @x[4], @x[5]
  258. veor @s[0], @x[1], @x[0]
  259. veor @t[3], @t[3], @s[1]
  260. veor @t[2], @t[2], @s[1]
  261. vand @s[1], @t[1], @s[0]
  262. vorr @t[1], @t[1], @s[0]
  263. veor @t[3], @t[3], @s[3]
  264. veor @t[0], @t[0], @s[1]
  265. veor @t[2], @t[2], @s[2]
  266. veor @t[1], @t[1], @s[3]
  267. veor @t[0], @t[0], @s[2]
  268. vand @s[0], @x[7], @x[3]
  269. veor @t[1], @t[1], @s[2]
  270. vand @s[1], @x[6], @x[2]
  271. vand @s[2], @x[5], @x[1]
  272. vorr @s[3], @x[4], @x[0]
  273. veor @t[3], @t[3], @s[0]
  274. veor @t[1], @t[1], @s[2]
  275. veor @t[0], @t[0], @s[3]
  276. veor @t[2], @t[2], @s[1]
  277. @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  278. @ new smaller inversion
  279. vand @s[2], @t[3], @t[1]
  280. vmov @s[0], @t[0]
  281. veor @s[1], @t[2], @s[2]
  282. veor @s[3], @t[0], @s[2]
  283. veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
  284. vbsl @s[1], @t[1], @t[0]
  285. vbsl @s[3], @t[3], @t[2]
  286. veor @t[3], @t[3], @t[2]
  287. vbsl @s[0], @s[1], @s[2]
  288. vbsl @t[0], @s[2], @s[1]
  289. vand @s[2], @s[0], @s[3]
  290. veor @t[1], @t[1], @t[0]
  291. veor @s[2], @s[2], @t[3]
  292. ___
  293. # output in s3, s2, s1, t1
  294. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
  295. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  296. &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
  297. ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
  298. }
  299. # AES linear components
  300. sub ShiftRows {
  301. my @x=@_[0..7];
  302. my @t=@_[8..11];
  303. my $mask=pop;
  304. $code.=<<___;
  305. vldmia $key!, {@t[0]-@t[3]}
  306. veor @t[0], @t[0], @x[0]
  307. veor @t[1], @t[1], @x[1]
  308. vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
  309. vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
  310. vldmia $key!, {@t[0]}
  311. veor @t[2], @t[2], @x[2]
  312. vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
  313. vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
  314. vldmia $key!, {@t[1]}
  315. veor @t[3], @t[3], @x[3]
  316. vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
  317. vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
  318. vldmia $key!, {@t[2]}
  319. vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
  320. vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
  321. vldmia $key!, {@t[3]}
  322. veor @t[0], @t[0], @x[4]
  323. veor @t[1], @t[1], @x[5]
  324. vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
  325. vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
  326. veor @t[2], @t[2], @x[6]
  327. vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
  328. vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
  329. veor @t[3], @t[3], @x[7]
  330. vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
  331. vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
  332. vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
  333. vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
  334. ___
  335. }
  336. sub MixColumns {
  337. # modified to emit output in order suitable for feeding back to aesenc[last]
  338. my @x=@_[0..7];
  339. my @t=@_[8..15];
  340. my $inv=@_[16]; # optional
  341. $code.=<<___;
  342. vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
  343. vext.8 @t[1], @x[1], @x[1], #12
  344. veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
  345. vext.8 @t[2], @x[2], @x[2], #12
  346. veor @x[1], @x[1], @t[1]
  347. vext.8 @t[3], @x[3], @x[3], #12
  348. veor @x[2], @x[2], @t[2]
  349. vext.8 @t[4], @x[4], @x[4], #12
  350. veor @x[3], @x[3], @t[3]
  351. vext.8 @t[5], @x[5], @x[5], #12
  352. veor @x[4], @x[4], @t[4]
  353. vext.8 @t[6], @x[6], @x[6], #12
  354. veor @x[5], @x[5], @t[5]
  355. vext.8 @t[7], @x[7], @x[7], #12
  356. veor @x[6], @x[6], @t[6]
  357. veor @t[1], @t[1], @x[0]
  358. veor @x[7], @x[7], @t[7]
  359. vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
  360. veor @t[2], @t[2], @x[1]
  361. veor @t[0], @t[0], @x[7]
  362. veor @t[1], @t[1], @x[7]
  363. vext.8 @x[1], @x[1], @x[1], #8
  364. veor @t[5], @t[5], @x[4]
  365. veor @x[0], @x[0], @t[0]
  366. veor @t[6], @t[6], @x[5]
  367. veor @x[1], @x[1], @t[1]
  368. vext.8 @t[0], @x[4], @x[4], #8
  369. veor @t[4], @t[4], @x[3]
  370. vext.8 @t[1], @x[5], @x[5], #8
  371. veor @t[7], @t[7], @x[6]
  372. vext.8 @x[4], @x[3], @x[3], #8
  373. veor @t[3], @t[3], @x[2]
  374. vext.8 @x[5], @x[7], @x[7], #8
  375. veor @t[4], @t[4], @x[7]
  376. vext.8 @x[3], @x[6], @x[6], #8
  377. veor @t[3], @t[3], @x[7]
  378. vext.8 @x[6], @x[2], @x[2], #8
  379. veor @x[7], @t[1], @t[5]
  380. ___
  381. $code.=<<___ if (!$inv);
  382. veor @x[2], @t[0], @t[4]
  383. veor @x[4], @x[4], @t[3]
  384. veor @x[5], @x[5], @t[7]
  385. veor @x[3], @x[3], @t[6]
  386. @ vmov @x[2], @t[0]
  387. veor @x[6], @x[6], @t[2]
  388. @ vmov @x[7], @t[1]
  389. ___
  390. $code.=<<___ if ($inv);
  391. veor @t[3], @t[3], @x[4]
  392. veor @x[5], @x[5], @t[7]
  393. veor @x[2], @x[3], @t[6]
  394. veor @x[3], @t[0], @t[4]
  395. veor @x[4], @x[6], @t[2]
  396. vmov @x[6], @t[3]
  397. @ vmov @x[7], @t[1]
  398. ___
  399. }
  400. sub InvMixColumns_orig {
  401. my @x=@_[0..7];
  402. my @t=@_[8..15];
  403. $code.=<<___;
  404. @ multiplication by 0x0e
  405. vext.8 @t[7], @x[7], @x[7], #12
  406. vmov @t[2], @x[2]
  407. veor @x[2], @x[2], @x[5] @ 2 5
  408. veor @x[7], @x[7], @x[5] @ 7 5
  409. vext.8 @t[0], @x[0], @x[0], #12
  410. vmov @t[5], @x[5]
  411. veor @x[5], @x[5], @x[0] @ 5 0 [1]
  412. veor @x[0], @x[0], @x[1] @ 0 1
  413. vext.8 @t[1], @x[1], @x[1], #12
  414. veor @x[1], @x[1], @x[2] @ 1 25
  415. veor @x[0], @x[0], @x[6] @ 01 6 [2]
  416. vext.8 @t[3], @x[3], @x[3], #12
  417. veor @x[1], @x[1], @x[3] @ 125 3 [4]
  418. veor @x[2], @x[2], @x[0] @ 25 016 [3]
  419. veor @x[3], @x[3], @x[7] @ 3 75
  420. veor @x[7], @x[7], @x[6] @ 75 6 [0]
  421. vext.8 @t[6], @x[6], @x[6], #12
  422. vmov @t[4], @x[4]
  423. veor @x[6], @x[6], @x[4] @ 6 4
  424. veor @x[4], @x[4], @x[3] @ 4 375 [6]
  425. veor @x[3], @x[3], @x[7] @ 375 756=36
  426. veor @x[6], @x[6], @t[5] @ 64 5 [7]
  427. veor @x[3], @x[3], @t[2] @ 36 2
  428. vext.8 @t[5], @t[5], @t[5], #12
  429. veor @x[3], @x[3], @t[4] @ 362 4 [5]
  430. ___
  431. my @y = @x[7,5,0,2,1,3,4,6];
  432. $code.=<<___;
  433. @ multiplication by 0x0b
  434. veor @y[1], @y[1], @y[0]
  435. veor @y[0], @y[0], @t[0]
  436. vext.8 @t[2], @t[2], @t[2], #12
  437. veor @y[1], @y[1], @t[1]
  438. veor @y[0], @y[0], @t[5]
  439. vext.8 @t[4], @t[4], @t[4], #12
  440. veor @y[1], @y[1], @t[6]
  441. veor @y[0], @y[0], @t[7]
  442. veor @t[7], @t[7], @t[6] @ clobber t[7]
  443. veor @y[3], @y[3], @t[0]
  444. veor @y[1], @y[1], @y[0]
  445. vext.8 @t[0], @t[0], @t[0], #12
  446. veor @y[2], @y[2], @t[1]
  447. veor @y[4], @y[4], @t[1]
  448. vext.8 @t[1], @t[1], @t[1], #12
  449. veor @y[2], @y[2], @t[2]
  450. veor @y[3], @y[3], @t[2]
  451. veor @y[5], @y[5], @t[2]
  452. veor @y[2], @y[2], @t[7]
  453. vext.8 @t[2], @t[2], @t[2], #12
  454. veor @y[3], @y[3], @t[3]
  455. veor @y[6], @y[6], @t[3]
  456. veor @y[4], @y[4], @t[3]
  457. veor @y[7], @y[7], @t[4]
  458. vext.8 @t[3], @t[3], @t[3], #12
  459. veor @y[5], @y[5], @t[4]
  460. veor @y[7], @y[7], @t[7]
  461. veor @t[7], @t[7], @t[5] @ clobber t[7] even more
  462. veor @y[3], @y[3], @t[5]
  463. veor @y[4], @y[4], @t[4]
  464. veor @y[5], @y[5], @t[7]
  465. vext.8 @t[4], @t[4], @t[4], #12
  466. veor @y[6], @y[6], @t[7]
  467. veor @y[4], @y[4], @t[7]
  468. veor @t[7], @t[7], @t[5]
  469. vext.8 @t[5], @t[5], @t[5], #12
  470. @ multiplication by 0x0d
  471. veor @y[4], @y[4], @y[7]
  472. veor @t[7], @t[7], @t[6] @ restore t[7]
  473. veor @y[7], @y[7], @t[4]
  474. vext.8 @t[6], @t[6], @t[6], #12
  475. veor @y[2], @y[2], @t[0]
  476. veor @y[7], @y[7], @t[5]
  477. vext.8 @t[7], @t[7], @t[7], #12
  478. veor @y[2], @y[2], @t[2]
  479. veor @y[3], @y[3], @y[1]
  480. veor @y[1], @y[1], @t[1]
  481. veor @y[0], @y[0], @t[0]
  482. veor @y[3], @y[3], @t[0]
  483. veor @y[1], @y[1], @t[5]
  484. veor @y[0], @y[0], @t[5]
  485. vext.8 @t[0], @t[0], @t[0], #12
  486. veor @y[1], @y[1], @t[7]
  487. veor @y[0], @y[0], @t[6]
  488. veor @y[3], @y[3], @y[1]
  489. veor @y[4], @y[4], @t[1]
  490. vext.8 @t[1], @t[1], @t[1], #12
  491. veor @y[7], @y[7], @t[7]
  492. veor @y[4], @y[4], @t[2]
  493. veor @y[5], @y[5], @t[2]
  494. veor @y[2], @y[2], @t[6]
  495. veor @t[6], @t[6], @t[3] @ clobber t[6]
  496. vext.8 @t[2], @t[2], @t[2], #12
  497. veor @y[4], @y[4], @y[7]
  498. veor @y[3], @y[3], @t[6]
  499. veor @y[6], @y[6], @t[6]
  500. veor @y[5], @y[5], @t[5]
  501. vext.8 @t[5], @t[5], @t[5], #12
  502. veor @y[6], @y[6], @t[4]
  503. vext.8 @t[4], @t[4], @t[4], #12
  504. veor @y[5], @y[5], @t[6]
  505. veor @y[6], @y[6], @t[7]
  506. vext.8 @t[7], @t[7], @t[7], #12
  507. veor @t[6], @t[6], @t[3] @ restore t[6]
  508. vext.8 @t[3], @t[3], @t[3], #12
  509. @ multiplication by 0x09
  510. veor @y[4], @y[4], @y[1]
  511. veor @t[1], @t[1], @y[1] @ t[1]=y[1]
  512. veor @t[0], @t[0], @t[5] @ clobber t[0]
  513. vext.8 @t[6], @t[6], @t[6], #12
  514. veor @t[1], @t[1], @t[5]
  515. veor @y[3], @y[3], @t[0]
  516. veor @t[0], @t[0], @y[0] @ t[0]=y[0]
  517. veor @t[1], @t[1], @t[6]
  518. veor @t[6], @t[6], @t[7] @ clobber t[6]
  519. veor @y[4], @y[4], @t[1]
  520. veor @y[7], @y[7], @t[4]
  521. veor @y[6], @y[6], @t[3]
  522. veor @y[5], @y[5], @t[2]
  523. veor @t[4], @t[4], @y[4] @ t[4]=y[4]
  524. veor @t[3], @t[3], @y[3] @ t[3]=y[3]
  525. veor @t[5], @t[5], @y[5] @ t[5]=y[5]
  526. veor @t[2], @t[2], @y[2] @ t[2]=y[2]
  527. veor @t[3], @t[3], @t[7]
  528. veor @XMM[5], @t[5], @t[6]
  529. veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
  530. veor @XMM[2], @t[2], @t[6]
  531. veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
  532. vmov @XMM[0], @t[0]
  533. vmov @XMM[1], @t[1]
  534. @ vmov @XMM[2], @t[2]
  535. vmov @XMM[3], @t[3]
  536. vmov @XMM[4], @t[4]
  537. @ vmov @XMM[5], @t[5]
  538. @ vmov @XMM[6], @t[6]
  539. @ vmov @XMM[7], @t[7]
  540. ___
  541. }
  542. sub InvMixColumns {
  543. my @x=@_[0..7];
  544. my @t=@_[8..15];
  545. # Thanks to Jussi Kivilinna for providing pointer to
  546. #
  547. # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
  548. # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
  549. # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
  550. # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
  551. $code.=<<___;
  552. @ multiplication by 0x05-0x00-0x04-0x00
  553. vext.8 @t[0], @x[0], @x[0], #8
  554. vext.8 @t[6], @x[6], @x[6], #8
  555. vext.8 @t[7], @x[7], @x[7], #8
  556. veor @t[0], @t[0], @x[0]
  557. vext.8 @t[1], @x[1], @x[1], #8
  558. veor @t[6], @t[6], @x[6]
  559. vext.8 @t[2], @x[2], @x[2], #8
  560. veor @t[7], @t[7], @x[7]
  561. vext.8 @t[3], @x[3], @x[3], #8
  562. veor @t[1], @t[1], @x[1]
  563. vext.8 @t[4], @x[4], @x[4], #8
  564. veor @t[2], @t[2], @x[2]
  565. vext.8 @t[5], @x[5], @x[5], #8
  566. veor @t[3], @t[3], @x[3]
  567. veor @t[4], @t[4], @x[4]
  568. veor @t[5], @t[5], @x[5]
  569. veor @x[0], @x[0], @t[6]
  570. veor @x[1], @x[1], @t[6]
  571. veor @x[2], @x[2], @t[0]
  572. veor @x[4], @x[4], @t[2]
  573. veor @x[3], @x[3], @t[1]
  574. veor @x[1], @x[1], @t[7]
  575. veor @x[2], @x[2], @t[7]
  576. veor @x[4], @x[4], @t[6]
  577. veor @x[5], @x[5], @t[3]
  578. veor @x[3], @x[3], @t[6]
  579. veor @x[6], @x[6], @t[4]
  580. veor @x[4], @x[4], @t[7]
  581. veor @x[5], @x[5], @t[7]
  582. veor @x[7], @x[7], @t[5]
  583. ___
  584. &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
  585. }
  586. sub swapmove {
  587. my ($a,$b,$n,$mask,$t)=@_;
  588. $code.=<<___;
  589. vshr.u64 $t, $b, #$n
  590. veor $t, $t, $a
  591. vand $t, $t, $mask
  592. veor $a, $a, $t
  593. vshl.u64 $t, $t, #$n
  594. veor $b, $b, $t
  595. ___
  596. }
  597. sub swapmove2x {
  598. my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
  599. $code.=<<___;
  600. vshr.u64 $t0, $b0, #$n
  601. vshr.u64 $t1, $b1, #$n
  602. veor $t0, $t0, $a0
  603. veor $t1, $t1, $a1
  604. vand $t0, $t0, $mask
  605. vand $t1, $t1, $mask
  606. veor $a0, $a0, $t0
  607. vshl.u64 $t0, $t0, #$n
  608. veor $a1, $a1, $t1
  609. vshl.u64 $t1, $t1, #$n
  610. veor $b0, $b0, $t0
  611. veor $b1, $b1, $t1
  612. ___
  613. }
  614. sub bitslice {
  615. my @x=reverse(@_[0..7]);
  616. my ($t0,$t1,$t2,$t3)=@_[8..11];
  617. $code.=<<___;
  618. vmov.i8 $t0,#0x55 @ compose .LBS0
  619. vmov.i8 $t1,#0x33 @ compose .LBS1
  620. ___
  621. &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
  622. &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  623. $code.=<<___;
  624. vmov.i8 $t0,#0x0f @ compose .LBS2
  625. ___
  626. &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
  627. &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  628. &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
  629. &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
  630. }
  631. $code.=<<___;
  632. #ifndef __KERNEL__
  633. # include "arm_arch.h"
  634. # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
  635. # define VFP_ABI_POP vldmia sp!,{d8-d15}
  636. # define VFP_ABI_FRAME 0x40
  637. #else
  638. # define VFP_ABI_PUSH
  639. # define VFP_ABI_POP
  640. # define VFP_ABI_FRAME 0
  641. # define BSAES_ASM_EXTENDED_KEY
  642. # define XTS_CHAIN_TWEAK
  643. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  644. # define __ARM_MAX_ARCH__ 7
  645. #endif
  646. #ifdef __thumb__
  647. # define adrl adr
  648. #endif
  649. #if __ARM_MAX_ARCH__>=7
  650. .arch armv7-a
  651. .fpu neon
  652. .text
  653. .syntax unified @ ARMv7-capable assembler is expected to handle this
  654. #ifdef __thumb2__
  655. .thumb
  656. #else
  657. .code 32
  658. #endif
  659. .type _bsaes_decrypt8,%function
  660. .align 4
  661. _bsaes_decrypt8:
  662. adr $const,_bsaes_decrypt8
  663. vldmia $key!, {@XMM[9]} @ round 0 key
  664. add $const,$const,#.LM0ISR-_bsaes_decrypt8
  665. vldmia $const!, {@XMM[8]} @ .LM0ISR
  666. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  667. veor @XMM[11], @XMM[1], @XMM[9]
  668. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  669. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  670. veor @XMM[12], @XMM[2], @XMM[9]
  671. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  672. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  673. veor @XMM[13], @XMM[3], @XMM[9]
  674. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  675. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  676. veor @XMM[14], @XMM[4], @XMM[9]
  677. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  678. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  679. veor @XMM[15], @XMM[5], @XMM[9]
  680. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  681. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  682. veor @XMM[10], @XMM[6], @XMM[9]
  683. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  684. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  685. veor @XMM[11], @XMM[7], @XMM[9]
  686. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  687. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  688. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  689. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  690. ___
  691. &bitslice (@XMM[0..7, 8..11]);
  692. $code.=<<___;
  693. sub $rounds,$rounds,#1
  694. b .Ldec_sbox
  695. .align 4
  696. .Ldec_loop:
  697. ___
  698. &ShiftRows (@XMM[0..7, 8..12]);
  699. $code.=".Ldec_sbox:\n";
  700. &InvSbox (@XMM[0..7, 8..15]);
  701. $code.=<<___;
  702. subs $rounds,$rounds,#1
  703. bcc .Ldec_done
  704. ___
  705. &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
  706. $code.=<<___;
  707. vldmia $const, {@XMM[12]} @ .LISR
  708. ite eq @ Thumb2 thing, sanity check in ARM
  709. addeq $const,$const,#0x10
  710. bne .Ldec_loop
  711. vldmia $const, {@XMM[12]} @ .LISRM0
  712. b .Ldec_loop
  713. .align 4
  714. .Ldec_done:
  715. ___
  716. &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
  717. $code.=<<___;
  718. vldmia $key, {@XMM[8]} @ last round key
  719. veor @XMM[6], @XMM[6], @XMM[8]
  720. veor @XMM[4], @XMM[4], @XMM[8]
  721. veor @XMM[2], @XMM[2], @XMM[8]
  722. veor @XMM[7], @XMM[7], @XMM[8]
  723. veor @XMM[3], @XMM[3], @XMM[8]
  724. veor @XMM[5], @XMM[5], @XMM[8]
  725. veor @XMM[0], @XMM[0], @XMM[8]
  726. veor @XMM[1], @XMM[1], @XMM[8]
  727. bx lr
  728. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  729. .type _bsaes_const,%object
  730. .align 6
  731. _bsaes_const:
  732. .LM0ISR: @ InvShiftRows constants
  733. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  734. .LISR:
  735. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  736. .LISRM0:
  737. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  738. .LM0SR: @ ShiftRows constants
  739. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  740. .LSR:
  741. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  742. .LSRM0:
  743. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  744. .LM0:
  745. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  746. .LREVM0SR:
  747. .quad 0x090d01050c000408, 0x03070b0f060a0e02
  748. .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
  749. .align 6
  750. .size _bsaes_const,.-_bsaes_const
  751. .type _bsaes_encrypt8,%function
  752. .align 4
  753. _bsaes_encrypt8:
  754. adr $const,_bsaes_encrypt8
  755. vldmia $key!, {@XMM[9]} @ round 0 key
  756. sub $const,$const,#_bsaes_encrypt8-.LM0SR
  757. vldmia $const!, {@XMM[8]} @ .LM0SR
  758. _bsaes_encrypt8_alt:
  759. veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
  760. veor @XMM[11], @XMM[1], @XMM[9]
  761. vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  762. vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  763. veor @XMM[12], @XMM[2], @XMM[9]
  764. vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  765. vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  766. veor @XMM[13], @XMM[3], @XMM[9]
  767. vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
  768. vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
  769. veor @XMM[14], @XMM[4], @XMM[9]
  770. vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
  771. vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
  772. veor @XMM[15], @XMM[5], @XMM[9]
  773. vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
  774. vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
  775. veor @XMM[10], @XMM[6], @XMM[9]
  776. vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
  777. vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
  778. veor @XMM[11], @XMM[7], @XMM[9]
  779. vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
  780. vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
  781. vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
  782. vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
  783. _bsaes_encrypt8_bitslice:
  784. ___
  785. &bitslice (@XMM[0..7, 8..11]);
  786. $code.=<<___;
  787. sub $rounds,$rounds,#1
  788. b .Lenc_sbox
  789. .align 4
  790. .Lenc_loop:
  791. ___
  792. &ShiftRows (@XMM[0..7, 8..12]);
  793. $code.=".Lenc_sbox:\n";
  794. &Sbox (@XMM[0..7, 8..15]);
  795. $code.=<<___;
  796. subs $rounds,$rounds,#1
  797. bcc .Lenc_done
  798. ___
  799. &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
  800. $code.=<<___;
  801. vldmia $const, {@XMM[12]} @ .LSR
  802. ite eq @ Thumb2 thing, samity check in ARM
  803. addeq $const,$const,#0x10
  804. bne .Lenc_loop
  805. vldmia $const, {@XMM[12]} @ .LSRM0
  806. b .Lenc_loop
  807. .align 4
  808. .Lenc_done:
  809. ___
  810. # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
  811. &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
  812. $code.=<<___;
  813. vldmia $key, {@XMM[8]} @ last round key
  814. veor @XMM[4], @XMM[4], @XMM[8]
  815. veor @XMM[6], @XMM[6], @XMM[8]
  816. veor @XMM[3], @XMM[3], @XMM[8]
  817. veor @XMM[7], @XMM[7], @XMM[8]
  818. veor @XMM[2], @XMM[2], @XMM[8]
  819. veor @XMM[5], @XMM[5], @XMM[8]
  820. veor @XMM[0], @XMM[0], @XMM[8]
  821. veor @XMM[1], @XMM[1], @XMM[8]
  822. bx lr
  823. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  824. ___
  825. }
  826. {
  827. my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
  828. sub bitslice_key {
  829. my @x=reverse(@_[0..7]);
  830. my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
  831. &swapmove (@x[0,1],1,$bs0,$t2,$t3);
  832. $code.=<<___;
  833. @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
  834. vmov @x[2], @x[0]
  835. vmov @x[3], @x[1]
  836. ___
  837. #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  838. &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
  839. $code.=<<___;
  840. @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  841. vmov @x[4], @x[0]
  842. vmov @x[6], @x[2]
  843. vmov @x[5], @x[1]
  844. vmov @x[7], @x[3]
  845. ___
  846. &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
  847. &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
  848. }
  849. $code.=<<___;
  850. .type _bsaes_key_convert,%function
  851. .align 4
  852. _bsaes_key_convert:
  853. adr $const,_bsaes_key_convert
  854. vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
  855. sub $const,$const,#_bsaes_key_convert-.LM0
  856. vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
  857. vmov.i8 @XMM[8], #0x01 @ bit masks
  858. vmov.i8 @XMM[9], #0x02
  859. vmov.i8 @XMM[10], #0x04
  860. vmov.i8 @XMM[11], #0x08
  861. vmov.i8 @XMM[12], #0x10
  862. vmov.i8 @XMM[13], #0x20
  863. vldmia $const, {@XMM[14]} @ .LM0
  864. #ifdef __ARMEL__
  865. vrev32.8 @XMM[7], @XMM[7]
  866. vrev32.8 @XMM[15], @XMM[15]
  867. #endif
  868. sub $rounds,$rounds,#1
  869. vstmia $out!, {@XMM[7]} @ save round 0 key
  870. b .Lkey_loop
  871. .align 4
  872. .Lkey_loop:
  873. vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
  874. vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
  875. vmov.i8 @XMM[6], #0x40
  876. vmov.i8 @XMM[15], #0x80
  877. vtst.8 @XMM[0], @XMM[7], @XMM[8]
  878. vtst.8 @XMM[1], @XMM[7], @XMM[9]
  879. vtst.8 @XMM[2], @XMM[7], @XMM[10]
  880. vtst.8 @XMM[3], @XMM[7], @XMM[11]
  881. vtst.8 @XMM[4], @XMM[7], @XMM[12]
  882. vtst.8 @XMM[5], @XMM[7], @XMM[13]
  883. vtst.8 @XMM[6], @XMM[7], @XMM[6]
  884. vtst.8 @XMM[7], @XMM[7], @XMM[15]
  885. vld1.8 {@XMM[15]}, [$inp]! @ load next round key
  886. vmvn @XMM[0], @XMM[0] @ "pnot"
  887. vmvn @XMM[1], @XMM[1]
  888. vmvn @XMM[5], @XMM[5]
  889. vmvn @XMM[6], @XMM[6]
  890. #ifdef __ARMEL__
  891. vrev32.8 @XMM[15], @XMM[15]
  892. #endif
  893. subs $rounds,$rounds,#1
  894. vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
  895. bne .Lkey_loop
  896. vmov.i8 @XMM[7],#0x63 @ compose .L63
  897. @ don't save last round key
  898. bx lr
  899. .size _bsaes_key_convert,.-_bsaes_key_convert
  900. ___
  901. }
  902. if (0) { # following four functions are unsupported interface
  903. # used for benchmarking...
  904. $code.=<<___;
  905. .globl bsaes_enc_key_convert
  906. .type bsaes_enc_key_convert,%function
  907. .align 4
  908. bsaes_enc_key_convert:
  909. stmdb sp!,{r4-r6,lr}
  910. vstmdb sp!,{d8-d15} @ ABI specification says so
  911. ldr r5,[$inp,#240] @ pass rounds
  912. mov r4,$inp @ pass key
  913. mov r12,$out @ pass key schedule
  914. bl _bsaes_key_convert
  915. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  916. vstmia r12, {@XMM[7]} @ save last round key
  917. vldmia sp!,{d8-d15}
  918. ldmia sp!,{r4-r6,pc}
  919. .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
  920. .globl bsaes_encrypt_128
  921. .type bsaes_encrypt_128,%function
  922. .align 4
  923. bsaes_encrypt_128:
  924. stmdb sp!,{r4-r6,lr}
  925. vstmdb sp!,{d8-d15} @ ABI specification says so
  926. .Lenc128_loop:
  927. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  928. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  929. mov r4,$key @ pass the key
  930. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  931. mov r5,#10 @ pass rounds
  932. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  933. bl _bsaes_encrypt8
  934. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  935. vst1.8 {@XMM[4]}, [$out]!
  936. vst1.8 {@XMM[6]}, [$out]!
  937. vst1.8 {@XMM[3]}, [$out]!
  938. vst1.8 {@XMM[7]}, [$out]!
  939. vst1.8 {@XMM[2]}, [$out]!
  940. subs $len,$len,#0x80
  941. vst1.8 {@XMM[5]}, [$out]!
  942. bhi .Lenc128_loop
  943. vldmia sp!,{d8-d15}
  944. ldmia sp!,{r4-r6,pc}
  945. .size bsaes_encrypt_128,.-bsaes_encrypt_128
  946. .globl bsaes_dec_key_convert
  947. .type bsaes_dec_key_convert,%function
  948. .align 4
  949. bsaes_dec_key_convert:
  950. stmdb sp!,{r4-r6,lr}
  951. vstmdb sp!,{d8-d15} @ ABI specification says so
  952. ldr r5,[$inp,#240] @ pass rounds
  953. mov r4,$inp @ pass key
  954. mov r12,$out @ pass key schedule
  955. bl _bsaes_key_convert
  956. vldmia $out, {@XMM[6]}
  957. vstmia r12, {@XMM[15]} @ save last round key
  958. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  959. vstmia $out, {@XMM[7]}
  960. vldmia sp!,{d8-d15}
  961. ldmia sp!,{r4-r6,pc}
  962. .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
  963. .globl bsaes_decrypt_128
  964. .type bsaes_decrypt_128,%function
  965. .align 4
  966. bsaes_decrypt_128:
  967. stmdb sp!,{r4-r6,lr}
  968. vstmdb sp!,{d8-d15} @ ABI specification says so
  969. .Ldec128_loop:
  970. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  971. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  972. mov r4,$key @ pass the key
  973. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  974. mov r5,#10 @ pass rounds
  975. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  976. bl _bsaes_decrypt8
  977. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  978. vst1.8 {@XMM[6]}, [$out]!
  979. vst1.8 {@XMM[4]}, [$out]!
  980. vst1.8 {@XMM[2]}, [$out]!
  981. vst1.8 {@XMM[7]}, [$out]!
  982. vst1.8 {@XMM[3]}, [$out]!
  983. subs $len,$len,#0x80
  984. vst1.8 {@XMM[5]}, [$out]!
  985. bhi .Ldec128_loop
  986. vldmia sp!,{d8-d15}
  987. ldmia sp!,{r4-r6,pc}
  988. .size bsaes_decrypt_128,.-bsaes_decrypt_128
  989. ___
  990. }
  991. {
  992. my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
  993. my ($keysched)=("sp");
  994. $code.=<<___;
  995. .extern AES_cbc_encrypt
  996. .extern AES_decrypt
  997. .global bsaes_cbc_encrypt
  998. .type bsaes_cbc_encrypt,%function
  999. .align 5
  1000. bsaes_cbc_encrypt:
  1001. #ifndef __KERNEL__
  1002. cmp $len, #128
  1003. #ifndef __thumb__
  1004. blo AES_cbc_encrypt
  1005. #else
  1006. bhs 1f
  1007. b AES_cbc_encrypt
  1008. 1:
  1009. #endif
  1010. #endif
  1011. @ it is up to the caller to make sure we are called with enc == 0
  1012. mov ip, sp
  1013. stmdb sp!, {r4-r10, lr}
  1014. VFP_ABI_PUSH
  1015. ldr $ivp, [ip] @ IV is 1st arg on the stack
  1016. mov $len, $len, lsr#4 @ len in 16 byte blocks
  1017. sub sp, #0x10 @ scratch space to carry over the IV
  1018. mov $fp, sp @ save sp
  1019. ldr $rounds, [$key, #240] @ get # of rounds
  1020. #ifndef BSAES_ASM_EXTENDED_KEY
  1021. @ allocate the key schedule on the stack
  1022. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1023. add r12, #`128-32` @ sifze of bit-slices key schedule
  1024. @ populate the key schedule
  1025. mov r4, $key @ pass key
  1026. mov r5, $rounds @ pass # of rounds
  1027. mov sp, r12 @ sp is $keysched
  1028. bl _bsaes_key_convert
  1029. vldmia $keysched, {@XMM[6]}
  1030. vstmia r12, {@XMM[15]} @ save last round key
  1031. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1032. vstmia $keysched, {@XMM[7]}
  1033. #else
  1034. ldr r12, [$key, #244]
  1035. eors r12, #1
  1036. beq 0f
  1037. @ populate the key schedule
  1038. str r12, [$key, #244]
  1039. mov r4, $key @ pass key
  1040. mov r5, $rounds @ pass # of rounds
  1041. add r12, $key, #248 @ pass key schedule
  1042. bl _bsaes_key_convert
  1043. add r4, $key, #248
  1044. vldmia r4, {@XMM[6]}
  1045. vstmia r12, {@XMM[15]} @ save last round key
  1046. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1047. vstmia r4, {@XMM[7]}
  1048. .align 2
  1049. 0:
  1050. #endif
  1051. vld1.8 {@XMM[15]}, [$ivp] @ load IV
  1052. b .Lcbc_dec_loop
  1053. .align 4
  1054. .Lcbc_dec_loop:
  1055. subs $len, $len, #0x8
  1056. bmi .Lcbc_dec_loop_finish
  1057. vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
  1058. vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
  1059. #ifndef BSAES_ASM_EXTENDED_KEY
  1060. mov r4, $keysched @ pass the key
  1061. #else
  1062. add r4, $key, #248
  1063. #endif
  1064. vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
  1065. mov r5, $rounds
  1066. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
  1067. sub $inp, $inp, #0x60
  1068. vstmia $fp, {@XMM[15]} @ put aside IV
  1069. bl _bsaes_decrypt8
  1070. vldmia $fp, {@XMM[14]} @ reload IV
  1071. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1072. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1073. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1074. veor @XMM[1], @XMM[1], @XMM[8]
  1075. veor @XMM[6], @XMM[6], @XMM[9]
  1076. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1077. veor @XMM[4], @XMM[4], @XMM[10]
  1078. veor @XMM[2], @XMM[2], @XMM[11]
  1079. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1080. veor @XMM[7], @XMM[7], @XMM[12]
  1081. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1082. veor @XMM[3], @XMM[3], @XMM[13]
  1083. vst1.8 {@XMM[6]}, [$out]!
  1084. veor @XMM[5], @XMM[5], @XMM[14]
  1085. vst1.8 {@XMM[4]}, [$out]!
  1086. vst1.8 {@XMM[2]}, [$out]!
  1087. vst1.8 {@XMM[7]}, [$out]!
  1088. vst1.8 {@XMM[3]}, [$out]!
  1089. vst1.8 {@XMM[5]}, [$out]!
  1090. b .Lcbc_dec_loop
  1091. .Lcbc_dec_loop_finish:
  1092. adds $len, $len, #8
  1093. beq .Lcbc_dec_done
  1094. vld1.8 {@XMM[0]}, [$inp]! @ load input
  1095. cmp $len, #2
  1096. blo .Lcbc_dec_one
  1097. vld1.8 {@XMM[1]}, [$inp]!
  1098. #ifndef BSAES_ASM_EXTENDED_KEY
  1099. mov r4, $keysched @ pass the key
  1100. #else
  1101. add r4, $key, #248
  1102. #endif
  1103. mov r5, $rounds
  1104. vstmia $fp, {@XMM[15]} @ put aside IV
  1105. beq .Lcbc_dec_two
  1106. vld1.8 {@XMM[2]}, [$inp]!
  1107. cmp $len, #4
  1108. blo .Lcbc_dec_three
  1109. vld1.8 {@XMM[3]}, [$inp]!
  1110. beq .Lcbc_dec_four
  1111. vld1.8 {@XMM[4]}, [$inp]!
  1112. cmp $len, #6
  1113. blo .Lcbc_dec_five
  1114. vld1.8 {@XMM[5]}, [$inp]!
  1115. beq .Lcbc_dec_six
  1116. vld1.8 {@XMM[6]}, [$inp]!
  1117. sub $inp, $inp, #0x70
  1118. bl _bsaes_decrypt8
  1119. vldmia $fp, {@XMM[14]} @ reload IV
  1120. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1121. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1122. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1123. veor @XMM[1], @XMM[1], @XMM[8]
  1124. veor @XMM[6], @XMM[6], @XMM[9]
  1125. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1126. veor @XMM[4], @XMM[4], @XMM[10]
  1127. veor @XMM[2], @XMM[2], @XMM[11]
  1128. vld1.8 {@XMM[15]}, [$inp]!
  1129. veor @XMM[7], @XMM[7], @XMM[12]
  1130. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1131. veor @XMM[3], @XMM[3], @XMM[13]
  1132. vst1.8 {@XMM[6]}, [$out]!
  1133. vst1.8 {@XMM[4]}, [$out]!
  1134. vst1.8 {@XMM[2]}, [$out]!
  1135. vst1.8 {@XMM[7]}, [$out]!
  1136. vst1.8 {@XMM[3]}, [$out]!
  1137. b .Lcbc_dec_done
  1138. .align 4
  1139. .Lcbc_dec_six:
  1140. sub $inp, $inp, #0x60
  1141. bl _bsaes_decrypt8
  1142. vldmia $fp,{@XMM[14]} @ reload IV
  1143. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1144. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1145. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1146. veor @XMM[1], @XMM[1], @XMM[8]
  1147. veor @XMM[6], @XMM[6], @XMM[9]
  1148. vld1.8 {@XMM[12]}, [$inp]!
  1149. veor @XMM[4], @XMM[4], @XMM[10]
  1150. veor @XMM[2], @XMM[2], @XMM[11]
  1151. vld1.8 {@XMM[15]}, [$inp]!
  1152. veor @XMM[7], @XMM[7], @XMM[12]
  1153. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1154. vst1.8 {@XMM[6]}, [$out]!
  1155. vst1.8 {@XMM[4]}, [$out]!
  1156. vst1.8 {@XMM[2]}, [$out]!
  1157. vst1.8 {@XMM[7]}, [$out]!
  1158. b .Lcbc_dec_done
  1159. .align 4
  1160. .Lcbc_dec_five:
  1161. sub $inp, $inp, #0x50
  1162. bl _bsaes_decrypt8
  1163. vldmia $fp, {@XMM[14]} @ reload IV
  1164. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1165. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1166. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1167. veor @XMM[1], @XMM[1], @XMM[8]
  1168. veor @XMM[6], @XMM[6], @XMM[9]
  1169. vld1.8 {@XMM[15]}, [$inp]!
  1170. veor @XMM[4], @XMM[4], @XMM[10]
  1171. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1172. veor @XMM[2], @XMM[2], @XMM[11]
  1173. vst1.8 {@XMM[6]}, [$out]!
  1174. vst1.8 {@XMM[4]}, [$out]!
  1175. vst1.8 {@XMM[2]}, [$out]!
  1176. b .Lcbc_dec_done
  1177. .align 4
  1178. .Lcbc_dec_four:
  1179. sub $inp, $inp, #0x40
  1180. bl _bsaes_decrypt8
  1181. vldmia $fp, {@XMM[14]} @ reload IV
  1182. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1183. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1184. vld1.8 {@XMM[10]}, [$inp]!
  1185. veor @XMM[1], @XMM[1], @XMM[8]
  1186. veor @XMM[6], @XMM[6], @XMM[9]
  1187. vld1.8 {@XMM[15]}, [$inp]!
  1188. veor @XMM[4], @XMM[4], @XMM[10]
  1189. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1190. vst1.8 {@XMM[6]}, [$out]!
  1191. vst1.8 {@XMM[4]}, [$out]!
  1192. b .Lcbc_dec_done
  1193. .align 4
  1194. .Lcbc_dec_three:
  1195. sub $inp, $inp, #0x30
  1196. bl _bsaes_decrypt8
  1197. vldmia $fp, {@XMM[14]} @ reload IV
  1198. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
  1199. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1200. vld1.8 {@XMM[15]}, [$inp]!
  1201. veor @XMM[1], @XMM[1], @XMM[8]
  1202. veor @XMM[6], @XMM[6], @XMM[9]
  1203. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1204. vst1.8 {@XMM[6]}, [$out]!
  1205. b .Lcbc_dec_done
  1206. .align 4
  1207. .Lcbc_dec_two:
  1208. sub $inp, $inp, #0x20
  1209. bl _bsaes_decrypt8
  1210. vldmia $fp, {@XMM[14]} @ reload IV
  1211. vld1.8 {@XMM[8]}, [$inp]! @ reload input
  1212. veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
  1213. vld1.8 {@XMM[15]}, [$inp]! @ reload input
  1214. veor @XMM[1], @XMM[1], @XMM[8]
  1215. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1216. b .Lcbc_dec_done
  1217. .align 4
  1218. .Lcbc_dec_one:
  1219. sub $inp, $inp, #0x10
  1220. mov $rounds, $out @ save original out pointer
  1221. mov $out, $fp @ use the iv scratch space as out buffer
  1222. mov r2, $key
  1223. vmov @XMM[4],@XMM[15] @ just in case ensure that IV
  1224. vmov @XMM[5],@XMM[0] @ and input are preserved
  1225. bl AES_decrypt
  1226. vld1.8 {@XMM[0]}, [$fp,:64] @ load result
  1227. veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
  1228. vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
  1229. vst1.8 {@XMM[0]}, [$rounds] @ write output
  1230. .Lcbc_dec_done:
  1231. #ifndef BSAES_ASM_EXTENDED_KEY
  1232. vmov.i32 q0, #0
  1233. vmov.i32 q1, #0
  1234. .Lcbc_dec_bzero: @ wipe key schedule [if any]
  1235. vstmia $keysched!, {q0-q1}
  1236. cmp $keysched, $fp
  1237. bne .Lcbc_dec_bzero
  1238. #endif
  1239. mov sp, $fp
  1240. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1241. vst1.8 {@XMM[15]}, [$ivp] @ return IV
  1242. VFP_ABI_POP
  1243. ldmia sp!, {r4-r10, pc}
  1244. .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
  1245. ___
  1246. }
  1247. {
  1248. my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
  1249. my $const = "r6"; # shared with _bsaes_encrypt8_alt
  1250. my $keysched = "sp";
  1251. $code.=<<___;
  1252. .extern AES_encrypt
  1253. .global bsaes_ctr32_encrypt_blocks
  1254. .type bsaes_ctr32_encrypt_blocks,%function
  1255. .align 5
  1256. bsaes_ctr32_encrypt_blocks:
  1257. cmp $len, #8 @ use plain AES for
  1258. blo .Lctr_enc_short @ small sizes
  1259. mov ip, sp
  1260. stmdb sp!, {r4-r10, lr}
  1261. VFP_ABI_PUSH
  1262. ldr $ctr, [ip] @ ctr is 1st arg on the stack
  1263. sub sp, sp, #0x10 @ scratch space to carry over the ctr
  1264. mov $fp, sp @ save sp
  1265. ldr $rounds, [$key, #240] @ get # of rounds
  1266. #ifndef BSAES_ASM_EXTENDED_KEY
  1267. @ allocate the key schedule on the stack
  1268. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1269. add r12, #`128-32` @ size of bit-sliced key schedule
  1270. @ populate the key schedule
  1271. mov r4, $key @ pass key
  1272. mov r5, $rounds @ pass # of rounds
  1273. mov sp, r12 @ sp is $keysched
  1274. bl _bsaes_key_convert
  1275. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1276. vstmia r12, {@XMM[7]} @ save last round key
  1277. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1278. add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
  1279. vldmia $keysched, {@XMM[4]} @ load round0 key
  1280. #else
  1281. ldr r12, [$key, #244]
  1282. eors r12, #1
  1283. beq 0f
  1284. @ populate the key schedule
  1285. str r12, [$key, #244]
  1286. mov r4, $key @ pass key
  1287. mov r5, $rounds @ pass # of rounds
  1288. add r12, $key, #248 @ pass key schedule
  1289. bl _bsaes_key_convert
  1290. veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
  1291. vstmia r12, {@XMM[7]} @ save last round key
  1292. .align 2
  1293. 0: add r12, $key, #248
  1294. vld1.8 {@XMM[0]}, [$ctr] @ load counter
  1295. adrl $ctr, .LREVM0SR @ borrow $ctr
  1296. vldmia r12, {@XMM[4]} @ load round0 key
  1297. sub sp, #0x10 @ place for adjusted round0 key
  1298. #endif
  1299. vmov.i32 @XMM[8],#1 @ compose 1<<96
  1300. veor @XMM[9],@XMM[9],@XMM[9]
  1301. vrev32.8 @XMM[0],@XMM[0]
  1302. vext.8 @XMM[8],@XMM[9],@XMM[8],#4
  1303. vrev32.8 @XMM[4],@XMM[4]
  1304. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1305. vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
  1306. b .Lctr_enc_loop
  1307. .align 4
  1308. .Lctr_enc_loop:
  1309. vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
  1310. vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
  1311. vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
  1312. vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
  1313. vadd.u32 @XMM[4], @XMM[1], @XMM[10]
  1314. vadd.u32 @XMM[5], @XMM[2], @XMM[10]
  1315. vadd.u32 @XMM[6], @XMM[3], @XMM[10]
  1316. vadd.u32 @XMM[7], @XMM[4], @XMM[10]
  1317. vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
  1318. @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
  1319. @ to flip byte order in 32-bit counter
  1320. vldmia $keysched, {@XMM[9]} @ load round0 key
  1321. #ifndef BSAES_ASM_EXTENDED_KEY
  1322. add r4, $keysched, #0x10 @ pass next round key
  1323. #else
  1324. add r4, $key, #`248+16`
  1325. #endif
  1326. vldmia $ctr, {@XMM[8]} @ .LREVM0SR
  1327. mov r5, $rounds @ pass rounds
  1328. vstmia $fp, {@XMM[10]} @ save next counter
  1329. sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
  1330. bl _bsaes_encrypt8_alt
  1331. subs $len, $len, #8
  1332. blo .Lctr_enc_loop_done
  1333. vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
  1334. vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
  1335. veor @XMM[0], @XMM[8]
  1336. veor @XMM[1], @XMM[9]
  1337. vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
  1338. veor @XMM[4], @XMM[10]
  1339. veor @XMM[6], @XMM[11]
  1340. vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
  1341. veor @XMM[3], @XMM[12]
  1342. vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
  1343. veor @XMM[7], @XMM[13]
  1344. veor @XMM[2], @XMM[14]
  1345. vst1.8 {@XMM[4]}, [$out]!
  1346. veor @XMM[5], @XMM[15]
  1347. vst1.8 {@XMM[6]}, [$out]!
  1348. vmov.i32 @XMM[8], #1 @ compose 1<<96
  1349. vst1.8 {@XMM[3]}, [$out]!
  1350. veor @XMM[9], @XMM[9], @XMM[9]
  1351. vst1.8 {@XMM[7]}, [$out]!
  1352. vext.8 @XMM[8], @XMM[9], @XMM[8], #4
  1353. vst1.8 {@XMM[2]}, [$out]!
  1354. vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
  1355. vst1.8 {@XMM[5]}, [$out]!
  1356. vldmia $fp, {@XMM[0]} @ load counter
  1357. bne .Lctr_enc_loop
  1358. b .Lctr_enc_done
  1359. .align 4
  1360. .Lctr_enc_loop_done:
  1361. add $len, $len, #8
  1362. vld1.8 {@XMM[8]}, [$inp]! @ load input
  1363. veor @XMM[0], @XMM[8]
  1364. vst1.8 {@XMM[0]}, [$out]! @ write output
  1365. cmp $len, #2
  1366. blo .Lctr_enc_done
  1367. vld1.8 {@XMM[9]}, [$inp]!
  1368. veor @XMM[1], @XMM[9]
  1369. vst1.8 {@XMM[1]}, [$out]!
  1370. beq .Lctr_enc_done
  1371. vld1.8 {@XMM[10]}, [$inp]!
  1372. veor @XMM[4], @XMM[10]
  1373. vst1.8 {@XMM[4]}, [$out]!
  1374. cmp $len, #4
  1375. blo .Lctr_enc_done
  1376. vld1.8 {@XMM[11]}, [$inp]!
  1377. veor @XMM[6], @XMM[11]
  1378. vst1.8 {@XMM[6]}, [$out]!
  1379. beq .Lctr_enc_done
  1380. vld1.8 {@XMM[12]}, [$inp]!
  1381. veor @XMM[3], @XMM[12]
  1382. vst1.8 {@XMM[3]}, [$out]!
  1383. cmp $len, #6
  1384. blo .Lctr_enc_done
  1385. vld1.8 {@XMM[13]}, [$inp]!
  1386. veor @XMM[7], @XMM[13]
  1387. vst1.8 {@XMM[7]}, [$out]!
  1388. beq .Lctr_enc_done
  1389. vld1.8 {@XMM[14]}, [$inp]
  1390. veor @XMM[2], @XMM[14]
  1391. vst1.8 {@XMM[2]}, [$out]!
  1392. .Lctr_enc_done:
  1393. vmov.i32 q0, #0
  1394. vmov.i32 q1, #0
  1395. #ifndef BSAES_ASM_EXTENDED_KEY
  1396. .Lctr_enc_bzero: @ wipe key schedule [if any]
  1397. vstmia $keysched!, {q0-q1}
  1398. cmp $keysched, $fp
  1399. bne .Lctr_enc_bzero
  1400. #else
  1401. vstmia $keysched, {q0-q1}
  1402. #endif
  1403. mov sp, $fp
  1404. add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
  1405. VFP_ABI_POP
  1406. ldmia sp!, {r4-r10, pc} @ return
  1407. .align 4
  1408. .Lctr_enc_short:
  1409. ldr ip, [sp] @ ctr pointer is passed on stack
  1410. stmdb sp!, {r4-r8, lr}
  1411. mov r4, $inp @ copy arguments
  1412. mov r5, $out
  1413. mov r6, $len
  1414. mov r7, $key
  1415. ldr r8, [ip, #12] @ load counter LSW
  1416. vld1.8 {@XMM[1]}, [ip] @ load whole counter value
  1417. #ifdef __ARMEL__
  1418. rev r8, r8
  1419. #endif
  1420. sub sp, sp, #0x10
  1421. vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
  1422. sub sp, sp, #0x10
  1423. .Lctr_enc_short_loop:
  1424. add r0, sp, #0x10 @ input counter value
  1425. mov r1, sp @ output on the stack
  1426. mov r2, r7 @ key
  1427. bl AES_encrypt
  1428. vld1.8 {@XMM[0]}, [r4]! @ load input
  1429. vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
  1430. add r8, r8, #1
  1431. #ifdef __ARMEL__
  1432. rev r0, r8
  1433. str r0, [sp, #0x1c] @ next counter value
  1434. #else
  1435. str r8, [sp, #0x1c] @ next counter value
  1436. #endif
  1437. veor @XMM[0],@XMM[0],@XMM[1]
  1438. vst1.8 {@XMM[0]}, [r5]! @ store output
  1439. subs r6, r6, #1
  1440. bne .Lctr_enc_short_loop
  1441. vmov.i32 q0, #0
  1442. vmov.i32 q1, #0
  1443. vstmia sp!, {q0-q1}
  1444. ldmia sp!, {r4-r8, pc}
  1445. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
  1446. ___
  1447. }
  1448. {
  1449. ######################################################################
  1450. # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  1451. # const AES_KEY *key1, const AES_KEY *key2,
  1452. # const unsigned char iv[16]);
  1453. #
  1454. my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
  1455. my $const="r6"; # returned by _bsaes_key_convert
  1456. my $twmask=@XMM[5];
  1457. my @T=@XMM[6..7];
  1458. $code.=<<___;
  1459. .globl bsaes_xts_encrypt
  1460. .type bsaes_xts_encrypt,%function
  1461. .align 4
  1462. bsaes_xts_encrypt:
  1463. mov ip, sp
  1464. stmdb sp!, {r4-r10, lr} @ 0x20
  1465. VFP_ABI_PUSH
  1466. mov r6, sp @ future $fp
  1467. mov $inp, r0
  1468. mov $out, r1
  1469. mov $len, r2
  1470. mov $key, r3
  1471. sub r0, sp, #0x10 @ 0x10
  1472. bic r0, #0xf @ align at 16 bytes
  1473. mov sp, r0
  1474. #ifdef XTS_CHAIN_TWEAK
  1475. ldr r0, [ip] @ pointer to input tweak
  1476. #else
  1477. @ generate initial tweak
  1478. ldr r0, [ip, #4] @ iv[]
  1479. mov r1, sp
  1480. ldr r2, [ip, #0] @ key2
  1481. bl AES_encrypt
  1482. mov r0,sp @ pointer to initial tweak
  1483. #endif
  1484. ldr $rounds, [$key, #240] @ get # of rounds
  1485. mov $fp, r6
  1486. #ifndef BSAES_ASM_EXTENDED_KEY
  1487. @ allocate the key schedule on the stack
  1488. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1489. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1490. sub r12, #`32+16` @ place for tweak[9]
  1491. @ populate the key schedule
  1492. mov r4, $key @ pass key
  1493. mov r5, $rounds @ pass # of rounds
  1494. mov sp, r12
  1495. add r12, #0x90 @ pass key schedule
  1496. bl _bsaes_key_convert
  1497. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1498. vstmia r12, {@XMM[7]} @ save last round key
  1499. #else
  1500. ldr r12, [$key, #244]
  1501. eors r12, #1
  1502. beq 0f
  1503. str r12, [$key, #244]
  1504. mov r4, $key @ pass key
  1505. mov r5, $rounds @ pass # of rounds
  1506. add r12, $key, #248 @ pass key schedule
  1507. bl _bsaes_key_convert
  1508. veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
  1509. vstmia r12, {@XMM[7]}
  1510. .align 2
  1511. 0: sub sp, #0x90 @ place for tweak[9]
  1512. #endif
  1513. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1514. adr $magic, .Lxts_magic
  1515. subs $len, #0x80
  1516. blo .Lxts_enc_short
  1517. b .Lxts_enc_loop
  1518. .align 4
  1519. .Lxts_enc_loop:
  1520. vldmia $magic, {$twmask} @ load XTS magic
  1521. vshr.s64 @T[0], @XMM[8], #63
  1522. mov r0, sp
  1523. vand @T[0], @T[0], $twmask
  1524. ___
  1525. for($i=9;$i<16;$i++) {
  1526. $code.=<<___;
  1527. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1528. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1529. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1530. vshr.s64 @T[1], @XMM[$i], #63
  1531. veor @XMM[$i], @XMM[$i], @T[0]
  1532. vand @T[1], @T[1], $twmask
  1533. ___
  1534. @T=reverse(@T);
  1535. $code.=<<___ if ($i>=10);
  1536. vld1.8 {@XMM[$i-10]}, [$inp]!
  1537. ___
  1538. $code.=<<___ if ($i>=11);
  1539. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1540. ___
  1541. }
  1542. $code.=<<___;
  1543. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1544. vst1.64 {@XMM[15]}, [r0,:128]!
  1545. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1546. veor @XMM[8], @XMM[8], @T[0]
  1547. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1548. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1549. veor @XMM[5], @XMM[5], @XMM[13]
  1550. #ifndef BSAES_ASM_EXTENDED_KEY
  1551. add r4, sp, #0x90 @ pass key schedule
  1552. #else
  1553. add r4, $key, #248 @ pass key schedule
  1554. #endif
  1555. veor @XMM[6], @XMM[6], @XMM[14]
  1556. mov r5, $rounds @ pass rounds
  1557. veor @XMM[7], @XMM[7], @XMM[15]
  1558. mov r0, sp
  1559. bl _bsaes_encrypt8
  1560. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1561. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1562. veor @XMM[0], @XMM[0], @XMM[ 8]
  1563. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1564. veor @XMM[1], @XMM[1], @XMM[ 9]
  1565. veor @XMM[8], @XMM[4], @XMM[10]
  1566. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1567. veor @XMM[9], @XMM[6], @XMM[11]
  1568. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1569. veor @XMM[10], @XMM[3], @XMM[12]
  1570. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1571. veor @XMM[11], @XMM[7], @XMM[13]
  1572. veor @XMM[12], @XMM[2], @XMM[14]
  1573. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1574. veor @XMM[13], @XMM[5], @XMM[15]
  1575. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1576. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1577. subs $len, #0x80
  1578. bpl .Lxts_enc_loop
  1579. .Lxts_enc_short:
  1580. adds $len, #0x70
  1581. bmi .Lxts_enc_done
  1582. vldmia $magic, {$twmask} @ load XTS magic
  1583. vshr.s64 @T[0], @XMM[8], #63
  1584. mov r0, sp
  1585. vand @T[0], @T[0], $twmask
  1586. ___
  1587. for($i=9;$i<16;$i++) {
  1588. $code.=<<___;
  1589. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1590. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1591. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1592. vshr.s64 @T[1], @XMM[$i], #63
  1593. veor @XMM[$i], @XMM[$i], @T[0]
  1594. vand @T[1], @T[1], $twmask
  1595. ___
  1596. @T=reverse(@T);
  1597. $code.=<<___ if ($i>=10);
  1598. vld1.8 {@XMM[$i-10]}, [$inp]!
  1599. subs $len, #0x10
  1600. bmi .Lxts_enc_`$i-9`
  1601. ___
  1602. $code.=<<___ if ($i>=11);
  1603. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1604. ___
  1605. }
  1606. $code.=<<___;
  1607. sub $len, #0x10
  1608. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  1609. vld1.8 {@XMM[6]}, [$inp]!
  1610. veor @XMM[5], @XMM[5], @XMM[13]
  1611. #ifndef BSAES_ASM_EXTENDED_KEY
  1612. add r4, sp, #0x90 @ pass key schedule
  1613. #else
  1614. add r4, $key, #248 @ pass key schedule
  1615. #endif
  1616. veor @XMM[6], @XMM[6], @XMM[14]
  1617. mov r5, $rounds @ pass rounds
  1618. mov r0, sp
  1619. bl _bsaes_encrypt8
  1620. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1621. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1622. veor @XMM[0], @XMM[0], @XMM[ 8]
  1623. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1624. veor @XMM[1], @XMM[1], @XMM[ 9]
  1625. veor @XMM[8], @XMM[4], @XMM[10]
  1626. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1627. veor @XMM[9], @XMM[6], @XMM[11]
  1628. vld1.64 {@XMM[14]}, [r0,:128]!
  1629. veor @XMM[10], @XMM[3], @XMM[12]
  1630. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1631. veor @XMM[11], @XMM[7], @XMM[13]
  1632. veor @XMM[12], @XMM[2], @XMM[14]
  1633. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1634. vst1.8 {@XMM[12]}, [$out]!
  1635. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1636. b .Lxts_enc_done
  1637. .align 4
  1638. .Lxts_enc_6:
  1639. vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
  1640. veor @XMM[4], @XMM[4], @XMM[12]
  1641. #ifndef BSAES_ASM_EXTENDED_KEY
  1642. add r4, sp, #0x90 @ pass key schedule
  1643. #else
  1644. add r4, $key, #248 @ pass key schedule
  1645. #endif
  1646. veor @XMM[5], @XMM[5], @XMM[13]
  1647. mov r5, $rounds @ pass rounds
  1648. mov r0, sp
  1649. bl _bsaes_encrypt8
  1650. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1651. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1652. veor @XMM[0], @XMM[0], @XMM[ 8]
  1653. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1654. veor @XMM[1], @XMM[1], @XMM[ 9]
  1655. veor @XMM[8], @XMM[4], @XMM[10]
  1656. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1657. veor @XMM[9], @XMM[6], @XMM[11]
  1658. veor @XMM[10], @XMM[3], @XMM[12]
  1659. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1660. veor @XMM[11], @XMM[7], @XMM[13]
  1661. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1662. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1663. b .Lxts_enc_done
  1664. @ put this in range for both ARM and Thumb mode adr instructions
  1665. .align 5
  1666. .Lxts_magic:
  1667. .quad 1, 0x87
  1668. .align 5
  1669. .Lxts_enc_5:
  1670. vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
  1671. veor @XMM[3], @XMM[3], @XMM[11]
  1672. #ifndef BSAES_ASM_EXTENDED_KEY
  1673. add r4, sp, #0x90 @ pass key schedule
  1674. #else
  1675. add r4, $key, #248 @ pass key schedule
  1676. #endif
  1677. veor @XMM[4], @XMM[4], @XMM[12]
  1678. mov r5, $rounds @ pass rounds
  1679. mov r0, sp
  1680. bl _bsaes_encrypt8
  1681. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1682. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1683. veor @XMM[0], @XMM[0], @XMM[ 8]
  1684. vld1.64 {@XMM[12]}, [r0,:128]!
  1685. veor @XMM[1], @XMM[1], @XMM[ 9]
  1686. veor @XMM[8], @XMM[4], @XMM[10]
  1687. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1688. veor @XMM[9], @XMM[6], @XMM[11]
  1689. veor @XMM[10], @XMM[3], @XMM[12]
  1690. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1691. vst1.8 {@XMM[10]}, [$out]!
  1692. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1693. b .Lxts_enc_done
  1694. .align 4
  1695. .Lxts_enc_4:
  1696. vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
  1697. veor @XMM[2], @XMM[2], @XMM[10]
  1698. #ifndef BSAES_ASM_EXTENDED_KEY
  1699. add r4, sp, #0x90 @ pass key schedule
  1700. #else
  1701. add r4, $key, #248 @ pass key schedule
  1702. #endif
  1703. veor @XMM[3], @XMM[3], @XMM[11]
  1704. mov r5, $rounds @ pass rounds
  1705. mov r0, sp
  1706. bl _bsaes_encrypt8
  1707. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1708. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1709. veor @XMM[0], @XMM[0], @XMM[ 8]
  1710. veor @XMM[1], @XMM[1], @XMM[ 9]
  1711. veor @XMM[8], @XMM[4], @XMM[10]
  1712. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1713. veor @XMM[9], @XMM[6], @XMM[11]
  1714. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1715. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1716. b .Lxts_enc_done
  1717. .align 4
  1718. .Lxts_enc_3:
  1719. vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
  1720. veor @XMM[1], @XMM[1], @XMM[9]
  1721. #ifndef BSAES_ASM_EXTENDED_KEY
  1722. add r4, sp, #0x90 @ pass key schedule
  1723. #else
  1724. add r4, $key, #248 @ pass key schedule
  1725. #endif
  1726. veor @XMM[2], @XMM[2], @XMM[10]
  1727. mov r5, $rounds @ pass rounds
  1728. mov r0, sp
  1729. bl _bsaes_encrypt8
  1730. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1731. vld1.64 {@XMM[10]}, [r0,:128]!
  1732. veor @XMM[0], @XMM[0], @XMM[ 8]
  1733. veor @XMM[1], @XMM[1], @XMM[ 9]
  1734. veor @XMM[8], @XMM[4], @XMM[10]
  1735. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1736. vst1.8 {@XMM[8]}, [$out]!
  1737. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1738. b .Lxts_enc_done
  1739. .align 4
  1740. .Lxts_enc_2:
  1741. vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
  1742. veor @XMM[0], @XMM[0], @XMM[8]
  1743. #ifndef BSAES_ASM_EXTENDED_KEY
  1744. add r4, sp, #0x90 @ pass key schedule
  1745. #else
  1746. add r4, $key, #248 @ pass key schedule
  1747. #endif
  1748. veor @XMM[1], @XMM[1], @XMM[9]
  1749. mov r5, $rounds @ pass rounds
  1750. mov r0, sp
  1751. bl _bsaes_encrypt8
  1752. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  1753. veor @XMM[0], @XMM[0], @XMM[ 8]
  1754. veor @XMM[1], @XMM[1], @XMM[ 9]
  1755. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1756. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1757. b .Lxts_enc_done
  1758. .align 4
  1759. .Lxts_enc_1:
  1760. mov r0, sp
  1761. veor @XMM[0], @XMM[8]
  1762. mov r1, sp
  1763. vst1.8 {@XMM[0]}, [sp,:128]
  1764. mov r2, $key
  1765. mov r4, $fp @ preserve fp
  1766. bl AES_encrypt
  1767. vld1.8 {@XMM[0]}, [sp,:128]
  1768. veor @XMM[0], @XMM[0], @XMM[8]
  1769. vst1.8 {@XMM[0]}, [$out]!
  1770. mov $fp, r4
  1771. vmov @XMM[8], @XMM[9] @ next round tweak
  1772. .Lxts_enc_done:
  1773. #ifndef XTS_CHAIN_TWEAK
  1774. adds $len, #0x10
  1775. beq .Lxts_enc_ret
  1776. sub r6, $out, #0x10
  1777. .Lxts_enc_steal:
  1778. ldrb r0, [$inp], #1
  1779. ldrb r1, [$out, #-0x10]
  1780. strb r0, [$out, #-0x10]
  1781. strb r1, [$out], #1
  1782. subs $len, #1
  1783. bhi .Lxts_enc_steal
  1784. vld1.8 {@XMM[0]}, [r6]
  1785. mov r0, sp
  1786. veor @XMM[0], @XMM[0], @XMM[8]
  1787. mov r1, sp
  1788. vst1.8 {@XMM[0]}, [sp,:128]
  1789. mov r2, $key
  1790. mov r4, $fp @ preserve fp
  1791. bl AES_encrypt
  1792. vld1.8 {@XMM[0]}, [sp,:128]
  1793. veor @XMM[0], @XMM[0], @XMM[8]
  1794. vst1.8 {@XMM[0]}, [r6]
  1795. mov $fp, r4
  1796. #endif
  1797. .Lxts_enc_ret:
  1798. bic r0, $fp, #0xf
  1799. vmov.i32 q0, #0
  1800. vmov.i32 q1, #0
  1801. #ifdef XTS_CHAIN_TWEAK
  1802. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  1803. #endif
  1804. .Lxts_enc_bzero: @ wipe key schedule [if any]
  1805. vstmia sp!, {q0-q1}
  1806. cmp sp, r0
  1807. bne .Lxts_enc_bzero
  1808. mov sp, $fp
  1809. #ifdef XTS_CHAIN_TWEAK
  1810. vst1.8 {@XMM[8]}, [r1]
  1811. #endif
  1812. VFP_ABI_POP
  1813. ldmia sp!, {r4-r10, pc} @ return
  1814. .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
  1815. .globl bsaes_xts_decrypt
  1816. .type bsaes_xts_decrypt,%function
  1817. .align 4
  1818. bsaes_xts_decrypt:
  1819. mov ip, sp
  1820. stmdb sp!, {r4-r10, lr} @ 0x20
  1821. VFP_ABI_PUSH
  1822. mov r6, sp @ future $fp
  1823. mov $inp, r0
  1824. mov $out, r1
  1825. mov $len, r2
  1826. mov $key, r3
  1827. sub r0, sp, #0x10 @ 0x10
  1828. bic r0, #0xf @ align at 16 bytes
  1829. mov sp, r0
  1830. #ifdef XTS_CHAIN_TWEAK
  1831. ldr r0, [ip] @ pointer to input tweak
  1832. #else
  1833. @ generate initial tweak
  1834. ldr r0, [ip, #4] @ iv[]
  1835. mov r1, sp
  1836. ldr r2, [ip, #0] @ key2
  1837. bl AES_encrypt
  1838. mov r0, sp @ pointer to initial tweak
  1839. #endif
  1840. ldr $rounds, [$key, #240] @ get # of rounds
  1841. mov $fp, r6
  1842. #ifndef BSAES_ASM_EXTENDED_KEY
  1843. @ allocate the key schedule on the stack
  1844. sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
  1845. @ add r12, #`128-32` @ size of bit-sliced key schedule
  1846. sub r12, #`32+16` @ place for tweak[9]
  1847. @ populate the key schedule
  1848. mov r4, $key @ pass key
  1849. mov r5, $rounds @ pass # of rounds
  1850. mov sp, r12
  1851. add r12, #0x90 @ pass key schedule
  1852. bl _bsaes_key_convert
  1853. add r4, sp, #0x90
  1854. vldmia r4, {@XMM[6]}
  1855. vstmia r12, {@XMM[15]} @ save last round key
  1856. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1857. vstmia r4, {@XMM[7]}
  1858. #else
  1859. ldr r12, [$key, #244]
  1860. eors r12, #1
  1861. beq 0f
  1862. str r12, [$key, #244]
  1863. mov r4, $key @ pass key
  1864. mov r5, $rounds @ pass # of rounds
  1865. add r12, $key, #248 @ pass key schedule
  1866. bl _bsaes_key_convert
  1867. add r4, $key, #248
  1868. vldmia r4, {@XMM[6]}
  1869. vstmia r12, {@XMM[15]} @ save last round key
  1870. veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
  1871. vstmia r4, {@XMM[7]}
  1872. .align 2
  1873. 0: sub sp, #0x90 @ place for tweak[9]
  1874. #endif
  1875. vld1.8 {@XMM[8]}, [r0] @ initial tweak
  1876. adr $magic, .Lxts_magic
  1877. #ifndef XTS_CHAIN_TWEAK
  1878. tst $len, #0xf @ if not multiple of 16
  1879. it ne @ Thumb2 thing, sanity check in ARM
  1880. subne $len, #0x10 @ subtract another 16 bytes
  1881. #endif
  1882. subs $len, #0x80
  1883. blo .Lxts_dec_short
  1884. b .Lxts_dec_loop
  1885. .align 4
  1886. .Lxts_dec_loop:
  1887. vldmia $magic, {$twmask} @ load XTS magic
  1888. vshr.s64 @T[0], @XMM[8], #63
  1889. mov r0, sp
  1890. vand @T[0], @T[0], $twmask
  1891. ___
  1892. for($i=9;$i<16;$i++) {
  1893. $code.=<<___;
  1894. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1895. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1896. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1897. vshr.s64 @T[1], @XMM[$i], #63
  1898. veor @XMM[$i], @XMM[$i], @T[0]
  1899. vand @T[1], @T[1], $twmask
  1900. ___
  1901. @T=reverse(@T);
  1902. $code.=<<___ if ($i>=10);
  1903. vld1.8 {@XMM[$i-10]}, [$inp]!
  1904. ___
  1905. $code.=<<___ if ($i>=11);
  1906. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1907. ___
  1908. }
  1909. $code.=<<___;
  1910. vadd.u64 @XMM[8], @XMM[15], @XMM[15]
  1911. vst1.64 {@XMM[15]}, [r0,:128]!
  1912. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1913. veor @XMM[8], @XMM[8], @T[0]
  1914. vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1915. vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
  1916. veor @XMM[5], @XMM[5], @XMM[13]
  1917. #ifndef BSAES_ASM_EXTENDED_KEY
  1918. add r4, sp, #0x90 @ pass key schedule
  1919. #else
  1920. add r4, $key, #248 @ pass key schedule
  1921. #endif
  1922. veor @XMM[6], @XMM[6], @XMM[14]
  1923. mov r5, $rounds @ pass rounds
  1924. veor @XMM[7], @XMM[7], @XMM[15]
  1925. mov r0, sp
  1926. bl _bsaes_decrypt8
  1927. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1928. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1929. veor @XMM[0], @XMM[0], @XMM[ 8]
  1930. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1931. veor @XMM[1], @XMM[1], @XMM[ 9]
  1932. veor @XMM[8], @XMM[6], @XMM[10]
  1933. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1934. veor @XMM[9], @XMM[4], @XMM[11]
  1935. vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
  1936. veor @XMM[10], @XMM[2], @XMM[12]
  1937. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1938. veor @XMM[11], @XMM[7], @XMM[13]
  1939. veor @XMM[12], @XMM[3], @XMM[14]
  1940. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  1941. veor @XMM[13], @XMM[5], @XMM[15]
  1942. vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
  1943. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  1944. subs $len, #0x80
  1945. bpl .Lxts_dec_loop
  1946. .Lxts_dec_short:
  1947. adds $len, #0x70
  1948. bmi .Lxts_dec_done
  1949. vldmia $magic, {$twmask} @ load XTS magic
  1950. vshr.s64 @T[0], @XMM[8], #63
  1951. mov r0, sp
  1952. vand @T[0], @T[0], $twmask
  1953. ___
  1954. for($i=9;$i<16;$i++) {
  1955. $code.=<<___;
  1956. vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
  1957. vst1.64 {@XMM[$i-1]}, [r0,:128]!
  1958. vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
  1959. vshr.s64 @T[1], @XMM[$i], #63
  1960. veor @XMM[$i], @XMM[$i], @T[0]
  1961. vand @T[1], @T[1], $twmask
  1962. ___
  1963. @T=reverse(@T);
  1964. $code.=<<___ if ($i>=10);
  1965. vld1.8 {@XMM[$i-10]}, [$inp]!
  1966. subs $len, #0x10
  1967. bmi .Lxts_dec_`$i-9`
  1968. ___
  1969. $code.=<<___ if ($i>=11);
  1970. veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
  1971. ___
  1972. }
  1973. $code.=<<___;
  1974. sub $len, #0x10
  1975. vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
  1976. vld1.8 {@XMM[6]}, [$inp]!
  1977. veor @XMM[5], @XMM[5], @XMM[13]
  1978. #ifndef BSAES_ASM_EXTENDED_KEY
  1979. add r4, sp, #0x90 @ pass key schedule
  1980. #else
  1981. add r4, $key, #248 @ pass key schedule
  1982. #endif
  1983. veor @XMM[6], @XMM[6], @XMM[14]
  1984. mov r5, $rounds @ pass rounds
  1985. mov r0, sp
  1986. bl _bsaes_decrypt8
  1987. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  1988. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  1989. veor @XMM[0], @XMM[0], @XMM[ 8]
  1990. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  1991. veor @XMM[1], @XMM[1], @XMM[ 9]
  1992. veor @XMM[8], @XMM[6], @XMM[10]
  1993. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  1994. veor @XMM[9], @XMM[4], @XMM[11]
  1995. vld1.64 {@XMM[14]}, [r0,:128]!
  1996. veor @XMM[10], @XMM[2], @XMM[12]
  1997. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  1998. veor @XMM[11], @XMM[7], @XMM[13]
  1999. veor @XMM[12], @XMM[3], @XMM[14]
  2000. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2001. vst1.8 {@XMM[12]}, [$out]!
  2002. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2003. b .Lxts_dec_done
  2004. .align 4
  2005. .Lxts_dec_6:
  2006. vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
  2007. veor @XMM[4], @XMM[4], @XMM[12]
  2008. #ifndef BSAES_ASM_EXTENDED_KEY
  2009. add r4, sp, #0x90 @ pass key schedule
  2010. #else
  2011. add r4, $key, #248 @ pass key schedule
  2012. #endif
  2013. veor @XMM[5], @XMM[5], @XMM[13]
  2014. mov r5, $rounds @ pass rounds
  2015. mov r0, sp
  2016. bl _bsaes_decrypt8
  2017. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2018. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2019. veor @XMM[0], @XMM[0], @XMM[ 8]
  2020. vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
  2021. veor @XMM[1], @XMM[1], @XMM[ 9]
  2022. veor @XMM[8], @XMM[6], @XMM[10]
  2023. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2024. veor @XMM[9], @XMM[4], @XMM[11]
  2025. veor @XMM[10], @XMM[2], @XMM[12]
  2026. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2027. veor @XMM[11], @XMM[7], @XMM[13]
  2028. vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
  2029. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2030. b .Lxts_dec_done
  2031. .align 4
  2032. .Lxts_dec_5:
  2033. vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
  2034. veor @XMM[3], @XMM[3], @XMM[11]
  2035. #ifndef BSAES_ASM_EXTENDED_KEY
  2036. add r4, sp, #0x90 @ pass key schedule
  2037. #else
  2038. add r4, $key, #248 @ pass key schedule
  2039. #endif
  2040. veor @XMM[4], @XMM[4], @XMM[12]
  2041. mov r5, $rounds @ pass rounds
  2042. mov r0, sp
  2043. bl _bsaes_decrypt8
  2044. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2045. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2046. veor @XMM[0], @XMM[0], @XMM[ 8]
  2047. vld1.64 {@XMM[12]}, [r0,:128]!
  2048. veor @XMM[1], @XMM[1], @XMM[ 9]
  2049. veor @XMM[8], @XMM[6], @XMM[10]
  2050. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2051. veor @XMM[9], @XMM[4], @XMM[11]
  2052. veor @XMM[10], @XMM[2], @XMM[12]
  2053. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2054. vst1.8 {@XMM[10]}, [$out]!
  2055. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2056. b .Lxts_dec_done
  2057. .align 4
  2058. .Lxts_dec_4:
  2059. vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
  2060. veor @XMM[2], @XMM[2], @XMM[10]
  2061. #ifndef BSAES_ASM_EXTENDED_KEY
  2062. add r4, sp, #0x90 @ pass key schedule
  2063. #else
  2064. add r4, $key, #248 @ pass key schedule
  2065. #endif
  2066. veor @XMM[3], @XMM[3], @XMM[11]
  2067. mov r5, $rounds @ pass rounds
  2068. mov r0, sp
  2069. bl _bsaes_decrypt8
  2070. vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
  2071. vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
  2072. veor @XMM[0], @XMM[0], @XMM[ 8]
  2073. veor @XMM[1], @XMM[1], @XMM[ 9]
  2074. veor @XMM[8], @XMM[6], @XMM[10]
  2075. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2076. veor @XMM[9], @XMM[4], @XMM[11]
  2077. vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
  2078. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2079. b .Lxts_dec_done
  2080. .align 4
  2081. .Lxts_dec_3:
  2082. vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
  2083. veor @XMM[1], @XMM[1], @XMM[9]
  2084. #ifndef BSAES_ASM_EXTENDED_KEY
  2085. add r4, sp, #0x90 @ pass key schedule
  2086. #else
  2087. add r4, $key, #248 @ pass key schedule
  2088. #endif
  2089. veor @XMM[2], @XMM[2], @XMM[10]
  2090. mov r5, $rounds @ pass rounds
  2091. mov r0, sp
  2092. bl _bsaes_decrypt8
  2093. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2094. vld1.64 {@XMM[10]}, [r0,:128]!
  2095. veor @XMM[0], @XMM[0], @XMM[ 8]
  2096. veor @XMM[1], @XMM[1], @XMM[ 9]
  2097. veor @XMM[8], @XMM[6], @XMM[10]
  2098. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2099. vst1.8 {@XMM[8]}, [$out]!
  2100. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2101. b .Lxts_dec_done
  2102. .align 4
  2103. .Lxts_dec_2:
  2104. vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
  2105. veor @XMM[0], @XMM[0], @XMM[8]
  2106. #ifndef BSAES_ASM_EXTENDED_KEY
  2107. add r4, sp, #0x90 @ pass key schedule
  2108. #else
  2109. add r4, $key, #248 @ pass key schedule
  2110. #endif
  2111. veor @XMM[1], @XMM[1], @XMM[9]
  2112. mov r5, $rounds @ pass rounds
  2113. mov r0, sp
  2114. bl _bsaes_decrypt8
  2115. vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
  2116. veor @XMM[0], @XMM[0], @XMM[ 8]
  2117. veor @XMM[1], @XMM[1], @XMM[ 9]
  2118. vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
  2119. vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
  2120. b .Lxts_dec_done
  2121. .align 4
  2122. .Lxts_dec_1:
  2123. mov r0, sp
  2124. veor @XMM[0], @XMM[8]
  2125. mov r1, sp
  2126. vst1.8 {@XMM[0]}, [sp,:128]
  2127. mov r2, $key
  2128. mov r4, $fp @ preserve fp
  2129. mov r5, $magic @ preserve magic
  2130. bl AES_decrypt
  2131. vld1.8 {@XMM[0]}, [sp,:128]
  2132. veor @XMM[0], @XMM[0], @XMM[8]
  2133. vst1.8 {@XMM[0]}, [$out]!
  2134. mov $fp, r4
  2135. mov $magic, r5
  2136. vmov @XMM[8], @XMM[9] @ next round tweak
  2137. .Lxts_dec_done:
  2138. #ifndef XTS_CHAIN_TWEAK
  2139. adds $len, #0x10
  2140. beq .Lxts_dec_ret
  2141. @ calculate one round of extra tweak for the stolen ciphertext
  2142. vldmia $magic, {$twmask}
  2143. vshr.s64 @XMM[6], @XMM[8], #63
  2144. vand @XMM[6], @XMM[6], $twmask
  2145. vadd.u64 @XMM[9], @XMM[8], @XMM[8]
  2146. vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
  2147. veor @XMM[9], @XMM[9], @XMM[6]
  2148. @ perform the final decryption with the last tweak value
  2149. vld1.8 {@XMM[0]}, [$inp]!
  2150. mov r0, sp
  2151. veor @XMM[0], @XMM[0], @XMM[9]
  2152. mov r1, sp
  2153. vst1.8 {@XMM[0]}, [sp,:128]
  2154. mov r2, $key
  2155. mov r4, $fp @ preserve fp
  2156. bl AES_decrypt
  2157. vld1.8 {@XMM[0]}, [sp,:128]
  2158. veor @XMM[0], @XMM[0], @XMM[9]
  2159. vst1.8 {@XMM[0]}, [$out]
  2160. mov r6, $out
  2161. .Lxts_dec_steal:
  2162. ldrb r1, [$out]
  2163. ldrb r0, [$inp], #1
  2164. strb r1, [$out, #0x10]
  2165. strb r0, [$out], #1
  2166. subs $len, #1
  2167. bhi .Lxts_dec_steal
  2168. vld1.8 {@XMM[0]}, [r6]
  2169. mov r0, sp
  2170. veor @XMM[0], @XMM[8]
  2171. mov r1, sp
  2172. vst1.8 {@XMM[0]}, [sp,:128]
  2173. mov r2, $key
  2174. bl AES_decrypt
  2175. vld1.8 {@XMM[0]}, [sp,:128]
  2176. veor @XMM[0], @XMM[0], @XMM[8]
  2177. vst1.8 {@XMM[0]}, [r6]
  2178. mov $fp, r4
  2179. #endif
  2180. .Lxts_dec_ret:
  2181. bic r0, $fp, #0xf
  2182. vmov.i32 q0, #0
  2183. vmov.i32 q1, #0
  2184. #ifdef XTS_CHAIN_TWEAK
  2185. ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
  2186. #endif
  2187. .Lxts_dec_bzero: @ wipe key schedule [if any]
  2188. vstmia sp!, {q0-q1}
  2189. cmp sp, r0
  2190. bne .Lxts_dec_bzero
  2191. mov sp, $fp
  2192. #ifdef XTS_CHAIN_TWEAK
  2193. vst1.8 {@XMM[8]}, [r1]
  2194. #endif
  2195. VFP_ABI_POP
  2196. ldmia sp!, {r4-r10, pc} @ return
  2197. .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
  2198. ___
  2199. }
  2200. $code.=<<___;
  2201. #endif
  2202. ___
  2203. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  2204. open SELF,$0;
  2205. while(<SELF>) {
  2206. next if (/^#!/);
  2207. last if (!s/^#/@/ and !/^$/);
  2208. print;
  2209. }
  2210. close SELF;
  2211. print $code;
  2212. close STDOUT;