aesni-intel_asm.S 77 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * Ported x86_64 version to x86:
  24. * Author: Mathias Krause <minipli@googlemail.com>
  25. *
  26. * This program is free software; you can redistribute it and/or modify
  27. * it under the terms of the GNU General Public License as published by
  28. * the Free Software Foundation; either version 2 of the License, or
  29. * (at your option) any later version.
  30. */
  31. #include <linux/linkage.h>
  32. #include <asm/inst.h>
  33. #include <asm/frame.h>
  34. #include <asm/nospec-branch.h>
  35. /*
  36. * The following macros are used to move an (un)aligned 16 byte value to/from
  37. * an XMM register. This can done for either FP or integer values, for FP use
  38. * movaps (move aligned packed single) or integer use movdqa (move double quad
  39. * aligned). It doesn't make a performance difference which instruction is used
  40. * since Nehalem (original Core i7) was released. However, the movaps is a byte
  41. * shorter, so that is the one we'll use for now. (same for unaligned).
  42. */
  43. #define MOVADQ movaps
  44. #define MOVUDQ movups
  45. #ifdef __x86_64__
  46. .data
  47. .align 16
  48. .Lgf128mul_x_ble_mask:
  49. .octa 0x00000000000000010000000000000087
  50. POLY: .octa 0xC2000000000000000000000000000001
  51. TWOONE: .octa 0x00000001000000000000000000000001
  52. # order of these constants should not change.
  53. # more specifically, ALL_F should follow SHIFT_MASK,
  54. # and ZERO should follow ALL_F
  55. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  56. MASK1: .octa 0x0000000000000000ffffffffffffffff
  57. MASK2: .octa 0xffffffffffffffff0000000000000000
  58. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  59. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  60. ZERO: .octa 0x00000000000000000000000000000000
  61. ONE: .octa 0x00000000000000000000000000000001
  62. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  63. dec: .octa 0x1
  64. enc: .octa 0x2
  65. .text
  66. #define STACK_OFFSET 8*3
  67. #define HashKey 16*0 // store HashKey <<1 mod poly here
  68. #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
  69. #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
  70. #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
  71. #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
  72. // bits of HashKey <<1 mod poly here
  73. //(for Karatsuba purposes)
  74. #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
  75. // bits of HashKey^2 <<1 mod poly here
  76. // (for Karatsuba purposes)
  77. #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
  78. // bits of HashKey^3 <<1 mod poly here
  79. // (for Karatsuba purposes)
  80. #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
  81. // bits of HashKey^4 <<1 mod poly here
  82. // (for Karatsuba purposes)
  83. #define VARIABLE_OFFSET 16*8
  84. #define arg1 rdi
  85. #define arg2 rsi
  86. #define arg3 rdx
  87. #define arg4 rcx
  88. #define arg5 r8
  89. #define arg6 r9
  90. #define arg7 STACK_OFFSET+8(%r14)
  91. #define arg8 STACK_OFFSET+16(%r14)
  92. #define arg9 STACK_OFFSET+24(%r14)
  93. #define arg10 STACK_OFFSET+32(%r14)
  94. #define keysize 2*15*16(%arg1)
  95. #endif
  96. #define STATE1 %xmm0
  97. #define STATE2 %xmm4
  98. #define STATE3 %xmm5
  99. #define STATE4 %xmm6
  100. #define STATE STATE1
  101. #define IN1 %xmm1
  102. #define IN2 %xmm7
  103. #define IN3 %xmm8
  104. #define IN4 %xmm9
  105. #define IN IN1
  106. #define KEY %xmm2
  107. #define IV %xmm3
  108. #define BSWAP_MASK %xmm10
  109. #define CTR %xmm11
  110. #define INC %xmm12
  111. #define GF128MUL_MASK %xmm10
  112. #ifdef __x86_64__
  113. #define AREG %rax
  114. #define KEYP %rdi
  115. #define OUTP %rsi
  116. #define UKEYP OUTP
  117. #define INP %rdx
  118. #define LEN %rcx
  119. #define IVP %r8
  120. #define KLEN %r9d
  121. #define T1 %r10
  122. #define TKEYP T1
  123. #define T2 %r11
  124. #define TCTR_LOW T2
  125. #else
  126. #define AREG %eax
  127. #define KEYP %edi
  128. #define OUTP AREG
  129. #define UKEYP OUTP
  130. #define INP %edx
  131. #define LEN %esi
  132. #define IVP %ebp
  133. #define KLEN %ebx
  134. #define T1 %ecx
  135. #define TKEYP T1
  136. #endif
  137. #ifdef __x86_64__
  138. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  139. *
  140. *
  141. * Input: A and B (128-bits each, bit-reflected)
  142. * Output: C = A*B*x mod poly, (i.e. >>1 )
  143. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  144. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  145. *
  146. */
  147. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  148. movdqa \GH, \TMP1
  149. pshufd $78, \GH, \TMP2
  150. pshufd $78, \HK, \TMP3
  151. pxor \GH, \TMP2 # TMP2 = a1+a0
  152. pxor \HK, \TMP3 # TMP3 = b1+b0
  153. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  154. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  155. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  156. pxor \GH, \TMP2
  157. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  158. movdqa \TMP2, \TMP3
  159. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  160. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  161. pxor \TMP3, \GH
  162. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  163. # first phase of the reduction
  164. movdqa \GH, \TMP2
  165. movdqa \GH, \TMP3
  166. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  167. # in in order to perform
  168. # independent shifts
  169. pslld $31, \TMP2 # packed right shift <<31
  170. pslld $30, \TMP3 # packed right shift <<30
  171. pslld $25, \TMP4 # packed right shift <<25
  172. pxor \TMP3, \TMP2 # xor the shifted versions
  173. pxor \TMP4, \TMP2
  174. movdqa \TMP2, \TMP5
  175. psrldq $4, \TMP5 # right shift TMP5 1 DW
  176. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  177. pxor \TMP2, \GH
  178. # second phase of the reduction
  179. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  180. # in in order to perform
  181. # independent shifts
  182. movdqa \GH,\TMP3
  183. movdqa \GH,\TMP4
  184. psrld $1,\TMP2 # packed left shift >>1
  185. psrld $2,\TMP3 # packed left shift >>2
  186. psrld $7,\TMP4 # packed left shift >>7
  187. pxor \TMP3,\TMP2 # xor the shifted versions
  188. pxor \TMP4,\TMP2
  189. pxor \TMP5, \TMP2
  190. pxor \TMP2, \GH
  191. pxor \TMP1, \GH # result is in TMP1
  192. .endm
  193. /*
  194. * if a = number of total plaintext bytes
  195. * b = floor(a/16)
  196. * num_initial_blocks = b mod 4
  197. * encrypt the initial num_initial_blocks blocks and apply ghash on
  198. * the ciphertext
  199. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  200. * are clobbered
  201. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  202. */
  203. .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  204. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  205. MOVADQ SHUF_MASK(%rip), %xmm14
  206. mov arg7, %r10 # %r10 = AAD
  207. mov arg8, %r12 # %r12 = aadLen
  208. mov %r12, %r11
  209. pxor %xmm\i, %xmm\i
  210. _get_AAD_loop\num_initial_blocks\operation:
  211. movd (%r10), \TMP1
  212. pslldq $12, \TMP1
  213. psrldq $4, %xmm\i
  214. pxor \TMP1, %xmm\i
  215. add $4, %r10
  216. sub $4, %r12
  217. jne _get_AAD_loop\num_initial_blocks\operation
  218. cmp $16, %r11
  219. je _get_AAD_loop2_done\num_initial_blocks\operation
  220. mov $16, %r12
  221. _get_AAD_loop2\num_initial_blocks\operation:
  222. psrldq $4, %xmm\i
  223. sub $4, %r12
  224. cmp %r11, %r12
  225. jne _get_AAD_loop2\num_initial_blocks\operation
  226. _get_AAD_loop2_done\num_initial_blocks\operation:
  227. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  228. xor %r11, %r11 # initialise the data pointer offset as zero
  229. # start AES for num_initial_blocks blocks
  230. mov %arg5, %rax # %rax = *Y0
  231. movdqu (%rax), \XMM0 # XMM0 = Y0
  232. PSHUFB_XMM %xmm14, \XMM0
  233. .if (\i == 5) || (\i == 6) || (\i == 7)
  234. MOVADQ ONE(%RIP),\TMP1
  235. MOVADQ (%arg1),\TMP2
  236. .irpc index, \i_seq
  237. paddd \TMP1, \XMM0 # INCR Y0
  238. movdqa \XMM0, %xmm\index
  239. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  240. pxor \TMP2, %xmm\index
  241. .endr
  242. lea 0x10(%arg1),%r10
  243. mov keysize,%eax
  244. shr $2,%eax # 128->4, 192->6, 256->8
  245. add $5,%eax # 128->9, 192->11, 256->13
  246. aes_loop_initial_dec\num_initial_blocks:
  247. MOVADQ (%r10),\TMP1
  248. .irpc index, \i_seq
  249. AESENC \TMP1, %xmm\index
  250. .endr
  251. add $16,%r10
  252. sub $1,%eax
  253. jnz aes_loop_initial_dec\num_initial_blocks
  254. MOVADQ (%r10), \TMP1
  255. .irpc index, \i_seq
  256. AESENCLAST \TMP1, %xmm\index # Last Round
  257. .endr
  258. .irpc index, \i_seq
  259. movdqu (%arg3 , %r11, 1), \TMP1
  260. pxor \TMP1, %xmm\index
  261. movdqu %xmm\index, (%arg2 , %r11, 1)
  262. # write back plaintext/ciphertext for num_initial_blocks
  263. add $16, %r11
  264. movdqa \TMP1, %xmm\index
  265. PSHUFB_XMM %xmm14, %xmm\index
  266. # prepare plaintext/ciphertext for GHASH computation
  267. .endr
  268. .endif
  269. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  270. # apply GHASH on num_initial_blocks blocks
  271. .if \i == 5
  272. pxor %xmm5, %xmm6
  273. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  274. pxor %xmm6, %xmm7
  275. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  276. pxor %xmm7, %xmm8
  277. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  278. .elseif \i == 6
  279. pxor %xmm6, %xmm7
  280. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  281. pxor %xmm7, %xmm8
  282. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  283. .elseif \i == 7
  284. pxor %xmm7, %xmm8
  285. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  286. .endif
  287. cmp $64, %r13
  288. jl _initial_blocks_done\num_initial_blocks\operation
  289. # no need for precomputed values
  290. /*
  291. *
  292. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  293. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  294. */
  295. MOVADQ ONE(%rip), \TMP1
  296. paddd \TMP1, \XMM0 # INCR Y0
  297. MOVADQ \XMM0, \XMM1
  298. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  299. paddd \TMP1, \XMM0 # INCR Y0
  300. MOVADQ \XMM0, \XMM2
  301. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  302. paddd \TMP1, \XMM0 # INCR Y0
  303. MOVADQ \XMM0, \XMM3
  304. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  305. paddd \TMP1, \XMM0 # INCR Y0
  306. MOVADQ \XMM0, \XMM4
  307. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  308. MOVADQ 0(%arg1),\TMP1
  309. pxor \TMP1, \XMM1
  310. pxor \TMP1, \XMM2
  311. pxor \TMP1, \XMM3
  312. pxor \TMP1, \XMM4
  313. movdqa \TMP3, \TMP5
  314. pshufd $78, \TMP3, \TMP1
  315. pxor \TMP3, \TMP1
  316. movdqa \TMP1, HashKey_k(%rsp)
  317. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  318. # TMP5 = HashKey^2<<1 (mod poly)
  319. movdqa \TMP5, HashKey_2(%rsp)
  320. # HashKey_2 = HashKey^2<<1 (mod poly)
  321. pshufd $78, \TMP5, \TMP1
  322. pxor \TMP5, \TMP1
  323. movdqa \TMP1, HashKey_2_k(%rsp)
  324. .irpc index, 1234 # do 4 rounds
  325. movaps 0x10*\index(%arg1), \TMP1
  326. AESENC \TMP1, \XMM1
  327. AESENC \TMP1, \XMM2
  328. AESENC \TMP1, \XMM3
  329. AESENC \TMP1, \XMM4
  330. .endr
  331. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  332. # TMP5 = HashKey^3<<1 (mod poly)
  333. movdqa \TMP5, HashKey_3(%rsp)
  334. pshufd $78, \TMP5, \TMP1
  335. pxor \TMP5, \TMP1
  336. movdqa \TMP1, HashKey_3_k(%rsp)
  337. .irpc index, 56789 # do next 5 rounds
  338. movaps 0x10*\index(%arg1), \TMP1
  339. AESENC \TMP1, \XMM1
  340. AESENC \TMP1, \XMM2
  341. AESENC \TMP1, \XMM3
  342. AESENC \TMP1, \XMM4
  343. .endr
  344. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  345. # TMP5 = HashKey^3<<1 (mod poly)
  346. movdqa \TMP5, HashKey_4(%rsp)
  347. pshufd $78, \TMP5, \TMP1
  348. pxor \TMP5, \TMP1
  349. movdqa \TMP1, HashKey_4_k(%rsp)
  350. lea 0xa0(%arg1),%r10
  351. mov keysize,%eax
  352. shr $2,%eax # 128->4, 192->6, 256->8
  353. sub $4,%eax # 128->0, 192->2, 256->4
  354. jz aes_loop_pre_dec_done\num_initial_blocks
  355. aes_loop_pre_dec\num_initial_blocks:
  356. MOVADQ (%r10),\TMP2
  357. .irpc index, 1234
  358. AESENC \TMP2, %xmm\index
  359. .endr
  360. add $16,%r10
  361. sub $1,%eax
  362. jnz aes_loop_pre_dec\num_initial_blocks
  363. aes_loop_pre_dec_done\num_initial_blocks:
  364. MOVADQ (%r10), \TMP2
  365. AESENCLAST \TMP2, \XMM1
  366. AESENCLAST \TMP2, \XMM2
  367. AESENCLAST \TMP2, \XMM3
  368. AESENCLAST \TMP2, \XMM4
  369. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  370. pxor \TMP1, \XMM1
  371. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  372. movdqa \TMP1, \XMM1
  373. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  374. pxor \TMP1, \XMM2
  375. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  376. movdqa \TMP1, \XMM2
  377. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  378. pxor \TMP1, \XMM3
  379. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  380. movdqa \TMP1, \XMM3
  381. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  382. pxor \TMP1, \XMM4
  383. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  384. movdqa \TMP1, \XMM4
  385. add $64, %r11
  386. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  387. pxor \XMMDst, \XMM1
  388. # combine GHASHed value with the corresponding ciphertext
  389. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  390. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  391. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  392. _initial_blocks_done\num_initial_blocks\operation:
  393. .endm
  394. /*
  395. * if a = number of total plaintext bytes
  396. * b = floor(a/16)
  397. * num_initial_blocks = b mod 4
  398. * encrypt the initial num_initial_blocks blocks and apply ghash on
  399. * the ciphertext
  400. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  401. * are clobbered
  402. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  403. */
  404. .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  405. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  406. MOVADQ SHUF_MASK(%rip), %xmm14
  407. mov arg7, %r10 # %r10 = AAD
  408. mov arg8, %r12 # %r12 = aadLen
  409. mov %r12, %r11
  410. pxor %xmm\i, %xmm\i
  411. _get_AAD_loop\num_initial_blocks\operation:
  412. movd (%r10), \TMP1
  413. pslldq $12, \TMP1
  414. psrldq $4, %xmm\i
  415. pxor \TMP1, %xmm\i
  416. add $4, %r10
  417. sub $4, %r12
  418. jne _get_AAD_loop\num_initial_blocks\operation
  419. cmp $16, %r11
  420. je _get_AAD_loop2_done\num_initial_blocks\operation
  421. mov $16, %r12
  422. _get_AAD_loop2\num_initial_blocks\operation:
  423. psrldq $4, %xmm\i
  424. sub $4, %r12
  425. cmp %r11, %r12
  426. jne _get_AAD_loop2\num_initial_blocks\operation
  427. _get_AAD_loop2_done\num_initial_blocks\operation:
  428. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  429. xor %r11, %r11 # initialise the data pointer offset as zero
  430. # start AES for num_initial_blocks blocks
  431. mov %arg5, %rax # %rax = *Y0
  432. movdqu (%rax), \XMM0 # XMM0 = Y0
  433. PSHUFB_XMM %xmm14, \XMM0
  434. .if (\i == 5) || (\i == 6) || (\i == 7)
  435. MOVADQ ONE(%RIP),\TMP1
  436. MOVADQ 0(%arg1),\TMP2
  437. .irpc index, \i_seq
  438. paddd \TMP1, \XMM0 # INCR Y0
  439. MOVADQ \XMM0, %xmm\index
  440. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  441. pxor \TMP2, %xmm\index
  442. .endr
  443. lea 0x10(%arg1),%r10
  444. mov keysize,%eax
  445. shr $2,%eax # 128->4, 192->6, 256->8
  446. add $5,%eax # 128->9, 192->11, 256->13
  447. aes_loop_initial_enc\num_initial_blocks:
  448. MOVADQ (%r10),\TMP1
  449. .irpc index, \i_seq
  450. AESENC \TMP1, %xmm\index
  451. .endr
  452. add $16,%r10
  453. sub $1,%eax
  454. jnz aes_loop_initial_enc\num_initial_blocks
  455. MOVADQ (%r10), \TMP1
  456. .irpc index, \i_seq
  457. AESENCLAST \TMP1, %xmm\index # Last Round
  458. .endr
  459. .irpc index, \i_seq
  460. movdqu (%arg3 , %r11, 1), \TMP1
  461. pxor \TMP1, %xmm\index
  462. movdqu %xmm\index, (%arg2 , %r11, 1)
  463. # write back plaintext/ciphertext for num_initial_blocks
  464. add $16, %r11
  465. PSHUFB_XMM %xmm14, %xmm\index
  466. # prepare plaintext/ciphertext for GHASH computation
  467. .endr
  468. .endif
  469. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  470. # apply GHASH on num_initial_blocks blocks
  471. .if \i == 5
  472. pxor %xmm5, %xmm6
  473. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  474. pxor %xmm6, %xmm7
  475. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  476. pxor %xmm7, %xmm8
  477. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  478. .elseif \i == 6
  479. pxor %xmm6, %xmm7
  480. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  481. pxor %xmm7, %xmm8
  482. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  483. .elseif \i == 7
  484. pxor %xmm7, %xmm8
  485. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  486. .endif
  487. cmp $64, %r13
  488. jl _initial_blocks_done\num_initial_blocks\operation
  489. # no need for precomputed values
  490. /*
  491. *
  492. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  493. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  494. */
  495. MOVADQ ONE(%RIP),\TMP1
  496. paddd \TMP1, \XMM0 # INCR Y0
  497. MOVADQ \XMM0, \XMM1
  498. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  499. paddd \TMP1, \XMM0 # INCR Y0
  500. MOVADQ \XMM0, \XMM2
  501. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  502. paddd \TMP1, \XMM0 # INCR Y0
  503. MOVADQ \XMM0, \XMM3
  504. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  505. paddd \TMP1, \XMM0 # INCR Y0
  506. MOVADQ \XMM0, \XMM4
  507. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  508. MOVADQ 0(%arg1),\TMP1
  509. pxor \TMP1, \XMM1
  510. pxor \TMP1, \XMM2
  511. pxor \TMP1, \XMM3
  512. pxor \TMP1, \XMM4
  513. movdqa \TMP3, \TMP5
  514. pshufd $78, \TMP3, \TMP1
  515. pxor \TMP3, \TMP1
  516. movdqa \TMP1, HashKey_k(%rsp)
  517. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  518. # TMP5 = HashKey^2<<1 (mod poly)
  519. movdqa \TMP5, HashKey_2(%rsp)
  520. # HashKey_2 = HashKey^2<<1 (mod poly)
  521. pshufd $78, \TMP5, \TMP1
  522. pxor \TMP5, \TMP1
  523. movdqa \TMP1, HashKey_2_k(%rsp)
  524. .irpc index, 1234 # do 4 rounds
  525. movaps 0x10*\index(%arg1), \TMP1
  526. AESENC \TMP1, \XMM1
  527. AESENC \TMP1, \XMM2
  528. AESENC \TMP1, \XMM3
  529. AESENC \TMP1, \XMM4
  530. .endr
  531. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  532. # TMP5 = HashKey^3<<1 (mod poly)
  533. movdqa \TMP5, HashKey_3(%rsp)
  534. pshufd $78, \TMP5, \TMP1
  535. pxor \TMP5, \TMP1
  536. movdqa \TMP1, HashKey_3_k(%rsp)
  537. .irpc index, 56789 # do next 5 rounds
  538. movaps 0x10*\index(%arg1), \TMP1
  539. AESENC \TMP1, \XMM1
  540. AESENC \TMP1, \XMM2
  541. AESENC \TMP1, \XMM3
  542. AESENC \TMP1, \XMM4
  543. .endr
  544. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  545. # TMP5 = HashKey^3<<1 (mod poly)
  546. movdqa \TMP5, HashKey_4(%rsp)
  547. pshufd $78, \TMP5, \TMP1
  548. pxor \TMP5, \TMP1
  549. movdqa \TMP1, HashKey_4_k(%rsp)
  550. lea 0xa0(%arg1),%r10
  551. mov keysize,%eax
  552. shr $2,%eax # 128->4, 192->6, 256->8
  553. sub $4,%eax # 128->0, 192->2, 256->4
  554. jz aes_loop_pre_enc_done\num_initial_blocks
  555. aes_loop_pre_enc\num_initial_blocks:
  556. MOVADQ (%r10),\TMP2
  557. .irpc index, 1234
  558. AESENC \TMP2, %xmm\index
  559. .endr
  560. add $16,%r10
  561. sub $1,%eax
  562. jnz aes_loop_pre_enc\num_initial_blocks
  563. aes_loop_pre_enc_done\num_initial_blocks:
  564. MOVADQ (%r10), \TMP2
  565. AESENCLAST \TMP2, \XMM1
  566. AESENCLAST \TMP2, \XMM2
  567. AESENCLAST \TMP2, \XMM3
  568. AESENCLAST \TMP2, \XMM4
  569. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  570. pxor \TMP1, \XMM1
  571. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  572. pxor \TMP1, \XMM2
  573. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  574. pxor \TMP1, \XMM3
  575. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  576. pxor \TMP1, \XMM4
  577. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  578. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  579. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  580. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  581. add $64, %r11
  582. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  583. pxor \XMMDst, \XMM1
  584. # combine GHASHed value with the corresponding ciphertext
  585. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  586. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  587. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  588. _initial_blocks_done\num_initial_blocks\operation:
  589. .endm
  590. /*
  591. * encrypt 4 blocks at a time
  592. * ghash the 4 previously encrypted ciphertext blocks
  593. * arg1, %arg2, %arg3 are used as pointers only, not modified
  594. * %r11 is the data offset value
  595. */
  596. .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
  597. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  598. movdqa \XMM1, \XMM5
  599. movdqa \XMM2, \XMM6
  600. movdqa \XMM3, \XMM7
  601. movdqa \XMM4, \XMM8
  602. movdqa SHUF_MASK(%rip), %xmm15
  603. # multiply TMP5 * HashKey using karatsuba
  604. movdqa \XMM5, \TMP4
  605. pshufd $78, \XMM5, \TMP6
  606. pxor \XMM5, \TMP6
  607. paddd ONE(%rip), \XMM0 # INCR CNT
  608. movdqa HashKey_4(%rsp), \TMP5
  609. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  610. movdqa \XMM0, \XMM1
  611. paddd ONE(%rip), \XMM0 # INCR CNT
  612. movdqa \XMM0, \XMM2
  613. paddd ONE(%rip), \XMM0 # INCR CNT
  614. movdqa \XMM0, \XMM3
  615. paddd ONE(%rip), \XMM0 # INCR CNT
  616. movdqa \XMM0, \XMM4
  617. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  618. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  619. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  620. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  621. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  622. pxor (%arg1), \XMM1
  623. pxor (%arg1), \XMM2
  624. pxor (%arg1), \XMM3
  625. pxor (%arg1), \XMM4
  626. movdqa HashKey_4_k(%rsp), \TMP5
  627. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  628. movaps 0x10(%arg1), \TMP1
  629. AESENC \TMP1, \XMM1 # Round 1
  630. AESENC \TMP1, \XMM2
  631. AESENC \TMP1, \XMM3
  632. AESENC \TMP1, \XMM4
  633. movaps 0x20(%arg1), \TMP1
  634. AESENC \TMP1, \XMM1 # Round 2
  635. AESENC \TMP1, \XMM2
  636. AESENC \TMP1, \XMM3
  637. AESENC \TMP1, \XMM4
  638. movdqa \XMM6, \TMP1
  639. pshufd $78, \XMM6, \TMP2
  640. pxor \XMM6, \TMP2
  641. movdqa HashKey_3(%rsp), \TMP5
  642. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  643. movaps 0x30(%arg1), \TMP3
  644. AESENC \TMP3, \XMM1 # Round 3
  645. AESENC \TMP3, \XMM2
  646. AESENC \TMP3, \XMM3
  647. AESENC \TMP3, \XMM4
  648. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  649. movaps 0x40(%arg1), \TMP3
  650. AESENC \TMP3, \XMM1 # Round 4
  651. AESENC \TMP3, \XMM2
  652. AESENC \TMP3, \XMM3
  653. AESENC \TMP3, \XMM4
  654. movdqa HashKey_3_k(%rsp), \TMP5
  655. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  656. movaps 0x50(%arg1), \TMP3
  657. AESENC \TMP3, \XMM1 # Round 5
  658. AESENC \TMP3, \XMM2
  659. AESENC \TMP3, \XMM3
  660. AESENC \TMP3, \XMM4
  661. pxor \TMP1, \TMP4
  662. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  663. pxor \XMM6, \XMM5
  664. pxor \TMP2, \TMP6
  665. movdqa \XMM7, \TMP1
  666. pshufd $78, \XMM7, \TMP2
  667. pxor \XMM7, \TMP2
  668. movdqa HashKey_2(%rsp ), \TMP5
  669. # Multiply TMP5 * HashKey using karatsuba
  670. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  671. movaps 0x60(%arg1), \TMP3
  672. AESENC \TMP3, \XMM1 # Round 6
  673. AESENC \TMP3, \XMM2
  674. AESENC \TMP3, \XMM3
  675. AESENC \TMP3, \XMM4
  676. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  677. movaps 0x70(%arg1), \TMP3
  678. AESENC \TMP3, \XMM1 # Round 7
  679. AESENC \TMP3, \XMM2
  680. AESENC \TMP3, \XMM3
  681. AESENC \TMP3, \XMM4
  682. movdqa HashKey_2_k(%rsp), \TMP5
  683. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  684. movaps 0x80(%arg1), \TMP3
  685. AESENC \TMP3, \XMM1 # Round 8
  686. AESENC \TMP3, \XMM2
  687. AESENC \TMP3, \XMM3
  688. AESENC \TMP3, \XMM4
  689. pxor \TMP1, \TMP4
  690. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  691. pxor \XMM7, \XMM5
  692. pxor \TMP2, \TMP6
  693. # Multiply XMM8 * HashKey
  694. # XMM8 and TMP5 hold the values for the two operands
  695. movdqa \XMM8, \TMP1
  696. pshufd $78, \XMM8, \TMP2
  697. pxor \XMM8, \TMP2
  698. movdqa HashKey(%rsp), \TMP5
  699. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  700. movaps 0x90(%arg1), \TMP3
  701. AESENC \TMP3, \XMM1 # Round 9
  702. AESENC \TMP3, \XMM2
  703. AESENC \TMP3, \XMM3
  704. AESENC \TMP3, \XMM4
  705. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  706. lea 0xa0(%arg1),%r10
  707. mov keysize,%eax
  708. shr $2,%eax # 128->4, 192->6, 256->8
  709. sub $4,%eax # 128->0, 192->2, 256->4
  710. jz aes_loop_par_enc_done
  711. aes_loop_par_enc:
  712. MOVADQ (%r10),\TMP3
  713. .irpc index, 1234
  714. AESENC \TMP3, %xmm\index
  715. .endr
  716. add $16,%r10
  717. sub $1,%eax
  718. jnz aes_loop_par_enc
  719. aes_loop_par_enc_done:
  720. MOVADQ (%r10), \TMP3
  721. AESENCLAST \TMP3, \XMM1 # Round 10
  722. AESENCLAST \TMP3, \XMM2
  723. AESENCLAST \TMP3, \XMM3
  724. AESENCLAST \TMP3, \XMM4
  725. movdqa HashKey_k(%rsp), \TMP5
  726. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  727. movdqu (%arg3,%r11,1), \TMP3
  728. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  729. movdqu 16(%arg3,%r11,1), \TMP3
  730. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  731. movdqu 32(%arg3,%r11,1), \TMP3
  732. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  733. movdqu 48(%arg3,%r11,1), \TMP3
  734. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  735. movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
  736. movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
  737. movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
  738. movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
  739. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  740. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  741. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  742. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  743. pxor \TMP4, \TMP1
  744. pxor \XMM8, \XMM5
  745. pxor \TMP6, \TMP2
  746. pxor \TMP1, \TMP2
  747. pxor \XMM5, \TMP2
  748. movdqa \TMP2, \TMP3
  749. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  750. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  751. pxor \TMP3, \XMM5
  752. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  753. # first phase of reduction
  754. movdqa \XMM5, \TMP2
  755. movdqa \XMM5, \TMP3
  756. movdqa \XMM5, \TMP4
  757. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  758. pslld $31, \TMP2 # packed right shift << 31
  759. pslld $30, \TMP3 # packed right shift << 30
  760. pslld $25, \TMP4 # packed right shift << 25
  761. pxor \TMP3, \TMP2 # xor the shifted versions
  762. pxor \TMP4, \TMP2
  763. movdqa \TMP2, \TMP5
  764. psrldq $4, \TMP5 # right shift T5 1 DW
  765. pslldq $12, \TMP2 # left shift T2 3 DWs
  766. pxor \TMP2, \XMM5
  767. # second phase of reduction
  768. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  769. movdqa \XMM5,\TMP3
  770. movdqa \XMM5,\TMP4
  771. psrld $1, \TMP2 # packed left shift >>1
  772. psrld $2, \TMP3 # packed left shift >>2
  773. psrld $7, \TMP4 # packed left shift >>7
  774. pxor \TMP3,\TMP2 # xor the shifted versions
  775. pxor \TMP4,\TMP2
  776. pxor \TMP5, \TMP2
  777. pxor \TMP2, \XMM5
  778. pxor \TMP1, \XMM5 # result is in TMP1
  779. pxor \XMM5, \XMM1
  780. .endm
  781. /*
  782. * decrypt 4 blocks at a time
  783. * ghash the 4 previously decrypted ciphertext blocks
  784. * arg1, %arg2, %arg3 are used as pointers only, not modified
  785. * %r11 is the data offset value
  786. */
  787. .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
  788. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  789. movdqa \XMM1, \XMM5
  790. movdqa \XMM2, \XMM6
  791. movdqa \XMM3, \XMM7
  792. movdqa \XMM4, \XMM8
  793. movdqa SHUF_MASK(%rip), %xmm15
  794. # multiply TMP5 * HashKey using karatsuba
  795. movdqa \XMM5, \TMP4
  796. pshufd $78, \XMM5, \TMP6
  797. pxor \XMM5, \TMP6
  798. paddd ONE(%rip), \XMM0 # INCR CNT
  799. movdqa HashKey_4(%rsp), \TMP5
  800. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  801. movdqa \XMM0, \XMM1
  802. paddd ONE(%rip), \XMM0 # INCR CNT
  803. movdqa \XMM0, \XMM2
  804. paddd ONE(%rip), \XMM0 # INCR CNT
  805. movdqa \XMM0, \XMM3
  806. paddd ONE(%rip), \XMM0 # INCR CNT
  807. movdqa \XMM0, \XMM4
  808. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  809. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  810. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  811. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  812. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  813. pxor (%arg1), \XMM1
  814. pxor (%arg1), \XMM2
  815. pxor (%arg1), \XMM3
  816. pxor (%arg1), \XMM4
  817. movdqa HashKey_4_k(%rsp), \TMP5
  818. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  819. movaps 0x10(%arg1), \TMP1
  820. AESENC \TMP1, \XMM1 # Round 1
  821. AESENC \TMP1, \XMM2
  822. AESENC \TMP1, \XMM3
  823. AESENC \TMP1, \XMM4
  824. movaps 0x20(%arg1), \TMP1
  825. AESENC \TMP1, \XMM1 # Round 2
  826. AESENC \TMP1, \XMM2
  827. AESENC \TMP1, \XMM3
  828. AESENC \TMP1, \XMM4
  829. movdqa \XMM6, \TMP1
  830. pshufd $78, \XMM6, \TMP2
  831. pxor \XMM6, \TMP2
  832. movdqa HashKey_3(%rsp), \TMP5
  833. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  834. movaps 0x30(%arg1), \TMP3
  835. AESENC \TMP3, \XMM1 # Round 3
  836. AESENC \TMP3, \XMM2
  837. AESENC \TMP3, \XMM3
  838. AESENC \TMP3, \XMM4
  839. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  840. movaps 0x40(%arg1), \TMP3
  841. AESENC \TMP3, \XMM1 # Round 4
  842. AESENC \TMP3, \XMM2
  843. AESENC \TMP3, \XMM3
  844. AESENC \TMP3, \XMM4
  845. movdqa HashKey_3_k(%rsp), \TMP5
  846. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  847. movaps 0x50(%arg1), \TMP3
  848. AESENC \TMP3, \XMM1 # Round 5
  849. AESENC \TMP3, \XMM2
  850. AESENC \TMP3, \XMM3
  851. AESENC \TMP3, \XMM4
  852. pxor \TMP1, \TMP4
  853. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  854. pxor \XMM6, \XMM5
  855. pxor \TMP2, \TMP6
  856. movdqa \XMM7, \TMP1
  857. pshufd $78, \XMM7, \TMP2
  858. pxor \XMM7, \TMP2
  859. movdqa HashKey_2(%rsp ), \TMP5
  860. # Multiply TMP5 * HashKey using karatsuba
  861. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  862. movaps 0x60(%arg1), \TMP3
  863. AESENC \TMP3, \XMM1 # Round 6
  864. AESENC \TMP3, \XMM2
  865. AESENC \TMP3, \XMM3
  866. AESENC \TMP3, \XMM4
  867. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  868. movaps 0x70(%arg1), \TMP3
  869. AESENC \TMP3, \XMM1 # Round 7
  870. AESENC \TMP3, \XMM2
  871. AESENC \TMP3, \XMM3
  872. AESENC \TMP3, \XMM4
  873. movdqa HashKey_2_k(%rsp), \TMP5
  874. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  875. movaps 0x80(%arg1), \TMP3
  876. AESENC \TMP3, \XMM1 # Round 8
  877. AESENC \TMP3, \XMM2
  878. AESENC \TMP3, \XMM3
  879. AESENC \TMP3, \XMM4
  880. pxor \TMP1, \TMP4
  881. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  882. pxor \XMM7, \XMM5
  883. pxor \TMP2, \TMP6
  884. # Multiply XMM8 * HashKey
  885. # XMM8 and TMP5 hold the values for the two operands
  886. movdqa \XMM8, \TMP1
  887. pshufd $78, \XMM8, \TMP2
  888. pxor \XMM8, \TMP2
  889. movdqa HashKey(%rsp), \TMP5
  890. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  891. movaps 0x90(%arg1), \TMP3
  892. AESENC \TMP3, \XMM1 # Round 9
  893. AESENC \TMP3, \XMM2
  894. AESENC \TMP3, \XMM3
  895. AESENC \TMP3, \XMM4
  896. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  897. lea 0xa0(%arg1),%r10
  898. mov keysize,%eax
  899. shr $2,%eax # 128->4, 192->6, 256->8
  900. sub $4,%eax # 128->0, 192->2, 256->4
  901. jz aes_loop_par_dec_done
  902. aes_loop_par_dec:
  903. MOVADQ (%r10),\TMP3
  904. .irpc index, 1234
  905. AESENC \TMP3, %xmm\index
  906. .endr
  907. add $16,%r10
  908. sub $1,%eax
  909. jnz aes_loop_par_dec
  910. aes_loop_par_dec_done:
  911. MOVADQ (%r10), \TMP3
  912. AESENCLAST \TMP3, \XMM1 # last round
  913. AESENCLAST \TMP3, \XMM2
  914. AESENCLAST \TMP3, \XMM3
  915. AESENCLAST \TMP3, \XMM4
  916. movdqa HashKey_k(%rsp), \TMP5
  917. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  918. movdqu (%arg3,%r11,1), \TMP3
  919. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  920. movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
  921. movdqa \TMP3, \XMM1
  922. movdqu 16(%arg3,%r11,1), \TMP3
  923. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  924. movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
  925. movdqa \TMP3, \XMM2
  926. movdqu 32(%arg3,%r11,1), \TMP3
  927. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  928. movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
  929. movdqa \TMP3, \XMM3
  930. movdqu 48(%arg3,%r11,1), \TMP3
  931. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  932. movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
  933. movdqa \TMP3, \XMM4
  934. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  935. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  936. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  937. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  938. pxor \TMP4, \TMP1
  939. pxor \XMM8, \XMM5
  940. pxor \TMP6, \TMP2
  941. pxor \TMP1, \TMP2
  942. pxor \XMM5, \TMP2
  943. movdqa \TMP2, \TMP3
  944. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  945. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  946. pxor \TMP3, \XMM5
  947. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  948. # first phase of reduction
  949. movdqa \XMM5, \TMP2
  950. movdqa \XMM5, \TMP3
  951. movdqa \XMM5, \TMP4
  952. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  953. pslld $31, \TMP2 # packed right shift << 31
  954. pslld $30, \TMP3 # packed right shift << 30
  955. pslld $25, \TMP4 # packed right shift << 25
  956. pxor \TMP3, \TMP2 # xor the shifted versions
  957. pxor \TMP4, \TMP2
  958. movdqa \TMP2, \TMP5
  959. psrldq $4, \TMP5 # right shift T5 1 DW
  960. pslldq $12, \TMP2 # left shift T2 3 DWs
  961. pxor \TMP2, \XMM5
  962. # second phase of reduction
  963. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  964. movdqa \XMM5,\TMP3
  965. movdqa \XMM5,\TMP4
  966. psrld $1, \TMP2 # packed left shift >>1
  967. psrld $2, \TMP3 # packed left shift >>2
  968. psrld $7, \TMP4 # packed left shift >>7
  969. pxor \TMP3,\TMP2 # xor the shifted versions
  970. pxor \TMP4,\TMP2
  971. pxor \TMP5, \TMP2
  972. pxor \TMP2, \XMM5
  973. pxor \TMP1, \XMM5 # result is in TMP1
  974. pxor \XMM5, \XMM1
  975. .endm
  976. /* GHASH the last 4 ciphertext blocks. */
  977. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  978. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  979. # Multiply TMP6 * HashKey (using Karatsuba)
  980. movdqa \XMM1, \TMP6
  981. pshufd $78, \XMM1, \TMP2
  982. pxor \XMM1, \TMP2
  983. movdqa HashKey_4(%rsp), \TMP5
  984. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  985. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  986. movdqa HashKey_4_k(%rsp), \TMP4
  987. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  988. movdqa \XMM1, \XMMDst
  989. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  990. # Multiply TMP1 * HashKey (using Karatsuba)
  991. movdqa \XMM2, \TMP1
  992. pshufd $78, \XMM2, \TMP2
  993. pxor \XMM2, \TMP2
  994. movdqa HashKey_3(%rsp), \TMP5
  995. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  996. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  997. movdqa HashKey_3_k(%rsp), \TMP4
  998. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  999. pxor \TMP1, \TMP6
  1000. pxor \XMM2, \XMMDst
  1001. pxor \TMP2, \XMM1
  1002. # results accumulated in TMP6, XMMDst, XMM1
  1003. # Multiply TMP1 * HashKey (using Karatsuba)
  1004. movdqa \XMM3, \TMP1
  1005. pshufd $78, \XMM3, \TMP2
  1006. pxor \XMM3, \TMP2
  1007. movdqa HashKey_2(%rsp), \TMP5
  1008. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1009. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  1010. movdqa HashKey_2_k(%rsp), \TMP4
  1011. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1012. pxor \TMP1, \TMP6
  1013. pxor \XMM3, \XMMDst
  1014. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  1015. # Multiply TMP1 * HashKey (using Karatsuba)
  1016. movdqa \XMM4, \TMP1
  1017. pshufd $78, \XMM4, \TMP2
  1018. pxor \XMM4, \TMP2
  1019. movdqa HashKey(%rsp), \TMP5
  1020. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1021. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  1022. movdqa HashKey_k(%rsp), \TMP4
  1023. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1024. pxor \TMP1, \TMP6
  1025. pxor \XMM4, \XMMDst
  1026. pxor \XMM1, \TMP2
  1027. pxor \TMP6, \TMP2
  1028. pxor \XMMDst, \TMP2
  1029. # middle section of the temp results combined as in karatsuba algorithm
  1030. movdqa \TMP2, \TMP4
  1031. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  1032. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1033. pxor \TMP4, \XMMDst
  1034. pxor \TMP2, \TMP6
  1035. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  1036. # first phase of the reduction
  1037. movdqa \XMMDst, \TMP2
  1038. movdqa \XMMDst, \TMP3
  1039. movdqa \XMMDst, \TMP4
  1040. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  1041. pslld $31, \TMP2 # packed right shifting << 31
  1042. pslld $30, \TMP3 # packed right shifting << 30
  1043. pslld $25, \TMP4 # packed right shifting << 25
  1044. pxor \TMP3, \TMP2 # xor the shifted versions
  1045. pxor \TMP4, \TMP2
  1046. movdqa \TMP2, \TMP7
  1047. psrldq $4, \TMP7 # right shift TMP7 1 DW
  1048. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  1049. pxor \TMP2, \XMMDst
  1050. # second phase of the reduction
  1051. movdqa \XMMDst, \TMP2
  1052. # make 3 copies of XMMDst for doing 3 shift operations
  1053. movdqa \XMMDst, \TMP3
  1054. movdqa \XMMDst, \TMP4
  1055. psrld $1, \TMP2 # packed left shift >> 1
  1056. psrld $2, \TMP3 # packed left shift >> 2
  1057. psrld $7, \TMP4 # packed left shift >> 7
  1058. pxor \TMP3, \TMP2 # xor the shifted versions
  1059. pxor \TMP4, \TMP2
  1060. pxor \TMP7, \TMP2
  1061. pxor \TMP2, \XMMDst
  1062. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  1063. .endm
  1064. /* Encryption of a single block
  1065. * uses eax & r10
  1066. */
  1067. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  1068. pxor (%arg1), \XMM0
  1069. mov keysize,%eax
  1070. shr $2,%eax # 128->4, 192->6, 256->8
  1071. add $5,%eax # 128->9, 192->11, 256->13
  1072. lea 16(%arg1), %r10 # get first expanded key address
  1073. _esb_loop_\@:
  1074. MOVADQ (%r10),\TMP1
  1075. AESENC \TMP1,\XMM0
  1076. add $16,%r10
  1077. sub $1,%eax
  1078. jnz _esb_loop_\@
  1079. MOVADQ (%r10),\TMP1
  1080. AESENCLAST \TMP1,\XMM0
  1081. .endm
  1082. /*****************************************************************************
  1083. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1084. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  1085. * const u8 *in, // Ciphertext input
  1086. * u64 plaintext_len, // Length of data in bytes for decryption.
  1087. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1088. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1089. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1090. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1091. * const u8 *aad, // Additional Authentication Data (AAD)
  1092. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1093. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  1094. * // given authentication tag and only return the plaintext if they match.
  1095. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  1096. * // (most likely), 12 or 8.
  1097. *
  1098. * Assumptions:
  1099. *
  1100. * keys:
  1101. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  1102. * set of 11 keys in the data structure void *aes_ctx
  1103. *
  1104. * iv:
  1105. * 0 1 2 3
  1106. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1107. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1108. * | Salt (From the SA) |
  1109. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1110. * | Initialization Vector |
  1111. * | (This is the sequence number from IPSec header) |
  1112. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1113. * | 0x1 |
  1114. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1115. *
  1116. *
  1117. *
  1118. * AAD:
  1119. * AAD padded to 128 bits with 0
  1120. * for example, assume AAD is a u32 vector
  1121. *
  1122. * if AAD is 8 bytes:
  1123. * AAD[3] = {A0, A1};
  1124. * padded AAD in xmm register = {A1 A0 0 0}
  1125. *
  1126. * 0 1 2 3
  1127. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1128. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1129. * | SPI (A1) |
  1130. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1131. * | 32-bit Sequence Number (A0) |
  1132. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1133. * | 0x0 |
  1134. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1135. *
  1136. * AAD Format with 32-bit Sequence Number
  1137. *
  1138. * if AAD is 12 bytes:
  1139. * AAD[3] = {A0, A1, A2};
  1140. * padded AAD in xmm register = {A2 A1 A0 0}
  1141. *
  1142. * 0 1 2 3
  1143. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1144. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1145. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1146. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1147. * | SPI (A2) |
  1148. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1149. * | 64-bit Extended Sequence Number {A1,A0} |
  1150. * | |
  1151. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1152. * | 0x0 |
  1153. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1154. *
  1155. * AAD Format with 64-bit Extended Sequence Number
  1156. *
  1157. * aadLen:
  1158. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1159. * The code supports 16 too but for other sizes, the code will fail.
  1160. *
  1161. * TLen:
  1162. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1163. * For other sizes, the code will fail.
  1164. *
  1165. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1166. *
  1167. *****************************************************************************/
  1168. ENTRY(aesni_gcm_dec)
  1169. push %r12
  1170. push %r13
  1171. push %r14
  1172. mov %rsp, %r14
  1173. /*
  1174. * states of %xmm registers %xmm6:%xmm15 not saved
  1175. * all %xmm registers are clobbered
  1176. */
  1177. sub $VARIABLE_OFFSET, %rsp
  1178. and $~63, %rsp # align rsp to 64 bytes
  1179. mov %arg6, %r12
  1180. movdqu (%r12), %xmm13 # %xmm13 = HashKey
  1181. movdqa SHUF_MASK(%rip), %xmm2
  1182. PSHUFB_XMM %xmm2, %xmm13
  1183. # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
  1184. movdqa %xmm13, %xmm2
  1185. psllq $1, %xmm13
  1186. psrlq $63, %xmm2
  1187. movdqa %xmm2, %xmm1
  1188. pslldq $8, %xmm2
  1189. psrldq $8, %xmm1
  1190. por %xmm2, %xmm13
  1191. # Reduction
  1192. pshufd $0x24, %xmm1, %xmm2
  1193. pcmpeqd TWOONE(%rip), %xmm2
  1194. pand POLY(%rip), %xmm2
  1195. pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
  1196. # Decrypt first few blocks
  1197. movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
  1198. mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
  1199. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  1200. mov %r13, %r12
  1201. and $(3<<4), %r12
  1202. jz _initial_num_blocks_is_0_decrypt
  1203. cmp $(2<<4), %r12
  1204. jb _initial_num_blocks_is_1_decrypt
  1205. je _initial_num_blocks_is_2_decrypt
  1206. _initial_num_blocks_is_3_decrypt:
  1207. INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1208. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
  1209. sub $48, %r13
  1210. jmp _initial_blocks_decrypted
  1211. _initial_num_blocks_is_2_decrypt:
  1212. INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1213. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
  1214. sub $32, %r13
  1215. jmp _initial_blocks_decrypted
  1216. _initial_num_blocks_is_1_decrypt:
  1217. INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1218. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
  1219. sub $16, %r13
  1220. jmp _initial_blocks_decrypted
  1221. _initial_num_blocks_is_0_decrypt:
  1222. INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1223. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
  1224. _initial_blocks_decrypted:
  1225. cmp $0, %r13
  1226. je _zero_cipher_left_decrypt
  1227. sub $64, %r13
  1228. je _four_cipher_left_decrypt
  1229. _decrypt_by_4:
  1230. GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1231. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
  1232. add $64, %r11
  1233. sub $64, %r13
  1234. jne _decrypt_by_4
  1235. _four_cipher_left_decrypt:
  1236. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1237. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1238. _zero_cipher_left_decrypt:
  1239. mov %arg4, %r13
  1240. and $15, %r13 # %r13 = arg4 (mod 16)
  1241. je _multiple_of_16_bytes_decrypt
  1242. # Handle the last <16 byte block separately
  1243. paddd ONE(%rip), %xmm0 # increment CNT to get Yn
  1244. movdqa SHUF_MASK(%rip), %xmm10
  1245. PSHUFB_XMM %xmm10, %xmm0
  1246. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
  1247. sub $16, %r11
  1248. add %r13, %r11
  1249. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
  1250. lea SHIFT_MASK+16(%rip), %r12
  1251. sub %r13, %r12
  1252. # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
  1253. # (%r13 is the number of bytes in plaintext mod 16)
  1254. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1255. PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
  1256. movdqa %xmm1, %xmm2
  1257. pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
  1258. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1259. # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
  1260. pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
  1261. pand %xmm1, %xmm2
  1262. movdqa SHUF_MASK(%rip), %xmm10
  1263. PSHUFB_XMM %xmm10 ,%xmm2
  1264. pxor %xmm2, %xmm8
  1265. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1266. # GHASH computation for the last <16 byte block
  1267. sub %r13, %r11
  1268. add $16, %r11
  1269. # output %r13 bytes
  1270. MOVQ_R64_XMM %xmm0, %rax
  1271. cmp $8, %r13
  1272. jle _less_than_8_bytes_left_decrypt
  1273. mov %rax, (%arg2 , %r11, 1)
  1274. add $8, %r11
  1275. psrldq $8, %xmm0
  1276. MOVQ_R64_XMM %xmm0, %rax
  1277. sub $8, %r13
  1278. _less_than_8_bytes_left_decrypt:
  1279. mov %al, (%arg2, %r11, 1)
  1280. add $1, %r11
  1281. shr $8, %rax
  1282. sub $1, %r13
  1283. jne _less_than_8_bytes_left_decrypt
  1284. _multiple_of_16_bytes_decrypt:
  1285. mov arg8, %r12 # %r13 = aadLen (number of bytes)
  1286. shl $3, %r12 # convert into number of bits
  1287. movd %r12d, %xmm15 # len(A) in %xmm15
  1288. shl $3, %arg4 # len(C) in bits (*128)
  1289. MOVQ_R64_XMM %arg4, %xmm1
  1290. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1291. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1292. pxor %xmm15, %xmm8
  1293. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1294. # final GHASH computation
  1295. movdqa SHUF_MASK(%rip), %xmm10
  1296. PSHUFB_XMM %xmm10, %xmm8
  1297. mov %arg5, %rax # %rax = *Y0
  1298. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1299. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  1300. pxor %xmm8, %xmm0
  1301. _return_T_decrypt:
  1302. mov arg9, %r10 # %r10 = authTag
  1303. mov arg10, %r11 # %r11 = auth_tag_len
  1304. cmp $16, %r11
  1305. je _T_16_decrypt
  1306. cmp $12, %r11
  1307. je _T_12_decrypt
  1308. _T_8_decrypt:
  1309. MOVQ_R64_XMM %xmm0, %rax
  1310. mov %rax, (%r10)
  1311. jmp _return_T_done_decrypt
  1312. _T_12_decrypt:
  1313. MOVQ_R64_XMM %xmm0, %rax
  1314. mov %rax, (%r10)
  1315. psrldq $8, %xmm0
  1316. movd %xmm0, %eax
  1317. mov %eax, 8(%r10)
  1318. jmp _return_T_done_decrypt
  1319. _T_16_decrypt:
  1320. movdqu %xmm0, (%r10)
  1321. _return_T_done_decrypt:
  1322. mov %r14, %rsp
  1323. pop %r14
  1324. pop %r13
  1325. pop %r12
  1326. ret
  1327. ENDPROC(aesni_gcm_dec)
  1328. /*****************************************************************************
  1329. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1330. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1331. * const u8 *in, // Plaintext input
  1332. * u64 plaintext_len, // Length of data in bytes for encryption.
  1333. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1334. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1335. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1336. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1337. * const u8 *aad, // Additional Authentication Data (AAD)
  1338. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1339. * u8 *auth_tag, // Authenticated Tag output.
  1340. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1341. * // 12 or 8.
  1342. *
  1343. * Assumptions:
  1344. *
  1345. * keys:
  1346. * keys are pre-expanded and aligned to 16 bytes. we are using the
  1347. * first set of 11 keys in the data structure void *aes_ctx
  1348. *
  1349. *
  1350. * iv:
  1351. * 0 1 2 3
  1352. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1353. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1354. * | Salt (From the SA) |
  1355. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1356. * | Initialization Vector |
  1357. * | (This is the sequence number from IPSec header) |
  1358. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1359. * | 0x1 |
  1360. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1361. *
  1362. *
  1363. *
  1364. * AAD:
  1365. * AAD padded to 128 bits with 0
  1366. * for example, assume AAD is a u32 vector
  1367. *
  1368. * if AAD is 8 bytes:
  1369. * AAD[3] = {A0, A1};
  1370. * padded AAD in xmm register = {A1 A0 0 0}
  1371. *
  1372. * 0 1 2 3
  1373. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1374. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1375. * | SPI (A1) |
  1376. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1377. * | 32-bit Sequence Number (A0) |
  1378. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1379. * | 0x0 |
  1380. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1381. *
  1382. * AAD Format with 32-bit Sequence Number
  1383. *
  1384. * if AAD is 12 bytes:
  1385. * AAD[3] = {A0, A1, A2};
  1386. * padded AAD in xmm register = {A2 A1 A0 0}
  1387. *
  1388. * 0 1 2 3
  1389. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1390. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1391. * | SPI (A2) |
  1392. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1393. * | 64-bit Extended Sequence Number {A1,A0} |
  1394. * | |
  1395. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1396. * | 0x0 |
  1397. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1398. *
  1399. * AAD Format with 64-bit Extended Sequence Number
  1400. *
  1401. * aadLen:
  1402. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1403. * The code supports 16 too but for other sizes, the code will fail.
  1404. *
  1405. * TLen:
  1406. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1407. * For other sizes, the code will fail.
  1408. *
  1409. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1410. ***************************************************************************/
  1411. ENTRY(aesni_gcm_enc)
  1412. push %r12
  1413. push %r13
  1414. push %r14
  1415. mov %rsp, %r14
  1416. #
  1417. # states of %xmm registers %xmm6:%xmm15 not saved
  1418. # all %xmm registers are clobbered
  1419. #
  1420. sub $VARIABLE_OFFSET, %rsp
  1421. and $~63, %rsp
  1422. mov %arg6, %r12
  1423. movdqu (%r12), %xmm13
  1424. movdqa SHUF_MASK(%rip), %xmm2
  1425. PSHUFB_XMM %xmm2, %xmm13
  1426. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  1427. movdqa %xmm13, %xmm2
  1428. psllq $1, %xmm13
  1429. psrlq $63, %xmm2
  1430. movdqa %xmm2, %xmm1
  1431. pslldq $8, %xmm2
  1432. psrldq $8, %xmm1
  1433. por %xmm2, %xmm13
  1434. # reduce HashKey<<1
  1435. pshufd $0x24, %xmm1, %xmm2
  1436. pcmpeqd TWOONE(%rip), %xmm2
  1437. pand POLY(%rip), %xmm2
  1438. pxor %xmm2, %xmm13
  1439. movdqa %xmm13, HashKey(%rsp)
  1440. mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
  1441. and $-16, %r13
  1442. mov %r13, %r12
  1443. # Encrypt first few blocks
  1444. and $(3<<4), %r12
  1445. jz _initial_num_blocks_is_0_encrypt
  1446. cmp $(2<<4), %r12
  1447. jb _initial_num_blocks_is_1_encrypt
  1448. je _initial_num_blocks_is_2_encrypt
  1449. _initial_num_blocks_is_3_encrypt:
  1450. INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1451. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
  1452. sub $48, %r13
  1453. jmp _initial_blocks_encrypted
  1454. _initial_num_blocks_is_2_encrypt:
  1455. INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1456. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
  1457. sub $32, %r13
  1458. jmp _initial_blocks_encrypted
  1459. _initial_num_blocks_is_1_encrypt:
  1460. INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1461. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
  1462. sub $16, %r13
  1463. jmp _initial_blocks_encrypted
  1464. _initial_num_blocks_is_0_encrypt:
  1465. INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1466. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
  1467. _initial_blocks_encrypted:
  1468. # Main loop - Encrypt remaining blocks
  1469. cmp $0, %r13
  1470. je _zero_cipher_left_encrypt
  1471. sub $64, %r13
  1472. je _four_cipher_left_encrypt
  1473. _encrypt_by_4_encrypt:
  1474. GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1475. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
  1476. add $64, %r11
  1477. sub $64, %r13
  1478. jne _encrypt_by_4_encrypt
  1479. _four_cipher_left_encrypt:
  1480. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1481. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1482. _zero_cipher_left_encrypt:
  1483. mov %arg4, %r13
  1484. and $15, %r13 # %r13 = arg4 (mod 16)
  1485. je _multiple_of_16_bytes_encrypt
  1486. # Handle the last <16 Byte block separately
  1487. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  1488. movdqa SHUF_MASK(%rip), %xmm10
  1489. PSHUFB_XMM %xmm10, %xmm0
  1490. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  1491. sub $16, %r11
  1492. add %r13, %r11
  1493. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
  1494. lea SHIFT_MASK+16(%rip), %r12
  1495. sub %r13, %r12
  1496. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  1497. # (%r13 is the number of bytes in plaintext mod 16)
  1498. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1499. PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
  1500. pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
  1501. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1502. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  1503. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  1504. movdqa SHUF_MASK(%rip), %xmm10
  1505. PSHUFB_XMM %xmm10,%xmm0
  1506. pxor %xmm0, %xmm8
  1507. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1508. # GHASH computation for the last <16 byte block
  1509. sub %r13, %r11
  1510. add $16, %r11
  1511. movdqa SHUF_MASK(%rip), %xmm10
  1512. PSHUFB_XMM %xmm10, %xmm0
  1513. # shuffle xmm0 back to output as ciphertext
  1514. # Output %r13 bytes
  1515. MOVQ_R64_XMM %xmm0, %rax
  1516. cmp $8, %r13
  1517. jle _less_than_8_bytes_left_encrypt
  1518. mov %rax, (%arg2 , %r11, 1)
  1519. add $8, %r11
  1520. psrldq $8, %xmm0
  1521. MOVQ_R64_XMM %xmm0, %rax
  1522. sub $8, %r13
  1523. _less_than_8_bytes_left_encrypt:
  1524. mov %al, (%arg2, %r11, 1)
  1525. add $1, %r11
  1526. shr $8, %rax
  1527. sub $1, %r13
  1528. jne _less_than_8_bytes_left_encrypt
  1529. _multiple_of_16_bytes_encrypt:
  1530. mov arg8, %r12 # %r12 = addLen (number of bytes)
  1531. shl $3, %r12
  1532. movd %r12d, %xmm15 # len(A) in %xmm15
  1533. shl $3, %arg4 # len(C) in bits (*128)
  1534. MOVQ_R64_XMM %arg4, %xmm1
  1535. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1536. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1537. pxor %xmm15, %xmm8
  1538. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1539. # final GHASH computation
  1540. movdqa SHUF_MASK(%rip), %xmm10
  1541. PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
  1542. mov %arg5, %rax # %rax = *Y0
  1543. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1544. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
  1545. pxor %xmm8, %xmm0
  1546. _return_T_encrypt:
  1547. mov arg9, %r10 # %r10 = authTag
  1548. mov arg10, %r11 # %r11 = auth_tag_len
  1549. cmp $16, %r11
  1550. je _T_16_encrypt
  1551. cmp $12, %r11
  1552. je _T_12_encrypt
  1553. _T_8_encrypt:
  1554. MOVQ_R64_XMM %xmm0, %rax
  1555. mov %rax, (%r10)
  1556. jmp _return_T_done_encrypt
  1557. _T_12_encrypt:
  1558. MOVQ_R64_XMM %xmm0, %rax
  1559. mov %rax, (%r10)
  1560. psrldq $8, %xmm0
  1561. movd %xmm0, %eax
  1562. mov %eax, 8(%r10)
  1563. jmp _return_T_done_encrypt
  1564. _T_16_encrypt:
  1565. movdqu %xmm0, (%r10)
  1566. _return_T_done_encrypt:
  1567. mov %r14, %rsp
  1568. pop %r14
  1569. pop %r13
  1570. pop %r12
  1571. ret
  1572. ENDPROC(aesni_gcm_enc)
  1573. #endif
  1574. .align 4
  1575. _key_expansion_128:
  1576. _key_expansion_256a:
  1577. pshufd $0b11111111, %xmm1, %xmm1
  1578. shufps $0b00010000, %xmm0, %xmm4
  1579. pxor %xmm4, %xmm0
  1580. shufps $0b10001100, %xmm0, %xmm4
  1581. pxor %xmm4, %xmm0
  1582. pxor %xmm1, %xmm0
  1583. movaps %xmm0, (TKEYP)
  1584. add $0x10, TKEYP
  1585. ret
  1586. ENDPROC(_key_expansion_128)
  1587. ENDPROC(_key_expansion_256a)
  1588. .align 4
  1589. _key_expansion_192a:
  1590. pshufd $0b01010101, %xmm1, %xmm1
  1591. shufps $0b00010000, %xmm0, %xmm4
  1592. pxor %xmm4, %xmm0
  1593. shufps $0b10001100, %xmm0, %xmm4
  1594. pxor %xmm4, %xmm0
  1595. pxor %xmm1, %xmm0
  1596. movaps %xmm2, %xmm5
  1597. movaps %xmm2, %xmm6
  1598. pslldq $4, %xmm5
  1599. pshufd $0b11111111, %xmm0, %xmm3
  1600. pxor %xmm3, %xmm2
  1601. pxor %xmm5, %xmm2
  1602. movaps %xmm0, %xmm1
  1603. shufps $0b01000100, %xmm0, %xmm6
  1604. movaps %xmm6, (TKEYP)
  1605. shufps $0b01001110, %xmm2, %xmm1
  1606. movaps %xmm1, 0x10(TKEYP)
  1607. add $0x20, TKEYP
  1608. ret
  1609. ENDPROC(_key_expansion_192a)
  1610. .align 4
  1611. _key_expansion_192b:
  1612. pshufd $0b01010101, %xmm1, %xmm1
  1613. shufps $0b00010000, %xmm0, %xmm4
  1614. pxor %xmm4, %xmm0
  1615. shufps $0b10001100, %xmm0, %xmm4
  1616. pxor %xmm4, %xmm0
  1617. pxor %xmm1, %xmm0
  1618. movaps %xmm2, %xmm5
  1619. pslldq $4, %xmm5
  1620. pshufd $0b11111111, %xmm0, %xmm3
  1621. pxor %xmm3, %xmm2
  1622. pxor %xmm5, %xmm2
  1623. movaps %xmm0, (TKEYP)
  1624. add $0x10, TKEYP
  1625. ret
  1626. ENDPROC(_key_expansion_192b)
  1627. .align 4
  1628. _key_expansion_256b:
  1629. pshufd $0b10101010, %xmm1, %xmm1
  1630. shufps $0b00010000, %xmm2, %xmm4
  1631. pxor %xmm4, %xmm2
  1632. shufps $0b10001100, %xmm2, %xmm4
  1633. pxor %xmm4, %xmm2
  1634. pxor %xmm1, %xmm2
  1635. movaps %xmm2, (TKEYP)
  1636. add $0x10, TKEYP
  1637. ret
  1638. ENDPROC(_key_expansion_256b)
  1639. /*
  1640. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1641. * unsigned int key_len)
  1642. */
  1643. ENTRY(aesni_set_key)
  1644. FRAME_BEGIN
  1645. #ifndef __x86_64__
  1646. pushl KEYP
  1647. movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
  1648. movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
  1649. movl (FRAME_OFFSET+16)(%esp), %edx # key_len
  1650. #endif
  1651. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1652. movaps %xmm0, (KEYP)
  1653. lea 0x10(KEYP), TKEYP # key addr
  1654. movl %edx, 480(KEYP)
  1655. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1656. cmp $24, %dl
  1657. jb .Lenc_key128
  1658. je .Lenc_key192
  1659. movups 0x10(UKEYP), %xmm2 # other user key
  1660. movaps %xmm2, (TKEYP)
  1661. add $0x10, TKEYP
  1662. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1663. call _key_expansion_256a
  1664. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1665. call _key_expansion_256b
  1666. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1667. call _key_expansion_256a
  1668. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1669. call _key_expansion_256b
  1670. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1671. call _key_expansion_256a
  1672. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1673. call _key_expansion_256b
  1674. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1675. call _key_expansion_256a
  1676. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1677. call _key_expansion_256b
  1678. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1679. call _key_expansion_256a
  1680. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1681. call _key_expansion_256b
  1682. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1683. call _key_expansion_256a
  1684. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1685. call _key_expansion_256b
  1686. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1687. call _key_expansion_256a
  1688. jmp .Ldec_key
  1689. .Lenc_key192:
  1690. movq 0x10(UKEYP), %xmm2 # other user key
  1691. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1692. call _key_expansion_192a
  1693. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1694. call _key_expansion_192b
  1695. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1696. call _key_expansion_192a
  1697. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1698. call _key_expansion_192b
  1699. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1700. call _key_expansion_192a
  1701. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1702. call _key_expansion_192b
  1703. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1704. call _key_expansion_192a
  1705. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1706. call _key_expansion_192b
  1707. jmp .Ldec_key
  1708. .Lenc_key128:
  1709. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1710. call _key_expansion_128
  1711. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1712. call _key_expansion_128
  1713. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1714. call _key_expansion_128
  1715. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1716. call _key_expansion_128
  1717. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1718. call _key_expansion_128
  1719. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1720. call _key_expansion_128
  1721. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1722. call _key_expansion_128
  1723. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1724. call _key_expansion_128
  1725. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1726. call _key_expansion_128
  1727. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1728. call _key_expansion_128
  1729. .Ldec_key:
  1730. sub $0x10, TKEYP
  1731. movaps (KEYP), %xmm0
  1732. movaps (TKEYP), %xmm1
  1733. movaps %xmm0, 240(TKEYP)
  1734. movaps %xmm1, 240(KEYP)
  1735. add $0x10, KEYP
  1736. lea 240-16(TKEYP), UKEYP
  1737. .align 4
  1738. .Ldec_key_loop:
  1739. movaps (KEYP), %xmm0
  1740. AESIMC %xmm0 %xmm1
  1741. movaps %xmm1, (UKEYP)
  1742. add $0x10, KEYP
  1743. sub $0x10, UKEYP
  1744. cmp TKEYP, KEYP
  1745. jb .Ldec_key_loop
  1746. xor AREG, AREG
  1747. #ifndef __x86_64__
  1748. popl KEYP
  1749. #endif
  1750. FRAME_END
  1751. ret
  1752. ENDPROC(aesni_set_key)
  1753. /*
  1754. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1755. */
  1756. ENTRY(aesni_enc)
  1757. FRAME_BEGIN
  1758. #ifndef __x86_64__
  1759. pushl KEYP
  1760. pushl KLEN
  1761. movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
  1762. movl (FRAME_OFFSET+16)(%esp), OUTP # dst
  1763. movl (FRAME_OFFSET+20)(%esp), INP # src
  1764. #endif
  1765. movl 480(KEYP), KLEN # key length
  1766. movups (INP), STATE # input
  1767. call _aesni_enc1
  1768. movups STATE, (OUTP) # output
  1769. #ifndef __x86_64__
  1770. popl KLEN
  1771. popl KEYP
  1772. #endif
  1773. FRAME_END
  1774. ret
  1775. ENDPROC(aesni_enc)
  1776. /*
  1777. * _aesni_enc1: internal ABI
  1778. * input:
  1779. * KEYP: key struct pointer
  1780. * KLEN: round count
  1781. * STATE: initial state (input)
  1782. * output:
  1783. * STATE: finial state (output)
  1784. * changed:
  1785. * KEY
  1786. * TKEYP (T1)
  1787. */
  1788. .align 4
  1789. _aesni_enc1:
  1790. movaps (KEYP), KEY # key
  1791. mov KEYP, TKEYP
  1792. pxor KEY, STATE # round 0
  1793. add $0x30, TKEYP
  1794. cmp $24, KLEN
  1795. jb .Lenc128
  1796. lea 0x20(TKEYP), TKEYP
  1797. je .Lenc192
  1798. add $0x20, TKEYP
  1799. movaps -0x60(TKEYP), KEY
  1800. AESENC KEY STATE
  1801. movaps -0x50(TKEYP), KEY
  1802. AESENC KEY STATE
  1803. .align 4
  1804. .Lenc192:
  1805. movaps -0x40(TKEYP), KEY
  1806. AESENC KEY STATE
  1807. movaps -0x30(TKEYP), KEY
  1808. AESENC KEY STATE
  1809. .align 4
  1810. .Lenc128:
  1811. movaps -0x20(TKEYP), KEY
  1812. AESENC KEY STATE
  1813. movaps -0x10(TKEYP), KEY
  1814. AESENC KEY STATE
  1815. movaps (TKEYP), KEY
  1816. AESENC KEY STATE
  1817. movaps 0x10(TKEYP), KEY
  1818. AESENC KEY STATE
  1819. movaps 0x20(TKEYP), KEY
  1820. AESENC KEY STATE
  1821. movaps 0x30(TKEYP), KEY
  1822. AESENC KEY STATE
  1823. movaps 0x40(TKEYP), KEY
  1824. AESENC KEY STATE
  1825. movaps 0x50(TKEYP), KEY
  1826. AESENC KEY STATE
  1827. movaps 0x60(TKEYP), KEY
  1828. AESENC KEY STATE
  1829. movaps 0x70(TKEYP), KEY
  1830. AESENCLAST KEY STATE
  1831. ret
  1832. ENDPROC(_aesni_enc1)
  1833. /*
  1834. * _aesni_enc4: internal ABI
  1835. * input:
  1836. * KEYP: key struct pointer
  1837. * KLEN: round count
  1838. * STATE1: initial state (input)
  1839. * STATE2
  1840. * STATE3
  1841. * STATE4
  1842. * output:
  1843. * STATE1: finial state (output)
  1844. * STATE2
  1845. * STATE3
  1846. * STATE4
  1847. * changed:
  1848. * KEY
  1849. * TKEYP (T1)
  1850. */
  1851. .align 4
  1852. _aesni_enc4:
  1853. movaps (KEYP), KEY # key
  1854. mov KEYP, TKEYP
  1855. pxor KEY, STATE1 # round 0
  1856. pxor KEY, STATE2
  1857. pxor KEY, STATE3
  1858. pxor KEY, STATE4
  1859. add $0x30, TKEYP
  1860. cmp $24, KLEN
  1861. jb .L4enc128
  1862. lea 0x20(TKEYP), TKEYP
  1863. je .L4enc192
  1864. add $0x20, TKEYP
  1865. movaps -0x60(TKEYP), KEY
  1866. AESENC KEY STATE1
  1867. AESENC KEY STATE2
  1868. AESENC KEY STATE3
  1869. AESENC KEY STATE4
  1870. movaps -0x50(TKEYP), KEY
  1871. AESENC KEY STATE1
  1872. AESENC KEY STATE2
  1873. AESENC KEY STATE3
  1874. AESENC KEY STATE4
  1875. #.align 4
  1876. .L4enc192:
  1877. movaps -0x40(TKEYP), KEY
  1878. AESENC KEY STATE1
  1879. AESENC KEY STATE2
  1880. AESENC KEY STATE3
  1881. AESENC KEY STATE4
  1882. movaps -0x30(TKEYP), KEY
  1883. AESENC KEY STATE1
  1884. AESENC KEY STATE2
  1885. AESENC KEY STATE3
  1886. AESENC KEY STATE4
  1887. #.align 4
  1888. .L4enc128:
  1889. movaps -0x20(TKEYP), KEY
  1890. AESENC KEY STATE1
  1891. AESENC KEY STATE2
  1892. AESENC KEY STATE3
  1893. AESENC KEY STATE4
  1894. movaps -0x10(TKEYP), KEY
  1895. AESENC KEY STATE1
  1896. AESENC KEY STATE2
  1897. AESENC KEY STATE3
  1898. AESENC KEY STATE4
  1899. movaps (TKEYP), KEY
  1900. AESENC KEY STATE1
  1901. AESENC KEY STATE2
  1902. AESENC KEY STATE3
  1903. AESENC KEY STATE4
  1904. movaps 0x10(TKEYP), KEY
  1905. AESENC KEY STATE1
  1906. AESENC KEY STATE2
  1907. AESENC KEY STATE3
  1908. AESENC KEY STATE4
  1909. movaps 0x20(TKEYP), KEY
  1910. AESENC KEY STATE1
  1911. AESENC KEY STATE2
  1912. AESENC KEY STATE3
  1913. AESENC KEY STATE4
  1914. movaps 0x30(TKEYP), KEY
  1915. AESENC KEY STATE1
  1916. AESENC KEY STATE2
  1917. AESENC KEY STATE3
  1918. AESENC KEY STATE4
  1919. movaps 0x40(TKEYP), KEY
  1920. AESENC KEY STATE1
  1921. AESENC KEY STATE2
  1922. AESENC KEY STATE3
  1923. AESENC KEY STATE4
  1924. movaps 0x50(TKEYP), KEY
  1925. AESENC KEY STATE1
  1926. AESENC KEY STATE2
  1927. AESENC KEY STATE3
  1928. AESENC KEY STATE4
  1929. movaps 0x60(TKEYP), KEY
  1930. AESENC KEY STATE1
  1931. AESENC KEY STATE2
  1932. AESENC KEY STATE3
  1933. AESENC KEY STATE4
  1934. movaps 0x70(TKEYP), KEY
  1935. AESENCLAST KEY STATE1 # last round
  1936. AESENCLAST KEY STATE2
  1937. AESENCLAST KEY STATE3
  1938. AESENCLAST KEY STATE4
  1939. ret
  1940. ENDPROC(_aesni_enc4)
  1941. /*
  1942. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1943. */
  1944. ENTRY(aesni_dec)
  1945. FRAME_BEGIN
  1946. #ifndef __x86_64__
  1947. pushl KEYP
  1948. pushl KLEN
  1949. movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
  1950. movl (FRAME_OFFSET+16)(%esp), OUTP # dst
  1951. movl (FRAME_OFFSET+20)(%esp), INP # src
  1952. #endif
  1953. mov 480(KEYP), KLEN # key length
  1954. add $240, KEYP
  1955. movups (INP), STATE # input
  1956. call _aesni_dec1
  1957. movups STATE, (OUTP) #output
  1958. #ifndef __x86_64__
  1959. popl KLEN
  1960. popl KEYP
  1961. #endif
  1962. FRAME_END
  1963. ret
  1964. ENDPROC(aesni_dec)
  1965. /*
  1966. * _aesni_dec1: internal ABI
  1967. * input:
  1968. * KEYP: key struct pointer
  1969. * KLEN: key length
  1970. * STATE: initial state (input)
  1971. * output:
  1972. * STATE: finial state (output)
  1973. * changed:
  1974. * KEY
  1975. * TKEYP (T1)
  1976. */
  1977. .align 4
  1978. _aesni_dec1:
  1979. movaps (KEYP), KEY # key
  1980. mov KEYP, TKEYP
  1981. pxor KEY, STATE # round 0
  1982. add $0x30, TKEYP
  1983. cmp $24, KLEN
  1984. jb .Ldec128
  1985. lea 0x20(TKEYP), TKEYP
  1986. je .Ldec192
  1987. add $0x20, TKEYP
  1988. movaps -0x60(TKEYP), KEY
  1989. AESDEC KEY STATE
  1990. movaps -0x50(TKEYP), KEY
  1991. AESDEC KEY STATE
  1992. .align 4
  1993. .Ldec192:
  1994. movaps -0x40(TKEYP), KEY
  1995. AESDEC KEY STATE
  1996. movaps -0x30(TKEYP), KEY
  1997. AESDEC KEY STATE
  1998. .align 4
  1999. .Ldec128:
  2000. movaps -0x20(TKEYP), KEY
  2001. AESDEC KEY STATE
  2002. movaps -0x10(TKEYP), KEY
  2003. AESDEC KEY STATE
  2004. movaps (TKEYP), KEY
  2005. AESDEC KEY STATE
  2006. movaps 0x10(TKEYP), KEY
  2007. AESDEC KEY STATE
  2008. movaps 0x20(TKEYP), KEY
  2009. AESDEC KEY STATE
  2010. movaps 0x30(TKEYP), KEY
  2011. AESDEC KEY STATE
  2012. movaps 0x40(TKEYP), KEY
  2013. AESDEC KEY STATE
  2014. movaps 0x50(TKEYP), KEY
  2015. AESDEC KEY STATE
  2016. movaps 0x60(TKEYP), KEY
  2017. AESDEC KEY STATE
  2018. movaps 0x70(TKEYP), KEY
  2019. AESDECLAST KEY STATE
  2020. ret
  2021. ENDPROC(_aesni_dec1)
  2022. /*
  2023. * _aesni_dec4: internal ABI
  2024. * input:
  2025. * KEYP: key struct pointer
  2026. * KLEN: key length
  2027. * STATE1: initial state (input)
  2028. * STATE2
  2029. * STATE3
  2030. * STATE4
  2031. * output:
  2032. * STATE1: finial state (output)
  2033. * STATE2
  2034. * STATE3
  2035. * STATE4
  2036. * changed:
  2037. * KEY
  2038. * TKEYP (T1)
  2039. */
  2040. .align 4
  2041. _aesni_dec4:
  2042. movaps (KEYP), KEY # key
  2043. mov KEYP, TKEYP
  2044. pxor KEY, STATE1 # round 0
  2045. pxor KEY, STATE2
  2046. pxor KEY, STATE3
  2047. pxor KEY, STATE4
  2048. add $0x30, TKEYP
  2049. cmp $24, KLEN
  2050. jb .L4dec128
  2051. lea 0x20(TKEYP), TKEYP
  2052. je .L4dec192
  2053. add $0x20, TKEYP
  2054. movaps -0x60(TKEYP), KEY
  2055. AESDEC KEY STATE1
  2056. AESDEC KEY STATE2
  2057. AESDEC KEY STATE3
  2058. AESDEC KEY STATE4
  2059. movaps -0x50(TKEYP), KEY
  2060. AESDEC KEY STATE1
  2061. AESDEC KEY STATE2
  2062. AESDEC KEY STATE3
  2063. AESDEC KEY STATE4
  2064. .align 4
  2065. .L4dec192:
  2066. movaps -0x40(TKEYP), KEY
  2067. AESDEC KEY STATE1
  2068. AESDEC KEY STATE2
  2069. AESDEC KEY STATE3
  2070. AESDEC KEY STATE4
  2071. movaps -0x30(TKEYP), KEY
  2072. AESDEC KEY STATE1
  2073. AESDEC KEY STATE2
  2074. AESDEC KEY STATE3
  2075. AESDEC KEY STATE4
  2076. .align 4
  2077. .L4dec128:
  2078. movaps -0x20(TKEYP), KEY
  2079. AESDEC KEY STATE1
  2080. AESDEC KEY STATE2
  2081. AESDEC KEY STATE3
  2082. AESDEC KEY STATE4
  2083. movaps -0x10(TKEYP), KEY
  2084. AESDEC KEY STATE1
  2085. AESDEC KEY STATE2
  2086. AESDEC KEY STATE3
  2087. AESDEC KEY STATE4
  2088. movaps (TKEYP), KEY
  2089. AESDEC KEY STATE1
  2090. AESDEC KEY STATE2
  2091. AESDEC KEY STATE3
  2092. AESDEC KEY STATE4
  2093. movaps 0x10(TKEYP), KEY
  2094. AESDEC KEY STATE1
  2095. AESDEC KEY STATE2
  2096. AESDEC KEY STATE3
  2097. AESDEC KEY STATE4
  2098. movaps 0x20(TKEYP), KEY
  2099. AESDEC KEY STATE1
  2100. AESDEC KEY STATE2
  2101. AESDEC KEY STATE3
  2102. AESDEC KEY STATE4
  2103. movaps 0x30(TKEYP), KEY
  2104. AESDEC KEY STATE1
  2105. AESDEC KEY STATE2
  2106. AESDEC KEY STATE3
  2107. AESDEC KEY STATE4
  2108. movaps 0x40(TKEYP), KEY
  2109. AESDEC KEY STATE1
  2110. AESDEC KEY STATE2
  2111. AESDEC KEY STATE3
  2112. AESDEC KEY STATE4
  2113. movaps 0x50(TKEYP), KEY
  2114. AESDEC KEY STATE1
  2115. AESDEC KEY STATE2
  2116. AESDEC KEY STATE3
  2117. AESDEC KEY STATE4
  2118. movaps 0x60(TKEYP), KEY
  2119. AESDEC KEY STATE1
  2120. AESDEC KEY STATE2
  2121. AESDEC KEY STATE3
  2122. AESDEC KEY STATE4
  2123. movaps 0x70(TKEYP), KEY
  2124. AESDECLAST KEY STATE1 # last round
  2125. AESDECLAST KEY STATE2
  2126. AESDECLAST KEY STATE3
  2127. AESDECLAST KEY STATE4
  2128. ret
  2129. ENDPROC(_aesni_dec4)
  2130. /*
  2131. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2132. * size_t len)
  2133. */
  2134. ENTRY(aesni_ecb_enc)
  2135. FRAME_BEGIN
  2136. #ifndef __x86_64__
  2137. pushl LEN
  2138. pushl KEYP
  2139. pushl KLEN
  2140. movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
  2141. movl (FRAME_OFFSET+20)(%esp), OUTP # dst
  2142. movl (FRAME_OFFSET+24)(%esp), INP # src
  2143. movl (FRAME_OFFSET+28)(%esp), LEN # len
  2144. #endif
  2145. test LEN, LEN # check length
  2146. jz .Lecb_enc_ret
  2147. mov 480(KEYP), KLEN
  2148. cmp $16, LEN
  2149. jb .Lecb_enc_ret
  2150. cmp $64, LEN
  2151. jb .Lecb_enc_loop1
  2152. .align 4
  2153. .Lecb_enc_loop4:
  2154. movups (INP), STATE1
  2155. movups 0x10(INP), STATE2
  2156. movups 0x20(INP), STATE3
  2157. movups 0x30(INP), STATE4
  2158. call _aesni_enc4
  2159. movups STATE1, (OUTP)
  2160. movups STATE2, 0x10(OUTP)
  2161. movups STATE3, 0x20(OUTP)
  2162. movups STATE4, 0x30(OUTP)
  2163. sub $64, LEN
  2164. add $64, INP
  2165. add $64, OUTP
  2166. cmp $64, LEN
  2167. jge .Lecb_enc_loop4
  2168. cmp $16, LEN
  2169. jb .Lecb_enc_ret
  2170. .align 4
  2171. .Lecb_enc_loop1:
  2172. movups (INP), STATE1
  2173. call _aesni_enc1
  2174. movups STATE1, (OUTP)
  2175. sub $16, LEN
  2176. add $16, INP
  2177. add $16, OUTP
  2178. cmp $16, LEN
  2179. jge .Lecb_enc_loop1
  2180. .Lecb_enc_ret:
  2181. #ifndef __x86_64__
  2182. popl KLEN
  2183. popl KEYP
  2184. popl LEN
  2185. #endif
  2186. FRAME_END
  2187. ret
  2188. ENDPROC(aesni_ecb_enc)
  2189. /*
  2190. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2191. * size_t len);
  2192. */
  2193. ENTRY(aesni_ecb_dec)
  2194. FRAME_BEGIN
  2195. #ifndef __x86_64__
  2196. pushl LEN
  2197. pushl KEYP
  2198. pushl KLEN
  2199. movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
  2200. movl (FRAME_OFFSET+20)(%esp), OUTP # dst
  2201. movl (FRAME_OFFSET+24)(%esp), INP # src
  2202. movl (FRAME_OFFSET+28)(%esp), LEN # len
  2203. #endif
  2204. test LEN, LEN
  2205. jz .Lecb_dec_ret
  2206. mov 480(KEYP), KLEN
  2207. add $240, KEYP
  2208. cmp $16, LEN
  2209. jb .Lecb_dec_ret
  2210. cmp $64, LEN
  2211. jb .Lecb_dec_loop1
  2212. .align 4
  2213. .Lecb_dec_loop4:
  2214. movups (INP), STATE1
  2215. movups 0x10(INP), STATE2
  2216. movups 0x20(INP), STATE3
  2217. movups 0x30(INP), STATE4
  2218. call _aesni_dec4
  2219. movups STATE1, (OUTP)
  2220. movups STATE2, 0x10(OUTP)
  2221. movups STATE3, 0x20(OUTP)
  2222. movups STATE4, 0x30(OUTP)
  2223. sub $64, LEN
  2224. add $64, INP
  2225. add $64, OUTP
  2226. cmp $64, LEN
  2227. jge .Lecb_dec_loop4
  2228. cmp $16, LEN
  2229. jb .Lecb_dec_ret
  2230. .align 4
  2231. .Lecb_dec_loop1:
  2232. movups (INP), STATE1
  2233. call _aesni_dec1
  2234. movups STATE1, (OUTP)
  2235. sub $16, LEN
  2236. add $16, INP
  2237. add $16, OUTP
  2238. cmp $16, LEN
  2239. jge .Lecb_dec_loop1
  2240. .Lecb_dec_ret:
  2241. #ifndef __x86_64__
  2242. popl KLEN
  2243. popl KEYP
  2244. popl LEN
  2245. #endif
  2246. FRAME_END
  2247. ret
  2248. ENDPROC(aesni_ecb_dec)
  2249. /*
  2250. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2251. * size_t len, u8 *iv)
  2252. */
  2253. ENTRY(aesni_cbc_enc)
  2254. FRAME_BEGIN
  2255. #ifndef __x86_64__
  2256. pushl IVP
  2257. pushl LEN
  2258. pushl KEYP
  2259. pushl KLEN
  2260. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2261. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2262. movl (FRAME_OFFSET+28)(%esp), INP # src
  2263. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2264. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2265. #endif
  2266. cmp $16, LEN
  2267. jb .Lcbc_enc_ret
  2268. mov 480(KEYP), KLEN
  2269. movups (IVP), STATE # load iv as initial state
  2270. .align 4
  2271. .Lcbc_enc_loop:
  2272. movups (INP), IN # load input
  2273. pxor IN, STATE
  2274. call _aesni_enc1
  2275. movups STATE, (OUTP) # store output
  2276. sub $16, LEN
  2277. add $16, INP
  2278. add $16, OUTP
  2279. cmp $16, LEN
  2280. jge .Lcbc_enc_loop
  2281. movups STATE, (IVP)
  2282. .Lcbc_enc_ret:
  2283. #ifndef __x86_64__
  2284. popl KLEN
  2285. popl KEYP
  2286. popl LEN
  2287. popl IVP
  2288. #endif
  2289. FRAME_END
  2290. ret
  2291. ENDPROC(aesni_cbc_enc)
  2292. /*
  2293. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2294. * size_t len, u8 *iv)
  2295. */
  2296. ENTRY(aesni_cbc_dec)
  2297. FRAME_BEGIN
  2298. #ifndef __x86_64__
  2299. pushl IVP
  2300. pushl LEN
  2301. pushl KEYP
  2302. pushl KLEN
  2303. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2304. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2305. movl (FRAME_OFFSET+28)(%esp), INP # src
  2306. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2307. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2308. #endif
  2309. cmp $16, LEN
  2310. jb .Lcbc_dec_just_ret
  2311. mov 480(KEYP), KLEN
  2312. add $240, KEYP
  2313. movups (IVP), IV
  2314. cmp $64, LEN
  2315. jb .Lcbc_dec_loop1
  2316. .align 4
  2317. .Lcbc_dec_loop4:
  2318. movups (INP), IN1
  2319. movaps IN1, STATE1
  2320. movups 0x10(INP), IN2
  2321. movaps IN2, STATE2
  2322. #ifdef __x86_64__
  2323. movups 0x20(INP), IN3
  2324. movaps IN3, STATE3
  2325. movups 0x30(INP), IN4
  2326. movaps IN4, STATE4
  2327. #else
  2328. movups 0x20(INP), IN1
  2329. movaps IN1, STATE3
  2330. movups 0x30(INP), IN2
  2331. movaps IN2, STATE4
  2332. #endif
  2333. call _aesni_dec4
  2334. pxor IV, STATE1
  2335. #ifdef __x86_64__
  2336. pxor IN1, STATE2
  2337. pxor IN2, STATE3
  2338. pxor IN3, STATE4
  2339. movaps IN4, IV
  2340. #else
  2341. pxor IN1, STATE4
  2342. movaps IN2, IV
  2343. movups (INP), IN1
  2344. pxor IN1, STATE2
  2345. movups 0x10(INP), IN2
  2346. pxor IN2, STATE3
  2347. #endif
  2348. movups STATE1, (OUTP)
  2349. movups STATE2, 0x10(OUTP)
  2350. movups STATE3, 0x20(OUTP)
  2351. movups STATE4, 0x30(OUTP)
  2352. sub $64, LEN
  2353. add $64, INP
  2354. add $64, OUTP
  2355. cmp $64, LEN
  2356. jge .Lcbc_dec_loop4
  2357. cmp $16, LEN
  2358. jb .Lcbc_dec_ret
  2359. .align 4
  2360. .Lcbc_dec_loop1:
  2361. movups (INP), IN
  2362. movaps IN, STATE
  2363. call _aesni_dec1
  2364. pxor IV, STATE
  2365. movups STATE, (OUTP)
  2366. movaps IN, IV
  2367. sub $16, LEN
  2368. add $16, INP
  2369. add $16, OUTP
  2370. cmp $16, LEN
  2371. jge .Lcbc_dec_loop1
  2372. .Lcbc_dec_ret:
  2373. movups IV, (IVP)
  2374. .Lcbc_dec_just_ret:
  2375. #ifndef __x86_64__
  2376. popl KLEN
  2377. popl KEYP
  2378. popl LEN
  2379. popl IVP
  2380. #endif
  2381. FRAME_END
  2382. ret
  2383. ENDPROC(aesni_cbc_dec)
  2384. #ifdef __x86_64__
  2385. .pushsection .rodata
  2386. .align 16
  2387. .Lbswap_mask:
  2388. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2389. .popsection
  2390. /*
  2391. * _aesni_inc_init: internal ABI
  2392. * setup registers used by _aesni_inc
  2393. * input:
  2394. * IV
  2395. * output:
  2396. * CTR: == IV, in little endian
  2397. * TCTR_LOW: == lower qword of CTR
  2398. * INC: == 1, in little endian
  2399. * BSWAP_MASK == endian swapping mask
  2400. */
  2401. .align 4
  2402. _aesni_inc_init:
  2403. movaps .Lbswap_mask, BSWAP_MASK
  2404. movaps IV, CTR
  2405. PSHUFB_XMM BSWAP_MASK CTR
  2406. mov $1, TCTR_LOW
  2407. MOVQ_R64_XMM TCTR_LOW INC
  2408. MOVQ_R64_XMM CTR TCTR_LOW
  2409. ret
  2410. ENDPROC(_aesni_inc_init)
  2411. /*
  2412. * _aesni_inc: internal ABI
  2413. * Increase IV by 1, IV is in big endian
  2414. * input:
  2415. * IV
  2416. * CTR: == IV, in little endian
  2417. * TCTR_LOW: == lower qword of CTR
  2418. * INC: == 1, in little endian
  2419. * BSWAP_MASK == endian swapping mask
  2420. * output:
  2421. * IV: Increase by 1
  2422. * changed:
  2423. * CTR: == output IV, in little endian
  2424. * TCTR_LOW: == lower qword of CTR
  2425. */
  2426. .align 4
  2427. _aesni_inc:
  2428. paddq INC, CTR
  2429. add $1, TCTR_LOW
  2430. jnc .Linc_low
  2431. pslldq $8, INC
  2432. paddq INC, CTR
  2433. psrldq $8, INC
  2434. .Linc_low:
  2435. movaps CTR, IV
  2436. PSHUFB_XMM BSWAP_MASK IV
  2437. ret
  2438. ENDPROC(_aesni_inc)
  2439. /*
  2440. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2441. * size_t len, u8 *iv)
  2442. */
  2443. ENTRY(aesni_ctr_enc)
  2444. FRAME_BEGIN
  2445. cmp $16, LEN
  2446. jb .Lctr_enc_just_ret
  2447. mov 480(KEYP), KLEN
  2448. movups (IVP), IV
  2449. call _aesni_inc_init
  2450. cmp $64, LEN
  2451. jb .Lctr_enc_loop1
  2452. .align 4
  2453. .Lctr_enc_loop4:
  2454. movaps IV, STATE1
  2455. call _aesni_inc
  2456. movups (INP), IN1
  2457. movaps IV, STATE2
  2458. call _aesni_inc
  2459. movups 0x10(INP), IN2
  2460. movaps IV, STATE3
  2461. call _aesni_inc
  2462. movups 0x20(INP), IN3
  2463. movaps IV, STATE4
  2464. call _aesni_inc
  2465. movups 0x30(INP), IN4
  2466. call _aesni_enc4
  2467. pxor IN1, STATE1
  2468. movups STATE1, (OUTP)
  2469. pxor IN2, STATE2
  2470. movups STATE2, 0x10(OUTP)
  2471. pxor IN3, STATE3
  2472. movups STATE3, 0x20(OUTP)
  2473. pxor IN4, STATE4
  2474. movups STATE4, 0x30(OUTP)
  2475. sub $64, LEN
  2476. add $64, INP
  2477. add $64, OUTP
  2478. cmp $64, LEN
  2479. jge .Lctr_enc_loop4
  2480. cmp $16, LEN
  2481. jb .Lctr_enc_ret
  2482. .align 4
  2483. .Lctr_enc_loop1:
  2484. movaps IV, STATE
  2485. call _aesni_inc
  2486. movups (INP), IN
  2487. call _aesni_enc1
  2488. pxor IN, STATE
  2489. movups STATE, (OUTP)
  2490. sub $16, LEN
  2491. add $16, INP
  2492. add $16, OUTP
  2493. cmp $16, LEN
  2494. jge .Lctr_enc_loop1
  2495. .Lctr_enc_ret:
  2496. movups IV, (IVP)
  2497. .Lctr_enc_just_ret:
  2498. FRAME_END
  2499. ret
  2500. ENDPROC(aesni_ctr_enc)
  2501. /*
  2502. * _aesni_gf128mul_x_ble: internal ABI
  2503. * Multiply in GF(2^128) for XTS IVs
  2504. * input:
  2505. * IV: current IV
  2506. * GF128MUL_MASK == mask with 0x87 and 0x01
  2507. * output:
  2508. * IV: next IV
  2509. * changed:
  2510. * CTR: == temporary value
  2511. */
  2512. #define _aesni_gf128mul_x_ble() \
  2513. pshufd $0x13, IV, CTR; \
  2514. paddq IV, IV; \
  2515. psrad $31, CTR; \
  2516. pand GF128MUL_MASK, CTR; \
  2517. pxor CTR, IV;
  2518. /*
  2519. * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2520. * bool enc, u8 *iv)
  2521. */
  2522. ENTRY(aesni_xts_crypt8)
  2523. FRAME_BEGIN
  2524. cmpb $0, %cl
  2525. movl $0, %ecx
  2526. movl $240, %r10d
  2527. leaq _aesni_enc4, %r11
  2528. leaq _aesni_dec4, %rax
  2529. cmovel %r10d, %ecx
  2530. cmoveq %rax, %r11
  2531. movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
  2532. movups (IVP), IV
  2533. mov 480(KEYP), KLEN
  2534. addq %rcx, KEYP
  2535. movdqa IV, STATE1
  2536. movdqu 0x00(INP), INC
  2537. pxor INC, STATE1
  2538. movdqu IV, 0x00(OUTP)
  2539. _aesni_gf128mul_x_ble()
  2540. movdqa IV, STATE2
  2541. movdqu 0x10(INP), INC
  2542. pxor INC, STATE2
  2543. movdqu IV, 0x10(OUTP)
  2544. _aesni_gf128mul_x_ble()
  2545. movdqa IV, STATE3
  2546. movdqu 0x20(INP), INC
  2547. pxor INC, STATE3
  2548. movdqu IV, 0x20(OUTP)
  2549. _aesni_gf128mul_x_ble()
  2550. movdqa IV, STATE4
  2551. movdqu 0x30(INP), INC
  2552. pxor INC, STATE4
  2553. movdqu IV, 0x30(OUTP)
  2554. CALL_NOSPEC %r11
  2555. movdqu 0x00(OUTP), INC
  2556. pxor INC, STATE1
  2557. movdqu STATE1, 0x00(OUTP)
  2558. _aesni_gf128mul_x_ble()
  2559. movdqa IV, STATE1
  2560. movdqu 0x40(INP), INC
  2561. pxor INC, STATE1
  2562. movdqu IV, 0x40(OUTP)
  2563. movdqu 0x10(OUTP), INC
  2564. pxor INC, STATE2
  2565. movdqu STATE2, 0x10(OUTP)
  2566. _aesni_gf128mul_x_ble()
  2567. movdqa IV, STATE2
  2568. movdqu 0x50(INP), INC
  2569. pxor INC, STATE2
  2570. movdqu IV, 0x50(OUTP)
  2571. movdqu 0x20(OUTP), INC
  2572. pxor INC, STATE3
  2573. movdqu STATE3, 0x20(OUTP)
  2574. _aesni_gf128mul_x_ble()
  2575. movdqa IV, STATE3
  2576. movdqu 0x60(INP), INC
  2577. pxor INC, STATE3
  2578. movdqu IV, 0x60(OUTP)
  2579. movdqu 0x30(OUTP), INC
  2580. pxor INC, STATE4
  2581. movdqu STATE4, 0x30(OUTP)
  2582. _aesni_gf128mul_x_ble()
  2583. movdqa IV, STATE4
  2584. movdqu 0x70(INP), INC
  2585. pxor INC, STATE4
  2586. movdqu IV, 0x70(OUTP)
  2587. _aesni_gf128mul_x_ble()
  2588. movups IV, (IVP)
  2589. CALL_NOSPEC %r11
  2590. movdqu 0x40(OUTP), INC
  2591. pxor INC, STATE1
  2592. movdqu STATE1, 0x40(OUTP)
  2593. movdqu 0x50(OUTP), INC
  2594. pxor INC, STATE2
  2595. movdqu STATE2, 0x50(OUTP)
  2596. movdqu 0x60(OUTP), INC
  2597. pxor INC, STATE3
  2598. movdqu STATE3, 0x60(OUTP)
  2599. movdqu 0x70(OUTP), INC
  2600. pxor INC, STATE4
  2601. movdqu STATE4, 0x70(OUTP)
  2602. FRAME_END
  2603. ret
  2604. ENDPROC(aesni_xts_crypt8)
  2605. #endif