aesni-intel_asm.S 74 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * Ported x86_64 version to x86:
  24. * Author: Mathias Krause <minipli@googlemail.com>
  25. *
  26. * This program is free software; you can redistribute it and/or modify
  27. * it under the terms of the GNU General Public License as published by
  28. * the Free Software Foundation; either version 2 of the License, or
  29. * (at your option) any later version.
  30. */
  31. #include <linux/linkage.h>
  32. #include <asm/inst.h>
  33. #ifdef __x86_64__
  34. .data
  35. POLY: .octa 0xC2000000000000000000000000000001
  36. TWOONE: .octa 0x00000001000000000000000000000001
  37. # order of these constants should not change.
  38. # more specifically, ALL_F should follow SHIFT_MASK,
  39. # and ZERO should follow ALL_F
  40. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  41. MASK1: .octa 0x0000000000000000ffffffffffffffff
  42. MASK2: .octa 0xffffffffffffffff0000000000000000
  43. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  44. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  45. ZERO: .octa 0x00000000000000000000000000000000
  46. ONE: .octa 0x00000000000000000000000000000001
  47. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  48. dec: .octa 0x1
  49. enc: .octa 0x2
  50. .text
  51. #define STACK_OFFSET 8*3
  52. #define HashKey 16*0 // store HashKey <<1 mod poly here
  53. #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
  54. #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
  55. #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
  56. #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
  57. // bits of HashKey <<1 mod poly here
  58. //(for Karatsuba purposes)
  59. #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
  60. // bits of HashKey^2 <<1 mod poly here
  61. // (for Karatsuba purposes)
  62. #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
  63. // bits of HashKey^3 <<1 mod poly here
  64. // (for Karatsuba purposes)
  65. #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
  66. // bits of HashKey^4 <<1 mod poly here
  67. // (for Karatsuba purposes)
  68. #define VARIABLE_OFFSET 16*8
  69. #define arg1 rdi
  70. #define arg2 rsi
  71. #define arg3 rdx
  72. #define arg4 rcx
  73. #define arg5 r8
  74. #define arg6 r9
  75. #define arg7 STACK_OFFSET+8(%r14)
  76. #define arg8 STACK_OFFSET+16(%r14)
  77. #define arg9 STACK_OFFSET+24(%r14)
  78. #define arg10 STACK_OFFSET+32(%r14)
  79. #endif
  80. #define STATE1 %xmm0
  81. #define STATE2 %xmm4
  82. #define STATE3 %xmm5
  83. #define STATE4 %xmm6
  84. #define STATE STATE1
  85. #define IN1 %xmm1
  86. #define IN2 %xmm7
  87. #define IN3 %xmm8
  88. #define IN4 %xmm9
  89. #define IN IN1
  90. #define KEY %xmm2
  91. #define IV %xmm3
  92. #define BSWAP_MASK %xmm10
  93. #define CTR %xmm11
  94. #define INC %xmm12
  95. #ifdef __x86_64__
  96. #define AREG %rax
  97. #define KEYP %rdi
  98. #define OUTP %rsi
  99. #define UKEYP OUTP
  100. #define INP %rdx
  101. #define LEN %rcx
  102. #define IVP %r8
  103. #define KLEN %r9d
  104. #define T1 %r10
  105. #define TKEYP T1
  106. #define T2 %r11
  107. #define TCTR_LOW T2
  108. #else
  109. #define AREG %eax
  110. #define KEYP %edi
  111. #define OUTP AREG
  112. #define UKEYP OUTP
  113. #define INP %edx
  114. #define LEN %esi
  115. #define IVP %ebp
  116. #define KLEN %ebx
  117. #define T1 %ecx
  118. #define TKEYP T1
  119. #endif
  120. #ifdef __x86_64__
  121. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  122. *
  123. *
  124. * Input: A and B (128-bits each, bit-reflected)
  125. * Output: C = A*B*x mod poly, (i.e. >>1 )
  126. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  127. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  128. *
  129. */
  130. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  131. movdqa \GH, \TMP1
  132. pshufd $78, \GH, \TMP2
  133. pshufd $78, \HK, \TMP3
  134. pxor \GH, \TMP2 # TMP2 = a1+a0
  135. pxor \HK, \TMP3 # TMP3 = b1+b0
  136. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  137. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  138. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  139. pxor \GH, \TMP2
  140. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  141. movdqa \TMP2, \TMP3
  142. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  143. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  144. pxor \TMP3, \GH
  145. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  146. # first phase of the reduction
  147. movdqa \GH, \TMP2
  148. movdqa \GH, \TMP3
  149. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  150. # in in order to perform
  151. # independent shifts
  152. pslld $31, \TMP2 # packed right shift <<31
  153. pslld $30, \TMP3 # packed right shift <<30
  154. pslld $25, \TMP4 # packed right shift <<25
  155. pxor \TMP3, \TMP2 # xor the shifted versions
  156. pxor \TMP4, \TMP2
  157. movdqa \TMP2, \TMP5
  158. psrldq $4, \TMP5 # right shift TMP5 1 DW
  159. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  160. pxor \TMP2, \GH
  161. # second phase of the reduction
  162. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  163. # in in order to perform
  164. # independent shifts
  165. movdqa \GH,\TMP3
  166. movdqa \GH,\TMP4
  167. psrld $1,\TMP2 # packed left shift >>1
  168. psrld $2,\TMP3 # packed left shift >>2
  169. psrld $7,\TMP4 # packed left shift >>7
  170. pxor \TMP3,\TMP2 # xor the shifted versions
  171. pxor \TMP4,\TMP2
  172. pxor \TMP5, \TMP2
  173. pxor \TMP2, \GH
  174. pxor \TMP1, \GH # result is in TMP1
  175. .endm
  176. /*
  177. * if a = number of total plaintext bytes
  178. * b = floor(a/16)
  179. * num_initial_blocks = b mod 4
  180. * encrypt the initial num_initial_blocks blocks and apply ghash on
  181. * the ciphertext
  182. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  183. * are clobbered
  184. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  185. */
  186. .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  187. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  188. mov arg7, %r10 # %r10 = AAD
  189. mov arg8, %r12 # %r12 = aadLen
  190. mov %r12, %r11
  191. pxor %xmm\i, %xmm\i
  192. _get_AAD_loop\num_initial_blocks\operation:
  193. movd (%r10), \TMP1
  194. pslldq $12, \TMP1
  195. psrldq $4, %xmm\i
  196. pxor \TMP1, %xmm\i
  197. add $4, %r10
  198. sub $4, %r12
  199. jne _get_AAD_loop\num_initial_blocks\operation
  200. cmp $16, %r11
  201. je _get_AAD_loop2_done\num_initial_blocks\operation
  202. mov $16, %r12
  203. _get_AAD_loop2\num_initial_blocks\operation:
  204. psrldq $4, %xmm\i
  205. sub $4, %r12
  206. cmp %r11, %r12
  207. jne _get_AAD_loop2\num_initial_blocks\operation
  208. _get_AAD_loop2_done\num_initial_blocks\operation:
  209. movdqa SHUF_MASK(%rip), %xmm14
  210. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  211. xor %r11, %r11 # initialise the data pointer offset as zero
  212. # start AES for num_initial_blocks blocks
  213. mov %arg5, %rax # %rax = *Y0
  214. movdqu (%rax), \XMM0 # XMM0 = Y0
  215. movdqa SHUF_MASK(%rip), %xmm14
  216. PSHUFB_XMM %xmm14, \XMM0
  217. .if (\i == 5) || (\i == 6) || (\i == 7)
  218. .irpc index, \i_seq
  219. paddd ONE(%rip), \XMM0 # INCR Y0
  220. movdqa \XMM0, %xmm\index
  221. movdqa SHUF_MASK(%rip), %xmm14
  222. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  223. .endr
  224. .irpc index, \i_seq
  225. pxor 16*0(%arg1), %xmm\index
  226. .endr
  227. .irpc index, \i_seq
  228. movaps 0x10(%rdi), \TMP1
  229. AESENC \TMP1, %xmm\index # Round 1
  230. .endr
  231. .irpc index, \i_seq
  232. movaps 0x20(%arg1), \TMP1
  233. AESENC \TMP1, %xmm\index # Round 2
  234. .endr
  235. .irpc index, \i_seq
  236. movaps 0x30(%arg1), \TMP1
  237. AESENC \TMP1, %xmm\index # Round 2
  238. .endr
  239. .irpc index, \i_seq
  240. movaps 0x40(%arg1), \TMP1
  241. AESENC \TMP1, %xmm\index # Round 2
  242. .endr
  243. .irpc index, \i_seq
  244. movaps 0x50(%arg1), \TMP1
  245. AESENC \TMP1, %xmm\index # Round 2
  246. .endr
  247. .irpc index, \i_seq
  248. movaps 0x60(%arg1), \TMP1
  249. AESENC \TMP1, %xmm\index # Round 2
  250. .endr
  251. .irpc index, \i_seq
  252. movaps 0x70(%arg1), \TMP1
  253. AESENC \TMP1, %xmm\index # Round 2
  254. .endr
  255. .irpc index, \i_seq
  256. movaps 0x80(%arg1), \TMP1
  257. AESENC \TMP1, %xmm\index # Round 2
  258. .endr
  259. .irpc index, \i_seq
  260. movaps 0x90(%arg1), \TMP1
  261. AESENC \TMP1, %xmm\index # Round 2
  262. .endr
  263. .irpc index, \i_seq
  264. movaps 0xa0(%arg1), \TMP1
  265. AESENCLAST \TMP1, %xmm\index # Round 10
  266. .endr
  267. .irpc index, \i_seq
  268. movdqu (%arg3 , %r11, 1), \TMP1
  269. pxor \TMP1, %xmm\index
  270. movdqu %xmm\index, (%arg2 , %r11, 1)
  271. # write back plaintext/ciphertext for num_initial_blocks
  272. add $16, %r11
  273. movdqa \TMP1, %xmm\index
  274. movdqa SHUF_MASK(%rip), %xmm14
  275. PSHUFB_XMM %xmm14, %xmm\index
  276. # prepare plaintext/ciphertext for GHASH computation
  277. .endr
  278. .endif
  279. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  280. # apply GHASH on num_initial_blocks blocks
  281. .if \i == 5
  282. pxor %xmm5, %xmm6
  283. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  284. pxor %xmm6, %xmm7
  285. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  286. pxor %xmm7, %xmm8
  287. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  288. .elseif \i == 6
  289. pxor %xmm6, %xmm7
  290. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  291. pxor %xmm7, %xmm8
  292. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  293. .elseif \i == 7
  294. pxor %xmm7, %xmm8
  295. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  296. .endif
  297. cmp $64, %r13
  298. jl _initial_blocks_done\num_initial_blocks\operation
  299. # no need for precomputed values
  300. /*
  301. *
  302. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  303. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  304. */
  305. paddd ONE(%rip), \XMM0 # INCR Y0
  306. movdqa \XMM0, \XMM1
  307. movdqa SHUF_MASK(%rip), %xmm14
  308. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  309. paddd ONE(%rip), \XMM0 # INCR Y0
  310. movdqa \XMM0, \XMM2
  311. movdqa SHUF_MASK(%rip), %xmm14
  312. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  313. paddd ONE(%rip), \XMM0 # INCR Y0
  314. movdqa \XMM0, \XMM3
  315. movdqa SHUF_MASK(%rip), %xmm14
  316. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  317. paddd ONE(%rip), \XMM0 # INCR Y0
  318. movdqa \XMM0, \XMM4
  319. movdqa SHUF_MASK(%rip), %xmm14
  320. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  321. pxor 16*0(%arg1), \XMM1
  322. pxor 16*0(%arg1), \XMM2
  323. pxor 16*0(%arg1), \XMM3
  324. pxor 16*0(%arg1), \XMM4
  325. movdqa \TMP3, \TMP5
  326. pshufd $78, \TMP3, \TMP1
  327. pxor \TMP3, \TMP1
  328. movdqa \TMP1, HashKey_k(%rsp)
  329. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  330. # TMP5 = HashKey^2<<1 (mod poly)
  331. movdqa \TMP5, HashKey_2(%rsp)
  332. # HashKey_2 = HashKey^2<<1 (mod poly)
  333. pshufd $78, \TMP5, \TMP1
  334. pxor \TMP5, \TMP1
  335. movdqa \TMP1, HashKey_2_k(%rsp)
  336. .irpc index, 1234 # do 4 rounds
  337. movaps 0x10*\index(%arg1), \TMP1
  338. AESENC \TMP1, \XMM1
  339. AESENC \TMP1, \XMM2
  340. AESENC \TMP1, \XMM3
  341. AESENC \TMP1, \XMM4
  342. .endr
  343. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  344. # TMP5 = HashKey^3<<1 (mod poly)
  345. movdqa \TMP5, HashKey_3(%rsp)
  346. pshufd $78, \TMP5, \TMP1
  347. pxor \TMP5, \TMP1
  348. movdqa \TMP1, HashKey_3_k(%rsp)
  349. .irpc index, 56789 # do next 5 rounds
  350. movaps 0x10*\index(%arg1), \TMP1
  351. AESENC \TMP1, \XMM1
  352. AESENC \TMP1, \XMM2
  353. AESENC \TMP1, \XMM3
  354. AESENC \TMP1, \XMM4
  355. .endr
  356. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  357. # TMP5 = HashKey^3<<1 (mod poly)
  358. movdqa \TMP5, HashKey_4(%rsp)
  359. pshufd $78, \TMP5, \TMP1
  360. pxor \TMP5, \TMP1
  361. movdqa \TMP1, HashKey_4_k(%rsp)
  362. movaps 0xa0(%arg1), \TMP2
  363. AESENCLAST \TMP2, \XMM1
  364. AESENCLAST \TMP2, \XMM2
  365. AESENCLAST \TMP2, \XMM3
  366. AESENCLAST \TMP2, \XMM4
  367. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  368. pxor \TMP1, \XMM1
  369. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  370. movdqa \TMP1, \XMM1
  371. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  372. pxor \TMP1, \XMM2
  373. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  374. movdqa \TMP1, \XMM2
  375. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  376. pxor \TMP1, \XMM3
  377. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  378. movdqa \TMP1, \XMM3
  379. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  380. pxor \TMP1, \XMM4
  381. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  382. movdqa \TMP1, \XMM4
  383. add $64, %r11
  384. movdqa SHUF_MASK(%rip), %xmm14
  385. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  386. pxor \XMMDst, \XMM1
  387. # combine GHASHed value with the corresponding ciphertext
  388. movdqa SHUF_MASK(%rip), %xmm14
  389. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  390. movdqa SHUF_MASK(%rip), %xmm14
  391. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  392. movdqa SHUF_MASK(%rip), %xmm14
  393. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  394. _initial_blocks_done\num_initial_blocks\operation:
  395. .endm
  396. /*
  397. * if a = number of total plaintext bytes
  398. * b = floor(a/16)
  399. * num_initial_blocks = b mod 4
  400. * encrypt the initial num_initial_blocks blocks and apply ghash on
  401. * the ciphertext
  402. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  403. * are clobbered
  404. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  405. */
  406. .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  407. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  408. mov arg7, %r10 # %r10 = AAD
  409. mov arg8, %r12 # %r12 = aadLen
  410. mov %r12, %r11
  411. pxor %xmm\i, %xmm\i
  412. _get_AAD_loop\num_initial_blocks\operation:
  413. movd (%r10), \TMP1
  414. pslldq $12, \TMP1
  415. psrldq $4, %xmm\i
  416. pxor \TMP1, %xmm\i
  417. add $4, %r10
  418. sub $4, %r12
  419. jne _get_AAD_loop\num_initial_blocks\operation
  420. cmp $16, %r11
  421. je _get_AAD_loop2_done\num_initial_blocks\operation
  422. mov $16, %r12
  423. _get_AAD_loop2\num_initial_blocks\operation:
  424. psrldq $4, %xmm\i
  425. sub $4, %r12
  426. cmp %r11, %r12
  427. jne _get_AAD_loop2\num_initial_blocks\operation
  428. _get_AAD_loop2_done\num_initial_blocks\operation:
  429. movdqa SHUF_MASK(%rip), %xmm14
  430. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  431. xor %r11, %r11 # initialise the data pointer offset as zero
  432. # start AES for num_initial_blocks blocks
  433. mov %arg5, %rax # %rax = *Y0
  434. movdqu (%rax), \XMM0 # XMM0 = Y0
  435. movdqa SHUF_MASK(%rip), %xmm14
  436. PSHUFB_XMM %xmm14, \XMM0
  437. .if (\i == 5) || (\i == 6) || (\i == 7)
  438. .irpc index, \i_seq
  439. paddd ONE(%rip), \XMM0 # INCR Y0
  440. movdqa \XMM0, %xmm\index
  441. movdqa SHUF_MASK(%rip), %xmm14
  442. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  443. .endr
  444. .irpc index, \i_seq
  445. pxor 16*0(%arg1), %xmm\index
  446. .endr
  447. .irpc index, \i_seq
  448. movaps 0x10(%rdi), \TMP1
  449. AESENC \TMP1, %xmm\index # Round 1
  450. .endr
  451. .irpc index, \i_seq
  452. movaps 0x20(%arg1), \TMP1
  453. AESENC \TMP1, %xmm\index # Round 2
  454. .endr
  455. .irpc index, \i_seq
  456. movaps 0x30(%arg1), \TMP1
  457. AESENC \TMP1, %xmm\index # Round 2
  458. .endr
  459. .irpc index, \i_seq
  460. movaps 0x40(%arg1), \TMP1
  461. AESENC \TMP1, %xmm\index # Round 2
  462. .endr
  463. .irpc index, \i_seq
  464. movaps 0x50(%arg1), \TMP1
  465. AESENC \TMP1, %xmm\index # Round 2
  466. .endr
  467. .irpc index, \i_seq
  468. movaps 0x60(%arg1), \TMP1
  469. AESENC \TMP1, %xmm\index # Round 2
  470. .endr
  471. .irpc index, \i_seq
  472. movaps 0x70(%arg1), \TMP1
  473. AESENC \TMP1, %xmm\index # Round 2
  474. .endr
  475. .irpc index, \i_seq
  476. movaps 0x80(%arg1), \TMP1
  477. AESENC \TMP1, %xmm\index # Round 2
  478. .endr
  479. .irpc index, \i_seq
  480. movaps 0x90(%arg1), \TMP1
  481. AESENC \TMP1, %xmm\index # Round 2
  482. .endr
  483. .irpc index, \i_seq
  484. movaps 0xa0(%arg1), \TMP1
  485. AESENCLAST \TMP1, %xmm\index # Round 10
  486. .endr
  487. .irpc index, \i_seq
  488. movdqu (%arg3 , %r11, 1), \TMP1
  489. pxor \TMP1, %xmm\index
  490. movdqu %xmm\index, (%arg2 , %r11, 1)
  491. # write back plaintext/ciphertext for num_initial_blocks
  492. add $16, %r11
  493. movdqa SHUF_MASK(%rip), %xmm14
  494. PSHUFB_XMM %xmm14, %xmm\index
  495. # prepare plaintext/ciphertext for GHASH computation
  496. .endr
  497. .endif
  498. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  499. # apply GHASH on num_initial_blocks blocks
  500. .if \i == 5
  501. pxor %xmm5, %xmm6
  502. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  503. pxor %xmm6, %xmm7
  504. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  505. pxor %xmm7, %xmm8
  506. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  507. .elseif \i == 6
  508. pxor %xmm6, %xmm7
  509. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  510. pxor %xmm7, %xmm8
  511. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  512. .elseif \i == 7
  513. pxor %xmm7, %xmm8
  514. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  515. .endif
  516. cmp $64, %r13
  517. jl _initial_blocks_done\num_initial_blocks\operation
  518. # no need for precomputed values
  519. /*
  520. *
  521. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  522. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  523. */
  524. paddd ONE(%rip), \XMM0 # INCR Y0
  525. movdqa \XMM0, \XMM1
  526. movdqa SHUF_MASK(%rip), %xmm14
  527. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  528. paddd ONE(%rip), \XMM0 # INCR Y0
  529. movdqa \XMM0, \XMM2
  530. movdqa SHUF_MASK(%rip), %xmm14
  531. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  532. paddd ONE(%rip), \XMM0 # INCR Y0
  533. movdqa \XMM0, \XMM3
  534. movdqa SHUF_MASK(%rip), %xmm14
  535. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  536. paddd ONE(%rip), \XMM0 # INCR Y0
  537. movdqa \XMM0, \XMM4
  538. movdqa SHUF_MASK(%rip), %xmm14
  539. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  540. pxor 16*0(%arg1), \XMM1
  541. pxor 16*0(%arg1), \XMM2
  542. pxor 16*0(%arg1), \XMM3
  543. pxor 16*0(%arg1), \XMM4
  544. movdqa \TMP3, \TMP5
  545. pshufd $78, \TMP3, \TMP1
  546. pxor \TMP3, \TMP1
  547. movdqa \TMP1, HashKey_k(%rsp)
  548. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  549. # TMP5 = HashKey^2<<1 (mod poly)
  550. movdqa \TMP5, HashKey_2(%rsp)
  551. # HashKey_2 = HashKey^2<<1 (mod poly)
  552. pshufd $78, \TMP5, \TMP1
  553. pxor \TMP5, \TMP1
  554. movdqa \TMP1, HashKey_2_k(%rsp)
  555. .irpc index, 1234 # do 4 rounds
  556. movaps 0x10*\index(%arg1), \TMP1
  557. AESENC \TMP1, \XMM1
  558. AESENC \TMP1, \XMM2
  559. AESENC \TMP1, \XMM3
  560. AESENC \TMP1, \XMM4
  561. .endr
  562. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  563. # TMP5 = HashKey^3<<1 (mod poly)
  564. movdqa \TMP5, HashKey_3(%rsp)
  565. pshufd $78, \TMP5, \TMP1
  566. pxor \TMP5, \TMP1
  567. movdqa \TMP1, HashKey_3_k(%rsp)
  568. .irpc index, 56789 # do next 5 rounds
  569. movaps 0x10*\index(%arg1), \TMP1
  570. AESENC \TMP1, \XMM1
  571. AESENC \TMP1, \XMM2
  572. AESENC \TMP1, \XMM3
  573. AESENC \TMP1, \XMM4
  574. .endr
  575. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  576. # TMP5 = HashKey^3<<1 (mod poly)
  577. movdqa \TMP5, HashKey_4(%rsp)
  578. pshufd $78, \TMP5, \TMP1
  579. pxor \TMP5, \TMP1
  580. movdqa \TMP1, HashKey_4_k(%rsp)
  581. movaps 0xa0(%arg1), \TMP2
  582. AESENCLAST \TMP2, \XMM1
  583. AESENCLAST \TMP2, \XMM2
  584. AESENCLAST \TMP2, \XMM3
  585. AESENCLAST \TMP2, \XMM4
  586. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  587. pxor \TMP1, \XMM1
  588. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  589. pxor \TMP1, \XMM2
  590. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  591. pxor \TMP1, \XMM3
  592. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  593. pxor \TMP1, \XMM4
  594. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  595. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  596. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  597. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  598. add $64, %r11
  599. movdqa SHUF_MASK(%rip), %xmm14
  600. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  601. pxor \XMMDst, \XMM1
  602. # combine GHASHed value with the corresponding ciphertext
  603. movdqa SHUF_MASK(%rip), %xmm14
  604. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  605. movdqa SHUF_MASK(%rip), %xmm14
  606. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  607. movdqa SHUF_MASK(%rip), %xmm14
  608. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  609. _initial_blocks_done\num_initial_blocks\operation:
  610. .endm
  611. /*
  612. * encrypt 4 blocks at a time
  613. * ghash the 4 previously encrypted ciphertext blocks
  614. * arg1, %arg2, %arg3 are used as pointers only, not modified
  615. * %r11 is the data offset value
  616. */
  617. .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
  618. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  619. movdqa \XMM1, \XMM5
  620. movdqa \XMM2, \XMM6
  621. movdqa \XMM3, \XMM7
  622. movdqa \XMM4, \XMM8
  623. movdqa SHUF_MASK(%rip), %xmm15
  624. # multiply TMP5 * HashKey using karatsuba
  625. movdqa \XMM5, \TMP4
  626. pshufd $78, \XMM5, \TMP6
  627. pxor \XMM5, \TMP6
  628. paddd ONE(%rip), \XMM0 # INCR CNT
  629. movdqa HashKey_4(%rsp), \TMP5
  630. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  631. movdqa \XMM0, \XMM1
  632. paddd ONE(%rip), \XMM0 # INCR CNT
  633. movdqa \XMM0, \XMM2
  634. paddd ONE(%rip), \XMM0 # INCR CNT
  635. movdqa \XMM0, \XMM3
  636. paddd ONE(%rip), \XMM0 # INCR CNT
  637. movdqa \XMM0, \XMM4
  638. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  639. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  640. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  641. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  642. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  643. pxor (%arg1), \XMM1
  644. pxor (%arg1), \XMM2
  645. pxor (%arg1), \XMM3
  646. pxor (%arg1), \XMM4
  647. movdqa HashKey_4_k(%rsp), \TMP5
  648. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  649. movaps 0x10(%arg1), \TMP1
  650. AESENC \TMP1, \XMM1 # Round 1
  651. AESENC \TMP1, \XMM2
  652. AESENC \TMP1, \XMM3
  653. AESENC \TMP1, \XMM4
  654. movaps 0x20(%arg1), \TMP1
  655. AESENC \TMP1, \XMM1 # Round 2
  656. AESENC \TMP1, \XMM2
  657. AESENC \TMP1, \XMM3
  658. AESENC \TMP1, \XMM4
  659. movdqa \XMM6, \TMP1
  660. pshufd $78, \XMM6, \TMP2
  661. pxor \XMM6, \TMP2
  662. movdqa HashKey_3(%rsp), \TMP5
  663. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  664. movaps 0x30(%arg1), \TMP3
  665. AESENC \TMP3, \XMM1 # Round 3
  666. AESENC \TMP3, \XMM2
  667. AESENC \TMP3, \XMM3
  668. AESENC \TMP3, \XMM4
  669. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  670. movaps 0x40(%arg1), \TMP3
  671. AESENC \TMP3, \XMM1 # Round 4
  672. AESENC \TMP3, \XMM2
  673. AESENC \TMP3, \XMM3
  674. AESENC \TMP3, \XMM4
  675. movdqa HashKey_3_k(%rsp), \TMP5
  676. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  677. movaps 0x50(%arg1), \TMP3
  678. AESENC \TMP3, \XMM1 # Round 5
  679. AESENC \TMP3, \XMM2
  680. AESENC \TMP3, \XMM3
  681. AESENC \TMP3, \XMM4
  682. pxor \TMP1, \TMP4
  683. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  684. pxor \XMM6, \XMM5
  685. pxor \TMP2, \TMP6
  686. movdqa \XMM7, \TMP1
  687. pshufd $78, \XMM7, \TMP2
  688. pxor \XMM7, \TMP2
  689. movdqa HashKey_2(%rsp ), \TMP5
  690. # Multiply TMP5 * HashKey using karatsuba
  691. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  692. movaps 0x60(%arg1), \TMP3
  693. AESENC \TMP3, \XMM1 # Round 6
  694. AESENC \TMP3, \XMM2
  695. AESENC \TMP3, \XMM3
  696. AESENC \TMP3, \XMM4
  697. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  698. movaps 0x70(%arg1), \TMP3
  699. AESENC \TMP3, \XMM1 # Round 7
  700. AESENC \TMP3, \XMM2
  701. AESENC \TMP3, \XMM3
  702. AESENC \TMP3, \XMM4
  703. movdqa HashKey_2_k(%rsp), \TMP5
  704. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  705. movaps 0x80(%arg1), \TMP3
  706. AESENC \TMP3, \XMM1 # Round 8
  707. AESENC \TMP3, \XMM2
  708. AESENC \TMP3, \XMM3
  709. AESENC \TMP3, \XMM4
  710. pxor \TMP1, \TMP4
  711. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  712. pxor \XMM7, \XMM5
  713. pxor \TMP2, \TMP6
  714. # Multiply XMM8 * HashKey
  715. # XMM8 and TMP5 hold the values for the two operands
  716. movdqa \XMM8, \TMP1
  717. pshufd $78, \XMM8, \TMP2
  718. pxor \XMM8, \TMP2
  719. movdqa HashKey(%rsp), \TMP5
  720. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  721. movaps 0x90(%arg1), \TMP3
  722. AESENC \TMP3, \XMM1 # Round 9
  723. AESENC \TMP3, \XMM2
  724. AESENC \TMP3, \XMM3
  725. AESENC \TMP3, \XMM4
  726. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  727. movaps 0xa0(%arg1), \TMP3
  728. AESENCLAST \TMP3, \XMM1 # Round 10
  729. AESENCLAST \TMP3, \XMM2
  730. AESENCLAST \TMP3, \XMM3
  731. AESENCLAST \TMP3, \XMM4
  732. movdqa HashKey_k(%rsp), \TMP5
  733. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  734. movdqu (%arg3,%r11,1), \TMP3
  735. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  736. movdqu 16(%arg3,%r11,1), \TMP3
  737. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  738. movdqu 32(%arg3,%r11,1), \TMP3
  739. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  740. movdqu 48(%arg3,%r11,1), \TMP3
  741. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  742. movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
  743. movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
  744. movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
  745. movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
  746. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  747. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  748. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  749. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  750. pxor \TMP4, \TMP1
  751. pxor \XMM8, \XMM5
  752. pxor \TMP6, \TMP2
  753. pxor \TMP1, \TMP2
  754. pxor \XMM5, \TMP2
  755. movdqa \TMP2, \TMP3
  756. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  757. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  758. pxor \TMP3, \XMM5
  759. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  760. # first phase of reduction
  761. movdqa \XMM5, \TMP2
  762. movdqa \XMM5, \TMP3
  763. movdqa \XMM5, \TMP4
  764. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  765. pslld $31, \TMP2 # packed right shift << 31
  766. pslld $30, \TMP3 # packed right shift << 30
  767. pslld $25, \TMP4 # packed right shift << 25
  768. pxor \TMP3, \TMP2 # xor the shifted versions
  769. pxor \TMP4, \TMP2
  770. movdqa \TMP2, \TMP5
  771. psrldq $4, \TMP5 # right shift T5 1 DW
  772. pslldq $12, \TMP2 # left shift T2 3 DWs
  773. pxor \TMP2, \XMM5
  774. # second phase of reduction
  775. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  776. movdqa \XMM5,\TMP3
  777. movdqa \XMM5,\TMP4
  778. psrld $1, \TMP2 # packed left shift >>1
  779. psrld $2, \TMP3 # packed left shift >>2
  780. psrld $7, \TMP4 # packed left shift >>7
  781. pxor \TMP3,\TMP2 # xor the shifted versions
  782. pxor \TMP4,\TMP2
  783. pxor \TMP5, \TMP2
  784. pxor \TMP2, \XMM5
  785. pxor \TMP1, \XMM5 # result is in TMP1
  786. pxor \XMM5, \XMM1
  787. .endm
  788. /*
  789. * decrypt 4 blocks at a time
  790. * ghash the 4 previously decrypted ciphertext blocks
  791. * arg1, %arg2, %arg3 are used as pointers only, not modified
  792. * %r11 is the data offset value
  793. */
  794. .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
  795. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  796. movdqa \XMM1, \XMM5
  797. movdqa \XMM2, \XMM6
  798. movdqa \XMM3, \XMM7
  799. movdqa \XMM4, \XMM8
  800. movdqa SHUF_MASK(%rip), %xmm15
  801. # multiply TMP5 * HashKey using karatsuba
  802. movdqa \XMM5, \TMP4
  803. pshufd $78, \XMM5, \TMP6
  804. pxor \XMM5, \TMP6
  805. paddd ONE(%rip), \XMM0 # INCR CNT
  806. movdqa HashKey_4(%rsp), \TMP5
  807. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  808. movdqa \XMM0, \XMM1
  809. paddd ONE(%rip), \XMM0 # INCR CNT
  810. movdqa \XMM0, \XMM2
  811. paddd ONE(%rip), \XMM0 # INCR CNT
  812. movdqa \XMM0, \XMM3
  813. paddd ONE(%rip), \XMM0 # INCR CNT
  814. movdqa \XMM0, \XMM4
  815. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  816. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  817. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  818. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  819. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  820. pxor (%arg1), \XMM1
  821. pxor (%arg1), \XMM2
  822. pxor (%arg1), \XMM3
  823. pxor (%arg1), \XMM4
  824. movdqa HashKey_4_k(%rsp), \TMP5
  825. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  826. movaps 0x10(%arg1), \TMP1
  827. AESENC \TMP1, \XMM1 # Round 1
  828. AESENC \TMP1, \XMM2
  829. AESENC \TMP1, \XMM3
  830. AESENC \TMP1, \XMM4
  831. movaps 0x20(%arg1), \TMP1
  832. AESENC \TMP1, \XMM1 # Round 2
  833. AESENC \TMP1, \XMM2
  834. AESENC \TMP1, \XMM3
  835. AESENC \TMP1, \XMM4
  836. movdqa \XMM6, \TMP1
  837. pshufd $78, \XMM6, \TMP2
  838. pxor \XMM6, \TMP2
  839. movdqa HashKey_3(%rsp), \TMP5
  840. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  841. movaps 0x30(%arg1), \TMP3
  842. AESENC \TMP3, \XMM1 # Round 3
  843. AESENC \TMP3, \XMM2
  844. AESENC \TMP3, \XMM3
  845. AESENC \TMP3, \XMM4
  846. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  847. movaps 0x40(%arg1), \TMP3
  848. AESENC \TMP3, \XMM1 # Round 4
  849. AESENC \TMP3, \XMM2
  850. AESENC \TMP3, \XMM3
  851. AESENC \TMP3, \XMM4
  852. movdqa HashKey_3_k(%rsp), \TMP5
  853. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  854. movaps 0x50(%arg1), \TMP3
  855. AESENC \TMP3, \XMM1 # Round 5
  856. AESENC \TMP3, \XMM2
  857. AESENC \TMP3, \XMM3
  858. AESENC \TMP3, \XMM4
  859. pxor \TMP1, \TMP4
  860. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  861. pxor \XMM6, \XMM5
  862. pxor \TMP2, \TMP6
  863. movdqa \XMM7, \TMP1
  864. pshufd $78, \XMM7, \TMP2
  865. pxor \XMM7, \TMP2
  866. movdqa HashKey_2(%rsp ), \TMP5
  867. # Multiply TMP5 * HashKey using karatsuba
  868. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  869. movaps 0x60(%arg1), \TMP3
  870. AESENC \TMP3, \XMM1 # Round 6
  871. AESENC \TMP3, \XMM2
  872. AESENC \TMP3, \XMM3
  873. AESENC \TMP3, \XMM4
  874. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  875. movaps 0x70(%arg1), \TMP3
  876. AESENC \TMP3, \XMM1 # Round 7
  877. AESENC \TMP3, \XMM2
  878. AESENC \TMP3, \XMM3
  879. AESENC \TMP3, \XMM4
  880. movdqa HashKey_2_k(%rsp), \TMP5
  881. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  882. movaps 0x80(%arg1), \TMP3
  883. AESENC \TMP3, \XMM1 # Round 8
  884. AESENC \TMP3, \XMM2
  885. AESENC \TMP3, \XMM3
  886. AESENC \TMP3, \XMM4
  887. pxor \TMP1, \TMP4
  888. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  889. pxor \XMM7, \XMM5
  890. pxor \TMP2, \TMP6
  891. # Multiply XMM8 * HashKey
  892. # XMM8 and TMP5 hold the values for the two operands
  893. movdqa \XMM8, \TMP1
  894. pshufd $78, \XMM8, \TMP2
  895. pxor \XMM8, \TMP2
  896. movdqa HashKey(%rsp), \TMP5
  897. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  898. movaps 0x90(%arg1), \TMP3
  899. AESENC \TMP3, \XMM1 # Round 9
  900. AESENC \TMP3, \XMM2
  901. AESENC \TMP3, \XMM3
  902. AESENC \TMP3, \XMM4
  903. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  904. movaps 0xa0(%arg1), \TMP3
  905. AESENCLAST \TMP3, \XMM1 # Round 10
  906. AESENCLAST \TMP3, \XMM2
  907. AESENCLAST \TMP3, \XMM3
  908. AESENCLAST \TMP3, \XMM4
  909. movdqa HashKey_k(%rsp), \TMP5
  910. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  911. movdqu (%arg3,%r11,1), \TMP3
  912. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  913. movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
  914. movdqa \TMP3, \XMM1
  915. movdqu 16(%arg3,%r11,1), \TMP3
  916. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  917. movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
  918. movdqa \TMP3, \XMM2
  919. movdqu 32(%arg3,%r11,1), \TMP3
  920. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  921. movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
  922. movdqa \TMP3, \XMM3
  923. movdqu 48(%arg3,%r11,1), \TMP3
  924. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  925. movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
  926. movdqa \TMP3, \XMM4
  927. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  928. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  929. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  930. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  931. pxor \TMP4, \TMP1
  932. pxor \XMM8, \XMM5
  933. pxor \TMP6, \TMP2
  934. pxor \TMP1, \TMP2
  935. pxor \XMM5, \TMP2
  936. movdqa \TMP2, \TMP3
  937. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  938. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  939. pxor \TMP3, \XMM5
  940. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  941. # first phase of reduction
  942. movdqa \XMM5, \TMP2
  943. movdqa \XMM5, \TMP3
  944. movdqa \XMM5, \TMP4
  945. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  946. pslld $31, \TMP2 # packed right shift << 31
  947. pslld $30, \TMP3 # packed right shift << 30
  948. pslld $25, \TMP4 # packed right shift << 25
  949. pxor \TMP3, \TMP2 # xor the shifted versions
  950. pxor \TMP4, \TMP2
  951. movdqa \TMP2, \TMP5
  952. psrldq $4, \TMP5 # right shift T5 1 DW
  953. pslldq $12, \TMP2 # left shift T2 3 DWs
  954. pxor \TMP2, \XMM5
  955. # second phase of reduction
  956. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  957. movdqa \XMM5,\TMP3
  958. movdqa \XMM5,\TMP4
  959. psrld $1, \TMP2 # packed left shift >>1
  960. psrld $2, \TMP3 # packed left shift >>2
  961. psrld $7, \TMP4 # packed left shift >>7
  962. pxor \TMP3,\TMP2 # xor the shifted versions
  963. pxor \TMP4,\TMP2
  964. pxor \TMP5, \TMP2
  965. pxor \TMP2, \XMM5
  966. pxor \TMP1, \XMM5 # result is in TMP1
  967. pxor \XMM5, \XMM1
  968. .endm
  969. /* GHASH the last 4 ciphertext blocks. */
  970. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  971. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  972. # Multiply TMP6 * HashKey (using Karatsuba)
  973. movdqa \XMM1, \TMP6
  974. pshufd $78, \XMM1, \TMP2
  975. pxor \XMM1, \TMP2
  976. movdqa HashKey_4(%rsp), \TMP5
  977. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  978. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  979. movdqa HashKey_4_k(%rsp), \TMP4
  980. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  981. movdqa \XMM1, \XMMDst
  982. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  983. # Multiply TMP1 * HashKey (using Karatsuba)
  984. movdqa \XMM2, \TMP1
  985. pshufd $78, \XMM2, \TMP2
  986. pxor \XMM2, \TMP2
  987. movdqa HashKey_3(%rsp), \TMP5
  988. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  989. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  990. movdqa HashKey_3_k(%rsp), \TMP4
  991. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  992. pxor \TMP1, \TMP6
  993. pxor \XMM2, \XMMDst
  994. pxor \TMP2, \XMM1
  995. # results accumulated in TMP6, XMMDst, XMM1
  996. # Multiply TMP1 * HashKey (using Karatsuba)
  997. movdqa \XMM3, \TMP1
  998. pshufd $78, \XMM3, \TMP2
  999. pxor \XMM3, \TMP2
  1000. movdqa HashKey_2(%rsp), \TMP5
  1001. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1002. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  1003. movdqa HashKey_2_k(%rsp), \TMP4
  1004. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1005. pxor \TMP1, \TMP6
  1006. pxor \XMM3, \XMMDst
  1007. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  1008. # Multiply TMP1 * HashKey (using Karatsuba)
  1009. movdqa \XMM4, \TMP1
  1010. pshufd $78, \XMM4, \TMP2
  1011. pxor \XMM4, \TMP2
  1012. movdqa HashKey(%rsp), \TMP5
  1013. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1014. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  1015. movdqa HashKey_k(%rsp), \TMP4
  1016. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1017. pxor \TMP1, \TMP6
  1018. pxor \XMM4, \XMMDst
  1019. pxor \XMM1, \TMP2
  1020. pxor \TMP6, \TMP2
  1021. pxor \XMMDst, \TMP2
  1022. # middle section of the temp results combined as in karatsuba algorithm
  1023. movdqa \TMP2, \TMP4
  1024. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  1025. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1026. pxor \TMP4, \XMMDst
  1027. pxor \TMP2, \TMP6
  1028. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  1029. # first phase of the reduction
  1030. movdqa \XMMDst, \TMP2
  1031. movdqa \XMMDst, \TMP3
  1032. movdqa \XMMDst, \TMP4
  1033. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  1034. pslld $31, \TMP2 # packed right shifting << 31
  1035. pslld $30, \TMP3 # packed right shifting << 30
  1036. pslld $25, \TMP4 # packed right shifting << 25
  1037. pxor \TMP3, \TMP2 # xor the shifted versions
  1038. pxor \TMP4, \TMP2
  1039. movdqa \TMP2, \TMP7
  1040. psrldq $4, \TMP7 # right shift TMP7 1 DW
  1041. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  1042. pxor \TMP2, \XMMDst
  1043. # second phase of the reduction
  1044. movdqa \XMMDst, \TMP2
  1045. # make 3 copies of XMMDst for doing 3 shift operations
  1046. movdqa \XMMDst, \TMP3
  1047. movdqa \XMMDst, \TMP4
  1048. psrld $1, \TMP2 # packed left shift >> 1
  1049. psrld $2, \TMP3 # packed left shift >> 2
  1050. psrld $7, \TMP4 # packed left shift >> 7
  1051. pxor \TMP3, \TMP2 # xor the shifted versions
  1052. pxor \TMP4, \TMP2
  1053. pxor \TMP7, \TMP2
  1054. pxor \TMP2, \XMMDst
  1055. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  1056. .endm
  1057. /* Encryption of a single block done*/
  1058. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  1059. pxor (%arg1), \XMM0
  1060. movaps 16(%arg1), \TMP1
  1061. AESENC \TMP1, \XMM0
  1062. movaps 32(%arg1), \TMP1
  1063. AESENC \TMP1, \XMM0
  1064. movaps 48(%arg1), \TMP1
  1065. AESENC \TMP1, \XMM0
  1066. movaps 64(%arg1), \TMP1
  1067. AESENC \TMP1, \XMM0
  1068. movaps 80(%arg1), \TMP1
  1069. AESENC \TMP1, \XMM0
  1070. movaps 96(%arg1), \TMP1
  1071. AESENC \TMP1, \XMM0
  1072. movaps 112(%arg1), \TMP1
  1073. AESENC \TMP1, \XMM0
  1074. movaps 128(%arg1), \TMP1
  1075. AESENC \TMP1, \XMM0
  1076. movaps 144(%arg1), \TMP1
  1077. AESENC \TMP1, \XMM0
  1078. movaps 160(%arg1), \TMP1
  1079. AESENCLAST \TMP1, \XMM0
  1080. .endm
  1081. /*****************************************************************************
  1082. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1083. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  1084. * const u8 *in, // Ciphertext input
  1085. * u64 plaintext_len, // Length of data in bytes for decryption.
  1086. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1087. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1088. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1089. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1090. * const u8 *aad, // Additional Authentication Data (AAD)
  1091. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1092. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  1093. * // given authentication tag and only return the plaintext if they match.
  1094. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  1095. * // (most likely), 12 or 8.
  1096. *
  1097. * Assumptions:
  1098. *
  1099. * keys:
  1100. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  1101. * set of 11 keys in the data structure void *aes_ctx
  1102. *
  1103. * iv:
  1104. * 0 1 2 3
  1105. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1106. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1107. * | Salt (From the SA) |
  1108. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1109. * | Initialization Vector |
  1110. * | (This is the sequence number from IPSec header) |
  1111. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1112. * | 0x1 |
  1113. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1114. *
  1115. *
  1116. *
  1117. * AAD:
  1118. * AAD padded to 128 bits with 0
  1119. * for example, assume AAD is a u32 vector
  1120. *
  1121. * if AAD is 8 bytes:
  1122. * AAD[3] = {A0, A1};
  1123. * padded AAD in xmm register = {A1 A0 0 0}
  1124. *
  1125. * 0 1 2 3
  1126. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1127. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1128. * | SPI (A1) |
  1129. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1130. * | 32-bit Sequence Number (A0) |
  1131. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1132. * | 0x0 |
  1133. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1134. *
  1135. * AAD Format with 32-bit Sequence Number
  1136. *
  1137. * if AAD is 12 bytes:
  1138. * AAD[3] = {A0, A1, A2};
  1139. * padded AAD in xmm register = {A2 A1 A0 0}
  1140. *
  1141. * 0 1 2 3
  1142. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1143. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1144. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1145. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1146. * | SPI (A2) |
  1147. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1148. * | 64-bit Extended Sequence Number {A1,A0} |
  1149. * | |
  1150. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1151. * | 0x0 |
  1152. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1153. *
  1154. * AAD Format with 64-bit Extended Sequence Number
  1155. *
  1156. * aadLen:
  1157. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1158. * The code supports 16 too but for other sizes, the code will fail.
  1159. *
  1160. * TLen:
  1161. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1162. * For other sizes, the code will fail.
  1163. *
  1164. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1165. *
  1166. *****************************************************************************/
  1167. ENTRY(aesni_gcm_dec)
  1168. push %r12
  1169. push %r13
  1170. push %r14
  1171. mov %rsp, %r14
  1172. /*
  1173. * states of %xmm registers %xmm6:%xmm15 not saved
  1174. * all %xmm registers are clobbered
  1175. */
  1176. sub $VARIABLE_OFFSET, %rsp
  1177. and $~63, %rsp # align rsp to 64 bytes
  1178. mov %arg6, %r12
  1179. movdqu (%r12), %xmm13 # %xmm13 = HashKey
  1180. movdqa SHUF_MASK(%rip), %xmm2
  1181. PSHUFB_XMM %xmm2, %xmm13
  1182. # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
  1183. movdqa %xmm13, %xmm2
  1184. psllq $1, %xmm13
  1185. psrlq $63, %xmm2
  1186. movdqa %xmm2, %xmm1
  1187. pslldq $8, %xmm2
  1188. psrldq $8, %xmm1
  1189. por %xmm2, %xmm13
  1190. # Reduction
  1191. pshufd $0x24, %xmm1, %xmm2
  1192. pcmpeqd TWOONE(%rip), %xmm2
  1193. pand POLY(%rip), %xmm2
  1194. pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
  1195. # Decrypt first few blocks
  1196. movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
  1197. mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
  1198. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  1199. mov %r13, %r12
  1200. and $(3<<4), %r12
  1201. jz _initial_num_blocks_is_0_decrypt
  1202. cmp $(2<<4), %r12
  1203. jb _initial_num_blocks_is_1_decrypt
  1204. je _initial_num_blocks_is_2_decrypt
  1205. _initial_num_blocks_is_3_decrypt:
  1206. INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1207. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
  1208. sub $48, %r13
  1209. jmp _initial_blocks_decrypted
  1210. _initial_num_blocks_is_2_decrypt:
  1211. INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1212. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
  1213. sub $32, %r13
  1214. jmp _initial_blocks_decrypted
  1215. _initial_num_blocks_is_1_decrypt:
  1216. INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1217. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
  1218. sub $16, %r13
  1219. jmp _initial_blocks_decrypted
  1220. _initial_num_blocks_is_0_decrypt:
  1221. INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1222. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
  1223. _initial_blocks_decrypted:
  1224. cmp $0, %r13
  1225. je _zero_cipher_left_decrypt
  1226. sub $64, %r13
  1227. je _four_cipher_left_decrypt
  1228. _decrypt_by_4:
  1229. GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1230. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
  1231. add $64, %r11
  1232. sub $64, %r13
  1233. jne _decrypt_by_4
  1234. _four_cipher_left_decrypt:
  1235. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1236. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1237. _zero_cipher_left_decrypt:
  1238. mov %arg4, %r13
  1239. and $15, %r13 # %r13 = arg4 (mod 16)
  1240. je _multiple_of_16_bytes_decrypt
  1241. # Handle the last <16 byte block separately
  1242. paddd ONE(%rip), %xmm0 # increment CNT to get Yn
  1243. movdqa SHUF_MASK(%rip), %xmm10
  1244. PSHUFB_XMM %xmm10, %xmm0
  1245. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
  1246. sub $16, %r11
  1247. add %r13, %r11
  1248. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
  1249. lea SHIFT_MASK+16(%rip), %r12
  1250. sub %r13, %r12
  1251. # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
  1252. # (%r13 is the number of bytes in plaintext mod 16)
  1253. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1254. PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
  1255. movdqa %xmm1, %xmm2
  1256. pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
  1257. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1258. # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
  1259. pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
  1260. pand %xmm1, %xmm2
  1261. movdqa SHUF_MASK(%rip), %xmm10
  1262. PSHUFB_XMM %xmm10 ,%xmm2
  1263. pxor %xmm2, %xmm8
  1264. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1265. # GHASH computation for the last <16 byte block
  1266. sub %r13, %r11
  1267. add $16, %r11
  1268. # output %r13 bytes
  1269. MOVQ_R64_XMM %xmm0, %rax
  1270. cmp $8, %r13
  1271. jle _less_than_8_bytes_left_decrypt
  1272. mov %rax, (%arg2 , %r11, 1)
  1273. add $8, %r11
  1274. psrldq $8, %xmm0
  1275. MOVQ_R64_XMM %xmm0, %rax
  1276. sub $8, %r13
  1277. _less_than_8_bytes_left_decrypt:
  1278. mov %al, (%arg2, %r11, 1)
  1279. add $1, %r11
  1280. shr $8, %rax
  1281. sub $1, %r13
  1282. jne _less_than_8_bytes_left_decrypt
  1283. _multiple_of_16_bytes_decrypt:
  1284. mov arg8, %r12 # %r13 = aadLen (number of bytes)
  1285. shl $3, %r12 # convert into number of bits
  1286. movd %r12d, %xmm15 # len(A) in %xmm15
  1287. shl $3, %arg4 # len(C) in bits (*128)
  1288. MOVQ_R64_XMM %arg4, %xmm1
  1289. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1290. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1291. pxor %xmm15, %xmm8
  1292. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1293. # final GHASH computation
  1294. movdqa SHUF_MASK(%rip), %xmm10
  1295. PSHUFB_XMM %xmm10, %xmm8
  1296. mov %arg5, %rax # %rax = *Y0
  1297. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1298. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  1299. pxor %xmm8, %xmm0
  1300. _return_T_decrypt:
  1301. mov arg9, %r10 # %r10 = authTag
  1302. mov arg10, %r11 # %r11 = auth_tag_len
  1303. cmp $16, %r11
  1304. je _T_16_decrypt
  1305. cmp $12, %r11
  1306. je _T_12_decrypt
  1307. _T_8_decrypt:
  1308. MOVQ_R64_XMM %xmm0, %rax
  1309. mov %rax, (%r10)
  1310. jmp _return_T_done_decrypt
  1311. _T_12_decrypt:
  1312. MOVQ_R64_XMM %xmm0, %rax
  1313. mov %rax, (%r10)
  1314. psrldq $8, %xmm0
  1315. movd %xmm0, %eax
  1316. mov %eax, 8(%r10)
  1317. jmp _return_T_done_decrypt
  1318. _T_16_decrypt:
  1319. movdqu %xmm0, (%r10)
  1320. _return_T_done_decrypt:
  1321. mov %r14, %rsp
  1322. pop %r14
  1323. pop %r13
  1324. pop %r12
  1325. ret
  1326. /*****************************************************************************
  1327. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1328. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1329. * const u8 *in, // Plaintext input
  1330. * u64 plaintext_len, // Length of data in bytes for encryption.
  1331. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1332. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1333. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1334. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1335. * const u8 *aad, // Additional Authentication Data (AAD)
  1336. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1337. * u8 *auth_tag, // Authenticated Tag output.
  1338. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1339. * // 12 or 8.
  1340. *
  1341. * Assumptions:
  1342. *
  1343. * keys:
  1344. * keys are pre-expanded and aligned to 16 bytes. we are using the
  1345. * first set of 11 keys in the data structure void *aes_ctx
  1346. *
  1347. *
  1348. * iv:
  1349. * 0 1 2 3
  1350. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1351. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1352. * | Salt (From the SA) |
  1353. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1354. * | Initialization Vector |
  1355. * | (This is the sequence number from IPSec header) |
  1356. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1357. * | 0x1 |
  1358. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1359. *
  1360. *
  1361. *
  1362. * AAD:
  1363. * AAD padded to 128 bits with 0
  1364. * for example, assume AAD is a u32 vector
  1365. *
  1366. * if AAD is 8 bytes:
  1367. * AAD[3] = {A0, A1};
  1368. * padded AAD in xmm register = {A1 A0 0 0}
  1369. *
  1370. * 0 1 2 3
  1371. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1372. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1373. * | SPI (A1) |
  1374. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1375. * | 32-bit Sequence Number (A0) |
  1376. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1377. * | 0x0 |
  1378. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1379. *
  1380. * AAD Format with 32-bit Sequence Number
  1381. *
  1382. * if AAD is 12 bytes:
  1383. * AAD[3] = {A0, A1, A2};
  1384. * padded AAD in xmm register = {A2 A1 A0 0}
  1385. *
  1386. * 0 1 2 3
  1387. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1388. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1389. * | SPI (A2) |
  1390. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1391. * | 64-bit Extended Sequence Number {A1,A0} |
  1392. * | |
  1393. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1394. * | 0x0 |
  1395. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1396. *
  1397. * AAD Format with 64-bit Extended Sequence Number
  1398. *
  1399. * aadLen:
  1400. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1401. * The code supports 16 too but for other sizes, the code will fail.
  1402. *
  1403. * TLen:
  1404. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1405. * For other sizes, the code will fail.
  1406. *
  1407. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1408. ***************************************************************************/
  1409. ENTRY(aesni_gcm_enc)
  1410. push %r12
  1411. push %r13
  1412. push %r14
  1413. mov %rsp, %r14
  1414. #
  1415. # states of %xmm registers %xmm6:%xmm15 not saved
  1416. # all %xmm registers are clobbered
  1417. #
  1418. sub $VARIABLE_OFFSET, %rsp
  1419. and $~63, %rsp
  1420. mov %arg6, %r12
  1421. movdqu (%r12), %xmm13
  1422. movdqa SHUF_MASK(%rip), %xmm2
  1423. PSHUFB_XMM %xmm2, %xmm13
  1424. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  1425. movdqa %xmm13, %xmm2
  1426. psllq $1, %xmm13
  1427. psrlq $63, %xmm2
  1428. movdqa %xmm2, %xmm1
  1429. pslldq $8, %xmm2
  1430. psrldq $8, %xmm1
  1431. por %xmm2, %xmm13
  1432. # reduce HashKey<<1
  1433. pshufd $0x24, %xmm1, %xmm2
  1434. pcmpeqd TWOONE(%rip), %xmm2
  1435. pand POLY(%rip), %xmm2
  1436. pxor %xmm2, %xmm13
  1437. movdqa %xmm13, HashKey(%rsp)
  1438. mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
  1439. and $-16, %r13
  1440. mov %r13, %r12
  1441. # Encrypt first few blocks
  1442. and $(3<<4), %r12
  1443. jz _initial_num_blocks_is_0_encrypt
  1444. cmp $(2<<4), %r12
  1445. jb _initial_num_blocks_is_1_encrypt
  1446. je _initial_num_blocks_is_2_encrypt
  1447. _initial_num_blocks_is_3_encrypt:
  1448. INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1449. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
  1450. sub $48, %r13
  1451. jmp _initial_blocks_encrypted
  1452. _initial_num_blocks_is_2_encrypt:
  1453. INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1454. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
  1455. sub $32, %r13
  1456. jmp _initial_blocks_encrypted
  1457. _initial_num_blocks_is_1_encrypt:
  1458. INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1459. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
  1460. sub $16, %r13
  1461. jmp _initial_blocks_encrypted
  1462. _initial_num_blocks_is_0_encrypt:
  1463. INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1464. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
  1465. _initial_blocks_encrypted:
  1466. # Main loop - Encrypt remaining blocks
  1467. cmp $0, %r13
  1468. je _zero_cipher_left_encrypt
  1469. sub $64, %r13
  1470. je _four_cipher_left_encrypt
  1471. _encrypt_by_4_encrypt:
  1472. GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1473. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
  1474. add $64, %r11
  1475. sub $64, %r13
  1476. jne _encrypt_by_4_encrypt
  1477. _four_cipher_left_encrypt:
  1478. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1479. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1480. _zero_cipher_left_encrypt:
  1481. mov %arg4, %r13
  1482. and $15, %r13 # %r13 = arg4 (mod 16)
  1483. je _multiple_of_16_bytes_encrypt
  1484. # Handle the last <16 Byte block separately
  1485. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  1486. movdqa SHUF_MASK(%rip), %xmm10
  1487. PSHUFB_XMM %xmm10, %xmm0
  1488. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  1489. sub $16, %r11
  1490. add %r13, %r11
  1491. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
  1492. lea SHIFT_MASK+16(%rip), %r12
  1493. sub %r13, %r12
  1494. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  1495. # (%r13 is the number of bytes in plaintext mod 16)
  1496. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1497. PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
  1498. pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
  1499. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1500. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  1501. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  1502. movdqa SHUF_MASK(%rip), %xmm10
  1503. PSHUFB_XMM %xmm10,%xmm0
  1504. pxor %xmm0, %xmm8
  1505. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1506. # GHASH computation for the last <16 byte block
  1507. sub %r13, %r11
  1508. add $16, %r11
  1509. movdqa SHUF_MASK(%rip), %xmm10
  1510. PSHUFB_XMM %xmm10, %xmm0
  1511. # shuffle xmm0 back to output as ciphertext
  1512. # Output %r13 bytes
  1513. MOVQ_R64_XMM %xmm0, %rax
  1514. cmp $8, %r13
  1515. jle _less_than_8_bytes_left_encrypt
  1516. mov %rax, (%arg2 , %r11, 1)
  1517. add $8, %r11
  1518. psrldq $8, %xmm0
  1519. MOVQ_R64_XMM %xmm0, %rax
  1520. sub $8, %r13
  1521. _less_than_8_bytes_left_encrypt:
  1522. mov %al, (%arg2, %r11, 1)
  1523. add $1, %r11
  1524. shr $8, %rax
  1525. sub $1, %r13
  1526. jne _less_than_8_bytes_left_encrypt
  1527. _multiple_of_16_bytes_encrypt:
  1528. mov arg8, %r12 # %r12 = addLen (number of bytes)
  1529. shl $3, %r12
  1530. movd %r12d, %xmm15 # len(A) in %xmm15
  1531. shl $3, %arg4 # len(C) in bits (*128)
  1532. MOVQ_R64_XMM %arg4, %xmm1
  1533. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1534. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1535. pxor %xmm15, %xmm8
  1536. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1537. # final GHASH computation
  1538. movdqa SHUF_MASK(%rip), %xmm10
  1539. PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
  1540. mov %arg5, %rax # %rax = *Y0
  1541. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1542. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
  1543. pxor %xmm8, %xmm0
  1544. _return_T_encrypt:
  1545. mov arg9, %r10 # %r10 = authTag
  1546. mov arg10, %r11 # %r11 = auth_tag_len
  1547. cmp $16, %r11
  1548. je _T_16_encrypt
  1549. cmp $12, %r11
  1550. je _T_12_encrypt
  1551. _T_8_encrypt:
  1552. MOVQ_R64_XMM %xmm0, %rax
  1553. mov %rax, (%r10)
  1554. jmp _return_T_done_encrypt
  1555. _T_12_encrypt:
  1556. MOVQ_R64_XMM %xmm0, %rax
  1557. mov %rax, (%r10)
  1558. psrldq $8, %xmm0
  1559. movd %xmm0, %eax
  1560. mov %eax, 8(%r10)
  1561. jmp _return_T_done_encrypt
  1562. _T_16_encrypt:
  1563. movdqu %xmm0, (%r10)
  1564. _return_T_done_encrypt:
  1565. mov %r14, %rsp
  1566. pop %r14
  1567. pop %r13
  1568. pop %r12
  1569. ret
  1570. #endif
  1571. _key_expansion_128:
  1572. _key_expansion_256a:
  1573. pshufd $0b11111111, %xmm1, %xmm1
  1574. shufps $0b00010000, %xmm0, %xmm4
  1575. pxor %xmm4, %xmm0
  1576. shufps $0b10001100, %xmm0, %xmm4
  1577. pxor %xmm4, %xmm0
  1578. pxor %xmm1, %xmm0
  1579. movaps %xmm0, (TKEYP)
  1580. add $0x10, TKEYP
  1581. ret
  1582. .align 4
  1583. _key_expansion_192a:
  1584. pshufd $0b01010101, %xmm1, %xmm1
  1585. shufps $0b00010000, %xmm0, %xmm4
  1586. pxor %xmm4, %xmm0
  1587. shufps $0b10001100, %xmm0, %xmm4
  1588. pxor %xmm4, %xmm0
  1589. pxor %xmm1, %xmm0
  1590. movaps %xmm2, %xmm5
  1591. movaps %xmm2, %xmm6
  1592. pslldq $4, %xmm5
  1593. pshufd $0b11111111, %xmm0, %xmm3
  1594. pxor %xmm3, %xmm2
  1595. pxor %xmm5, %xmm2
  1596. movaps %xmm0, %xmm1
  1597. shufps $0b01000100, %xmm0, %xmm6
  1598. movaps %xmm6, (TKEYP)
  1599. shufps $0b01001110, %xmm2, %xmm1
  1600. movaps %xmm1, 0x10(TKEYP)
  1601. add $0x20, TKEYP
  1602. ret
  1603. .align 4
  1604. _key_expansion_192b:
  1605. pshufd $0b01010101, %xmm1, %xmm1
  1606. shufps $0b00010000, %xmm0, %xmm4
  1607. pxor %xmm4, %xmm0
  1608. shufps $0b10001100, %xmm0, %xmm4
  1609. pxor %xmm4, %xmm0
  1610. pxor %xmm1, %xmm0
  1611. movaps %xmm2, %xmm5
  1612. pslldq $4, %xmm5
  1613. pshufd $0b11111111, %xmm0, %xmm3
  1614. pxor %xmm3, %xmm2
  1615. pxor %xmm5, %xmm2
  1616. movaps %xmm0, (TKEYP)
  1617. add $0x10, TKEYP
  1618. ret
  1619. .align 4
  1620. _key_expansion_256b:
  1621. pshufd $0b10101010, %xmm1, %xmm1
  1622. shufps $0b00010000, %xmm2, %xmm4
  1623. pxor %xmm4, %xmm2
  1624. shufps $0b10001100, %xmm2, %xmm4
  1625. pxor %xmm4, %xmm2
  1626. pxor %xmm1, %xmm2
  1627. movaps %xmm2, (TKEYP)
  1628. add $0x10, TKEYP
  1629. ret
  1630. /*
  1631. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1632. * unsigned int key_len)
  1633. */
  1634. ENTRY(aesni_set_key)
  1635. #ifndef __x86_64__
  1636. pushl KEYP
  1637. movl 8(%esp), KEYP # ctx
  1638. movl 12(%esp), UKEYP # in_key
  1639. movl 16(%esp), %edx # key_len
  1640. #endif
  1641. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1642. movaps %xmm0, (KEYP)
  1643. lea 0x10(KEYP), TKEYP # key addr
  1644. movl %edx, 480(KEYP)
  1645. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1646. cmp $24, %dl
  1647. jb .Lenc_key128
  1648. je .Lenc_key192
  1649. movups 0x10(UKEYP), %xmm2 # other user key
  1650. movaps %xmm2, (TKEYP)
  1651. add $0x10, TKEYP
  1652. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1653. call _key_expansion_256a
  1654. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1655. call _key_expansion_256b
  1656. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1657. call _key_expansion_256a
  1658. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1659. call _key_expansion_256b
  1660. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1661. call _key_expansion_256a
  1662. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1663. call _key_expansion_256b
  1664. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1665. call _key_expansion_256a
  1666. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1667. call _key_expansion_256b
  1668. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1669. call _key_expansion_256a
  1670. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1671. call _key_expansion_256b
  1672. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1673. call _key_expansion_256a
  1674. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1675. call _key_expansion_256b
  1676. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1677. call _key_expansion_256a
  1678. jmp .Ldec_key
  1679. .Lenc_key192:
  1680. movq 0x10(UKEYP), %xmm2 # other user key
  1681. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1682. call _key_expansion_192a
  1683. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1684. call _key_expansion_192b
  1685. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1686. call _key_expansion_192a
  1687. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1688. call _key_expansion_192b
  1689. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1690. call _key_expansion_192a
  1691. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1692. call _key_expansion_192b
  1693. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1694. call _key_expansion_192a
  1695. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1696. call _key_expansion_192b
  1697. jmp .Ldec_key
  1698. .Lenc_key128:
  1699. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1700. call _key_expansion_128
  1701. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1702. call _key_expansion_128
  1703. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1704. call _key_expansion_128
  1705. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1706. call _key_expansion_128
  1707. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1708. call _key_expansion_128
  1709. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1710. call _key_expansion_128
  1711. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1712. call _key_expansion_128
  1713. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1714. call _key_expansion_128
  1715. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1716. call _key_expansion_128
  1717. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1718. call _key_expansion_128
  1719. .Ldec_key:
  1720. sub $0x10, TKEYP
  1721. movaps (KEYP), %xmm0
  1722. movaps (TKEYP), %xmm1
  1723. movaps %xmm0, 240(TKEYP)
  1724. movaps %xmm1, 240(KEYP)
  1725. add $0x10, KEYP
  1726. lea 240-16(TKEYP), UKEYP
  1727. .align 4
  1728. .Ldec_key_loop:
  1729. movaps (KEYP), %xmm0
  1730. AESIMC %xmm0 %xmm1
  1731. movaps %xmm1, (UKEYP)
  1732. add $0x10, KEYP
  1733. sub $0x10, UKEYP
  1734. cmp TKEYP, KEYP
  1735. jb .Ldec_key_loop
  1736. xor AREG, AREG
  1737. #ifndef __x86_64__
  1738. popl KEYP
  1739. #endif
  1740. ret
  1741. /*
  1742. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1743. */
  1744. ENTRY(aesni_enc)
  1745. #ifndef __x86_64__
  1746. pushl KEYP
  1747. pushl KLEN
  1748. movl 12(%esp), KEYP
  1749. movl 16(%esp), OUTP
  1750. movl 20(%esp), INP
  1751. #endif
  1752. movl 480(KEYP), KLEN # key length
  1753. movups (INP), STATE # input
  1754. call _aesni_enc1
  1755. movups STATE, (OUTP) # output
  1756. #ifndef __x86_64__
  1757. popl KLEN
  1758. popl KEYP
  1759. #endif
  1760. ret
  1761. /*
  1762. * _aesni_enc1: internal ABI
  1763. * input:
  1764. * KEYP: key struct pointer
  1765. * KLEN: round count
  1766. * STATE: initial state (input)
  1767. * output:
  1768. * STATE: finial state (output)
  1769. * changed:
  1770. * KEY
  1771. * TKEYP (T1)
  1772. */
  1773. .align 4
  1774. _aesni_enc1:
  1775. movaps (KEYP), KEY # key
  1776. mov KEYP, TKEYP
  1777. pxor KEY, STATE # round 0
  1778. add $0x30, TKEYP
  1779. cmp $24, KLEN
  1780. jb .Lenc128
  1781. lea 0x20(TKEYP), TKEYP
  1782. je .Lenc192
  1783. add $0x20, TKEYP
  1784. movaps -0x60(TKEYP), KEY
  1785. AESENC KEY STATE
  1786. movaps -0x50(TKEYP), KEY
  1787. AESENC KEY STATE
  1788. .align 4
  1789. .Lenc192:
  1790. movaps -0x40(TKEYP), KEY
  1791. AESENC KEY STATE
  1792. movaps -0x30(TKEYP), KEY
  1793. AESENC KEY STATE
  1794. .align 4
  1795. .Lenc128:
  1796. movaps -0x20(TKEYP), KEY
  1797. AESENC KEY STATE
  1798. movaps -0x10(TKEYP), KEY
  1799. AESENC KEY STATE
  1800. movaps (TKEYP), KEY
  1801. AESENC KEY STATE
  1802. movaps 0x10(TKEYP), KEY
  1803. AESENC KEY STATE
  1804. movaps 0x20(TKEYP), KEY
  1805. AESENC KEY STATE
  1806. movaps 0x30(TKEYP), KEY
  1807. AESENC KEY STATE
  1808. movaps 0x40(TKEYP), KEY
  1809. AESENC KEY STATE
  1810. movaps 0x50(TKEYP), KEY
  1811. AESENC KEY STATE
  1812. movaps 0x60(TKEYP), KEY
  1813. AESENC KEY STATE
  1814. movaps 0x70(TKEYP), KEY
  1815. AESENCLAST KEY STATE
  1816. ret
  1817. /*
  1818. * _aesni_enc4: internal ABI
  1819. * input:
  1820. * KEYP: key struct pointer
  1821. * KLEN: round count
  1822. * STATE1: initial state (input)
  1823. * STATE2
  1824. * STATE3
  1825. * STATE4
  1826. * output:
  1827. * STATE1: finial state (output)
  1828. * STATE2
  1829. * STATE3
  1830. * STATE4
  1831. * changed:
  1832. * KEY
  1833. * TKEYP (T1)
  1834. */
  1835. .align 4
  1836. _aesni_enc4:
  1837. movaps (KEYP), KEY # key
  1838. mov KEYP, TKEYP
  1839. pxor KEY, STATE1 # round 0
  1840. pxor KEY, STATE2
  1841. pxor KEY, STATE3
  1842. pxor KEY, STATE4
  1843. add $0x30, TKEYP
  1844. cmp $24, KLEN
  1845. jb .L4enc128
  1846. lea 0x20(TKEYP), TKEYP
  1847. je .L4enc192
  1848. add $0x20, TKEYP
  1849. movaps -0x60(TKEYP), KEY
  1850. AESENC KEY STATE1
  1851. AESENC KEY STATE2
  1852. AESENC KEY STATE3
  1853. AESENC KEY STATE4
  1854. movaps -0x50(TKEYP), KEY
  1855. AESENC KEY STATE1
  1856. AESENC KEY STATE2
  1857. AESENC KEY STATE3
  1858. AESENC KEY STATE4
  1859. #.align 4
  1860. .L4enc192:
  1861. movaps -0x40(TKEYP), KEY
  1862. AESENC KEY STATE1
  1863. AESENC KEY STATE2
  1864. AESENC KEY STATE3
  1865. AESENC KEY STATE4
  1866. movaps -0x30(TKEYP), KEY
  1867. AESENC KEY STATE1
  1868. AESENC KEY STATE2
  1869. AESENC KEY STATE3
  1870. AESENC KEY STATE4
  1871. #.align 4
  1872. .L4enc128:
  1873. movaps -0x20(TKEYP), KEY
  1874. AESENC KEY STATE1
  1875. AESENC KEY STATE2
  1876. AESENC KEY STATE3
  1877. AESENC KEY STATE4
  1878. movaps -0x10(TKEYP), KEY
  1879. AESENC KEY STATE1
  1880. AESENC KEY STATE2
  1881. AESENC KEY STATE3
  1882. AESENC KEY STATE4
  1883. movaps (TKEYP), KEY
  1884. AESENC KEY STATE1
  1885. AESENC KEY STATE2
  1886. AESENC KEY STATE3
  1887. AESENC KEY STATE4
  1888. movaps 0x10(TKEYP), KEY
  1889. AESENC KEY STATE1
  1890. AESENC KEY STATE2
  1891. AESENC KEY STATE3
  1892. AESENC KEY STATE4
  1893. movaps 0x20(TKEYP), KEY
  1894. AESENC KEY STATE1
  1895. AESENC KEY STATE2
  1896. AESENC KEY STATE3
  1897. AESENC KEY STATE4
  1898. movaps 0x30(TKEYP), KEY
  1899. AESENC KEY STATE1
  1900. AESENC KEY STATE2
  1901. AESENC KEY STATE3
  1902. AESENC KEY STATE4
  1903. movaps 0x40(TKEYP), KEY
  1904. AESENC KEY STATE1
  1905. AESENC KEY STATE2
  1906. AESENC KEY STATE3
  1907. AESENC KEY STATE4
  1908. movaps 0x50(TKEYP), KEY
  1909. AESENC KEY STATE1
  1910. AESENC KEY STATE2
  1911. AESENC KEY STATE3
  1912. AESENC KEY STATE4
  1913. movaps 0x60(TKEYP), KEY
  1914. AESENC KEY STATE1
  1915. AESENC KEY STATE2
  1916. AESENC KEY STATE3
  1917. AESENC KEY STATE4
  1918. movaps 0x70(TKEYP), KEY
  1919. AESENCLAST KEY STATE1 # last round
  1920. AESENCLAST KEY STATE2
  1921. AESENCLAST KEY STATE3
  1922. AESENCLAST KEY STATE4
  1923. ret
  1924. /*
  1925. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1926. */
  1927. ENTRY(aesni_dec)
  1928. #ifndef __x86_64__
  1929. pushl KEYP
  1930. pushl KLEN
  1931. movl 12(%esp), KEYP
  1932. movl 16(%esp), OUTP
  1933. movl 20(%esp), INP
  1934. #endif
  1935. mov 480(KEYP), KLEN # key length
  1936. add $240, KEYP
  1937. movups (INP), STATE # input
  1938. call _aesni_dec1
  1939. movups STATE, (OUTP) #output
  1940. #ifndef __x86_64__
  1941. popl KLEN
  1942. popl KEYP
  1943. #endif
  1944. ret
  1945. /*
  1946. * _aesni_dec1: internal ABI
  1947. * input:
  1948. * KEYP: key struct pointer
  1949. * KLEN: key length
  1950. * STATE: initial state (input)
  1951. * output:
  1952. * STATE: finial state (output)
  1953. * changed:
  1954. * KEY
  1955. * TKEYP (T1)
  1956. */
  1957. .align 4
  1958. _aesni_dec1:
  1959. movaps (KEYP), KEY # key
  1960. mov KEYP, TKEYP
  1961. pxor KEY, STATE # round 0
  1962. add $0x30, TKEYP
  1963. cmp $24, KLEN
  1964. jb .Ldec128
  1965. lea 0x20(TKEYP), TKEYP
  1966. je .Ldec192
  1967. add $0x20, TKEYP
  1968. movaps -0x60(TKEYP), KEY
  1969. AESDEC KEY STATE
  1970. movaps -0x50(TKEYP), KEY
  1971. AESDEC KEY STATE
  1972. .align 4
  1973. .Ldec192:
  1974. movaps -0x40(TKEYP), KEY
  1975. AESDEC KEY STATE
  1976. movaps -0x30(TKEYP), KEY
  1977. AESDEC KEY STATE
  1978. .align 4
  1979. .Ldec128:
  1980. movaps -0x20(TKEYP), KEY
  1981. AESDEC KEY STATE
  1982. movaps -0x10(TKEYP), KEY
  1983. AESDEC KEY STATE
  1984. movaps (TKEYP), KEY
  1985. AESDEC KEY STATE
  1986. movaps 0x10(TKEYP), KEY
  1987. AESDEC KEY STATE
  1988. movaps 0x20(TKEYP), KEY
  1989. AESDEC KEY STATE
  1990. movaps 0x30(TKEYP), KEY
  1991. AESDEC KEY STATE
  1992. movaps 0x40(TKEYP), KEY
  1993. AESDEC KEY STATE
  1994. movaps 0x50(TKEYP), KEY
  1995. AESDEC KEY STATE
  1996. movaps 0x60(TKEYP), KEY
  1997. AESDEC KEY STATE
  1998. movaps 0x70(TKEYP), KEY
  1999. AESDECLAST KEY STATE
  2000. ret
  2001. /*
  2002. * _aesni_dec4: internal ABI
  2003. * input:
  2004. * KEYP: key struct pointer
  2005. * KLEN: key length
  2006. * STATE1: initial state (input)
  2007. * STATE2
  2008. * STATE3
  2009. * STATE4
  2010. * output:
  2011. * STATE1: finial state (output)
  2012. * STATE2
  2013. * STATE3
  2014. * STATE4
  2015. * changed:
  2016. * KEY
  2017. * TKEYP (T1)
  2018. */
  2019. .align 4
  2020. _aesni_dec4:
  2021. movaps (KEYP), KEY # key
  2022. mov KEYP, TKEYP
  2023. pxor KEY, STATE1 # round 0
  2024. pxor KEY, STATE2
  2025. pxor KEY, STATE3
  2026. pxor KEY, STATE4
  2027. add $0x30, TKEYP
  2028. cmp $24, KLEN
  2029. jb .L4dec128
  2030. lea 0x20(TKEYP), TKEYP
  2031. je .L4dec192
  2032. add $0x20, TKEYP
  2033. movaps -0x60(TKEYP), KEY
  2034. AESDEC KEY STATE1
  2035. AESDEC KEY STATE2
  2036. AESDEC KEY STATE3
  2037. AESDEC KEY STATE4
  2038. movaps -0x50(TKEYP), KEY
  2039. AESDEC KEY STATE1
  2040. AESDEC KEY STATE2
  2041. AESDEC KEY STATE3
  2042. AESDEC KEY STATE4
  2043. .align 4
  2044. .L4dec192:
  2045. movaps -0x40(TKEYP), KEY
  2046. AESDEC KEY STATE1
  2047. AESDEC KEY STATE2
  2048. AESDEC KEY STATE3
  2049. AESDEC KEY STATE4
  2050. movaps -0x30(TKEYP), KEY
  2051. AESDEC KEY STATE1
  2052. AESDEC KEY STATE2
  2053. AESDEC KEY STATE3
  2054. AESDEC KEY STATE4
  2055. .align 4
  2056. .L4dec128:
  2057. movaps -0x20(TKEYP), KEY
  2058. AESDEC KEY STATE1
  2059. AESDEC KEY STATE2
  2060. AESDEC KEY STATE3
  2061. AESDEC KEY STATE4
  2062. movaps -0x10(TKEYP), KEY
  2063. AESDEC KEY STATE1
  2064. AESDEC KEY STATE2
  2065. AESDEC KEY STATE3
  2066. AESDEC KEY STATE4
  2067. movaps (TKEYP), KEY
  2068. AESDEC KEY STATE1
  2069. AESDEC KEY STATE2
  2070. AESDEC KEY STATE3
  2071. AESDEC KEY STATE4
  2072. movaps 0x10(TKEYP), KEY
  2073. AESDEC KEY STATE1
  2074. AESDEC KEY STATE2
  2075. AESDEC KEY STATE3
  2076. AESDEC KEY STATE4
  2077. movaps 0x20(TKEYP), KEY
  2078. AESDEC KEY STATE1
  2079. AESDEC KEY STATE2
  2080. AESDEC KEY STATE3
  2081. AESDEC KEY STATE4
  2082. movaps 0x30(TKEYP), KEY
  2083. AESDEC KEY STATE1
  2084. AESDEC KEY STATE2
  2085. AESDEC KEY STATE3
  2086. AESDEC KEY STATE4
  2087. movaps 0x40(TKEYP), KEY
  2088. AESDEC KEY STATE1
  2089. AESDEC KEY STATE2
  2090. AESDEC KEY STATE3
  2091. AESDEC KEY STATE4
  2092. movaps 0x50(TKEYP), KEY
  2093. AESDEC KEY STATE1
  2094. AESDEC KEY STATE2
  2095. AESDEC KEY STATE3
  2096. AESDEC KEY STATE4
  2097. movaps 0x60(TKEYP), KEY
  2098. AESDEC KEY STATE1
  2099. AESDEC KEY STATE2
  2100. AESDEC KEY STATE3
  2101. AESDEC KEY STATE4
  2102. movaps 0x70(TKEYP), KEY
  2103. AESDECLAST KEY STATE1 # last round
  2104. AESDECLAST KEY STATE2
  2105. AESDECLAST KEY STATE3
  2106. AESDECLAST KEY STATE4
  2107. ret
  2108. /*
  2109. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2110. * size_t len)
  2111. */
  2112. ENTRY(aesni_ecb_enc)
  2113. #ifndef __x86_64__
  2114. pushl LEN
  2115. pushl KEYP
  2116. pushl KLEN
  2117. movl 16(%esp), KEYP
  2118. movl 20(%esp), OUTP
  2119. movl 24(%esp), INP
  2120. movl 28(%esp), LEN
  2121. #endif
  2122. test LEN, LEN # check length
  2123. jz .Lecb_enc_ret
  2124. mov 480(KEYP), KLEN
  2125. cmp $16, LEN
  2126. jb .Lecb_enc_ret
  2127. cmp $64, LEN
  2128. jb .Lecb_enc_loop1
  2129. .align 4
  2130. .Lecb_enc_loop4:
  2131. movups (INP), STATE1
  2132. movups 0x10(INP), STATE2
  2133. movups 0x20(INP), STATE3
  2134. movups 0x30(INP), STATE4
  2135. call _aesni_enc4
  2136. movups STATE1, (OUTP)
  2137. movups STATE2, 0x10(OUTP)
  2138. movups STATE3, 0x20(OUTP)
  2139. movups STATE4, 0x30(OUTP)
  2140. sub $64, LEN
  2141. add $64, INP
  2142. add $64, OUTP
  2143. cmp $64, LEN
  2144. jge .Lecb_enc_loop4
  2145. cmp $16, LEN
  2146. jb .Lecb_enc_ret
  2147. .align 4
  2148. .Lecb_enc_loop1:
  2149. movups (INP), STATE1
  2150. call _aesni_enc1
  2151. movups STATE1, (OUTP)
  2152. sub $16, LEN
  2153. add $16, INP
  2154. add $16, OUTP
  2155. cmp $16, LEN
  2156. jge .Lecb_enc_loop1
  2157. .Lecb_enc_ret:
  2158. #ifndef __x86_64__
  2159. popl KLEN
  2160. popl KEYP
  2161. popl LEN
  2162. #endif
  2163. ret
  2164. /*
  2165. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2166. * size_t len);
  2167. */
  2168. ENTRY(aesni_ecb_dec)
  2169. #ifndef __x86_64__
  2170. pushl LEN
  2171. pushl KEYP
  2172. pushl KLEN
  2173. movl 16(%esp), KEYP
  2174. movl 20(%esp), OUTP
  2175. movl 24(%esp), INP
  2176. movl 28(%esp), LEN
  2177. #endif
  2178. test LEN, LEN
  2179. jz .Lecb_dec_ret
  2180. mov 480(KEYP), KLEN
  2181. add $240, KEYP
  2182. cmp $16, LEN
  2183. jb .Lecb_dec_ret
  2184. cmp $64, LEN
  2185. jb .Lecb_dec_loop1
  2186. .align 4
  2187. .Lecb_dec_loop4:
  2188. movups (INP), STATE1
  2189. movups 0x10(INP), STATE2
  2190. movups 0x20(INP), STATE3
  2191. movups 0x30(INP), STATE4
  2192. call _aesni_dec4
  2193. movups STATE1, (OUTP)
  2194. movups STATE2, 0x10(OUTP)
  2195. movups STATE3, 0x20(OUTP)
  2196. movups STATE4, 0x30(OUTP)
  2197. sub $64, LEN
  2198. add $64, INP
  2199. add $64, OUTP
  2200. cmp $64, LEN
  2201. jge .Lecb_dec_loop4
  2202. cmp $16, LEN
  2203. jb .Lecb_dec_ret
  2204. .align 4
  2205. .Lecb_dec_loop1:
  2206. movups (INP), STATE1
  2207. call _aesni_dec1
  2208. movups STATE1, (OUTP)
  2209. sub $16, LEN
  2210. add $16, INP
  2211. add $16, OUTP
  2212. cmp $16, LEN
  2213. jge .Lecb_dec_loop1
  2214. .Lecb_dec_ret:
  2215. #ifndef __x86_64__
  2216. popl KLEN
  2217. popl KEYP
  2218. popl LEN
  2219. #endif
  2220. ret
  2221. /*
  2222. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2223. * size_t len, u8 *iv)
  2224. */
  2225. ENTRY(aesni_cbc_enc)
  2226. #ifndef __x86_64__
  2227. pushl IVP
  2228. pushl LEN
  2229. pushl KEYP
  2230. pushl KLEN
  2231. movl 20(%esp), KEYP
  2232. movl 24(%esp), OUTP
  2233. movl 28(%esp), INP
  2234. movl 32(%esp), LEN
  2235. movl 36(%esp), IVP
  2236. #endif
  2237. cmp $16, LEN
  2238. jb .Lcbc_enc_ret
  2239. mov 480(KEYP), KLEN
  2240. movups (IVP), STATE # load iv as initial state
  2241. .align 4
  2242. .Lcbc_enc_loop:
  2243. movups (INP), IN # load input
  2244. pxor IN, STATE
  2245. call _aesni_enc1
  2246. movups STATE, (OUTP) # store output
  2247. sub $16, LEN
  2248. add $16, INP
  2249. add $16, OUTP
  2250. cmp $16, LEN
  2251. jge .Lcbc_enc_loop
  2252. movups STATE, (IVP)
  2253. .Lcbc_enc_ret:
  2254. #ifndef __x86_64__
  2255. popl KLEN
  2256. popl KEYP
  2257. popl LEN
  2258. popl IVP
  2259. #endif
  2260. ret
  2261. /*
  2262. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2263. * size_t len, u8 *iv)
  2264. */
  2265. ENTRY(aesni_cbc_dec)
  2266. #ifndef __x86_64__
  2267. pushl IVP
  2268. pushl LEN
  2269. pushl KEYP
  2270. pushl KLEN
  2271. movl 20(%esp), KEYP
  2272. movl 24(%esp), OUTP
  2273. movl 28(%esp), INP
  2274. movl 32(%esp), LEN
  2275. movl 36(%esp), IVP
  2276. #endif
  2277. cmp $16, LEN
  2278. jb .Lcbc_dec_just_ret
  2279. mov 480(KEYP), KLEN
  2280. add $240, KEYP
  2281. movups (IVP), IV
  2282. cmp $64, LEN
  2283. jb .Lcbc_dec_loop1
  2284. .align 4
  2285. .Lcbc_dec_loop4:
  2286. movups (INP), IN1
  2287. movaps IN1, STATE1
  2288. movups 0x10(INP), IN2
  2289. movaps IN2, STATE2
  2290. #ifdef __x86_64__
  2291. movups 0x20(INP), IN3
  2292. movaps IN3, STATE3
  2293. movups 0x30(INP), IN4
  2294. movaps IN4, STATE4
  2295. #else
  2296. movups 0x20(INP), IN1
  2297. movaps IN1, STATE3
  2298. movups 0x30(INP), IN2
  2299. movaps IN2, STATE4
  2300. #endif
  2301. call _aesni_dec4
  2302. pxor IV, STATE1
  2303. #ifdef __x86_64__
  2304. pxor IN1, STATE2
  2305. pxor IN2, STATE3
  2306. pxor IN3, STATE4
  2307. movaps IN4, IV
  2308. #else
  2309. pxor (INP), STATE2
  2310. pxor 0x10(INP), STATE3
  2311. pxor IN1, STATE4
  2312. movaps IN2, IV
  2313. #endif
  2314. movups STATE1, (OUTP)
  2315. movups STATE2, 0x10(OUTP)
  2316. movups STATE3, 0x20(OUTP)
  2317. movups STATE4, 0x30(OUTP)
  2318. sub $64, LEN
  2319. add $64, INP
  2320. add $64, OUTP
  2321. cmp $64, LEN
  2322. jge .Lcbc_dec_loop4
  2323. cmp $16, LEN
  2324. jb .Lcbc_dec_ret
  2325. .align 4
  2326. .Lcbc_dec_loop1:
  2327. movups (INP), IN
  2328. movaps IN, STATE
  2329. call _aesni_dec1
  2330. pxor IV, STATE
  2331. movups STATE, (OUTP)
  2332. movaps IN, IV
  2333. sub $16, LEN
  2334. add $16, INP
  2335. add $16, OUTP
  2336. cmp $16, LEN
  2337. jge .Lcbc_dec_loop1
  2338. .Lcbc_dec_ret:
  2339. movups IV, (IVP)
  2340. .Lcbc_dec_just_ret:
  2341. #ifndef __x86_64__
  2342. popl KLEN
  2343. popl KEYP
  2344. popl LEN
  2345. popl IVP
  2346. #endif
  2347. ret
  2348. #ifdef __x86_64__
  2349. .align 16
  2350. .Lbswap_mask:
  2351. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2352. /*
  2353. * _aesni_inc_init: internal ABI
  2354. * setup registers used by _aesni_inc
  2355. * input:
  2356. * IV
  2357. * output:
  2358. * CTR: == IV, in little endian
  2359. * TCTR_LOW: == lower qword of CTR
  2360. * INC: == 1, in little endian
  2361. * BSWAP_MASK == endian swapping mask
  2362. */
  2363. .align 4
  2364. _aesni_inc_init:
  2365. movaps .Lbswap_mask, BSWAP_MASK
  2366. movaps IV, CTR
  2367. PSHUFB_XMM BSWAP_MASK CTR
  2368. mov $1, TCTR_LOW
  2369. MOVQ_R64_XMM TCTR_LOW INC
  2370. MOVQ_R64_XMM CTR TCTR_LOW
  2371. ret
  2372. /*
  2373. * _aesni_inc: internal ABI
  2374. * Increase IV by 1, IV is in big endian
  2375. * input:
  2376. * IV
  2377. * CTR: == IV, in little endian
  2378. * TCTR_LOW: == lower qword of CTR
  2379. * INC: == 1, in little endian
  2380. * BSWAP_MASK == endian swapping mask
  2381. * output:
  2382. * IV: Increase by 1
  2383. * changed:
  2384. * CTR: == output IV, in little endian
  2385. * TCTR_LOW: == lower qword of CTR
  2386. */
  2387. .align 4
  2388. _aesni_inc:
  2389. paddq INC, CTR
  2390. add $1, TCTR_LOW
  2391. jnc .Linc_low
  2392. pslldq $8, INC
  2393. paddq INC, CTR
  2394. psrldq $8, INC
  2395. .Linc_low:
  2396. movaps CTR, IV
  2397. PSHUFB_XMM BSWAP_MASK IV
  2398. ret
  2399. /*
  2400. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2401. * size_t len, u8 *iv)
  2402. */
  2403. ENTRY(aesni_ctr_enc)
  2404. cmp $16, LEN
  2405. jb .Lctr_enc_just_ret
  2406. mov 480(KEYP), KLEN
  2407. movups (IVP), IV
  2408. call _aesni_inc_init
  2409. cmp $64, LEN
  2410. jb .Lctr_enc_loop1
  2411. .align 4
  2412. .Lctr_enc_loop4:
  2413. movaps IV, STATE1
  2414. call _aesni_inc
  2415. movups (INP), IN1
  2416. movaps IV, STATE2
  2417. call _aesni_inc
  2418. movups 0x10(INP), IN2
  2419. movaps IV, STATE3
  2420. call _aesni_inc
  2421. movups 0x20(INP), IN3
  2422. movaps IV, STATE4
  2423. call _aesni_inc
  2424. movups 0x30(INP), IN4
  2425. call _aesni_enc4
  2426. pxor IN1, STATE1
  2427. movups STATE1, (OUTP)
  2428. pxor IN2, STATE2
  2429. movups STATE2, 0x10(OUTP)
  2430. pxor IN3, STATE3
  2431. movups STATE3, 0x20(OUTP)
  2432. pxor IN4, STATE4
  2433. movups STATE4, 0x30(OUTP)
  2434. sub $64, LEN
  2435. add $64, INP
  2436. add $64, OUTP
  2437. cmp $64, LEN
  2438. jge .Lctr_enc_loop4
  2439. cmp $16, LEN
  2440. jb .Lctr_enc_ret
  2441. .align 4
  2442. .Lctr_enc_loop1:
  2443. movaps IV, STATE
  2444. call _aesni_inc
  2445. movups (INP), IN
  2446. call _aesni_enc1
  2447. pxor IN, STATE
  2448. movups STATE, (OUTP)
  2449. sub $16, LEN
  2450. add $16, INP
  2451. add $16, OUTP
  2452. cmp $16, LEN
  2453. jge .Lctr_enc_loop1
  2454. .Lctr_enc_ret:
  2455. movups IV, (IVP)
  2456. .Lctr_enc_just_ret:
  2457. ret
  2458. #endif