aesni-intel_avx-x86_64.S 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812
  1. ########################################################################
  2. # Copyright (c) 2013, Intel Corporation
  3. #
  4. # This software is available to you under a choice of one of two
  5. # licenses. You may choose to be licensed under the terms of the GNU
  6. # General Public License (GPL) Version 2, available from the file
  7. # COPYING in the main directory of this source tree, or the
  8. # OpenIB.org BSD license below:
  9. #
  10. # Redistribution and use in source and binary forms, with or without
  11. # modification, are permitted provided that the following conditions are
  12. # met:
  13. #
  14. # * Redistributions of source code must retain the above copyright
  15. # notice, this list of conditions and the following disclaimer.
  16. #
  17. # * Redistributions in binary form must reproduce the above copyright
  18. # notice, this list of conditions and the following disclaimer in the
  19. # documentation and/or other materials provided with the
  20. # distribution.
  21. #
  22. # * Neither the name of the Intel Corporation nor the names of its
  23. # contributors may be used to endorse or promote products derived from
  24. # this software without specific prior written permission.
  25. #
  26. #
  27. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  28. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  30. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  31. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  32. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  33. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
  34. # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. ########################################################################
  39. ##
  40. ## Authors:
  41. ## Erdinc Ozturk <erdinc.ozturk@intel.com>
  42. ## Vinodh Gopal <vinodh.gopal@intel.com>
  43. ## James Guilford <james.guilford@intel.com>
  44. ## Tim Chen <tim.c.chen@linux.intel.com>
  45. ##
  46. ## References:
  47. ## This code was derived and highly optimized from the code described in paper:
  48. ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
  49. ## on Intel Architecture Processors. August, 2010
  50. ## The details of the implementation is explained in:
  51. ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
  52. ## on Intel Architecture Processors. October, 2012.
  53. ##
  54. ## Assumptions:
  55. ##
  56. ##
  57. ##
  58. ## iv:
  59. ## 0 1 2 3
  60. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  61. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  62. ## | Salt (From the SA) |
  63. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  64. ## | Initialization Vector |
  65. ## | (This is the sequence number from IPSec header) |
  66. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67. ## | 0x1 |
  68. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69. ##
  70. ##
  71. ##
  72. ## AAD:
  73. ## AAD padded to 128 bits with 0
  74. ## for example, assume AAD is a u32 vector
  75. ##
  76. ## if AAD is 8 bytes:
  77. ## AAD[3] = {A0, A1}#
  78. ## padded AAD in xmm register = {A1 A0 0 0}
  79. ##
  80. ## 0 1 2 3
  81. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  82. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  83. ## | SPI (A1) |
  84. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  85. ## | 32-bit Sequence Number (A0) |
  86. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  87. ## | 0x0 |
  88. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  89. ##
  90. ## AAD Format with 32-bit Sequence Number
  91. ##
  92. ## if AAD is 12 bytes:
  93. ## AAD[3] = {A0, A1, A2}#
  94. ## padded AAD in xmm register = {A2 A1 A0 0}
  95. ##
  96. ## 0 1 2 3
  97. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  98. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  99. ## | SPI (A2) |
  100. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  101. ## | 64-bit Extended Sequence Number {A1,A0} |
  102. ## | |
  103. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  104. ## | 0x0 |
  105. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  106. ##
  107. ## AAD Format with 64-bit Extended Sequence Number
  108. ##
  109. ##
  110. ## aadLen:
  111. ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
  112. ## The code additionally supports aadLen of length 16 bytes.
  113. ##
  114. ## TLen:
  115. ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  116. ##
  117. ## poly = x^128 + x^127 + x^126 + x^121 + 1
  118. ## throughout the code, one tab and two tab indentations are used. one tab is
  119. ## for GHASH part, two tabs is for AES part.
  120. ##
  121. #include <linux/linkage.h>
  122. #include <asm/inst.h>
  123. .data
  124. .align 16
  125. POLY: .octa 0xC2000000000000000000000000000001
  126. POLY2: .octa 0xC20000000000000000000001C2000000
  127. TWOONE: .octa 0x00000001000000000000000000000001
  128. # order of these constants should not change.
  129. # more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
  130. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  131. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  132. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  133. ZERO: .octa 0x00000000000000000000000000000000
  134. ONE: .octa 0x00000000000000000000000000000001
  135. ONEf: .octa 0x01000000000000000000000000000000
  136. .text
  137. ##define the fields of the gcm aes context
  138. #{
  139. # u8 expanded_keys[16*11] store expanded keys
  140. # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
  141. # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
  142. # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
  143. # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
  144. # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
  145. # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
  146. # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
  147. # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
  148. # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
  149. # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
  150. # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
  151. # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
  152. # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
  153. # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
  154. # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
  155. # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
  156. #} gcm_ctx#
  157. HashKey = 16*11 # store HashKey <<1 mod poly here
  158. HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
  159. HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
  160. HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
  161. HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
  162. HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
  163. HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
  164. HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
  165. HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
  166. HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
  167. HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
  168. HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
  169. HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
  170. HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
  171. HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
  172. HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
  173. #define arg1 %rdi
  174. #define arg2 %rsi
  175. #define arg3 %rdx
  176. #define arg4 %rcx
  177. #define arg5 %r8
  178. #define arg6 %r9
  179. #define arg7 STACK_OFFSET+8*1(%r14)
  180. #define arg8 STACK_OFFSET+8*2(%r14)
  181. #define arg9 STACK_OFFSET+8*3(%r14)
  182. i = 0
  183. j = 0
  184. out_order = 0
  185. in_order = 1
  186. DEC = 0
  187. ENC = 1
  188. .macro define_reg r n
  189. reg_\r = %xmm\n
  190. .endm
  191. .macro setreg
  192. .altmacro
  193. define_reg i %i
  194. define_reg j %j
  195. .noaltmacro
  196. .endm
  197. # need to push 4 registers into stack to maintain
  198. STACK_OFFSET = 8*4
  199. TMP1 = 16*0 # Temporary storage for AAD
  200. TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
  201. TMP3 = 16*2 # Temporary storage for AES State 3
  202. TMP4 = 16*3 # Temporary storage for AES State 4
  203. TMP5 = 16*4 # Temporary storage for AES State 5
  204. TMP6 = 16*5 # Temporary storage for AES State 6
  205. TMP7 = 16*6 # Temporary storage for AES State 7
  206. TMP8 = 16*7 # Temporary storage for AES State 8
  207. VARIABLE_OFFSET = 16*8
  208. ################################
  209. # Utility Macros
  210. ################################
  211. # Encryption of a single block
  212. .macro ENCRYPT_SINGLE_BLOCK XMM0
  213. vpxor (arg1), \XMM0, \XMM0
  214. i = 1
  215. setreg
  216. .rep 9
  217. vaesenc 16*i(arg1), \XMM0, \XMM0
  218. i = (i+1)
  219. setreg
  220. .endr
  221. vaesenclast 16*10(arg1), \XMM0, \XMM0
  222. .endm
  223. #ifdef CONFIG_AS_AVX
  224. ###############################################################################
  225. # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  226. # Input: A and B (128-bits each, bit-reflected)
  227. # Output: C = A*B*x mod poly, (i.e. >>1 )
  228. # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  229. # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  230. ###############################################################################
  231. .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
  232. vpshufd $0b01001110, \GH, \T2
  233. vpshufd $0b01001110, \HK, \T3
  234. vpxor \GH , \T2, \T2 # T2 = (a1+a0)
  235. vpxor \HK , \T3, \T3 # T3 = (b1+b0)
  236. vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
  237. vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
  238. vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
  239. vpxor \GH, \T2,\T2
  240. vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
  241. vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
  242. vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
  243. vpxor \T3, \GH, \GH
  244. vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
  245. #first phase of the reduction
  246. vpslld $31, \GH, \T2 # packed right shifting << 31
  247. vpslld $30, \GH, \T3 # packed right shifting shift << 30
  248. vpslld $25, \GH, \T4 # packed right shifting shift << 25
  249. vpxor \T3, \T2, \T2 # xor the shifted versions
  250. vpxor \T4, \T2, \T2
  251. vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
  252. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  253. vpxor \T2, \GH, \GH # first phase of the reduction complete
  254. #second phase of the reduction
  255. vpsrld $1,\GH, \T2 # packed left shifting >> 1
  256. vpsrld $2,\GH, \T3 # packed left shifting >> 2
  257. vpsrld $7,\GH, \T4 # packed left shifting >> 7
  258. vpxor \T3, \T2, \T2 # xor the shifted versions
  259. vpxor \T4, \T2, \T2
  260. vpxor \T5, \T2, \T2
  261. vpxor \T2, \GH, \GH
  262. vpxor \T1, \GH, \GH # the result is in GH
  263. .endm
  264. .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
  265. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  266. vmovdqa \HK, \T5
  267. vpshufd $0b01001110, \T5, \T1
  268. vpxor \T5, \T1, \T1
  269. vmovdqa \T1, HashKey_k(arg1)
  270. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
  271. vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
  272. vpshufd $0b01001110, \T5, \T1
  273. vpxor \T5, \T1, \T1
  274. vmovdqa \T1, HashKey_2_k(arg1)
  275. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
  276. vmovdqa \T5, HashKey_3(arg1)
  277. vpshufd $0b01001110, \T5, \T1
  278. vpxor \T5, \T1, \T1
  279. vmovdqa \T1, HashKey_3_k(arg1)
  280. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
  281. vmovdqa \T5, HashKey_4(arg1)
  282. vpshufd $0b01001110, \T5, \T1
  283. vpxor \T5, \T1, \T1
  284. vmovdqa \T1, HashKey_4_k(arg1)
  285. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
  286. vmovdqa \T5, HashKey_5(arg1)
  287. vpshufd $0b01001110, \T5, \T1
  288. vpxor \T5, \T1, \T1
  289. vmovdqa \T1, HashKey_5_k(arg1)
  290. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
  291. vmovdqa \T5, HashKey_6(arg1)
  292. vpshufd $0b01001110, \T5, \T1
  293. vpxor \T5, \T1, \T1
  294. vmovdqa \T1, HashKey_6_k(arg1)
  295. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
  296. vmovdqa \T5, HashKey_7(arg1)
  297. vpshufd $0b01001110, \T5, \T1
  298. vpxor \T5, \T1, \T1
  299. vmovdqa \T1, HashKey_7_k(arg1)
  300. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
  301. vmovdqa \T5, HashKey_8(arg1)
  302. vpshufd $0b01001110, \T5, \T1
  303. vpxor \T5, \T1, \T1
  304. vmovdqa \T1, HashKey_8_k(arg1)
  305. .endm
  306. ## if a = number of total plaintext bytes
  307. ## b = floor(a/16)
  308. ## num_initial_blocks = b mod 4#
  309. ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
  310. ## r10, r11, r12, rax are clobbered
  311. ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
  312. .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
  313. i = (8-\num_initial_blocks)
  314. setreg
  315. mov arg6, %r10 # r10 = AAD
  316. mov arg7, %r12 # r12 = aadLen
  317. mov %r12, %r11
  318. vpxor reg_i, reg_i, reg_i
  319. _get_AAD_loop\@:
  320. vmovd (%r10), \T1
  321. vpslldq $12, \T1, \T1
  322. vpsrldq $4, reg_i, reg_i
  323. vpxor \T1, reg_i, reg_i
  324. add $4, %r10
  325. sub $4, %r12
  326. jg _get_AAD_loop\@
  327. cmp $16, %r11
  328. je _get_AAD_loop2_done\@
  329. mov $16, %r12
  330. _get_AAD_loop2\@:
  331. vpsrldq $4, reg_i, reg_i
  332. sub $4, %r12
  333. cmp %r11, %r12
  334. jg _get_AAD_loop2\@
  335. _get_AAD_loop2_done\@:
  336. #byte-reflect the AAD data
  337. vpshufb SHUF_MASK(%rip), reg_i, reg_i
  338. # initialize the data pointer offset as zero
  339. xor %r11, %r11
  340. # start AES for num_initial_blocks blocks
  341. mov arg5, %rax # rax = *Y0
  342. vmovdqu (%rax), \CTR # CTR = Y0
  343. vpshufb SHUF_MASK(%rip), \CTR, \CTR
  344. i = (9-\num_initial_blocks)
  345. setreg
  346. .rep \num_initial_blocks
  347. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  348. vmovdqa \CTR, reg_i
  349. vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
  350. i = (i+1)
  351. setreg
  352. .endr
  353. vmovdqa (arg1), \T_key
  354. i = (9-\num_initial_blocks)
  355. setreg
  356. .rep \num_initial_blocks
  357. vpxor \T_key, reg_i, reg_i
  358. i = (i+1)
  359. setreg
  360. .endr
  361. j = 1
  362. setreg
  363. .rep 9
  364. vmovdqa 16*j(arg1), \T_key
  365. i = (9-\num_initial_blocks)
  366. setreg
  367. .rep \num_initial_blocks
  368. vaesenc \T_key, reg_i, reg_i
  369. i = (i+1)
  370. setreg
  371. .endr
  372. j = (j+1)
  373. setreg
  374. .endr
  375. vmovdqa 16*10(arg1), \T_key
  376. i = (9-\num_initial_blocks)
  377. setreg
  378. .rep \num_initial_blocks
  379. vaesenclast \T_key, reg_i, reg_i
  380. i = (i+1)
  381. setreg
  382. .endr
  383. i = (9-\num_initial_blocks)
  384. setreg
  385. .rep \num_initial_blocks
  386. vmovdqu (arg3, %r11), \T1
  387. vpxor \T1, reg_i, reg_i
  388. vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
  389. add $16, %r11
  390. .if \ENC_DEC == DEC
  391. vmovdqa \T1, reg_i
  392. .endif
  393. vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
  394. i = (i+1)
  395. setreg
  396. .endr
  397. i = (8-\num_initial_blocks)
  398. j = (9-\num_initial_blocks)
  399. setreg
  400. GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
  401. .rep \num_initial_blocks
  402. vpxor reg_i, reg_j, reg_j
  403. GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
  404. i = (i+1)
  405. j = (j+1)
  406. setreg
  407. .endr
  408. # XMM8 has the combined result here
  409. vmovdqa \XMM8, TMP1(%rsp)
  410. vmovdqa \XMM8, \T3
  411. cmp $128, %r13
  412. jl _initial_blocks_done\@ # no need for precomputed constants
  413. ###############################################################################
  414. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  415. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  416. vmovdqa \CTR, \XMM1
  417. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  418. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  419. vmovdqa \CTR, \XMM2
  420. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  421. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  422. vmovdqa \CTR, \XMM3
  423. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  424. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  425. vmovdqa \CTR, \XMM4
  426. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  427. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  428. vmovdqa \CTR, \XMM5
  429. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  430. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  431. vmovdqa \CTR, \XMM6
  432. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  433. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  434. vmovdqa \CTR, \XMM7
  435. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  436. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  437. vmovdqa \CTR, \XMM8
  438. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  439. vmovdqa (arg1), \T_key
  440. vpxor \T_key, \XMM1, \XMM1
  441. vpxor \T_key, \XMM2, \XMM2
  442. vpxor \T_key, \XMM3, \XMM3
  443. vpxor \T_key, \XMM4, \XMM4
  444. vpxor \T_key, \XMM5, \XMM5
  445. vpxor \T_key, \XMM6, \XMM6
  446. vpxor \T_key, \XMM7, \XMM7
  447. vpxor \T_key, \XMM8, \XMM8
  448. i = 1
  449. setreg
  450. .rep 9 # do 9 rounds
  451. vmovdqa 16*i(arg1), \T_key
  452. vaesenc \T_key, \XMM1, \XMM1
  453. vaesenc \T_key, \XMM2, \XMM2
  454. vaesenc \T_key, \XMM3, \XMM3
  455. vaesenc \T_key, \XMM4, \XMM4
  456. vaesenc \T_key, \XMM5, \XMM5
  457. vaesenc \T_key, \XMM6, \XMM6
  458. vaesenc \T_key, \XMM7, \XMM7
  459. vaesenc \T_key, \XMM8, \XMM8
  460. i = (i+1)
  461. setreg
  462. .endr
  463. vmovdqa 16*i(arg1), \T_key
  464. vaesenclast \T_key, \XMM1, \XMM1
  465. vaesenclast \T_key, \XMM2, \XMM2
  466. vaesenclast \T_key, \XMM3, \XMM3
  467. vaesenclast \T_key, \XMM4, \XMM4
  468. vaesenclast \T_key, \XMM5, \XMM5
  469. vaesenclast \T_key, \XMM6, \XMM6
  470. vaesenclast \T_key, \XMM7, \XMM7
  471. vaesenclast \T_key, \XMM8, \XMM8
  472. vmovdqu (arg3, %r11), \T1
  473. vpxor \T1, \XMM1, \XMM1
  474. vmovdqu \XMM1, (arg2 , %r11)
  475. .if \ENC_DEC == DEC
  476. vmovdqa \T1, \XMM1
  477. .endif
  478. vmovdqu 16*1(arg3, %r11), \T1
  479. vpxor \T1, \XMM2, \XMM2
  480. vmovdqu \XMM2, 16*1(arg2 , %r11)
  481. .if \ENC_DEC == DEC
  482. vmovdqa \T1, \XMM2
  483. .endif
  484. vmovdqu 16*2(arg3, %r11), \T1
  485. vpxor \T1, \XMM3, \XMM3
  486. vmovdqu \XMM3, 16*2(arg2 , %r11)
  487. .if \ENC_DEC == DEC
  488. vmovdqa \T1, \XMM3
  489. .endif
  490. vmovdqu 16*3(arg3, %r11), \T1
  491. vpxor \T1, \XMM4, \XMM4
  492. vmovdqu \XMM4, 16*3(arg2 , %r11)
  493. .if \ENC_DEC == DEC
  494. vmovdqa \T1, \XMM4
  495. .endif
  496. vmovdqu 16*4(arg3, %r11), \T1
  497. vpxor \T1, \XMM5, \XMM5
  498. vmovdqu \XMM5, 16*4(arg2 , %r11)
  499. .if \ENC_DEC == DEC
  500. vmovdqa \T1, \XMM5
  501. .endif
  502. vmovdqu 16*5(arg3, %r11), \T1
  503. vpxor \T1, \XMM6, \XMM6
  504. vmovdqu \XMM6, 16*5(arg2 , %r11)
  505. .if \ENC_DEC == DEC
  506. vmovdqa \T1, \XMM6
  507. .endif
  508. vmovdqu 16*6(arg3, %r11), \T1
  509. vpxor \T1, \XMM7, \XMM7
  510. vmovdqu \XMM7, 16*6(arg2 , %r11)
  511. .if \ENC_DEC == DEC
  512. vmovdqa \T1, \XMM7
  513. .endif
  514. vmovdqu 16*7(arg3, %r11), \T1
  515. vpxor \T1, \XMM8, \XMM8
  516. vmovdqu \XMM8, 16*7(arg2 , %r11)
  517. .if \ENC_DEC == DEC
  518. vmovdqa \T1, \XMM8
  519. .endif
  520. add $128, %r11
  521. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  522. vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
  523. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  524. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  525. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  526. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  527. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  528. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  529. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  530. ###############################################################################
  531. _initial_blocks_done\@:
  532. .endm
  533. # encrypt 8 blocks at a time
  534. # ghash the 8 previously encrypted ciphertext blocks
  535. # arg1, arg2, arg3 are used as pointers only, not modified
  536. # r11 is the data offset value
  537. .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
  538. vmovdqa \XMM1, \T2
  539. vmovdqa \XMM2, TMP2(%rsp)
  540. vmovdqa \XMM3, TMP3(%rsp)
  541. vmovdqa \XMM4, TMP4(%rsp)
  542. vmovdqa \XMM5, TMP5(%rsp)
  543. vmovdqa \XMM6, TMP6(%rsp)
  544. vmovdqa \XMM7, TMP7(%rsp)
  545. vmovdqa \XMM8, TMP8(%rsp)
  546. .if \loop_idx == in_order
  547. vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
  548. vpaddd ONE(%rip), \XMM1, \XMM2
  549. vpaddd ONE(%rip), \XMM2, \XMM3
  550. vpaddd ONE(%rip), \XMM3, \XMM4
  551. vpaddd ONE(%rip), \XMM4, \XMM5
  552. vpaddd ONE(%rip), \XMM5, \XMM6
  553. vpaddd ONE(%rip), \XMM6, \XMM7
  554. vpaddd ONE(%rip), \XMM7, \XMM8
  555. vmovdqa \XMM8, \CTR
  556. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  557. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  558. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  559. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  560. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  561. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  562. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  563. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  564. .else
  565. vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
  566. vpaddd ONEf(%rip), \XMM1, \XMM2
  567. vpaddd ONEf(%rip), \XMM2, \XMM3
  568. vpaddd ONEf(%rip), \XMM3, \XMM4
  569. vpaddd ONEf(%rip), \XMM4, \XMM5
  570. vpaddd ONEf(%rip), \XMM5, \XMM6
  571. vpaddd ONEf(%rip), \XMM6, \XMM7
  572. vpaddd ONEf(%rip), \XMM7, \XMM8
  573. vmovdqa \XMM8, \CTR
  574. .endif
  575. #######################################################################
  576. vmovdqu (arg1), \T1
  577. vpxor \T1, \XMM1, \XMM1
  578. vpxor \T1, \XMM2, \XMM2
  579. vpxor \T1, \XMM3, \XMM3
  580. vpxor \T1, \XMM4, \XMM4
  581. vpxor \T1, \XMM5, \XMM5
  582. vpxor \T1, \XMM6, \XMM6
  583. vpxor \T1, \XMM7, \XMM7
  584. vpxor \T1, \XMM8, \XMM8
  585. #######################################################################
  586. vmovdqu 16*1(arg1), \T1
  587. vaesenc \T1, \XMM1, \XMM1
  588. vaesenc \T1, \XMM2, \XMM2
  589. vaesenc \T1, \XMM3, \XMM3
  590. vaesenc \T1, \XMM4, \XMM4
  591. vaesenc \T1, \XMM5, \XMM5
  592. vaesenc \T1, \XMM6, \XMM6
  593. vaesenc \T1, \XMM7, \XMM7
  594. vaesenc \T1, \XMM8, \XMM8
  595. vmovdqu 16*2(arg1), \T1
  596. vaesenc \T1, \XMM1, \XMM1
  597. vaesenc \T1, \XMM2, \XMM2
  598. vaesenc \T1, \XMM3, \XMM3
  599. vaesenc \T1, \XMM4, \XMM4
  600. vaesenc \T1, \XMM5, \XMM5
  601. vaesenc \T1, \XMM6, \XMM6
  602. vaesenc \T1, \XMM7, \XMM7
  603. vaesenc \T1, \XMM8, \XMM8
  604. #######################################################################
  605. vmovdqa HashKey_8(arg1), \T5
  606. vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
  607. vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
  608. vpshufd $0b01001110, \T2, \T6
  609. vpxor \T2, \T6, \T6
  610. vmovdqa HashKey_8_k(arg1), \T5
  611. vpclmulqdq $0x00, \T5, \T6, \T6
  612. vmovdqu 16*3(arg1), \T1
  613. vaesenc \T1, \XMM1, \XMM1
  614. vaesenc \T1, \XMM2, \XMM2
  615. vaesenc \T1, \XMM3, \XMM3
  616. vaesenc \T1, \XMM4, \XMM4
  617. vaesenc \T1, \XMM5, \XMM5
  618. vaesenc \T1, \XMM6, \XMM6
  619. vaesenc \T1, \XMM7, \XMM7
  620. vaesenc \T1, \XMM8, \XMM8
  621. vmovdqa TMP2(%rsp), \T1
  622. vmovdqa HashKey_7(arg1), \T5
  623. vpclmulqdq $0x11, \T5, \T1, \T3
  624. vpxor \T3, \T4, \T4
  625. vpclmulqdq $0x00, \T5, \T1, \T3
  626. vpxor \T3, \T7, \T7
  627. vpshufd $0b01001110, \T1, \T3
  628. vpxor \T1, \T3, \T3
  629. vmovdqa HashKey_7_k(arg1), \T5
  630. vpclmulqdq $0x10, \T5, \T3, \T3
  631. vpxor \T3, \T6, \T6
  632. vmovdqu 16*4(arg1), \T1
  633. vaesenc \T1, \XMM1, \XMM1
  634. vaesenc \T1, \XMM2, \XMM2
  635. vaesenc \T1, \XMM3, \XMM3
  636. vaesenc \T1, \XMM4, \XMM4
  637. vaesenc \T1, \XMM5, \XMM5
  638. vaesenc \T1, \XMM6, \XMM6
  639. vaesenc \T1, \XMM7, \XMM7
  640. vaesenc \T1, \XMM8, \XMM8
  641. #######################################################################
  642. vmovdqa TMP3(%rsp), \T1
  643. vmovdqa HashKey_6(arg1), \T5
  644. vpclmulqdq $0x11, \T5, \T1, \T3
  645. vpxor \T3, \T4, \T4
  646. vpclmulqdq $0x00, \T5, \T1, \T3
  647. vpxor \T3, \T7, \T7
  648. vpshufd $0b01001110, \T1, \T3
  649. vpxor \T1, \T3, \T3
  650. vmovdqa HashKey_6_k(arg1), \T5
  651. vpclmulqdq $0x10, \T5, \T3, \T3
  652. vpxor \T3, \T6, \T6
  653. vmovdqu 16*5(arg1), \T1
  654. vaesenc \T1, \XMM1, \XMM1
  655. vaesenc \T1, \XMM2, \XMM2
  656. vaesenc \T1, \XMM3, \XMM3
  657. vaesenc \T1, \XMM4, \XMM4
  658. vaesenc \T1, \XMM5, \XMM5
  659. vaesenc \T1, \XMM6, \XMM6
  660. vaesenc \T1, \XMM7, \XMM7
  661. vaesenc \T1, \XMM8, \XMM8
  662. vmovdqa TMP4(%rsp), \T1
  663. vmovdqa HashKey_5(arg1), \T5
  664. vpclmulqdq $0x11, \T5, \T1, \T3
  665. vpxor \T3, \T4, \T4
  666. vpclmulqdq $0x00, \T5, \T1, \T3
  667. vpxor \T3, \T7, \T7
  668. vpshufd $0b01001110, \T1, \T3
  669. vpxor \T1, \T3, \T3
  670. vmovdqa HashKey_5_k(arg1), \T5
  671. vpclmulqdq $0x10, \T5, \T3, \T3
  672. vpxor \T3, \T6, \T6
  673. vmovdqu 16*6(arg1), \T1
  674. vaesenc \T1, \XMM1, \XMM1
  675. vaesenc \T1, \XMM2, \XMM2
  676. vaesenc \T1, \XMM3, \XMM3
  677. vaesenc \T1, \XMM4, \XMM4
  678. vaesenc \T1, \XMM5, \XMM5
  679. vaesenc \T1, \XMM6, \XMM6
  680. vaesenc \T1, \XMM7, \XMM7
  681. vaesenc \T1, \XMM8, \XMM8
  682. vmovdqa TMP5(%rsp), \T1
  683. vmovdqa HashKey_4(arg1), \T5
  684. vpclmulqdq $0x11, \T5, \T1, \T3
  685. vpxor \T3, \T4, \T4
  686. vpclmulqdq $0x00, \T5, \T1, \T3
  687. vpxor \T3, \T7, \T7
  688. vpshufd $0b01001110, \T1, \T3
  689. vpxor \T1, \T3, \T3
  690. vmovdqa HashKey_4_k(arg1), \T5
  691. vpclmulqdq $0x10, \T5, \T3, \T3
  692. vpxor \T3, \T6, \T6
  693. vmovdqu 16*7(arg1), \T1
  694. vaesenc \T1, \XMM1, \XMM1
  695. vaesenc \T1, \XMM2, \XMM2
  696. vaesenc \T1, \XMM3, \XMM3
  697. vaesenc \T1, \XMM4, \XMM4
  698. vaesenc \T1, \XMM5, \XMM5
  699. vaesenc \T1, \XMM6, \XMM6
  700. vaesenc \T1, \XMM7, \XMM7
  701. vaesenc \T1, \XMM8, \XMM8
  702. vmovdqa TMP6(%rsp), \T1
  703. vmovdqa HashKey_3(arg1), \T5
  704. vpclmulqdq $0x11, \T5, \T1, \T3
  705. vpxor \T3, \T4, \T4
  706. vpclmulqdq $0x00, \T5, \T1, \T3
  707. vpxor \T3, \T7, \T7
  708. vpshufd $0b01001110, \T1, \T3
  709. vpxor \T1, \T3, \T3
  710. vmovdqa HashKey_3_k(arg1), \T5
  711. vpclmulqdq $0x10, \T5, \T3, \T3
  712. vpxor \T3, \T6, \T6
  713. vmovdqu 16*8(arg1), \T1
  714. vaesenc \T1, \XMM1, \XMM1
  715. vaesenc \T1, \XMM2, \XMM2
  716. vaesenc \T1, \XMM3, \XMM3
  717. vaesenc \T1, \XMM4, \XMM4
  718. vaesenc \T1, \XMM5, \XMM5
  719. vaesenc \T1, \XMM6, \XMM6
  720. vaesenc \T1, \XMM7, \XMM7
  721. vaesenc \T1, \XMM8, \XMM8
  722. vmovdqa TMP7(%rsp), \T1
  723. vmovdqa HashKey_2(arg1), \T5
  724. vpclmulqdq $0x11, \T5, \T1, \T3
  725. vpxor \T3, \T4, \T4
  726. vpclmulqdq $0x00, \T5, \T1, \T3
  727. vpxor \T3, \T7, \T7
  728. vpshufd $0b01001110, \T1, \T3
  729. vpxor \T1, \T3, \T3
  730. vmovdqa HashKey_2_k(arg1), \T5
  731. vpclmulqdq $0x10, \T5, \T3, \T3
  732. vpxor \T3, \T6, \T6
  733. #######################################################################
  734. vmovdqu 16*9(arg1), \T5
  735. vaesenc \T5, \XMM1, \XMM1
  736. vaesenc \T5, \XMM2, \XMM2
  737. vaesenc \T5, \XMM3, \XMM3
  738. vaesenc \T5, \XMM4, \XMM4
  739. vaesenc \T5, \XMM5, \XMM5
  740. vaesenc \T5, \XMM6, \XMM6
  741. vaesenc \T5, \XMM7, \XMM7
  742. vaesenc \T5, \XMM8, \XMM8
  743. vmovdqa TMP8(%rsp), \T1
  744. vmovdqa HashKey(arg1), \T5
  745. vpclmulqdq $0x11, \T5, \T1, \T3
  746. vpxor \T3, \T4, \T4
  747. vpclmulqdq $0x00, \T5, \T1, \T3
  748. vpxor \T3, \T7, \T7
  749. vpshufd $0b01001110, \T1, \T3
  750. vpxor \T1, \T3, \T3
  751. vmovdqa HashKey_k(arg1), \T5
  752. vpclmulqdq $0x10, \T5, \T3, \T3
  753. vpxor \T3, \T6, \T6
  754. vpxor \T4, \T6, \T6
  755. vpxor \T7, \T6, \T6
  756. vmovdqu 16*10(arg1), \T5
  757. i = 0
  758. j = 1
  759. setreg
  760. .rep 8
  761. vpxor 16*i(arg3, %r11), \T5, \T2
  762. .if \ENC_DEC == ENC
  763. vaesenclast \T2, reg_j, reg_j
  764. .else
  765. vaesenclast \T2, reg_j, \T3
  766. vmovdqu 16*i(arg3, %r11), reg_j
  767. vmovdqu \T3, 16*i(arg2, %r11)
  768. .endif
  769. i = (i+1)
  770. j = (j+1)
  771. setreg
  772. .endr
  773. #######################################################################
  774. vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
  775. vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
  776. vpxor \T3, \T7, \T7
  777. vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
  778. #######################################################################
  779. #first phase of the reduction
  780. #######################################################################
  781. vpslld $31, \T7, \T2 # packed right shifting << 31
  782. vpslld $30, \T7, \T3 # packed right shifting shift << 30
  783. vpslld $25, \T7, \T4 # packed right shifting shift << 25
  784. vpxor \T3, \T2, \T2 # xor the shifted versions
  785. vpxor \T4, \T2, \T2
  786. vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
  787. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  788. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  789. #######################################################################
  790. .if \ENC_DEC == ENC
  791. vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
  792. vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
  793. vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
  794. vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
  795. vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
  796. vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
  797. vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
  798. vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
  799. .endif
  800. #######################################################################
  801. #second phase of the reduction
  802. vpsrld $1, \T7, \T2 # packed left shifting >> 1
  803. vpsrld $2, \T7, \T3 # packed left shifting >> 2
  804. vpsrld $7, \T7, \T4 # packed left shifting >> 7
  805. vpxor \T3, \T2, \T2 # xor the shifted versions
  806. vpxor \T4, \T2, \T2
  807. vpxor \T1, \T2, \T2
  808. vpxor \T2, \T7, \T7
  809. vpxor \T7, \T6, \T6 # the result is in T6
  810. #######################################################################
  811. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  812. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  813. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  814. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  815. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  816. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  817. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  818. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  819. vpxor \T6, \XMM1, \XMM1
  820. .endm
  821. # GHASH the last 4 ciphertext blocks.
  822. .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
  823. ## Karatsuba Method
  824. vpshufd $0b01001110, \XMM1, \T2
  825. vpxor \XMM1, \T2, \T2
  826. vmovdqa HashKey_8(arg1), \T5
  827. vpclmulqdq $0x11, \T5, \XMM1, \T6
  828. vpclmulqdq $0x00, \T5, \XMM1, \T7
  829. vmovdqa HashKey_8_k(arg1), \T3
  830. vpclmulqdq $0x00, \T3, \T2, \XMM1
  831. ######################
  832. vpshufd $0b01001110, \XMM2, \T2
  833. vpxor \XMM2, \T2, \T2
  834. vmovdqa HashKey_7(arg1), \T5
  835. vpclmulqdq $0x11, \T5, \XMM2, \T4
  836. vpxor \T4, \T6, \T6
  837. vpclmulqdq $0x00, \T5, \XMM2, \T4
  838. vpxor \T4, \T7, \T7
  839. vmovdqa HashKey_7_k(arg1), \T3
  840. vpclmulqdq $0x00, \T3, \T2, \T2
  841. vpxor \T2, \XMM1, \XMM1
  842. ######################
  843. vpshufd $0b01001110, \XMM3, \T2
  844. vpxor \XMM3, \T2, \T2
  845. vmovdqa HashKey_6(arg1), \T5
  846. vpclmulqdq $0x11, \T5, \XMM3, \T4
  847. vpxor \T4, \T6, \T6
  848. vpclmulqdq $0x00, \T5, \XMM3, \T4
  849. vpxor \T4, \T7, \T7
  850. vmovdqa HashKey_6_k(arg1), \T3
  851. vpclmulqdq $0x00, \T3, \T2, \T2
  852. vpxor \T2, \XMM1, \XMM1
  853. ######################
  854. vpshufd $0b01001110, \XMM4, \T2
  855. vpxor \XMM4, \T2, \T2
  856. vmovdqa HashKey_5(arg1), \T5
  857. vpclmulqdq $0x11, \T5, \XMM4, \T4
  858. vpxor \T4, \T6, \T6
  859. vpclmulqdq $0x00, \T5, \XMM4, \T4
  860. vpxor \T4, \T7, \T7
  861. vmovdqa HashKey_5_k(arg1), \T3
  862. vpclmulqdq $0x00, \T3, \T2, \T2
  863. vpxor \T2, \XMM1, \XMM1
  864. ######################
  865. vpshufd $0b01001110, \XMM5, \T2
  866. vpxor \XMM5, \T2, \T2
  867. vmovdqa HashKey_4(arg1), \T5
  868. vpclmulqdq $0x11, \T5, \XMM5, \T4
  869. vpxor \T4, \T6, \T6
  870. vpclmulqdq $0x00, \T5, \XMM5, \T4
  871. vpxor \T4, \T7, \T7
  872. vmovdqa HashKey_4_k(arg1), \T3
  873. vpclmulqdq $0x00, \T3, \T2, \T2
  874. vpxor \T2, \XMM1, \XMM1
  875. ######################
  876. vpshufd $0b01001110, \XMM6, \T2
  877. vpxor \XMM6, \T2, \T2
  878. vmovdqa HashKey_3(arg1), \T5
  879. vpclmulqdq $0x11, \T5, \XMM6, \T4
  880. vpxor \T4, \T6, \T6
  881. vpclmulqdq $0x00, \T5, \XMM6, \T4
  882. vpxor \T4, \T7, \T7
  883. vmovdqa HashKey_3_k(arg1), \T3
  884. vpclmulqdq $0x00, \T3, \T2, \T2
  885. vpxor \T2, \XMM1, \XMM1
  886. ######################
  887. vpshufd $0b01001110, \XMM7, \T2
  888. vpxor \XMM7, \T2, \T2
  889. vmovdqa HashKey_2(arg1), \T5
  890. vpclmulqdq $0x11, \T5, \XMM7, \T4
  891. vpxor \T4, \T6, \T6
  892. vpclmulqdq $0x00, \T5, \XMM7, \T4
  893. vpxor \T4, \T7, \T7
  894. vmovdqa HashKey_2_k(arg1), \T3
  895. vpclmulqdq $0x00, \T3, \T2, \T2
  896. vpxor \T2, \XMM1, \XMM1
  897. ######################
  898. vpshufd $0b01001110, \XMM8, \T2
  899. vpxor \XMM8, \T2, \T2
  900. vmovdqa HashKey(arg1), \T5
  901. vpclmulqdq $0x11, \T5, \XMM8, \T4
  902. vpxor \T4, \T6, \T6
  903. vpclmulqdq $0x00, \T5, \XMM8, \T4
  904. vpxor \T4, \T7, \T7
  905. vmovdqa HashKey_k(arg1), \T3
  906. vpclmulqdq $0x00, \T3, \T2, \T2
  907. vpxor \T2, \XMM1, \XMM1
  908. vpxor \T6, \XMM1, \XMM1
  909. vpxor \T7, \XMM1, \T2
  910. vpslldq $8, \T2, \T4
  911. vpsrldq $8, \T2, \T2
  912. vpxor \T4, \T7, \T7
  913. vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
  914. # the accumulated carry-less multiplications
  915. #######################################################################
  916. #first phase of the reduction
  917. vpslld $31, \T7, \T2 # packed right shifting << 31
  918. vpslld $30, \T7, \T3 # packed right shifting shift << 30
  919. vpslld $25, \T7, \T4 # packed right shifting shift << 25
  920. vpxor \T3, \T2, \T2 # xor the shifted versions
  921. vpxor \T4, \T2, \T2
  922. vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
  923. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  924. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  925. #######################################################################
  926. #second phase of the reduction
  927. vpsrld $1, \T7, \T2 # packed left shifting >> 1
  928. vpsrld $2, \T7, \T3 # packed left shifting >> 2
  929. vpsrld $7, \T7, \T4 # packed left shifting >> 7
  930. vpxor \T3, \T2, \T2 # xor the shifted versions
  931. vpxor \T4, \T2, \T2
  932. vpxor \T1, \T2, \T2
  933. vpxor \T2, \T7, \T7
  934. vpxor \T7, \T6, \T6 # the result is in T6
  935. .endm
  936. # combined for GCM encrypt and decrypt functions
  937. # clobbering all xmm registers
  938. # clobbering r10, r11, r12, r13, r14, r15
  939. .macro GCM_ENC_DEC_AVX ENC_DEC
  940. #the number of pushes must equal STACK_OFFSET
  941. push %r12
  942. push %r13
  943. push %r14
  944. push %r15
  945. mov %rsp, %r14
  946. sub $VARIABLE_OFFSET, %rsp
  947. and $~63, %rsp # align rsp to 64 bytes
  948. vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
  949. mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
  950. and $-16, %r13 # r13 = r13 - (r13 mod 16)
  951. mov %r13, %r12
  952. shr $4, %r12
  953. and $7, %r12
  954. jz _initial_num_blocks_is_0\@
  955. cmp $7, %r12
  956. je _initial_num_blocks_is_7\@
  957. cmp $6, %r12
  958. je _initial_num_blocks_is_6\@
  959. cmp $5, %r12
  960. je _initial_num_blocks_is_5\@
  961. cmp $4, %r12
  962. je _initial_num_blocks_is_4\@
  963. cmp $3, %r12
  964. je _initial_num_blocks_is_3\@
  965. cmp $2, %r12
  966. je _initial_num_blocks_is_2\@
  967. jmp _initial_num_blocks_is_1\@
  968. _initial_num_blocks_is_7\@:
  969. INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  970. sub $16*7, %r13
  971. jmp _initial_blocks_encrypted\@
  972. _initial_num_blocks_is_6\@:
  973. INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  974. sub $16*6, %r13
  975. jmp _initial_blocks_encrypted\@
  976. _initial_num_blocks_is_5\@:
  977. INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  978. sub $16*5, %r13
  979. jmp _initial_blocks_encrypted\@
  980. _initial_num_blocks_is_4\@:
  981. INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  982. sub $16*4, %r13
  983. jmp _initial_blocks_encrypted\@
  984. _initial_num_blocks_is_3\@:
  985. INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  986. sub $16*3, %r13
  987. jmp _initial_blocks_encrypted\@
  988. _initial_num_blocks_is_2\@:
  989. INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  990. sub $16*2, %r13
  991. jmp _initial_blocks_encrypted\@
  992. _initial_num_blocks_is_1\@:
  993. INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  994. sub $16*1, %r13
  995. jmp _initial_blocks_encrypted\@
  996. _initial_num_blocks_is_0\@:
  997. INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  998. _initial_blocks_encrypted\@:
  999. cmp $0, %r13
  1000. je _zero_cipher_left\@
  1001. sub $128, %r13
  1002. je _eight_cipher_left\@
  1003. vmovd %xmm9, %r15d
  1004. and $255, %r15d
  1005. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1006. _encrypt_by_8_new\@:
  1007. cmp $(255-8), %r15d
  1008. jg _encrypt_by_8\@
  1009. add $8, %r15b
  1010. GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
  1011. add $128, %r11
  1012. sub $128, %r13
  1013. jne _encrypt_by_8_new\@
  1014. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1015. jmp _eight_cipher_left\@
  1016. _encrypt_by_8\@:
  1017. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1018. add $8, %r15b
  1019. GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
  1020. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1021. add $128, %r11
  1022. sub $128, %r13
  1023. jne _encrypt_by_8_new\@
  1024. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1025. _eight_cipher_left\@:
  1026. GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
  1027. _zero_cipher_left\@:
  1028. cmp $16, arg4
  1029. jl _only_less_than_16\@
  1030. mov arg4, %r13
  1031. and $15, %r13 # r13 = (arg4 mod 16)
  1032. je _multiple_of_16_bytes\@
  1033. # handle the last <16 Byte block seperately
  1034. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  1035. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1036. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  1037. sub $16, %r11
  1038. add %r13, %r11
  1039. vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
  1040. lea SHIFT_MASK+16(%rip), %r12
  1041. sub %r13, %r12 # adjust the shuffle mask pointer to be
  1042. # able to shift 16-r13 bytes (r13 is the
  1043. # number of bytes in plaintext mod 16)
  1044. vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1045. vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
  1046. jmp _final_ghash_mul\@
  1047. _only_less_than_16\@:
  1048. # check for 0 length
  1049. mov arg4, %r13
  1050. and $15, %r13 # r13 = (arg4 mod 16)
  1051. je _multiple_of_16_bytes\@
  1052. # handle the last <16 Byte block seperately
  1053. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  1054. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1055. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  1056. lea SHIFT_MASK+16(%rip), %r12
  1057. sub %r13, %r12 # adjust the shuffle mask pointer to be
  1058. # able to shift 16-r13 bytes (r13 is the
  1059. # number of bytes in plaintext mod 16)
  1060. _get_last_16_byte_loop\@:
  1061. movb (arg3, %r11), %al
  1062. movb %al, TMP1 (%rsp , %r11)
  1063. add $1, %r11
  1064. cmp %r13, %r11
  1065. jne _get_last_16_byte_loop\@
  1066. vmovdqu TMP1(%rsp), %xmm1
  1067. sub $16, %r11
  1068. _final_ghash_mul\@:
  1069. .if \ENC_DEC == DEC
  1070. vmovdqa %xmm1, %xmm2
  1071. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  1072. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
  1073. # mask out top 16-r13 bytes of xmm9
  1074. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  1075. vpand %xmm1, %xmm2, %xmm2
  1076. vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
  1077. vpxor %xmm2, %xmm14, %xmm14
  1078. #GHASH computation for the last <16 Byte block
  1079. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  1080. sub %r13, %r11
  1081. add $16, %r11
  1082. .else
  1083. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  1084. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
  1085. # mask out top 16-r13 bytes of xmm9
  1086. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  1087. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1088. vpxor %xmm9, %xmm14, %xmm14
  1089. #GHASH computation for the last <16 Byte block
  1090. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  1091. sub %r13, %r11
  1092. add $16, %r11
  1093. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
  1094. .endif
  1095. #############################
  1096. # output r13 Bytes
  1097. vmovq %xmm9, %rax
  1098. cmp $8, %r13
  1099. jle _less_than_8_bytes_left\@
  1100. mov %rax, (arg2 , %r11)
  1101. add $8, %r11
  1102. vpsrldq $8, %xmm9, %xmm9
  1103. vmovq %xmm9, %rax
  1104. sub $8, %r13
  1105. _less_than_8_bytes_left\@:
  1106. movb %al, (arg2 , %r11)
  1107. add $1, %r11
  1108. shr $8, %rax
  1109. sub $1, %r13
  1110. jne _less_than_8_bytes_left\@
  1111. #############################
  1112. _multiple_of_16_bytes\@:
  1113. mov arg7, %r12 # r12 = aadLen (number of bytes)
  1114. shl $3, %r12 # convert into number of bits
  1115. vmovd %r12d, %xmm15 # len(A) in xmm15
  1116. shl $3, arg4 # len(C) in bits (*128)
  1117. vmovq arg4, %xmm1
  1118. vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
  1119. vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
  1120. vpxor %xmm15, %xmm14, %xmm14
  1121. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
  1122. vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
  1123. mov arg5, %rax # rax = *Y0
  1124. vmovdqu (%rax), %xmm9 # xmm9 = Y0
  1125. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
  1126. vpxor %xmm14, %xmm9, %xmm9
  1127. _return_T\@:
  1128. mov arg8, %r10 # r10 = authTag
  1129. mov arg9, %r11 # r11 = auth_tag_len
  1130. cmp $16, %r11
  1131. je _T_16\@
  1132. cmp $12, %r11
  1133. je _T_12\@
  1134. _T_8\@:
  1135. vmovq %xmm9, %rax
  1136. mov %rax, (%r10)
  1137. jmp _return_T_done\@
  1138. _T_12\@:
  1139. vmovq %xmm9, %rax
  1140. mov %rax, (%r10)
  1141. vpsrldq $8, %xmm9, %xmm9
  1142. vmovd %xmm9, %eax
  1143. mov %eax, 8(%r10)
  1144. jmp _return_T_done\@
  1145. _T_16\@:
  1146. vmovdqu %xmm9, (%r10)
  1147. _return_T_done\@:
  1148. mov %r14, %rsp
  1149. pop %r15
  1150. pop %r14
  1151. pop %r13
  1152. pop %r12
  1153. .endm
  1154. #############################################################
  1155. #void aesni_gcm_precomp_avx_gen2
  1156. # (gcm_data *my_ctx_data,
  1157. # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
  1158. #############################################################
  1159. ENTRY(aesni_gcm_precomp_avx_gen2)
  1160. #the number of pushes must equal STACK_OFFSET
  1161. push %r12
  1162. push %r13
  1163. push %r14
  1164. push %r15
  1165. mov %rsp, %r14
  1166. sub $VARIABLE_OFFSET, %rsp
  1167. and $~63, %rsp # align rsp to 64 bytes
  1168. vmovdqu (arg2), %xmm6 # xmm6 = HashKey
  1169. vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
  1170. ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
  1171. vmovdqa %xmm6, %xmm2
  1172. vpsllq $1, %xmm6, %xmm6
  1173. vpsrlq $63, %xmm2, %xmm2
  1174. vmovdqa %xmm2, %xmm1
  1175. vpslldq $8, %xmm2, %xmm2
  1176. vpsrldq $8, %xmm1, %xmm1
  1177. vpor %xmm2, %xmm6, %xmm6
  1178. #reduction
  1179. vpshufd $0b00100100, %xmm1, %xmm2
  1180. vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
  1181. vpand POLY(%rip), %xmm2, %xmm2
  1182. vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
  1183. #######################################################################
  1184. vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
  1185. PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
  1186. mov %r14, %rsp
  1187. pop %r15
  1188. pop %r14
  1189. pop %r13
  1190. pop %r12
  1191. ret
  1192. ENDPROC(aesni_gcm_precomp_avx_gen2)
  1193. ###############################################################################
  1194. #void aesni_gcm_enc_avx_gen2(
  1195. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1196. # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
  1197. # const u8 *in, /* Plaintext input */
  1198. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  1199. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  1200. # (from Security Association) concatenated with 8 byte
  1201. # Initialisation Vector (from IPSec ESP Payload)
  1202. # concatenated with 0x00000001. 16-byte aligned pointer. */
  1203. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  1204. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  1205. # u8 *auth_tag, /* Authenticated Tag output. */
  1206. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  1207. # Valid values are 16 (most likely), 12 or 8. */
  1208. ###############################################################################
  1209. ENTRY(aesni_gcm_enc_avx_gen2)
  1210. GCM_ENC_DEC_AVX ENC
  1211. ret
  1212. ENDPROC(aesni_gcm_enc_avx_gen2)
  1213. ###############################################################################
  1214. #void aesni_gcm_dec_avx_gen2(
  1215. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1216. # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
  1217. # const u8 *in, /* Ciphertext input */
  1218. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  1219. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  1220. # (from Security Association) concatenated with 8 byte
  1221. # Initialisation Vector (from IPSec ESP Payload)
  1222. # concatenated with 0x00000001. 16-byte aligned pointer. */
  1223. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  1224. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  1225. # u8 *auth_tag, /* Authenticated Tag output. */
  1226. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  1227. # Valid values are 16 (most likely), 12 or 8. */
  1228. ###############################################################################
  1229. ENTRY(aesni_gcm_dec_avx_gen2)
  1230. GCM_ENC_DEC_AVX DEC
  1231. ret
  1232. ENDPROC(aesni_gcm_dec_avx_gen2)
  1233. #endif /* CONFIG_AS_AVX */
  1234. #ifdef CONFIG_AS_AVX2
  1235. ###############################################################################
  1236. # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  1237. # Input: A and B (128-bits each, bit-reflected)
  1238. # Output: C = A*B*x mod poly, (i.e. >>1 )
  1239. # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  1240. # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  1241. ###############################################################################
  1242. .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
  1243. vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
  1244. vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
  1245. vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
  1246. vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
  1247. vpxor \T3, \GH, \GH
  1248. vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
  1249. vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
  1250. vpxor \T3, \T1, \T1
  1251. vpxor \T2, \GH, \GH
  1252. #######################################################################
  1253. #first phase of the reduction
  1254. vmovdqa POLY2(%rip), \T3
  1255. vpclmulqdq $0x01, \GH, \T3, \T2
  1256. vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
  1257. vpxor \T2, \GH, \GH # first phase of the reduction complete
  1258. #######################################################################
  1259. #second phase of the reduction
  1260. vpclmulqdq $0x00, \GH, \T3, \T2
  1261. vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1262. vpclmulqdq $0x10, \GH, \T3, \GH
  1263. vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1264. vpxor \T2, \GH, \GH # second phase of the reduction complete
  1265. #######################################################################
  1266. vpxor \T1, \GH, \GH # the result is in GH
  1267. .endm
  1268. .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
  1269. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  1270. vmovdqa \HK, \T5
  1271. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
  1272. vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
  1273. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
  1274. vmovdqa \T5, HashKey_3(arg1)
  1275. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
  1276. vmovdqa \T5, HashKey_4(arg1)
  1277. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
  1278. vmovdqa \T5, HashKey_5(arg1)
  1279. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
  1280. vmovdqa \T5, HashKey_6(arg1)
  1281. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
  1282. vmovdqa \T5, HashKey_7(arg1)
  1283. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
  1284. vmovdqa \T5, HashKey_8(arg1)
  1285. .endm
  1286. ## if a = number of total plaintext bytes
  1287. ## b = floor(a/16)
  1288. ## num_initial_blocks = b mod 4#
  1289. ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
  1290. ## r10, r11, r12, rax are clobbered
  1291. ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
  1292. .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
  1293. i = (8-\num_initial_blocks)
  1294. setreg
  1295. mov arg6, %r10 # r10 = AAD
  1296. mov arg7, %r12 # r12 = aadLen
  1297. mov %r12, %r11
  1298. vpxor reg_i, reg_i, reg_i
  1299. _get_AAD_loop\@:
  1300. vmovd (%r10), \T1
  1301. vpslldq $12, \T1, \T1
  1302. vpsrldq $4, reg_i, reg_i
  1303. vpxor \T1, reg_i, reg_i
  1304. add $4, %r10
  1305. sub $4, %r12
  1306. jg _get_AAD_loop\@
  1307. cmp $16, %r11
  1308. je _get_AAD_loop2_done\@
  1309. mov $16, %r12
  1310. _get_AAD_loop2\@:
  1311. vpsrldq $4, reg_i, reg_i
  1312. sub $4, %r12
  1313. cmp %r11, %r12
  1314. jg _get_AAD_loop2\@
  1315. _get_AAD_loop2_done\@:
  1316. #byte-reflect the AAD data
  1317. vpshufb SHUF_MASK(%rip), reg_i, reg_i
  1318. # initialize the data pointer offset as zero
  1319. xor %r11, %r11
  1320. # start AES for num_initial_blocks blocks
  1321. mov arg5, %rax # rax = *Y0
  1322. vmovdqu (%rax), \CTR # CTR = Y0
  1323. vpshufb SHUF_MASK(%rip), \CTR, \CTR
  1324. i = (9-\num_initial_blocks)
  1325. setreg
  1326. .rep \num_initial_blocks
  1327. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1328. vmovdqa \CTR, reg_i
  1329. vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
  1330. i = (i+1)
  1331. setreg
  1332. .endr
  1333. vmovdqa (arg1), \T_key
  1334. i = (9-\num_initial_blocks)
  1335. setreg
  1336. .rep \num_initial_blocks
  1337. vpxor \T_key, reg_i, reg_i
  1338. i = (i+1)
  1339. setreg
  1340. .endr
  1341. j = 1
  1342. setreg
  1343. .rep 9
  1344. vmovdqa 16*j(arg1), \T_key
  1345. i = (9-\num_initial_blocks)
  1346. setreg
  1347. .rep \num_initial_blocks
  1348. vaesenc \T_key, reg_i, reg_i
  1349. i = (i+1)
  1350. setreg
  1351. .endr
  1352. j = (j+1)
  1353. setreg
  1354. .endr
  1355. vmovdqa 16*10(arg1), \T_key
  1356. i = (9-\num_initial_blocks)
  1357. setreg
  1358. .rep \num_initial_blocks
  1359. vaesenclast \T_key, reg_i, reg_i
  1360. i = (i+1)
  1361. setreg
  1362. .endr
  1363. i = (9-\num_initial_blocks)
  1364. setreg
  1365. .rep \num_initial_blocks
  1366. vmovdqu (arg3, %r11), \T1
  1367. vpxor \T1, reg_i, reg_i
  1368. vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
  1369. # num_initial_blocks blocks
  1370. add $16, %r11
  1371. .if \ENC_DEC == DEC
  1372. vmovdqa \T1, reg_i
  1373. .endif
  1374. vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
  1375. i = (i+1)
  1376. setreg
  1377. .endr
  1378. i = (8-\num_initial_blocks)
  1379. j = (9-\num_initial_blocks)
  1380. setreg
  1381. GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
  1382. .rep \num_initial_blocks
  1383. vpxor reg_i, reg_j, reg_j
  1384. GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
  1385. i = (i+1)
  1386. j = (j+1)
  1387. setreg
  1388. .endr
  1389. # XMM8 has the combined result here
  1390. vmovdqa \XMM8, TMP1(%rsp)
  1391. vmovdqa \XMM8, \T3
  1392. cmp $128, %r13
  1393. jl _initial_blocks_done\@ # no need for precomputed constants
  1394. ###############################################################################
  1395. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  1396. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1397. vmovdqa \CTR, \XMM1
  1398. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1399. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1400. vmovdqa \CTR, \XMM2
  1401. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1402. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1403. vmovdqa \CTR, \XMM3
  1404. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1405. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1406. vmovdqa \CTR, \XMM4
  1407. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1408. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1409. vmovdqa \CTR, \XMM5
  1410. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1411. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1412. vmovdqa \CTR, \XMM6
  1413. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1414. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1415. vmovdqa \CTR, \XMM7
  1416. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1417. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1418. vmovdqa \CTR, \XMM8
  1419. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1420. vmovdqa (arg1), \T_key
  1421. vpxor \T_key, \XMM1, \XMM1
  1422. vpxor \T_key, \XMM2, \XMM2
  1423. vpxor \T_key, \XMM3, \XMM3
  1424. vpxor \T_key, \XMM4, \XMM4
  1425. vpxor \T_key, \XMM5, \XMM5
  1426. vpxor \T_key, \XMM6, \XMM6
  1427. vpxor \T_key, \XMM7, \XMM7
  1428. vpxor \T_key, \XMM8, \XMM8
  1429. i = 1
  1430. setreg
  1431. .rep 9 # do 9 rounds
  1432. vmovdqa 16*i(arg1), \T_key
  1433. vaesenc \T_key, \XMM1, \XMM1
  1434. vaesenc \T_key, \XMM2, \XMM2
  1435. vaesenc \T_key, \XMM3, \XMM3
  1436. vaesenc \T_key, \XMM4, \XMM4
  1437. vaesenc \T_key, \XMM5, \XMM5
  1438. vaesenc \T_key, \XMM6, \XMM6
  1439. vaesenc \T_key, \XMM7, \XMM7
  1440. vaesenc \T_key, \XMM8, \XMM8
  1441. i = (i+1)
  1442. setreg
  1443. .endr
  1444. vmovdqa 16*i(arg1), \T_key
  1445. vaesenclast \T_key, \XMM1, \XMM1
  1446. vaesenclast \T_key, \XMM2, \XMM2
  1447. vaesenclast \T_key, \XMM3, \XMM3
  1448. vaesenclast \T_key, \XMM4, \XMM4
  1449. vaesenclast \T_key, \XMM5, \XMM5
  1450. vaesenclast \T_key, \XMM6, \XMM6
  1451. vaesenclast \T_key, \XMM7, \XMM7
  1452. vaesenclast \T_key, \XMM8, \XMM8
  1453. vmovdqu (arg3, %r11), \T1
  1454. vpxor \T1, \XMM1, \XMM1
  1455. vmovdqu \XMM1, (arg2 , %r11)
  1456. .if \ENC_DEC == DEC
  1457. vmovdqa \T1, \XMM1
  1458. .endif
  1459. vmovdqu 16*1(arg3, %r11), \T1
  1460. vpxor \T1, \XMM2, \XMM2
  1461. vmovdqu \XMM2, 16*1(arg2 , %r11)
  1462. .if \ENC_DEC == DEC
  1463. vmovdqa \T1, \XMM2
  1464. .endif
  1465. vmovdqu 16*2(arg3, %r11), \T1
  1466. vpxor \T1, \XMM3, \XMM3
  1467. vmovdqu \XMM3, 16*2(arg2 , %r11)
  1468. .if \ENC_DEC == DEC
  1469. vmovdqa \T1, \XMM3
  1470. .endif
  1471. vmovdqu 16*3(arg3, %r11), \T1
  1472. vpxor \T1, \XMM4, \XMM4
  1473. vmovdqu \XMM4, 16*3(arg2 , %r11)
  1474. .if \ENC_DEC == DEC
  1475. vmovdqa \T1, \XMM4
  1476. .endif
  1477. vmovdqu 16*4(arg3, %r11), \T1
  1478. vpxor \T1, \XMM5, \XMM5
  1479. vmovdqu \XMM5, 16*4(arg2 , %r11)
  1480. .if \ENC_DEC == DEC
  1481. vmovdqa \T1, \XMM5
  1482. .endif
  1483. vmovdqu 16*5(arg3, %r11), \T1
  1484. vpxor \T1, \XMM6, \XMM6
  1485. vmovdqu \XMM6, 16*5(arg2 , %r11)
  1486. .if \ENC_DEC == DEC
  1487. vmovdqa \T1, \XMM6
  1488. .endif
  1489. vmovdqu 16*6(arg3, %r11), \T1
  1490. vpxor \T1, \XMM7, \XMM7
  1491. vmovdqu \XMM7, 16*6(arg2 , %r11)
  1492. .if \ENC_DEC == DEC
  1493. vmovdqa \T1, \XMM7
  1494. .endif
  1495. vmovdqu 16*7(arg3, %r11), \T1
  1496. vpxor \T1, \XMM8, \XMM8
  1497. vmovdqu \XMM8, 16*7(arg2 , %r11)
  1498. .if \ENC_DEC == DEC
  1499. vmovdqa \T1, \XMM8
  1500. .endif
  1501. add $128, %r11
  1502. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1503. vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
  1504. # the corresponding ciphertext
  1505. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1506. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1507. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1508. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1509. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1510. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1511. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1512. ###############################################################################
  1513. _initial_blocks_done\@:
  1514. .endm
  1515. # encrypt 8 blocks at a time
  1516. # ghash the 8 previously encrypted ciphertext blocks
  1517. # arg1, arg2, arg3 are used as pointers only, not modified
  1518. # r11 is the data offset value
  1519. .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
  1520. vmovdqa \XMM1, \T2
  1521. vmovdqa \XMM2, TMP2(%rsp)
  1522. vmovdqa \XMM3, TMP3(%rsp)
  1523. vmovdqa \XMM4, TMP4(%rsp)
  1524. vmovdqa \XMM5, TMP5(%rsp)
  1525. vmovdqa \XMM6, TMP6(%rsp)
  1526. vmovdqa \XMM7, TMP7(%rsp)
  1527. vmovdqa \XMM8, TMP8(%rsp)
  1528. .if \loop_idx == in_order
  1529. vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
  1530. vpaddd ONE(%rip), \XMM1, \XMM2
  1531. vpaddd ONE(%rip), \XMM2, \XMM3
  1532. vpaddd ONE(%rip), \XMM3, \XMM4
  1533. vpaddd ONE(%rip), \XMM4, \XMM5
  1534. vpaddd ONE(%rip), \XMM5, \XMM6
  1535. vpaddd ONE(%rip), \XMM6, \XMM7
  1536. vpaddd ONE(%rip), \XMM7, \XMM8
  1537. vmovdqa \XMM8, \CTR
  1538. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1539. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1540. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1541. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1542. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1543. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1544. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1545. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1546. .else
  1547. vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
  1548. vpaddd ONEf(%rip), \XMM1, \XMM2
  1549. vpaddd ONEf(%rip), \XMM2, \XMM3
  1550. vpaddd ONEf(%rip), \XMM3, \XMM4
  1551. vpaddd ONEf(%rip), \XMM4, \XMM5
  1552. vpaddd ONEf(%rip), \XMM5, \XMM6
  1553. vpaddd ONEf(%rip), \XMM6, \XMM7
  1554. vpaddd ONEf(%rip), \XMM7, \XMM8
  1555. vmovdqa \XMM8, \CTR
  1556. .endif
  1557. #######################################################################
  1558. vmovdqu (arg1), \T1
  1559. vpxor \T1, \XMM1, \XMM1
  1560. vpxor \T1, \XMM2, \XMM2
  1561. vpxor \T1, \XMM3, \XMM3
  1562. vpxor \T1, \XMM4, \XMM4
  1563. vpxor \T1, \XMM5, \XMM5
  1564. vpxor \T1, \XMM6, \XMM6
  1565. vpxor \T1, \XMM7, \XMM7
  1566. vpxor \T1, \XMM8, \XMM8
  1567. #######################################################################
  1568. vmovdqu 16*1(arg1), \T1
  1569. vaesenc \T1, \XMM1, \XMM1
  1570. vaesenc \T1, \XMM2, \XMM2
  1571. vaesenc \T1, \XMM3, \XMM3
  1572. vaesenc \T1, \XMM4, \XMM4
  1573. vaesenc \T1, \XMM5, \XMM5
  1574. vaesenc \T1, \XMM6, \XMM6
  1575. vaesenc \T1, \XMM7, \XMM7
  1576. vaesenc \T1, \XMM8, \XMM8
  1577. vmovdqu 16*2(arg1), \T1
  1578. vaesenc \T1, \XMM1, \XMM1
  1579. vaesenc \T1, \XMM2, \XMM2
  1580. vaesenc \T1, \XMM3, \XMM3
  1581. vaesenc \T1, \XMM4, \XMM4
  1582. vaesenc \T1, \XMM5, \XMM5
  1583. vaesenc \T1, \XMM6, \XMM6
  1584. vaesenc \T1, \XMM7, \XMM7
  1585. vaesenc \T1, \XMM8, \XMM8
  1586. #######################################################################
  1587. vmovdqa HashKey_8(arg1), \T5
  1588. vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
  1589. vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
  1590. vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
  1591. vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
  1592. vpxor \T5, \T6, \T6
  1593. vmovdqu 16*3(arg1), \T1
  1594. vaesenc \T1, \XMM1, \XMM1
  1595. vaesenc \T1, \XMM2, \XMM2
  1596. vaesenc \T1, \XMM3, \XMM3
  1597. vaesenc \T1, \XMM4, \XMM4
  1598. vaesenc \T1, \XMM5, \XMM5
  1599. vaesenc \T1, \XMM6, \XMM6
  1600. vaesenc \T1, \XMM7, \XMM7
  1601. vaesenc \T1, \XMM8, \XMM8
  1602. vmovdqa TMP2(%rsp), \T1
  1603. vmovdqa HashKey_7(arg1), \T5
  1604. vpclmulqdq $0x11, \T5, \T1, \T3
  1605. vpxor \T3, \T4, \T4
  1606. vpclmulqdq $0x00, \T5, \T1, \T3
  1607. vpxor \T3, \T7, \T7
  1608. vpclmulqdq $0x01, \T5, \T1, \T3
  1609. vpxor \T3, \T6, \T6
  1610. vpclmulqdq $0x10, \T5, \T1, \T3
  1611. vpxor \T3, \T6, \T6
  1612. vmovdqu 16*4(arg1), \T1
  1613. vaesenc \T1, \XMM1, \XMM1
  1614. vaesenc \T1, \XMM2, \XMM2
  1615. vaesenc \T1, \XMM3, \XMM3
  1616. vaesenc \T1, \XMM4, \XMM4
  1617. vaesenc \T1, \XMM5, \XMM5
  1618. vaesenc \T1, \XMM6, \XMM6
  1619. vaesenc \T1, \XMM7, \XMM7
  1620. vaesenc \T1, \XMM8, \XMM8
  1621. #######################################################################
  1622. vmovdqa TMP3(%rsp), \T1
  1623. vmovdqa HashKey_6(arg1), \T5
  1624. vpclmulqdq $0x11, \T5, \T1, \T3
  1625. vpxor \T3, \T4, \T4
  1626. vpclmulqdq $0x00, \T5, \T1, \T3
  1627. vpxor \T3, \T7, \T7
  1628. vpclmulqdq $0x01, \T5, \T1, \T3
  1629. vpxor \T3, \T6, \T6
  1630. vpclmulqdq $0x10, \T5, \T1, \T3
  1631. vpxor \T3, \T6, \T6
  1632. vmovdqu 16*5(arg1), \T1
  1633. vaesenc \T1, \XMM1, \XMM1
  1634. vaesenc \T1, \XMM2, \XMM2
  1635. vaesenc \T1, \XMM3, \XMM3
  1636. vaesenc \T1, \XMM4, \XMM4
  1637. vaesenc \T1, \XMM5, \XMM5
  1638. vaesenc \T1, \XMM6, \XMM6
  1639. vaesenc \T1, \XMM7, \XMM7
  1640. vaesenc \T1, \XMM8, \XMM8
  1641. vmovdqa TMP4(%rsp), \T1
  1642. vmovdqa HashKey_5(arg1), \T5
  1643. vpclmulqdq $0x11, \T5, \T1, \T3
  1644. vpxor \T3, \T4, \T4
  1645. vpclmulqdq $0x00, \T5, \T1, \T3
  1646. vpxor \T3, \T7, \T7
  1647. vpclmulqdq $0x01, \T5, \T1, \T3
  1648. vpxor \T3, \T6, \T6
  1649. vpclmulqdq $0x10, \T5, \T1, \T3
  1650. vpxor \T3, \T6, \T6
  1651. vmovdqu 16*6(arg1), \T1
  1652. vaesenc \T1, \XMM1, \XMM1
  1653. vaesenc \T1, \XMM2, \XMM2
  1654. vaesenc \T1, \XMM3, \XMM3
  1655. vaesenc \T1, \XMM4, \XMM4
  1656. vaesenc \T1, \XMM5, \XMM5
  1657. vaesenc \T1, \XMM6, \XMM6
  1658. vaesenc \T1, \XMM7, \XMM7
  1659. vaesenc \T1, \XMM8, \XMM8
  1660. vmovdqa TMP5(%rsp), \T1
  1661. vmovdqa HashKey_4(arg1), \T5
  1662. vpclmulqdq $0x11, \T5, \T1, \T3
  1663. vpxor \T3, \T4, \T4
  1664. vpclmulqdq $0x00, \T5, \T1, \T3
  1665. vpxor \T3, \T7, \T7
  1666. vpclmulqdq $0x01, \T5, \T1, \T3
  1667. vpxor \T3, \T6, \T6
  1668. vpclmulqdq $0x10, \T5, \T1, \T3
  1669. vpxor \T3, \T6, \T6
  1670. vmovdqu 16*7(arg1), \T1
  1671. vaesenc \T1, \XMM1, \XMM1
  1672. vaesenc \T1, \XMM2, \XMM2
  1673. vaesenc \T1, \XMM3, \XMM3
  1674. vaesenc \T1, \XMM4, \XMM4
  1675. vaesenc \T1, \XMM5, \XMM5
  1676. vaesenc \T1, \XMM6, \XMM6
  1677. vaesenc \T1, \XMM7, \XMM7
  1678. vaesenc \T1, \XMM8, \XMM8
  1679. vmovdqa TMP6(%rsp), \T1
  1680. vmovdqa HashKey_3(arg1), \T5
  1681. vpclmulqdq $0x11, \T5, \T1, \T3
  1682. vpxor \T3, \T4, \T4
  1683. vpclmulqdq $0x00, \T5, \T1, \T3
  1684. vpxor \T3, \T7, \T7
  1685. vpclmulqdq $0x01, \T5, \T1, \T3
  1686. vpxor \T3, \T6, \T6
  1687. vpclmulqdq $0x10, \T5, \T1, \T3
  1688. vpxor \T3, \T6, \T6
  1689. vmovdqu 16*8(arg1), \T1
  1690. vaesenc \T1, \XMM1, \XMM1
  1691. vaesenc \T1, \XMM2, \XMM2
  1692. vaesenc \T1, \XMM3, \XMM3
  1693. vaesenc \T1, \XMM4, \XMM4
  1694. vaesenc \T1, \XMM5, \XMM5
  1695. vaesenc \T1, \XMM6, \XMM6
  1696. vaesenc \T1, \XMM7, \XMM7
  1697. vaesenc \T1, \XMM8, \XMM8
  1698. vmovdqa TMP7(%rsp), \T1
  1699. vmovdqa HashKey_2(arg1), \T5
  1700. vpclmulqdq $0x11, \T5, \T1, \T3
  1701. vpxor \T3, \T4, \T4
  1702. vpclmulqdq $0x00, \T5, \T1, \T3
  1703. vpxor \T3, \T7, \T7
  1704. vpclmulqdq $0x01, \T5, \T1, \T3
  1705. vpxor \T3, \T6, \T6
  1706. vpclmulqdq $0x10, \T5, \T1, \T3
  1707. vpxor \T3, \T6, \T6
  1708. #######################################################################
  1709. vmovdqu 16*9(arg1), \T5
  1710. vaesenc \T5, \XMM1, \XMM1
  1711. vaesenc \T5, \XMM2, \XMM2
  1712. vaesenc \T5, \XMM3, \XMM3
  1713. vaesenc \T5, \XMM4, \XMM4
  1714. vaesenc \T5, \XMM5, \XMM5
  1715. vaesenc \T5, \XMM6, \XMM6
  1716. vaesenc \T5, \XMM7, \XMM7
  1717. vaesenc \T5, \XMM8, \XMM8
  1718. vmovdqa TMP8(%rsp), \T1
  1719. vmovdqa HashKey(arg1), \T5
  1720. vpclmulqdq $0x00, \T5, \T1, \T3
  1721. vpxor \T3, \T7, \T7
  1722. vpclmulqdq $0x01, \T5, \T1, \T3
  1723. vpxor \T3, \T6, \T6
  1724. vpclmulqdq $0x10, \T5, \T1, \T3
  1725. vpxor \T3, \T6, \T6
  1726. vpclmulqdq $0x11, \T5, \T1, \T3
  1727. vpxor \T3, \T4, \T1
  1728. vmovdqu 16*10(arg1), \T5
  1729. i = 0
  1730. j = 1
  1731. setreg
  1732. .rep 8
  1733. vpxor 16*i(arg3, %r11), \T5, \T2
  1734. .if \ENC_DEC == ENC
  1735. vaesenclast \T2, reg_j, reg_j
  1736. .else
  1737. vaesenclast \T2, reg_j, \T3
  1738. vmovdqu 16*i(arg3, %r11), reg_j
  1739. vmovdqu \T3, 16*i(arg2, %r11)
  1740. .endif
  1741. i = (i+1)
  1742. j = (j+1)
  1743. setreg
  1744. .endr
  1745. #######################################################################
  1746. vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
  1747. vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
  1748. vpxor \T3, \T7, \T7
  1749. vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
  1750. #######################################################################
  1751. #first phase of the reduction
  1752. vmovdqa POLY2(%rip), \T3
  1753. vpclmulqdq $0x01, \T7, \T3, \T2
  1754. vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
  1755. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  1756. #######################################################################
  1757. .if \ENC_DEC == ENC
  1758. vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
  1759. vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
  1760. vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
  1761. vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
  1762. vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
  1763. vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
  1764. vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
  1765. vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
  1766. .endif
  1767. #######################################################################
  1768. #second phase of the reduction
  1769. vpclmulqdq $0x00, \T7, \T3, \T2
  1770. vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1771. vpclmulqdq $0x10, \T7, \T3, \T4
  1772. vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1773. vpxor \T2, \T4, \T4 # second phase of the reduction complete
  1774. #######################################################################
  1775. vpxor \T4, \T1, \T1 # the result is in T1
  1776. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1777. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1778. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1779. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1780. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1781. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1782. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1783. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1784. vpxor \T1, \XMM1, \XMM1
  1785. .endm
  1786. # GHASH the last 4 ciphertext blocks.
  1787. .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
  1788. ## Karatsuba Method
  1789. vmovdqa HashKey_8(arg1), \T5
  1790. vpshufd $0b01001110, \XMM1, \T2
  1791. vpshufd $0b01001110, \T5, \T3
  1792. vpxor \XMM1, \T2, \T2
  1793. vpxor \T5, \T3, \T3
  1794. vpclmulqdq $0x11, \T5, \XMM1, \T6
  1795. vpclmulqdq $0x00, \T5, \XMM1, \T7
  1796. vpclmulqdq $0x00, \T3, \T2, \XMM1
  1797. ######################
  1798. vmovdqa HashKey_7(arg1), \T5
  1799. vpshufd $0b01001110, \XMM2, \T2
  1800. vpshufd $0b01001110, \T5, \T3
  1801. vpxor \XMM2, \T2, \T2
  1802. vpxor \T5, \T3, \T3
  1803. vpclmulqdq $0x11, \T5, \XMM2, \T4
  1804. vpxor \T4, \T6, \T6
  1805. vpclmulqdq $0x00, \T5, \XMM2, \T4
  1806. vpxor \T4, \T7, \T7
  1807. vpclmulqdq $0x00, \T3, \T2, \T2
  1808. vpxor \T2, \XMM1, \XMM1
  1809. ######################
  1810. vmovdqa HashKey_6(arg1), \T5
  1811. vpshufd $0b01001110, \XMM3, \T2
  1812. vpshufd $0b01001110, \T5, \T3
  1813. vpxor \XMM3, \T2, \T2
  1814. vpxor \T5, \T3, \T3
  1815. vpclmulqdq $0x11, \T5, \XMM3, \T4
  1816. vpxor \T4, \T6, \T6
  1817. vpclmulqdq $0x00, \T5, \XMM3, \T4
  1818. vpxor \T4, \T7, \T7
  1819. vpclmulqdq $0x00, \T3, \T2, \T2
  1820. vpxor \T2, \XMM1, \XMM1
  1821. ######################
  1822. vmovdqa HashKey_5(arg1), \T5
  1823. vpshufd $0b01001110, \XMM4, \T2
  1824. vpshufd $0b01001110, \T5, \T3
  1825. vpxor \XMM4, \T2, \T2
  1826. vpxor \T5, \T3, \T3
  1827. vpclmulqdq $0x11, \T5, \XMM4, \T4
  1828. vpxor \T4, \T6, \T6
  1829. vpclmulqdq $0x00, \T5, \XMM4, \T4
  1830. vpxor \T4, \T7, \T7
  1831. vpclmulqdq $0x00, \T3, \T2, \T2
  1832. vpxor \T2, \XMM1, \XMM1
  1833. ######################
  1834. vmovdqa HashKey_4(arg1), \T5
  1835. vpshufd $0b01001110, \XMM5, \T2
  1836. vpshufd $0b01001110, \T5, \T3
  1837. vpxor \XMM5, \T2, \T2
  1838. vpxor \T5, \T3, \T3
  1839. vpclmulqdq $0x11, \T5, \XMM5, \T4
  1840. vpxor \T4, \T6, \T6
  1841. vpclmulqdq $0x00, \T5, \XMM5, \T4
  1842. vpxor \T4, \T7, \T7
  1843. vpclmulqdq $0x00, \T3, \T2, \T2
  1844. vpxor \T2, \XMM1, \XMM1
  1845. ######################
  1846. vmovdqa HashKey_3(arg1), \T5
  1847. vpshufd $0b01001110, \XMM6, \T2
  1848. vpshufd $0b01001110, \T5, \T3
  1849. vpxor \XMM6, \T2, \T2
  1850. vpxor \T5, \T3, \T3
  1851. vpclmulqdq $0x11, \T5, \XMM6, \T4
  1852. vpxor \T4, \T6, \T6
  1853. vpclmulqdq $0x00, \T5, \XMM6, \T4
  1854. vpxor \T4, \T7, \T7
  1855. vpclmulqdq $0x00, \T3, \T2, \T2
  1856. vpxor \T2, \XMM1, \XMM1
  1857. ######################
  1858. vmovdqa HashKey_2(arg1), \T5
  1859. vpshufd $0b01001110, \XMM7, \T2
  1860. vpshufd $0b01001110, \T5, \T3
  1861. vpxor \XMM7, \T2, \T2
  1862. vpxor \T5, \T3, \T3
  1863. vpclmulqdq $0x11, \T5, \XMM7, \T4
  1864. vpxor \T4, \T6, \T6
  1865. vpclmulqdq $0x00, \T5, \XMM7, \T4
  1866. vpxor \T4, \T7, \T7
  1867. vpclmulqdq $0x00, \T3, \T2, \T2
  1868. vpxor \T2, \XMM1, \XMM1
  1869. ######################
  1870. vmovdqa HashKey(arg1), \T5
  1871. vpshufd $0b01001110, \XMM8, \T2
  1872. vpshufd $0b01001110, \T5, \T3
  1873. vpxor \XMM8, \T2, \T2
  1874. vpxor \T5, \T3, \T3
  1875. vpclmulqdq $0x11, \T5, \XMM8, \T4
  1876. vpxor \T4, \T6, \T6
  1877. vpclmulqdq $0x00, \T5, \XMM8, \T4
  1878. vpxor \T4, \T7, \T7
  1879. vpclmulqdq $0x00, \T3, \T2, \T2
  1880. vpxor \T2, \XMM1, \XMM1
  1881. vpxor \T6, \XMM1, \XMM1
  1882. vpxor \T7, \XMM1, \T2
  1883. vpslldq $8, \T2, \T4
  1884. vpsrldq $8, \T2, \T2
  1885. vpxor \T4, \T7, \T7
  1886. vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
  1887. # accumulated carry-less multiplications
  1888. #######################################################################
  1889. #first phase of the reduction
  1890. vmovdqa POLY2(%rip), \T3
  1891. vpclmulqdq $0x01, \T7, \T3, \T2
  1892. vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
  1893. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  1894. #######################################################################
  1895. #second phase of the reduction
  1896. vpclmulqdq $0x00, \T7, \T3, \T2
  1897. vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1898. vpclmulqdq $0x10, \T7, \T3, \T4
  1899. vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1900. vpxor \T2, \T4, \T4 # second phase of the reduction complete
  1901. #######################################################################
  1902. vpxor \T4, \T6, \T6 # the result is in T6
  1903. .endm
  1904. # combined for GCM encrypt and decrypt functions
  1905. # clobbering all xmm registers
  1906. # clobbering r10, r11, r12, r13, r14, r15
  1907. .macro GCM_ENC_DEC_AVX2 ENC_DEC
  1908. #the number of pushes must equal STACK_OFFSET
  1909. push %r12
  1910. push %r13
  1911. push %r14
  1912. push %r15
  1913. mov %rsp, %r14
  1914. sub $VARIABLE_OFFSET, %rsp
  1915. and $~63, %rsp # align rsp to 64 bytes
  1916. vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
  1917. mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
  1918. and $-16, %r13 # r13 = r13 - (r13 mod 16)
  1919. mov %r13, %r12
  1920. shr $4, %r12
  1921. and $7, %r12
  1922. jz _initial_num_blocks_is_0\@
  1923. cmp $7, %r12
  1924. je _initial_num_blocks_is_7\@
  1925. cmp $6, %r12
  1926. je _initial_num_blocks_is_6\@
  1927. cmp $5, %r12
  1928. je _initial_num_blocks_is_5\@
  1929. cmp $4, %r12
  1930. je _initial_num_blocks_is_4\@
  1931. cmp $3, %r12
  1932. je _initial_num_blocks_is_3\@
  1933. cmp $2, %r12
  1934. je _initial_num_blocks_is_2\@
  1935. jmp _initial_num_blocks_is_1\@
  1936. _initial_num_blocks_is_7\@:
  1937. INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1938. sub $16*7, %r13
  1939. jmp _initial_blocks_encrypted\@
  1940. _initial_num_blocks_is_6\@:
  1941. INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1942. sub $16*6, %r13
  1943. jmp _initial_blocks_encrypted\@
  1944. _initial_num_blocks_is_5\@:
  1945. INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1946. sub $16*5, %r13
  1947. jmp _initial_blocks_encrypted\@
  1948. _initial_num_blocks_is_4\@:
  1949. INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1950. sub $16*4, %r13
  1951. jmp _initial_blocks_encrypted\@
  1952. _initial_num_blocks_is_3\@:
  1953. INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1954. sub $16*3, %r13
  1955. jmp _initial_blocks_encrypted\@
  1956. _initial_num_blocks_is_2\@:
  1957. INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1958. sub $16*2, %r13
  1959. jmp _initial_blocks_encrypted\@
  1960. _initial_num_blocks_is_1\@:
  1961. INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1962. sub $16*1, %r13
  1963. jmp _initial_blocks_encrypted\@
  1964. _initial_num_blocks_is_0\@:
  1965. INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1966. _initial_blocks_encrypted\@:
  1967. cmp $0, %r13
  1968. je _zero_cipher_left\@
  1969. sub $128, %r13
  1970. je _eight_cipher_left\@
  1971. vmovd %xmm9, %r15d
  1972. and $255, %r15d
  1973. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1974. _encrypt_by_8_new\@:
  1975. cmp $(255-8), %r15d
  1976. jg _encrypt_by_8\@
  1977. add $8, %r15b
  1978. GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
  1979. add $128, %r11
  1980. sub $128, %r13
  1981. jne _encrypt_by_8_new\@
  1982. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1983. jmp _eight_cipher_left\@
  1984. _encrypt_by_8\@:
  1985. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1986. add $8, %r15b
  1987. GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
  1988. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1989. add $128, %r11
  1990. sub $128, %r13
  1991. jne _encrypt_by_8_new\@
  1992. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1993. _eight_cipher_left\@:
  1994. GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
  1995. _zero_cipher_left\@:
  1996. cmp $16, arg4
  1997. jl _only_less_than_16\@
  1998. mov arg4, %r13
  1999. and $15, %r13 # r13 = (arg4 mod 16)
  2000. je _multiple_of_16_bytes\@
  2001. # handle the last <16 Byte block seperately
  2002. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  2003. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2004. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  2005. sub $16, %r11
  2006. add %r13, %r11
  2007. vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
  2008. lea SHIFT_MASK+16(%rip), %r12
  2009. sub %r13, %r12 # adjust the shuffle mask pointer
  2010. # to be able to shift 16-r13 bytes
  2011. # (r13 is the number of bytes in plaintext mod 16)
  2012. vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
  2013. vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
  2014. jmp _final_ghash_mul\@
  2015. _only_less_than_16\@:
  2016. # check for 0 length
  2017. mov arg4, %r13
  2018. and $15, %r13 # r13 = (arg4 mod 16)
  2019. je _multiple_of_16_bytes\@
  2020. # handle the last <16 Byte block seperately
  2021. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  2022. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2023. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  2024. lea SHIFT_MASK+16(%rip), %r12
  2025. sub %r13, %r12 # adjust the shuffle mask pointer to be
  2026. # able to shift 16-r13 bytes (r13 is the
  2027. # number of bytes in plaintext mod 16)
  2028. _get_last_16_byte_loop\@:
  2029. movb (arg3, %r11), %al
  2030. movb %al, TMP1 (%rsp , %r11)
  2031. add $1, %r11
  2032. cmp %r13, %r11
  2033. jne _get_last_16_byte_loop\@
  2034. vmovdqu TMP1(%rsp), %xmm1
  2035. sub $16, %r11
  2036. _final_ghash_mul\@:
  2037. .if \ENC_DEC == DEC
  2038. vmovdqa %xmm1, %xmm2
  2039. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  2040. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
  2041. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  2042. vpand %xmm1, %xmm2, %xmm2
  2043. vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
  2044. vpxor %xmm2, %xmm14, %xmm14
  2045. #GHASH computation for the last <16 Byte block
  2046. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  2047. sub %r13, %r11
  2048. add $16, %r11
  2049. .else
  2050. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  2051. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
  2052. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  2053. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2054. vpxor %xmm9, %xmm14, %xmm14
  2055. #GHASH computation for the last <16 Byte block
  2056. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  2057. sub %r13, %r11
  2058. add $16, %r11
  2059. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
  2060. .endif
  2061. #############################
  2062. # output r13 Bytes
  2063. vmovq %xmm9, %rax
  2064. cmp $8, %r13
  2065. jle _less_than_8_bytes_left\@
  2066. mov %rax, (arg2 , %r11)
  2067. add $8, %r11
  2068. vpsrldq $8, %xmm9, %xmm9
  2069. vmovq %xmm9, %rax
  2070. sub $8, %r13
  2071. _less_than_8_bytes_left\@:
  2072. movb %al, (arg2 , %r11)
  2073. add $1, %r11
  2074. shr $8, %rax
  2075. sub $1, %r13
  2076. jne _less_than_8_bytes_left\@
  2077. #############################
  2078. _multiple_of_16_bytes\@:
  2079. mov arg7, %r12 # r12 = aadLen (number of bytes)
  2080. shl $3, %r12 # convert into number of bits
  2081. vmovd %r12d, %xmm15 # len(A) in xmm15
  2082. shl $3, arg4 # len(C) in bits (*128)
  2083. vmovq arg4, %xmm1
  2084. vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
  2085. vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
  2086. vpxor %xmm15, %xmm14, %xmm14
  2087. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
  2088. vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
  2089. mov arg5, %rax # rax = *Y0
  2090. vmovdqu (%rax), %xmm9 # xmm9 = Y0
  2091. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
  2092. vpxor %xmm14, %xmm9, %xmm9
  2093. _return_T\@:
  2094. mov arg8, %r10 # r10 = authTag
  2095. mov arg9, %r11 # r11 = auth_tag_len
  2096. cmp $16, %r11
  2097. je _T_16\@
  2098. cmp $12, %r11
  2099. je _T_12\@
  2100. _T_8\@:
  2101. vmovq %xmm9, %rax
  2102. mov %rax, (%r10)
  2103. jmp _return_T_done\@
  2104. _T_12\@:
  2105. vmovq %xmm9, %rax
  2106. mov %rax, (%r10)
  2107. vpsrldq $8, %xmm9, %xmm9
  2108. vmovd %xmm9, %eax
  2109. mov %eax, 8(%r10)
  2110. jmp _return_T_done\@
  2111. _T_16\@:
  2112. vmovdqu %xmm9, (%r10)
  2113. _return_T_done\@:
  2114. mov %r14, %rsp
  2115. pop %r15
  2116. pop %r14
  2117. pop %r13
  2118. pop %r12
  2119. .endm
  2120. #############################################################
  2121. #void aesni_gcm_precomp_avx_gen4
  2122. # (gcm_data *my_ctx_data,
  2123. # u8 *hash_subkey)# /* H, the Hash sub key input.
  2124. # Data starts on a 16-byte boundary. */
  2125. #############################################################
  2126. ENTRY(aesni_gcm_precomp_avx_gen4)
  2127. #the number of pushes must equal STACK_OFFSET
  2128. push %r12
  2129. push %r13
  2130. push %r14
  2131. push %r15
  2132. mov %rsp, %r14
  2133. sub $VARIABLE_OFFSET, %rsp
  2134. and $~63, %rsp # align rsp to 64 bytes
  2135. vmovdqu (arg2), %xmm6 # xmm6 = HashKey
  2136. vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
  2137. ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
  2138. vmovdqa %xmm6, %xmm2
  2139. vpsllq $1, %xmm6, %xmm6
  2140. vpsrlq $63, %xmm2, %xmm2
  2141. vmovdqa %xmm2, %xmm1
  2142. vpslldq $8, %xmm2, %xmm2
  2143. vpsrldq $8, %xmm1, %xmm1
  2144. vpor %xmm2, %xmm6, %xmm6
  2145. #reduction
  2146. vpshufd $0b00100100, %xmm1, %xmm2
  2147. vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
  2148. vpand POLY(%rip), %xmm2, %xmm2
  2149. vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
  2150. #######################################################################
  2151. vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
  2152. PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
  2153. mov %r14, %rsp
  2154. pop %r15
  2155. pop %r14
  2156. pop %r13
  2157. pop %r12
  2158. ret
  2159. ENDPROC(aesni_gcm_precomp_avx_gen4)
  2160. ###############################################################################
  2161. #void aesni_gcm_enc_avx_gen4(
  2162. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2163. # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
  2164. # const u8 *in, /* Plaintext input */
  2165. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  2166. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  2167. # (from Security Association) concatenated with 8 byte
  2168. # Initialisation Vector (from IPSec ESP Payload)
  2169. # concatenated with 0x00000001. 16-byte aligned pointer. */
  2170. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  2171. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  2172. # u8 *auth_tag, /* Authenticated Tag output. */
  2173. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  2174. # Valid values are 16 (most likely), 12 or 8. */
  2175. ###############################################################################
  2176. ENTRY(aesni_gcm_enc_avx_gen4)
  2177. GCM_ENC_DEC_AVX2 ENC
  2178. ret
  2179. ENDPROC(aesni_gcm_enc_avx_gen4)
  2180. ###############################################################################
  2181. #void aesni_gcm_dec_avx_gen4(
  2182. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2183. # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
  2184. # const u8 *in, /* Ciphertext input */
  2185. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  2186. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  2187. # (from Security Association) concatenated with 8 byte
  2188. # Initialisation Vector (from IPSec ESP Payload)
  2189. # concatenated with 0x00000001. 16-byte aligned pointer. */
  2190. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  2191. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  2192. # u8 *auth_tag, /* Authenticated Tag output. */
  2193. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  2194. # Valid values are 16 (most likely), 12 or 8. */
  2195. ###############################################################################
  2196. ENTRY(aesni_gcm_dec_avx_gen4)
  2197. GCM_ENC_DEC_AVX2 DEC
  2198. ret
  2199. ENDPROC(aesni_gcm_dec_avx_gen4)
  2200. #endif /* CONFIG_AS_AVX2 */