mvpkm_main.c 71 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794
  1. /*
  2. * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support
  3. *
  4. * Copyright (C) 2010-2013 VMware, Inc. All rights reserved.
  5. *
  6. * This program is free software; you can redistribute it and/or modify it
  7. * under the terms of the GNU General Public License version 2 as published by
  8. * the Free Software Foundation.
  9. *
  10. * This program is distributed in the hope that it will be useful, but WITHOUT
  11. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  13. * more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along with
  16. * this program; see the file COPYING. If not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18. */
  19. #line 5
  20. /**
  21. * @file
  22. *
  23. * @brief The kernel level driver.
  24. */
  25. #define __KERNEL_SYSCALLS__
  26. #include <linux/version.h>
  27. #include <linux/kernel.h>
  28. #include <linux/module.h>
  29. #include <linux/init.h>
  30. #include <linux/fs.h>
  31. #include <linux/errno.h>
  32. #include <linux/types.h>
  33. #include <linux/proc_fs.h>
  34. #include <linux/fcntl.h>
  35. #include <linux/syscalls.h>
  36. #include <linux/kmod.h>
  37. #include <linux/socket.h>
  38. #include <linux/net.h>
  39. #include <linux/skbuff.h>
  40. #include <linux/miscdevice.h>
  41. #include <linux/poll.h>
  42. #include <linux/smp.h>
  43. #include <linux/capability.h>
  44. #include <linux/mm.h>
  45. #include <linux/vmalloc.h>
  46. #include <linux/sysfs.h>
  47. #include <linux/debugfs.h>
  48. #include <linux/pid.h>
  49. #include <linux/highmem.h>
  50. #include <linux/syscalls.h>
  51. #ifdef CONFIG_HAS_WAKELOCK
  52. #include <linux/wakelock.h>
  53. #endif
  54. #include <net/sock.h>
  55. #include <asm/cacheflush.h>
  56. #include <asm/memory.h>
  57. #include <asm/pgtable.h>
  58. #include <asm/system.h>
  59. #include <linux/uaccess.h>
  60. #include "mvp.h"
  61. #include "mvp_version.h"
  62. #include "mvpkm_types.h"
  63. #include "mvpkm_private.h"
  64. #include "mvpkm_kernel.h"
  65. #include "actions.h"
  66. #include "wscalls.h"
  67. #include "arm_inline.h"
  68. #include "tsc.h"
  69. #include "mksck_kernel.h"
  70. #include "mmu_types.h"
  71. #include "mvp_timer.h"
  72. #include "qp.h"
  73. #include "qp_host_kernel.h"
  74. #include "cpufreq_kernel.h"
  75. #include "mvpkm_comm_ev.h"
  76. #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER
  77. #include "mvp_balloon.h"
  78. #endif
  79. /*
  80. * Definition of the file operations
  81. */
  82. static _Bool
  83. LockedListAdd(struct MvpkmVM *vm,
  84. __u32 mpn,
  85. __u32 order,
  86. PhysMem_RegionType forRegion);
  87. static _Bool LockedListDel(struct MvpkmVM *vm, __u32 mpn);
  88. static void LockedListUnlockAll(struct MvpkmVM *vm);
  89. static _Bool LockedListLookup(struct MvpkmVM *vm, __u32 mpn);
  90. static int SetupMonitor(struct MvpkmVM *vm);
  91. static int RunMonitor(struct MvpkmVM *vm);
  92. static MPN
  93. AllocZeroedFreePages(struct MvpkmVM *vm,
  94. uint32 order,
  95. _Bool highmem,
  96. PhysMem_RegionType forRegion,
  97. HKVA *hkvaRet);
  98. static HKVA MapWSPHKVA(struct MvpkmVM *vm, HkvaMapInfo *mapInfo);
  99. static void UnmapWSPHKVA(struct MvpkmVM *vm);
  100. static int MvpkmWaitForInt(struct MvpkmVM *vm, _Bool suspend);
  101. static void ReleaseVM(struct MvpkmVM *vm);
  102. /*
  103. * Mksck open request must come from this uid. It must be root until
  104. * it is set via an ioctl from mvpd.
  105. */
  106. uid_t Mvpkm_vmwareUid;
  107. EXPORT_SYMBOL(Mvpkm_vmwareUid);
  108. gid_t Mvpkm_vmwareGid;
  109. EXPORT_SYMBOL(Mvpkm_vmwareGid);
  110. /*
  111. * Mvpd should copy the content of /sys/module/lowmemorykiller/parameters/adj
  112. * here, as we don't have access to these numbers within the kernel itself.
  113. * Note: Android uses 6 values, and we rely on this.
  114. */
  115. static int lowmemAdjSize;
  116. static int lowmemAdj[6];
  117. /*
  118. * vCPU cpu affinity to let monitor/guest run on some CPUs only (when possible)
  119. */
  120. static DECLARE_BITMAP(vcpuAffinity, NR_CPUS);
  121. /*
  122. * Which CPUs are running a monitor ?
  123. */
  124. struct cpumask inMonitor;
  125. /*********************************************************************
  126. *
  127. * Sysfs nodes
  128. *
  129. *********************************************************************/
  130. /*
  131. * kobject for our sysfs representation, used for global nodes.
  132. */
  133. static struct kobject *mvpkmKObj;
  134. /*
  135. * kobject for the balloon exports.
  136. */
  137. static struct kobject *balloonKObj;
  138. /**
  139. * @brief sysfs show function for global version attribute.
  140. *
  141. * @param kobj reference to kobj nested in MvpkmVM struct.
  142. * @param attr kobj_attribute reference, not used.
  143. * @param buf PAGE_SIZEd buffer to write to.
  144. *
  145. * @return number of characters printed (not including trailing null character).
  146. */
  147. static ssize_t
  148. version_show(struct kobject *kobj,
  149. struct kobj_attribute *attr,
  150. char *buf)
  151. {
  152. return snprintf(buf, PAGE_SIZE,
  153. MVP_VERSION_FORMATSTR "\n", MVP_VERSION_FORMATARGS);
  154. }
  155. static struct kobj_attribute versionAttr = __ATTR_RO(version);
  156. /**
  157. * @brief sysfs show function for global background_pages attribute.
  158. *
  159. * Used by vmx balloon policy controller to gauge the amount of freeable
  160. * anonymous memory.
  161. *
  162. * @param kobj reference to kobj nested in MvpkmVM struct.
  163. * @param attr kobj_attribute reference, not used.
  164. * @param buf PAGE_SIZEd buffer to write to.
  165. *
  166. * @return number of characters printed (not including trailing null character).
  167. */
  168. static ssize_t
  169. background_show(struct kobject *kobj,
  170. struct kobj_attribute *attr,
  171. char *buf)
  172. {
  173. #ifndef CONFIG_ANDROID_LOW_MEMORY_KILLER
  174. return snprintf(buf, PAGE_SIZE, "0\n");
  175. #else
  176. /* The HIDDEN_APP_MIN_ADJ value is the 5th in a list of 6 parameters. */
  177. FATAL_IF(lowmemAdjSize != 6);
  178. return snprintf(buf, PAGE_SIZE, "%d\n",
  179. Balloon_AndroidBackgroundPages(lowmemAdj[4]));
  180. #endif
  181. }
  182. static struct kobj_attribute backgroundAttr = __ATTR_RO(background);
  183. /**
  184. * @brief sysfs show function to export the other_file calculation in
  185. * lowmemorykiller.
  186. *
  187. * It's helpful, in the balloon controller, to know what the lowmemorykiller
  188. * module is using to know when the system has crossed a minfree threshold.
  189. * Since there exists a number of different other_file calculations in various
  190. * lowmemorykiller patches (@see{MVP-1674}), and the module itself doesn't
  191. * provide a clean export of this figure, we provide it on a case-by-case basis
  192. * for the various supported hosts here.
  193. *
  194. * @param kobj reference to kobj nested in MvpkmVM struct.
  195. * @param attr kobj_attribute reference, not used.
  196. * @param buf PAGE_SIZEd buffer to write to.
  197. *
  198. * @return number of characters printed (not including trailing null character).
  199. */
  200. static ssize_t
  201. other_file_show(struct kobject *kobj,
  202. struct kobj_attribute *attr,
  203. char *buf)
  204. {
  205. int32 other_file = 0;
  206. #ifndef LOWMEMKILLER_VARIANT
  207. #define LOWMEMKILLER_VARIANT 0
  208. #endif
  209. #ifndef LOWMEMKILLER_MD5
  210. #define LOWMEMKILLER_MD5 0
  211. #endif
  212. #ifndef LOWMEMKILLER_SHRINK_MD5
  213. #define LOWMEMKILLER_SHRINK_MD5 0
  214. #endif
  215. /*
  216. * The build system hashes the lowmemorykiller section related to the
  217. * other_file calculation in the kernel source for us, here we have to
  218. * provide the code.
  219. */
  220. #if LOWMEMKILLER_VARIANT == 1
  221. /*
  222. * This is the same as the non-exported global_reclaimable_pages()
  223. * when there is no swap.
  224. */
  225. other_file = global_page_state(NR_ACTIVE_FILE) +
  226. global_page_state(NR_INACTIVE_FILE);
  227. #elif LOWMEMKILLER_VARIANT == 2
  228. other_file = global_page_state(NR_FILE_PAGES);
  229. #elif LOWMEMKILLER_VARIANT == 3
  230. other_file = global_page_state(NR_FILE_PAGES) -
  231. global_page_state(NR_SHMEM);
  232. #elif LOWMEMKILLER_VARIANT == 4
  233. /*
  234. * Here free/file pages are fungible and max(free, file) isn't used,
  235. * but we can continue to use max(free, file) since
  236. * max(free, file) = other_file in this case.
  237. */
  238. other_file = global_page_state(NR_FREE_PAGES) +
  239. global_page_state(NR_FILE_PAGES);
  240. #elif LOWMEMKILLER_VARIANT == 5
  241. /*
  242. * other_free and other_file are modified depending on zone index or/and
  243. * memory offlining and compared to "lowmem_minfree[i] - zone_adj".
  244. */
  245. other_file = global_page_state(NR_FILE_PAGES) -
  246. global_page_state(NR_SHMEM);
  247. #elif defined(NONANDROID)
  248. /*
  249. * Non-Android host platforms don't have ballooning enabled.
  250. */
  251. #else
  252. /*
  253. * If you get this message, you need to run 'make lowmem-info' and
  254. * inspect lowmemorykiller.c. If the "other_file = ..." calculation in
  255. * lowmem_shrink appears above, simply add the "Shrink#" to an existing
  256. * entry in lowmemkiller-variant.sh, pointing to the variant number
  257. * above. Otherwise, provide a new entry above and variant number,
  258. * with the appropriate other_file calculation and update
  259. * lowmemkiller-variant.sh accordingly.
  260. */
  261. /*
  262. * Fall back on default - this may bias strangely for/against the host,
  263. * but nothing catastrophic should result.
  264. */
  265. /* other_file = global_page_state(NR_FILE_PAGES); */
  266. other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM);
  267. #endif
  268. #define _STRINGIFY(x) (#x)
  269. #define STRINGIFY(x) _STRINGIFY(x)
  270. return snprintf(buf, PAGE_SIZE, "%d %d %s %s\n", other_file,
  271. LOWMEMKILLER_VARIANT, STRINGIFY(LOWMEMKILLER_MD5),
  272. STRINGIFY(LOWMEMKILLER_SHRINK_MD5));
  273. #undef _STRINGIFY
  274. #undef STRINGIFY
  275. }
  276. static struct kobj_attribute otherFileAttr = __ATTR_RO(other_file);
  277. /*********************************************************************
  278. *
  279. * Debugfs nodes
  280. *
  281. *********************************************************************/
  282. static struct dentry *mvpDebugDentry;
  283. /**
  284. * @brief debugfs show function for global inMonitor
  285. * @param m seq_file reference
  286. * @param private ignored
  287. * @return 0 for success
  288. */
  289. static int
  290. InMonitorShow(struct seq_file *m,
  291. void *private)
  292. {
  293. seq_bitmap_list(m, cpumask_bits(&inMonitor), nr_cpumask_bits);
  294. seq_puts(m, "\n");
  295. return 0;
  296. }
  297. /**
  298. * @brief debugfs open function for global inMonitor
  299. * @param inode inode
  300. * @param file file
  301. * @return result of single_open
  302. */
  303. static int
  304. InMonitorOpen(struct inode *inode,
  305. struct file *file)
  306. {
  307. return single_open(file, InMonitorShow, NULL);
  308. }
  309. static const struct file_operations inMonitorFops = {
  310. .open = InMonitorOpen,
  311. .read = seq_read,
  312. .llseek = seq_lseek,
  313. .release = single_release,
  314. };
  315. /*
  316. * kset for our sysfs representation, used for per-VM nodes.
  317. */
  318. static struct kset *mvpkmKSet;
  319. static ssize_t
  320. MvpkmAttrShow(struct kobject *kobj,
  321. struct attribute *attr,
  322. char *buf);
  323. static ssize_t
  324. MvpkmAttrStore(struct kobject *kobj,
  325. struct attribute *attr,
  326. const char *buf,
  327. size_t count);
  328. static void MvpkmKObjRelease(struct kobject *kobj)
  329. __attribute__((optimize("-fomit-frame-pointer")));
  330. /**
  331. * @brief Releases the vm structure containing the kobject.
  332. *
  333. * @param kobj the vm's kobject.
  334. */
  335. static void
  336. MvpkmKObjRelease(struct kobject *kobj)
  337. {
  338. struct MvpkmVM *vm = container_of(kobj, struct MvpkmVM, kobj);
  339. ReleaseVM(vm);
  340. module_put(THIS_MODULE);
  341. }
  342. /**
  343. * @name mvpkm ktype attribute structures for locked_pages.
  344. *
  345. * @{
  346. */
  347. static const struct sysfs_ops mvpkmSysfsOps = {
  348. .show = MvpkmAttrShow,
  349. .store = MvpkmAttrStore
  350. };
  351. static struct attribute mvpkmLockedPagesAttr = {
  352. .name = "locked_pages",
  353. .mode = 0444,
  354. };
  355. static struct attribute mvpkmBalloonWatchdogAttr = {
  356. .name = "balloon_watchdog",
  357. .mode = 0444
  358. };
  359. static struct attribute mvpkmMonitorAttr = {
  360. .name = "monitor",
  361. .mode = 0400,
  362. };
  363. static struct attribute *mvpkmDefaultAttrs[] = {
  364. &mvpkmLockedPagesAttr,
  365. &mvpkmBalloonWatchdogAttr,
  366. &mvpkmMonitorAttr,
  367. NULL,
  368. };
  369. static struct kobj_type mvpkmKType = {
  370. .sysfs_ops = &mvpkmSysfsOps,
  371. .release = MvpkmKObjRelease,
  372. .default_attrs = mvpkmDefaultAttrs,
  373. };
  374. /*@}*/
  375. /*
  376. * As it is not very common for host kernels to have SYS_HYPERVISOR enabled and
  377. * you have to "hack" a Kconfig file to enable it, just include the
  378. * functionality inline if it is not enabled.
  379. */
  380. #ifndef CONFIG_SYS_HYPERVISOR
  381. struct kobject *hypervisor_kobj;
  382. EXPORT_SYMBOL_GPL(hypervisor_kobj);
  383. #endif
  384. /*
  385. * kobject and kset utilities.
  386. */
  387. extern struct kobject *kset_find_obj(struct kset *, const char *)
  388. __attribute__((weak));
  389. /**
  390. * @brief Finds a kobject in a kset. The actual implementation is copied from
  391. * kernel source in lib/kobject.c. Although the symbol is extern-declared,
  392. * it is not EXPORT_SYMBOL-ed. We use a weak reference in case the symbol
  393. * might be exported in future kernel versions.
  394. *
  395. * @param kset set to search.
  396. * @param name object name.
  397. *
  398. * @return retained kobject if found, NULL otherwise.
  399. */
  400. struct kobject *
  401. kset_find_obj(struct kset *kset,
  402. const char *name)
  403. {
  404. struct kobject *k;
  405. struct kobject *ret = NULL;
  406. spin_lock(&kset->list_lock);
  407. list_for_each_entry(k, &kset->list, entry) {
  408. if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
  409. ret = kobject_get(k);
  410. break;
  411. }
  412. }
  413. spin_unlock(&kset->list_lock);
  414. return ret;
  415. }
  416. /**
  417. * @brief Finds one of the VM's pre-defined ksets.
  418. *
  419. * @param vmID a VM ID.
  420. * @param name name of one of the VM's pre-defined ksets.
  421. *
  422. * @return retained kset if found, NULL otherwise.
  423. */
  424. struct kset *
  425. Mvpkm_FindVMNamedKSet(int vmID,
  426. const char *name)
  427. {
  428. struct MvpkmVM *vm;
  429. struct kobject *kobj;
  430. char vmName[32] = {}; /* Large enough for externally-formatted int32. */
  431. struct kset *res = NULL;
  432. if (!mvpkmKSet)
  433. return NULL;
  434. snprintf(vmName, sizeof(vmName), "%d", vmID);
  435. /* Always null-terminate, no overflow. */
  436. vmName[sizeof(vmName) - 1] = '\0';
  437. kobj = kset_find_obj(mvpkmKSet, vmName);
  438. if (!kobj)
  439. return NULL;
  440. vm = container_of(kobj, struct MvpkmVM, kobj);
  441. if (!strcmp(name, "devices"))
  442. res = kset_get(vm->devicesKSet);
  443. else if (!strcmp(name, "misc"))
  444. res = kset_get(vm->miscKSet);
  445. kobject_put(kobj);
  446. return res;
  447. }
  448. EXPORT_SYMBOL(Mvpkm_FindVMNamedKSet);
  449. /*********************************************************************
  450. *
  451. * Standard Linux miscellaneous device registration
  452. *
  453. *********************************************************************/
  454. MODULE_LICENSE("GPL"); /* for kallsyms_lookup_name */
  455. static int MvpkmFault(struct vm_area_struct *vma, struct vm_fault *vmf);
  456. /**
  457. * @brief Linux vma operations for /dev/mem-like kernel module mmap. We
  458. * enforce the restriction that only MPNs that have been allocated
  459. * to the opened VM may be mapped and also increment the reference
  460. * count (via vm_insert_page), so that even if the memory is later
  461. * freed by the VM, host process vma's containing the MPN can't
  462. * compromise the system.
  463. *
  464. * However, only trusted host processes (e.g. the vmx) should be allowed
  465. * to use this interface, since you can mmap the monitor's code/data/
  466. * page tables etc. with it. Untrusted host processes are limited to
  467. * typed messages for sharing memory with the monitor. Unix file system
  468. * access permissions are the intended method of restricting access.
  469. * Unfortunately, today _any_ host process utilizing Mksck requires
  470. * access to mvpkm to setup its Mksck pages and obtain socket info via
  471. * ioctls - we probably should be exporting two devices, one for trusted
  472. * and one for arbitrary host processes to avoid this confusion of
  473. * concerns.
  474. */
  475. static struct vm_operations_struct mvpkmVMOps = {
  476. .fault = MvpkmFault
  477. };
  478. /*
  479. * Generic kernel module file ops. These functions will be registered
  480. * at the time the kernel module is loaded.
  481. */
  482. static long
  483. MvpkmUnlockedIoctl(struct file *filep,
  484. unsigned int cmd,
  485. unsigned long arg);
  486. static int MvpkmOpen(struct inode *inode, struct file *filp);
  487. static int MvpkmRelease(struct inode *inode, struct file *filp);
  488. static int MvpkmMMap(struct file *filp, struct vm_area_struct *vma);
  489. /**
  490. * @brief the file_operation structure contains the callback functions
  491. * that are registered with Linux to handle file operations on
  492. * the mvpkm device.
  493. *
  494. * The structure contains other members that the mvpkm device
  495. * does not use. Those members are auto-initialized to NULL.
  496. *
  497. * WARNING, this structure has changed after Linux kernel 2.6.19:
  498. * readv/writev are changed to aio_read/aio_write (neither is used here).
  499. */
  500. static const struct file_operations mvpkmFileOps = {
  501. .owner = THIS_MODULE,
  502. .unlocked_ioctl = MvpkmUnlockedIoctl,
  503. .open = MvpkmOpen,
  504. .release = MvpkmRelease,
  505. .mmap = MvpkmMMap
  506. };
  507. /**
  508. * @brief The mvpkm device identifying information to be used to register
  509. * the device with the Linux kernel.
  510. */
  511. static struct miscdevice mvpkmDev = {
  512. .minor = 165,
  513. .name = "mvpkm",
  514. .fops = &mvpkmFileOps
  515. };
  516. /**
  517. * Mvpkm is loaded by mvpd and only mvpd will be allowed to open
  518. * it. There is a very simple way to verify that: record the process
  519. * id (thread group id) at the time the module is loaded and test it
  520. * at the time the module is opened.
  521. */
  522. static struct pid *initTgid;
  523. #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER
  524. /**
  525. * @name Slab shrinker for triggering balloon adjustment.
  526. *
  527. * @note shrinker us used as a trigger for guest balloon.
  528. *
  529. * @{
  530. */
  531. static int MvpkmShrink(struct shrinker *this, struct shrink_control *sc);
  532. static struct shrinker mvpkmShrinker = {
  533. .shrink = MvpkmShrink,
  534. .seeks = DEFAULT_SEEKS
  535. };
  536. /*@}*/
  537. #endif
  538. module_param_array(vcpuAffinity, ulong, NULL, S_IRUGO | S_IWUSR);
  539. MODULE_PARM_DESC(vcpuAffinity, "vCPU affinity");
  540. /**
  541. * @brief Initialize the mvpkm device, register it with the Linux kernel.
  542. *
  543. * @return A zero is returned on success and a negative errno code for failure.
  544. * (Same as the return policy of misc_register(9).)
  545. */
  546. static int __init
  547. MvpkmInit(void)
  548. {
  549. int err = 0;
  550. _Bool mksckInited = false;
  551. _Bool cpuFreqInited = false;
  552. pr_info("Mvpkm: " MVP_VERSION_FORMATSTR "\n", MVP_VERSION_FORMATARGS);
  553. pr_info("Mvpkm: started from process %s tgid=%d, pid=%d\n",
  554. current->comm, task_tgid_vnr(current), task_pid_vnr(current));
  555. if (bitmap_empty(vcpuAffinity, nr_cpumask_bits))
  556. bitmap_copy(vcpuAffinity, cpumask_bits(cpu_possible_mask),
  557. nr_cpumask_bits);
  558. err = misc_register(&mvpkmDev);
  559. if (err)
  560. return -ENOENT;
  561. err = Mksck_Init();
  562. if (err)
  563. goto error;
  564. else
  565. mksckInited = true;
  566. mksckInited = true;
  567. QP_HostInit();
  568. CpuFreq_Init();
  569. cpuFreqInited = true;
  570. /*
  571. * Reference mvpd (module loader) tgid struct, so that we can avoid
  572. * attacks based on pid number wraparound.
  573. */
  574. initTgid = get_pid(task_tgid(current));
  575. #ifndef CONFIG_SYS_HYPERVISOR
  576. hypervisor_kobj = kobject_create_and_add("hypervisor", NULL);
  577. if (!hypervisor_kobj) {
  578. err = -ENOMEM;
  579. goto error;
  580. }
  581. #endif
  582. mvpkmKObj = kobject_create_and_add("mvp", hypervisor_kobj);
  583. if (!mvpkmKObj) {
  584. err = -ENOMEM;
  585. goto error;
  586. }
  587. balloonKObj = kobject_create_and_add("lowmem", mvpkmKObj);
  588. if (!balloonKObj) {
  589. err = -ENOMEM;
  590. goto error;
  591. }
  592. mvpkmKSet = kset_create_and_add("vm", NULL, mvpkmKObj);
  593. if (!mvpkmKSet) {
  594. err = -ENOMEM;
  595. goto error;
  596. }
  597. err = sysfs_create_file(mvpkmKObj, &versionAttr.attr);
  598. if (err)
  599. goto error;
  600. err = sysfs_create_file(balloonKObj, &backgroundAttr.attr);
  601. if (err)
  602. goto error;
  603. err = sysfs_create_file(balloonKObj, &otherFileAttr.attr);
  604. if (err)
  605. goto error;
  606. #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER
  607. register_shrinker(&mvpkmShrinker);
  608. #endif
  609. /* Create /sys/kernel/debug/mvp for debufs nodes */
  610. mvpDebugDentry = debugfs_create_dir("mvp", NULL);
  611. if (mvpDebugDentry) {
  612. debugfs_create_file("inMonitor", S_IRUGO,
  613. mvpDebugDentry, NULL, &inMonitorFops);
  614. MksckPageInfo_Init(mvpDebugDentry);
  615. }
  616. return 0;
  617. error:
  618. if (mvpkmKSet)
  619. kset_unregister(mvpkmKSet);
  620. if (balloonKObj) {
  621. kobject_del(balloonKObj);
  622. kobject_put(balloonKObj);
  623. }
  624. if (mvpkmKObj) {
  625. kobject_del(mvpkmKObj);
  626. kobject_put(mvpkmKObj);
  627. }
  628. #ifndef CONFIG_SYS_HYPERVISOR
  629. if (hypervisor_kobj) {
  630. kobject_del(hypervisor_kobj);
  631. kobject_put(hypervisor_kobj);
  632. }
  633. #endif
  634. if (cpuFreqInited)
  635. CpuFreq_Exit();
  636. if (mksckInited)
  637. Mksck_Exit();
  638. if (initTgid)
  639. put_pid(initTgid);
  640. misc_deregister(&mvpkmDev);
  641. return err;
  642. }
  643. /**
  644. * @brief De-register the mvpkm device with the Linux kernel.
  645. */
  646. void
  647. MvpkmExit(void)
  648. {
  649. PRINTK("MvpkmExit called !\n");
  650. if (mvpDebugDentry)
  651. debugfs_remove_recursive(mvpDebugDentry);
  652. #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER
  653. unregister_shrinker(&mvpkmShrinker);
  654. #endif
  655. kset_unregister(mvpkmKSet);
  656. kobject_del(balloonKObj);
  657. kobject_put(balloonKObj);
  658. kobject_del(mvpkmKObj);
  659. kobject_put(mvpkmKObj);
  660. #ifndef CONFIG_SYS_HYPERVISOR
  661. kobject_del(hypervisor_kobj);
  662. kobject_put(hypervisor_kobj);
  663. #endif
  664. CpuFreq_Exit();
  665. Mksck_Exit();
  666. put_pid(initTgid);
  667. misc_deregister(&mvpkmDev);
  668. }
  669. /*
  670. * The standard module registration macros of Linux.
  671. */
  672. module_init(MvpkmInit);
  673. module_exit(MvpkmExit);
  674. module_param_array_named(lowmemAdj, lowmemAdj, int, &lowmemAdjSize,
  675. S_IRUGO | S_IWUSR);
  676. MODULE_PARM_DESC(lowmemAdj,
  677. "copy of /sys/module/lowmemorykiller/parameters/adj");
  678. #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER
  679. /**
  680. * @brief Balloon watchdog timeout callback.
  681. *
  682. * Terminate the VM since it's not responsive.
  683. *
  684. * @param data vm reference representation.
  685. */
  686. static void
  687. WatchdogCB(unsigned long data)
  688. {
  689. struct MvpkmVM *vm = (struct MvpkmVM *)data;
  690. pr_err("Balloon watchdog expired (%d s)!\n",
  691. BALLOON_WATCHDOG_TIMEOUT_SECS);
  692. vm->watchdogTriggered = true;
  693. Mvpkm_WakeGuest(vm, ACTION_ABORT);
  694. }
  695. /**
  696. * @brief Slab shrinker.
  697. *
  698. * Called by Linux kernel when we're under memory pressure. We treat all locked
  699. * pages as a slab for this purpose, similar to the Android low memory killer.
  700. *
  701. * @param this reference to registered shrinker for callback context.
  702. * @param nrToScan number of entries to scan. If 0 then just return the number
  703. * of present entries. We ignore the value of nrToScan when > 1
  704. * since the shrinker is a trigger to readjust guest balloons,
  705. * where the actual balloon size is determined in conjunction
  706. * with the guest.
  707. * @param gfpMask ignored.
  708. *
  709. * @return number of locked pages.
  710. */
  711. static int
  712. MvpkmShrink(struct shrinker *this,
  713. struct shrink_control *sc)
  714. {
  715. uint32 locked = 0;
  716. struct kobject *k;
  717. int nrToScan = sc->nr_to_scan;
  718. spin_lock(&mvpkmKSet->list_lock);
  719. list_for_each_entry(k, &mvpkmKSet->list, entry) {
  720. struct MvpkmVM *vm = container_of(k, struct MvpkmVM, kobj);
  721. locked += ATOMIC_GETO(vm->usedPages);
  722. /*
  723. * Try and grab the WSP semaphore - if we fail, we must be
  724. * VM setup or teardown, no point trying to wake the guest.
  725. */
  726. if (nrToScan > 0 &&
  727. down_read_trylock(&vm->wspSem)) {
  728. if (vm->wsp) {
  729. /*
  730. * Balloon watchdog.
  731. * We start the timer before waking up the
  732. * guest to avoid races in case of immediate
  733. * descheduling.
  734. */
  735. if (vm->balloonWDEnabled) {
  736. struct timer_list *t =
  737. &vm->balloonWDTimer;
  738. if (!timer_pending(t)) {
  739. t->data = (unsigned long)vm;
  740. t->function = WatchdogCB;
  741. t->expires = jiffies +
  742. BALLOON_WATCHDOG_TIMEOUT_SECS * HZ;
  743. add_timer(t);
  744. }
  745. }
  746. Mvpkm_WakeGuest(vm, ACTION_BALLOON);
  747. }
  748. up_read(&vm->wspSem);
  749. }
  750. }
  751. spin_unlock(&mvpkmKSet->list_lock);
  752. return locked;
  753. }
  754. #endif
  755. /**
  756. * @brief The open file operation. Initializes the vm specific structure.
  757. */
  758. int
  759. MvpkmOpen(struct inode *inode,
  760. struct file *filp)
  761. {
  762. struct MvpkmVM *vm;
  763. if (initTgid != task_tgid(current)) {
  764. pr_err("%s: MVPKM can be opened only from MVPD (process %d).\n",
  765. __func__, pid_vnr(initTgid));
  766. return -EPERM;
  767. }
  768. pr_debug("%s: Allocating an MvpkmVM structure from process %s tgid=%d, pid=%d\n",
  769. __func__, current->comm, task_tgid_vnr(current),
  770. task_pid_vnr(current));
  771. vm = kmalloc(sizeof(struct MvpkmVM), GFP_KERNEL);
  772. if (!vm)
  773. return -ENOMEM;
  774. memset(vm, 0, sizeof(*vm));
  775. init_timer(&vm->balloonWDTimer);
  776. init_rwsem(&vm->lockedSem);
  777. init_rwsem(&vm->wspSem);
  778. init_rwsem(&vm->monThreadTaskSem);
  779. vm->monThreadTask = NULL;
  780. vm->isMonitorInited = false;
  781. filp->private_data = vm;
  782. if (!Mvpkm_vmwareUid)
  783. current_uid_gid(&Mvpkm_vmwareUid, &Mvpkm_vmwareGid);
  784. return 0;
  785. }
  786. /**
  787. * @brief Releases a VMs resources
  788. * @param vm vm to release
  789. */
  790. static void
  791. ReleaseVM(struct MvpkmVM *vm)
  792. {
  793. /*
  794. * Delete balloon watchdog timer. We are already out of VM kset,
  795. * so there is no race with shrink callback.
  796. */
  797. del_timer_sync(&vm->balloonWDTimer);
  798. down_write(&vm->wspSem);
  799. if (vm->isMonitorInited) {
  800. MonitorTimer_Request(&vm->monTimer, 0);
  801. Mksck_WspRelease(vm->wsp);
  802. vm->wsp = NULL;
  803. #ifdef CONFIG_HAS_WAKELOCK
  804. /*
  805. * Destroy wakelock after WSP is released (and MksckPage
  806. * detached).
  807. */
  808. wake_lock_destroy(&vm->wakeLock);
  809. #endif
  810. }
  811. up_write(&vm->wspSem);
  812. LockedListUnlockAll(vm);
  813. UnmapWSPHKVA(vm);
  814. /*
  815. * All sockets potentially connected to sockets of this vm's vmId
  816. * will fail at send now. DGRAM sockets are not required to tear
  817. * down connection explicitly.
  818. */
  819. kfree(vm);
  820. }
  821. /**
  822. * @brief The release file operation. Releases the vm specific
  823. * structure including all the locked pages.
  824. *
  825. * @param inode Unused
  826. * @param filp which VM we're dealing with
  827. * @return 0
  828. */
  829. int
  830. MvpkmRelease(struct inode *inode,
  831. struct file *filp)
  832. {
  833. struct MvpkmVM *vm = filp->private_data;
  834. /*
  835. * Tear down any queue pairs associated with this VM
  836. */
  837. if (vm->isMonitorInited) {
  838. ASSERT(vm->wsp);
  839. QP_DetachAll(vm->wsp->guestId);
  840. }
  841. /*
  842. * Release the VM's ksets.
  843. */
  844. kset_unregister(vm->miscKSet);
  845. kset_unregister(vm->devicesKSet);
  846. if (vm->haveKObj) {
  847. /*
  848. * Release the VM's kobject.
  849. * 'vm' will be kfree-d in its kobject's release function.
  850. */
  851. kobject_del(&vm->kobj);
  852. kobject_put(&vm->kobj);
  853. } else {
  854. ReleaseVM(vm);
  855. }
  856. filp->private_data = NULL;
  857. pr_info("%s: Released MvpkmVM structure from process %s tgid=%d, pid=%d\n",
  858. __func__, current->comm, task_tgid_vnr(current),
  859. task_pid_vnr(current));
  860. return 0;
  861. }
  862. /**
  863. * @brief Page fault handler for /dev/mem-like regions (see mvpkmVMOps
  864. * block comment).
  865. */
  866. static int
  867. MvpkmFault(struct vm_area_struct *vma,
  868. struct vm_fault *vmf)
  869. {
  870. unsigned long address = (unsigned long)vmf->virtual_address;
  871. MPN mpn = vmf->pgoff;
  872. struct MvpkmVM *vm = vma->vm_file->private_data;
  873. /*
  874. * Only insert pages belonging to the VM. The check is slow, O(n) in the
  875. * number of MPNs associated with the VM, but it doesn't matter - the
  876. * mmap interface should only be used by trusted processes at
  877. * initialization time and for debugging.
  878. *
  879. * The mpn can be either in the memory reserved the monitor or mvpd
  880. * through the regular mechanisms or it could be a mksck page.
  881. */
  882. if (!pfn_valid(mpn)) {
  883. pr_err("MvpkmMMap: Failed to insert %x @ %lx, mpn invalid\n",
  884. mpn, address);
  885. } else if (LockedListLookup(vm, mpn)) {
  886. if (vm_insert_page(vma, address, pfn_to_page(mpn)) == 0)
  887. return VM_FAULT_NOPAGE;
  888. pr_err("MvpkmMMap: Failed to insert %x @ %lx\n",
  889. mpn, address);
  890. } else if (MksckPage_LookupAndInsertPage(vma, address, mpn) == 0) {
  891. return VM_FAULT_NOPAGE;
  892. }
  893. if (vm->stubPageMPN) {
  894. if (vm_insert_page(vma, address,
  895. pfn_to_page(vm->stubPageMPN)) == 0) {
  896. pr_info("MvpkmMMap: mapped the stub page at %x @ %lx\n",
  897. mpn, address);
  898. return VM_FAULT_NOPAGE;
  899. }
  900. pr_err("MvpkmMMap: Could not insert stub page %x @ %lx\n",
  901. mpn, address);
  902. }
  903. return VM_FAULT_SIGBUS;
  904. }
  905. /**
  906. * @brief sysfs show function for per-VM locked_pages attribute.
  907. *
  908. * @param kobj reference to kobj nested in MvpkmVM struct.
  909. * @param attr attribute reference.
  910. * @param buf PAGE_SIZEd buffer to write to.
  911. *
  912. * @return number of characters printed (not including trailing null character).
  913. */
  914. static ssize_t
  915. MvpkmAttrShow(struct kobject *kobj,
  916. struct attribute *attr,
  917. char *buf)
  918. {
  919. if (attr == &mvpkmLockedPagesAttr) {
  920. struct MvpkmVM *vm = container_of(kobj, struct MvpkmVM, kobj);
  921. return snprintf(buf, PAGE_SIZE, "%d\n",
  922. ATOMIC_GETO(vm->usedPages));
  923. } else if (attr == &mvpkmMonitorAttr) {
  924. struct MvpkmVM *vm = container_of(kobj, struct MvpkmVM, kobj);
  925. return snprintf(buf, PAGE_SIZE, "hostActions %x callno %d\n",
  926. ATOMIC_GETO(vm->wsp->hostActions),
  927. WSP_Params(vm->wsp)->callno);
  928. } else if (attr == &mvpkmBalloonWatchdogAttr) {
  929. struct MvpkmVM *vm = container_of(kobj, struct MvpkmVM, kobj);
  930. /*
  931. * Enable balloon watchdog on first read. This includes all
  932. * ballooning capable guest.
  933. */
  934. vm->balloonWDEnabled = true;
  935. del_timer_sync(&vm->balloonWDTimer);
  936. buf[0] = 1;
  937. return 1;
  938. } else {
  939. return -EPERM;
  940. }
  941. }
  942. /**
  943. * @brief sysfs store function for per-VM locked_pages attribute.
  944. *
  945. * @param kobj reference to kobj nested in MvpkmVM struct.
  946. * @param attr attribute reference.
  947. * @param buf PAGE_SIZEd buffer to write to.
  948. * @param buf input buffer.
  949. * @param count input buffer length.
  950. *
  951. * @return number of bytes consumed or negative error code.
  952. */
  953. static ssize_t
  954. MvpkmAttrStore(struct kobject *kobj,
  955. struct attribute *attr,
  956. const char *buf,
  957. size_t count)
  958. {
  959. return -EPERM;
  960. }
  961. /**
  962. * @brief Map machine address space region into host process.
  963. *
  964. * @param filp file reference (ignored).
  965. * @param vma Linux virtual memory area defining the region.
  966. *
  967. * @return 0 on success, otherwise error code.
  968. */
  969. static int
  970. MvpkmMMap(struct file *filp,
  971. struct vm_area_struct *vma)
  972. {
  973. vma->vm_ops = &mvpkmVMOps;
  974. return 0;
  975. }
  976. #ifdef CONFIG_ARM_LPAE
  977. /**
  978. * @brief Determine host cacheability/shareability attributes.
  979. *
  980. * Used to ensure monitor/guest shared mappings are consistent with
  981. * those of host user/kernel.
  982. *
  983. * @param[out] attribMAN when setting up the HW monitor this provides the
  984. * attributes in the generic ARM_MemAttrNormal form,
  985. * suitable for configuring the monitor and guest's
  986. * [H]MAIR0 and setting the shareability attributes of
  987. * the LPAE descriptors.
  988. */
  989. static void
  990. DetermineMemAttrLPAE(ARM_MemAttrNormal *attribMAN)
  991. {
  992. /*
  993. * We use set_pte_ext to sample what {S,TEX,CB} bits Linux is using for
  994. * normal kernel/user L2D mappings. These bits should be consistent both
  995. * with each other and what we use in the monitor since we share various
  996. * pages with both host processes, the kernel module and monitor, and
  997. * the ARM ARM requires that synonyms have the same cacheability
  998. * attributes, see end of A3.5.{4,7} ARM DDI 0406A.
  999. */
  1000. HKVA hkva = __get_free_pages(GFP_KERNEL, 0);
  1001. ARM_LPAE_L3D *pt = (ARM_LPAE_L3D *)hkva;
  1002. ARM_LPAE_L3D *kernL3D = &pt[0], *userL3D = &pt[1];
  1003. uint32 attr, mair0, mair1;
  1004. set_pte_ext((pte_t *)kernL3D, pfn_pte(0, PAGE_KERNEL), 0);
  1005. set_pte_ext((pte_t *)userL3D, pfn_pte(0, PAGE_NONE), 0);
  1006. pr_info("DetermineMemAttr: Kernel L3D AttrIndx=%x SH=%x\n",
  1007. kernL3D->blockS1.attrIndx, kernL3D->blockS1.sh);
  1008. pr_info("DetermineMemAttr: User L3D AttrIndx=%x SH=%x\n",
  1009. userL3D->blockS1.attrIndx, userL3D->blockS1.sh);
  1010. ASSERT(kernL3D->blockS1.attrIndx == userL3D->blockS1.attrIndx);
  1011. ASSERT(kernL3D->blockS1.sh == userL3D->blockS1.sh);
  1012. switch (kernL3D->blockS1.sh) {
  1013. case 0:
  1014. attribMAN->share = ARM_SHARE_ATTR_NONE;
  1015. break;
  1016. case 2:
  1017. attribMAN->share = ARM_SHARE_ATTR_OUTER;
  1018. break;
  1019. case 3:
  1020. attribMAN->share = ARM_SHARE_ATTR_INNER;
  1021. break;
  1022. default:
  1023. FATAL();
  1024. }
  1025. ARM_MRC_CP15(MAIR0, mair0);
  1026. ARM_MRC_CP15(MAIR1, mair1);
  1027. attr = MVP_EXTRACT_FIELD(kernL3D->blockS1.attrIndx >= 4 ? mair1 : mair0,
  1028. 8 * (kernL3D->blockS1.attrIndx % 4),
  1029. 8);
  1030. /*
  1031. * See B4-1615 ARM DDI 0406C-2c for magic.
  1032. */
  1033. #define MAIR_ATTR_2_CACHE_ATTR(x, y) \
  1034. do { \
  1035. switch (x) { \
  1036. case 2: \
  1037. (y) = ARM_CACHE_ATTR_NORMAL_WT; \
  1038. break; \
  1039. case 3: \
  1040. (y) = ARM_CACHE_ATTR_NORMAL_WB; \
  1041. break; \
  1042. default: \
  1043. FATAL(); \
  1044. } \
  1045. } while (0)
  1046. MAIR_ATTR_2_CACHE_ATTR(MVP_EXTRACT_FIELD(attr, 2, 2),
  1047. attribMAN->innerCache);
  1048. MAIR_ATTR_2_CACHE_ATTR(MVP_EXTRACT_FIELD(attr, 6, 2),
  1049. attribMAN->outerCache);
  1050. #undef MAIR_ATTR_2_CACHE_ATTR
  1051. pr_info("DetermineMemAttr: innerCache %x outerCache %x share %x\n",
  1052. attribMAN->innerCache,
  1053. attribMAN->outerCache,
  1054. attribMAN->share);
  1055. free_pages(hkva, 0);
  1056. }
  1057. #else
  1058. /**
  1059. * @brief Determine host cacheability/shareability attributes.
  1060. *
  1061. * Used to ensure monitor/guest shared mappings are consistent with
  1062. * those of host user/kernel.
  1063. *
  1064. * @param[out] attribL2D when setting up the LPV monitor a template L2D
  1065. * containing cacheability attributes {S, TEX,CB} used by
  1066. * host kernel for normal memory mappings. These may be
  1067. * used directly for monitor/guest mappings, since both
  1068. * worlds share a common {TRE, PRRR, NMRR}.
  1069. * @param[out] attribMAN when setting up TTBR0 in the LPV monitor and the page
  1070. * tables for the HW monitor this provides the attributes
  1071. * in the generic ARM_MemAttrNormal form, suitable for
  1072. * configuring TTBR0 + the monitor and guest's [H]MAIR0
  1073. * and setting the shareability attributes of the LPAE
  1074. * descriptors.
  1075. */
  1076. static void
  1077. DetermineMemAttrNonLPAE(ARM_L2D *attribL2D,
  1078. ARM_MemAttrNormal *attribMAN)
  1079. {
  1080. /*
  1081. * We use set_pte_ext to sample what {S,TEX,CB} bits Linux is using for
  1082. * normal kernel/user L2D mappings. These bits should be consistent both
  1083. * with each other and what we use in the monitor since we share various
  1084. * pages with both host processes, the kernel module and monitor, and
  1085. * the ARM ARM requires that synonyms have the same cacheability
  1086. * attributes, see end of A3.5.{4,7} ARM DDI 0406A.
  1087. */
  1088. HKVA hkva = __get_free_pages(GFP_KERNEL, 0);
  1089. uint32 sctlr;
  1090. ARM_L2D *pt = (ARM_L2D *)hkva;
  1091. ARM_L2D *kernL2D = &pt[0], *userL2D = &pt[1];
  1092. /*
  1093. * Linux 2.6.38 switched the order of Linux vs hardware page tables.
  1094. * See mainline d30e45eeabefadc6039d7f876a59e5f5f6cb11c6.
  1095. */
  1096. const uint32 set_pte_ext_offset = 0;
  1097. set_pte_ext((pte_t *)(kernL2D + set_pte_ext_offset/sizeof(ARM_L2D)),
  1098. pfn_pte(0, PAGE_KERNEL),
  1099. 0);
  1100. set_pte_ext((pte_t *)(userL2D + set_pte_ext_offset/sizeof(ARM_L2D)),
  1101. pfn_pte(0, PAGE_NONE),
  1102. 0);
  1103. /*
  1104. * Linux 2.6.38 switched the order of Linux vs hardware page tables.
  1105. * See mainline d30e45eeabefadc6039d7f876a59e5f5f6cb11c6.
  1106. */
  1107. kernL2D += 2048/sizeof(ARM_L2D);
  1108. userL2D += 2048/sizeof(ARM_L2D);
  1109. pr_info("DetermineMemAttr: Kernel L2D TEX=%x CB=%x S=%x\n",
  1110. kernL2D->small.tex,
  1111. kernL2D->small.cb,
  1112. kernL2D->small.s);
  1113. pr_info("DetermineMemAttr: User L2D TEX=%x CB=%x S=%x\n",
  1114. userL2D->small.tex,
  1115. userL2D->small.cb,
  1116. userL2D->small.s);
  1117. ASSERT((kernL2D->small.tex & 1) == (userL2D->small.tex & 1));
  1118. ASSERT(kernL2D->small.cb == userL2D->small.cb);
  1119. ASSERT(kernL2D->small.s == userL2D->small.s);
  1120. *attribL2D = *kernL2D;
  1121. /*
  1122. * We now decode TEX remap and obtain the more generic form for use in
  1123. * the LPV monitor's TTBR0 initialization and the HW monitor.
  1124. */
  1125. ARM_MRC_CP15(CONTROL_REGISTER, sctlr);
  1126. if (sctlr & ARM_CP15_CNTL_TRE) {
  1127. uint32 prrr, nmrr, indx, type;
  1128. uint32 innerCache, outerCache, outerShare, share;
  1129. pr_info("DetermineMemAttr: TEX remapping enabled\n");
  1130. ARM_MRC_CP15(PRIMARY_REGION_REMAP, prrr);
  1131. ARM_MRC_CP15(NORMAL_MEMORY_REMAP, nmrr);
  1132. pr_info("DetermineMemAttr: PRRR=%x NMRR=%x\n",
  1133. prrr, nmrr);
  1134. /*
  1135. * Decode PRRR/NMRR below. See B3.7 ARM DDI 0406B for register
  1136. * encodings, tables and magic numbers.
  1137. */
  1138. indx = (MVP_BIT(kernL2D->small.tex, 0) << 2) |
  1139. kernL2D->small.cb;
  1140. /*
  1141. * Only normal memory makes sense here.
  1142. */
  1143. type = MVP_EXTRACT_FIELD(prrr, 2 * indx, 2);
  1144. ASSERT(type == 2);
  1145. innerCache = MVP_EXTRACT_FIELD(nmrr, 2 * indx, 2);
  1146. outerCache = MVP_EXTRACT_FIELD(nmrr, 16 + 2 * indx, 2);
  1147. outerShare = !MVP_BIT(prrr, 24 + indx);
  1148. share = MVP_BIT(prrr, 18 + kernL2D->small.s);
  1149. pr_info("DetermineMemAttr: type %x innerCache %x outerCache %x"\
  1150. " share %x outerShare %x\n",
  1151. type, innerCache, outerCache, share, outerShare);
  1152. if (share) {
  1153. if (outerShare)
  1154. attribMAN->share = ARM_SHARE_ATTR_OUTER;
  1155. else
  1156. attribMAN->share = ARM_SHARE_ATTR_INNER;
  1157. } else {
  1158. attribMAN->share = ARM_SHARE_ATTR_NONE;
  1159. }
  1160. attribMAN->innerCache = innerCache;
  1161. attribMAN->outerCache = outerCache;
  1162. } else {
  1163. NOT_IMPLEMENTED_JIRA(1849);
  1164. }
  1165. free_pages(hkva, 0);
  1166. }
  1167. #endif
  1168. /**
  1169. * @brief The ioctl file operation.
  1170. *
  1171. * The ioctl command is the main communication method between the
  1172. * vmx and the mvpkm kernel module.
  1173. *
  1174. * @param filp which VM we're dealing with
  1175. * @param cmd select which cmd function needs to be performed
  1176. * @param arg argument for command
  1177. * @return error code, 0 on success
  1178. */
  1179. long
  1180. MvpkmUnlockedIoctl(struct file *filp,
  1181. unsigned int cmd,
  1182. unsigned long arg)
  1183. {
  1184. struct MvpkmVM *vm = filp->private_data;
  1185. int retval = 0;
  1186. switch (cmd) {
  1187. case MVPKM_DISABLE_FAULT:
  1188. if (!vm->stubPageMPN) {
  1189. uint32 *ptr;
  1190. vm->stubPageMPN = AllocZeroedFreePages(vm, 0, false,
  1191. MEMREGION_MAINMEM, (HKVA *)&ptr);
  1192. if (!vm->stubPageMPN)
  1193. break;
  1194. ptr[0] = MVPKM_STUBPAGE_BEG;
  1195. ptr[PAGE_SIZE/sizeof(uint32) - 1] = MVPKM_STUBPAGE_END;
  1196. }
  1197. break;
  1198. /*
  1199. * Allocate some pinned pages from kernel.
  1200. * Returns -ENOMEM if no host pages available for allocation.
  1201. */
  1202. case MVPKM_LOCK_MPN: {
  1203. struct MvpkmLockMPN buf;
  1204. if (copy_from_user(&buf, (void *)arg, sizeof(buf)))
  1205. return -EFAULT;
  1206. buf.mpn = AllocZeroedFreePages(vm, buf.order, false,
  1207. buf.forRegion, NULL);
  1208. if (buf.mpn == 0)
  1209. return -ENOMEM;
  1210. if (copy_to_user((void *)arg, &buf, sizeof(buf)))
  1211. return -EFAULT;
  1212. break;
  1213. }
  1214. case MVPKM_UNLOCK_MPN: {
  1215. struct MvpkmLockMPN buf;
  1216. if (copy_from_user(&buf, (void *)arg, sizeof(buf)))
  1217. return -EFAULT;
  1218. if (!LockedListDel(vm, buf.mpn))
  1219. return -EINVAL;
  1220. break;
  1221. }
  1222. case MVPKM_MAP_WSPHKVA: {
  1223. MvpkmMapHKVA mvpkmMapInfo;
  1224. HkvaMapInfo mapInfo[WSP_PAGE_COUNT];
  1225. if (copy_from_user(&mvpkmMapInfo, (void *)arg,
  1226. sizeof(mvpkmMapInfo)))
  1227. return -EFAULT;
  1228. if (copy_from_user(mapInfo, (void *)mvpkmMapInfo.mapInfo,
  1229. sizeof(mapInfo)))
  1230. return -EFAULT;
  1231. mvpkmMapInfo.hkva = MapWSPHKVA(vm, mapInfo);
  1232. BUG_ON(mvpkmMapInfo.hkva == 0);
  1233. if (mvpkmMapInfo.forRegion == MEMREGION_WSP)
  1234. vm->wsp = (WorldSwitchPage *) mvpkmMapInfo.hkva;
  1235. if (copy_to_user((void *)arg, &mvpkmMapInfo,
  1236. sizeof(mvpkmMapInfo)))
  1237. return -EFAULT;
  1238. break;
  1239. }
  1240. case MVPKM_RUN_MONITOR:
  1241. if (!vm->isMonitorInited)
  1242. vm->isMonitorInited =
  1243. ((retval = SetupMonitor(vm)) == 0);
  1244. if (vm->isMonitorInited)
  1245. retval = RunMonitor(vm);
  1246. break;
  1247. case MVPKM_ABORT_MONITOR:
  1248. if (!vm->isMonitorInited)
  1249. return -EINVAL;
  1250. ASSERT(vm->wsp != NULL);
  1251. pr_err("MvpkmIoctl: Aborting monitor.\n");
  1252. Mvpkm_WakeGuest(vm, ACTION_ABORT);
  1253. break;
  1254. case MVPKM_CPU_INFO: {
  1255. struct MvpkmCpuInfo buf;
  1256. uint32 mpidr;
  1257. #ifdef CONFIG_ARM_LPAE
  1258. DetermineMemAttrLPAE(&buf.attribMAN);
  1259. /**
  1260. * We need to add support to the LPV monitor for LPAE page
  1261. * tables if we want to use it on a LPAE host, due to the
  1262. * costs involved in transitioning between LPAE and non-LPAE
  1263. * page tables without Hyp assistance.
  1264. *
  1265. * @knownjira{MVP-2184}
  1266. */
  1267. buf.attribL2D.u = 0;
  1268. #else
  1269. DetermineMemAttrNonLPAE(&buf.attribL2D, &buf.attribMAN);
  1270. #endif
  1271. /*
  1272. * Are MP extensions implemented?
  1273. * See B4-1618 ARM DDI 0406C-2c for magic.
  1274. */
  1275. ARM_MRC_CP15(MPIDR, mpidr);
  1276. buf.mpExt = mpidr & ARM_CP15_MPIDR_MP;
  1277. if (copy_to_user((int *)arg, &buf,
  1278. sizeof(struct MvpkmCpuInfo)))
  1279. retval = -EFAULT;
  1280. break; }
  1281. default:
  1282. retval = -EINVAL;
  1283. break;
  1284. }
  1285. PRINTK("Returning from IOCTL(%d) retval = %d %s\n",
  1286. cmd, retval, signal_pending(current) ? "(pending signal)" : "");
  1287. return retval;
  1288. }
  1289. /*********************************************************************
  1290. *
  1291. * Locked page management
  1292. *
  1293. *********************************************************************/
  1294. /*
  1295. * Pages locked by the kernel module are remembered so an unlockAll
  1296. * operation can be performed when the vmm is closed. The locked page
  1297. * identifiers are stored in a red-black tree to support O(log n)
  1298. * removal and search (required for /dev/mem-like mmap).
  1299. */
  1300. /**
  1301. * @brief Descriptor of a locked page range
  1302. */
  1303. struct LockedPage {
  1304. struct {
  1305. __u32 mpn:20; /**< MPN. */
  1306. __u32 order:6; /**< Size/alignment exponent for page. */
  1307. __u32 forRegion:6; /**< Annotate/identify guest page alloc. */
  1308. } page;
  1309. struct rb_node rb;
  1310. };
  1311. static void FreeLockedPages(struct LockedPage *lp);
  1312. /**
  1313. * @brief Search for an mpn inside a RB tree of LockedPages. The mpn
  1314. * will match a LockedPage as long as it is covered by the
  1315. * entry, i.e. in a non-zero order entry it doesn't have to be
  1316. * the base MPN.
  1317. *
  1318. * This must be called with the relevant vm->lockedSem held.
  1319. *
  1320. * @param root RB tree root.
  1321. * @param mpn MPN to search for.
  1322. *
  1323. * @return reference to LockedPage entry if found, otherwise NULL.
  1324. */
  1325. static struct LockedPage *
  1326. LockedListSearch(struct rb_root *root,
  1327. __u32 mpn)
  1328. {
  1329. struct rb_node *n = root->rb_node;
  1330. while (n) {
  1331. struct LockedPage *lp = rb_entry(n, struct LockedPage, rb);
  1332. if (lp->page.mpn == (mpn & (~0UL << lp->page.order)))
  1333. return lp;
  1334. if (mpn < lp->page.mpn)
  1335. n = n->rb_left;
  1336. else
  1337. n = n->rb_right;
  1338. }
  1339. return NULL;
  1340. }
  1341. /**
  1342. * @brief Delete an mpn from the list of locked pages.
  1343. *
  1344. * @param vm Mvpkm module control structure pointer
  1345. * @param mpn MPN to be unlocked and freed for reuse
  1346. * @return true if list contained MPN and it was deleted from list
  1347. */
  1348. static _Bool
  1349. LockedListDel(struct MvpkmVM *vm,
  1350. __u32 mpn)
  1351. {
  1352. struct LockedPage *lp;
  1353. down_write(&vm->lockedSem);
  1354. lp = LockedListSearch(&vm->lockedRoot, mpn);
  1355. /*
  1356. * The MPN should be in the locked pages RB tree and it should be the
  1357. * base of an entry, i.e. we can't fragment existing allocations for
  1358. * a VM.
  1359. */
  1360. if (lp == NULL || lp->page.mpn != mpn) {
  1361. up_write(&vm->lockedSem);
  1362. return false;
  1363. }
  1364. FreeLockedPages(lp);
  1365. if (lp->page.forRegion == MEMREGION_MAINMEM)
  1366. ATOMIC_SUBV(vm->usedPages, 1U << lp->page.order);
  1367. rb_erase(&lp->rb, &vm->lockedRoot);
  1368. kfree(lp);
  1369. up_write(&vm->lockedSem);
  1370. return true;
  1371. }
  1372. /**
  1373. * @brief Scan the list of locked pages to see if an MPN matches.
  1374. *
  1375. * @param vm Mvpkm module control structure pointer
  1376. * @param mpn MPN to check
  1377. *
  1378. * @return true iff list contains MPN.
  1379. */
  1380. static _Bool
  1381. LockedListLookup(struct MvpkmVM *vm,
  1382. __u32 mpn)
  1383. {
  1384. struct LockedPage *lp;
  1385. down_read(&vm->lockedSem);
  1386. lp = LockedListSearch(&vm->lockedRoot, mpn);
  1387. up_read(&vm->lockedSem);
  1388. return lp != NULL;
  1389. }
  1390. /**
  1391. * @brief Add a new mpn to the locked pages RB tree.
  1392. *
  1393. * @param vm control structure pointer
  1394. *
  1395. * @param mpn mpn of page that was locked with get_user_pages or some sort of
  1396. * get that is undone by put_page.
  1397. * The mpn is assumed to be non-zero
  1398. * @param order size/alignment exponent for page
  1399. * @param forRegion Annotation for Page pool to identify guest page allocations
  1400. *
  1401. * @return false: couldn't allocate internal memory to record mpn in<br>
  1402. * true: successful.
  1403. */
  1404. static _Bool
  1405. LockedListAdd(struct MvpkmVM *vm,
  1406. __u32 mpn,
  1407. __u32 order,
  1408. PhysMem_RegionType forRegion)
  1409. {
  1410. struct rb_node *parent, **p;
  1411. struct LockedPage *tp, *lp = kmalloc(sizeof(*lp), GFP_KERNEL);
  1412. if (!lp)
  1413. return false;
  1414. lp->page.mpn = mpn;
  1415. lp->page.order = order;
  1416. lp->page.forRegion = forRegion;
  1417. down_write(&vm->lockedSem);
  1418. if (forRegion == MEMREGION_MAINMEM)
  1419. ATOMIC_ADDV(vm->usedPages, 1U << order);
  1420. /*
  1421. * Insert as a red leaf in the tree (see include/linux/rbtree.h).
  1422. */
  1423. p = &vm->lockedRoot.rb_node;
  1424. parent = NULL;
  1425. while (*p) {
  1426. parent = *p;
  1427. tp = rb_entry(parent, struct LockedPage, rb);
  1428. /*
  1429. * MPN should not already exist in the tree.
  1430. */
  1431. ASSERT(tp->page.mpn != (mpn & (~0UL << tp->page.order)));
  1432. if (mpn < tp->page.mpn)
  1433. p = &(*p)->rb_left;
  1434. else
  1435. p = &(*p)->rb_right;
  1436. }
  1437. rb_link_node(&lp->rb, parent, p);
  1438. /*
  1439. * Restructure tree if necessary (see include/linux/rbtree.h).
  1440. */
  1441. rb_insert_color(&lp->rb, &vm->lockedRoot);
  1442. up_write(&vm->lockedSem);
  1443. return true;
  1444. }
  1445. /**
  1446. * @brief Traverse RB locked tree, freeing every entry.
  1447. *
  1448. * This must be called with the relevant vm->lockedSem held.
  1449. *
  1450. * @param node reference to RB node at root of subtree.
  1451. */
  1452. static void
  1453. LockedListNuke(struct rb_node *node)
  1454. {
  1455. while (node) {
  1456. if (node->rb_left) {
  1457. node = node->rb_left;
  1458. } else if (node->rb_right) {
  1459. node = node->rb_right;
  1460. } else {
  1461. /*
  1462. * We found a leaf, free it and go back to parent.
  1463. */
  1464. struct LockedPage *lp =
  1465. rb_entry(node, struct LockedPage, rb);
  1466. node = rb_parent(node);
  1467. if (node) {
  1468. if (node->rb_left)
  1469. node->rb_left = NULL;
  1470. else
  1471. node->rb_right = NULL;
  1472. }
  1473. FreeLockedPages(lp);
  1474. kfree(lp);
  1475. }
  1476. }
  1477. }
  1478. /**
  1479. * @brief Unlock all pages at vm close time.
  1480. *
  1481. * @param vm control structure pointer
  1482. */
  1483. static void
  1484. LockedListUnlockAll(struct MvpkmVM *vm)
  1485. {
  1486. down_write(&vm->lockedSem);
  1487. LockedListNuke(vm->lockedRoot.rb_node);
  1488. ATOMIC_SETV(vm->usedPages, 0);
  1489. up_write(&vm->lockedSem);
  1490. }
  1491. /**
  1492. * @brief Allocate zeroed free pages
  1493. *
  1494. * @param[in] vm which VM the pages are for so they will be freed when the vm
  1495. * closes
  1496. * @param[in] order log2(number of contiguous pages to allocate)
  1497. * @param[in] highmem is it OK to allocate this page in ZONE_HIGHMEM? This
  1498. * option should only be specified for pages the host kernel
  1499. * will not need to address directly.
  1500. * @param[out] hkvaRet where to return host kernel virtual address of the
  1501. * allocated pages, if non-NULL, and ONLY IF !highmem.
  1502. * @param forRegion Annotation for Page pool to identify guest page allocations
  1503. * @return 0: no host memory available<br>
  1504. * else: starting MPN<br>
  1505. * *hkvaRet = filled in
  1506. */
  1507. static MPN
  1508. AllocZeroedFreePages(struct MvpkmVM *vm,
  1509. uint32 order,
  1510. _Bool highmem,
  1511. PhysMem_RegionType forRegion,
  1512. HKVA *hkvaRet)
  1513. {
  1514. MPN mpn;
  1515. struct page *page;
  1516. if (order > PAGE_ALLOC_COSTLY_ORDER)
  1517. pr_warn("Order %d allocation for region %d exceeds the safe " \
  1518. "maximum order %d\n",
  1519. order,
  1520. forRegion,
  1521. PAGE_ALLOC_COSTLY_ORDER);
  1522. /*
  1523. * System RAM bank in 0x00000000 workaround. Should only happens once
  1524. * in host lifetime as memory page is leaked forever. Also leak the
  1525. * MVP's INVALID_MPN page if it appears.
  1526. */
  1527. do {
  1528. /*
  1529. * Get some pages for the requested range. They will be
  1530. * physically contiguous and have the requested alignment.
  1531. * They will also have a kernel virtual mapping if !highmem.
  1532. *
  1533. * We allocate out of ZONE_MOVABLE even though we can't just
  1534. * pick up our bags. We do this to support platforms that
  1535. * explicitly configure ZONE_MOVABLE, such as the Qualcomm
  1536. * MSM8960, to enable deep power down of memory banks. When
  1537. * the kernel attempts to take a memory bank offline, it will
  1538. * try and place the pages on the isolate LRU - only pages
  1539. * already on an LRU, such as anon/file, can get there, so it
  1540. * will not be able to migrate/move our pages (and hence the
  1541. * bank will not be offlined). The other alternative is to
  1542. * live withing ZONE_NORMAL, and only have available a small
  1543. * fraction of system memory. Long term we plan on hooking the
  1544. * offlining callback in mvpkm and perform our own migration
  1545. * with the cooperation of the monitor, but we don't have dev
  1546. * board to support this today.
  1547. *
  1548. * @knownjira{MVP-3477}
  1549. *
  1550. * Allocating all memory as MOVABLE is breaking the linux
  1551. * Contiguous Memory Allocator. It sets up several memory
  1552. * regions reserved for MOVABLE memory, so that it is able to
  1553. * move pages from them on request to satifsy a large memory
  1554. * allocation. But as our pages are not really movable, it
  1555. * happens that it cannot find enough contiguous memory.
  1556. * As a workaround, we now only allocate MOVABLE pages when
  1557. * CONFIG_MEMORY_HOTPLUG is enabled.
  1558. *
  1559. * @knownjira{HW-28182}
  1560. *
  1561. * In order to fully support linux memory hotplug, we should
  1562. * implement a mapping with the "migrate_page" callback and
  1563. * corresponding backend in monitor.
  1564. *
  1565. * @knownjira{HW-28658}
  1566. */
  1567. gfp_t gfp = GFP_USER | __GFP_COMP | __GFP_ZERO;
  1568. if (highmem) {
  1569. gfp |= __GFP_HIGHMEM;
  1570. #ifdef CONFIG_MEMORY_HOTPLUG
  1571. gfp |= __GFP_MOVABLE;
  1572. #endif
  1573. }
  1574. page = alloc_pages(gfp, order);
  1575. if (page == NULL)
  1576. return 0;
  1577. /*
  1578. * Return the corresponding page number.
  1579. */
  1580. mpn = page_to_pfn(page);
  1581. } while (mpn == 0 || mpn == INVALID_MPN);
  1582. /*
  1583. * Remember to unlock the pages when the FD is closed.
  1584. */
  1585. if (!LockedListAdd(vm, mpn, order, forRegion)) {
  1586. __free_pages(page, order);
  1587. return 0;
  1588. }
  1589. if (hkvaRet)
  1590. *hkvaRet = highmem ? 0 : __phys_to_virt(page_to_phys(page));
  1591. return mpn;
  1592. }
  1593. /**
  1594. * @brief Map already-pinned WSP memory in host kernel virtual address(HKVA)
  1595. * space. Assumes 2 world switch pages on an 8k boundary.
  1596. *
  1597. * @param[in] vm which VM the HKVA Area is to be mapped for
  1598. * @param[in] mapInfo array of MPNs and execute permission flags to be used in
  1599. * inserting a new contiguous map in HKVA space
  1600. * @return 0: HKVA space could not be mapped
  1601. * else: HKVA where mapping was inserted
  1602. */
  1603. static HKVA
  1604. MapWSPHKVA(struct MvpkmVM *vm,
  1605. HkvaMapInfo *mapInfo)
  1606. {
  1607. unsigned int i;
  1608. struct page **pages = NULL;
  1609. struct page **pagesPtr;
  1610. pgprot_t prot;
  1611. int retval;
  1612. int allocateCount = WSP_PAGE_COUNT + 1; /* extra page for alignment */
  1613. int pageIndex = 0;
  1614. HKVA dummyPage = (HKVA)NULL;
  1615. HKVA start;
  1616. HKVA startSegment;
  1617. HKVA endSegment;
  1618. /*
  1619. * Add one page for alignment purposes in case __get_vm_area returns an
  1620. * unaligned address.
  1621. */
  1622. ASSERT(allocateCount == 3);
  1623. ASSERT_ON_COMPILE(WSP_PAGE_COUNT == 2);
  1624. /*
  1625. * NOT_IMPLEMENTED if MapHKVA is called more than once.
  1626. */
  1627. BUG_ON(vm->wspHkvaArea);
  1628. /*
  1629. * Reserve virtual address space.
  1630. */
  1631. vm->wspHkvaArea = __get_vm_area((allocateCount * PAGE_SIZE),
  1632. VM_ALLOC, MODULES_VADDR, MODULES_END);
  1633. if (!vm->wspHkvaArea)
  1634. return 0;
  1635. pages = kmalloc(allocateCount * sizeof(struct page *), GFP_TEMPORARY);
  1636. if (!pages)
  1637. goto err;
  1638. pagesPtr = pages;
  1639. /*
  1640. * Use a dummy page to boundary align the section, if needed.
  1641. */
  1642. dummyPage = __get_free_pages(GFP_KERNEL, 0);
  1643. if (!dummyPage)
  1644. goto err;
  1645. vm->wspHKVADummyPage = dummyPage;
  1646. /*
  1647. * Back every entry with the dummy page.
  1648. */
  1649. for (i = 0; i < allocateCount; i++)
  1650. pages[i] = virt_to_page(dummyPage);
  1651. /*
  1652. * World switch pages must not span a 1MB boundary in order to
  1653. * maintain only a single L2 page table.
  1654. */
  1655. start = (HKVA)vm->wspHkvaArea->addr;
  1656. startSegment = start & ~(ARM_L1D_SECTION_SIZE - 1);
  1657. endSegment = (start + PAGE_SIZE) & ~(ARM_L1D_SECTION_SIZE - 1);
  1658. /*
  1659. * Insert dummy page at pageIndex, if needed.
  1660. */
  1661. pageIndex = (startSegment != endSegment);
  1662. /*
  1663. * Back the rest with the actual world switch pages
  1664. */
  1665. for (i = pageIndex; i < pageIndex + WSP_PAGE_COUNT; i++)
  1666. pages[i] = pfn_to_page(mapInfo[i - pageIndex].mpn);
  1667. /*
  1668. * Given the lack of functionality in the kernel for being able to mark
  1669. * mappings for a given vm area with different sets of protection bits,
  1670. * we simply mark the entire vm area as PAGE_KERNEL_EXEC for now
  1671. * (i.e., union of all the protection bits). Given that the kernel
  1672. * itself does something similar while loading modules, this should be a
  1673. * reasonable workaround for now. In the future, we should set the
  1674. * protection bits to strictly adhere to what has been requested in the
  1675. * mapInfo parameter.
  1676. */
  1677. prot = PAGE_KERNEL_EXEC;
  1678. retval = map_vm_area(vm->wspHkvaArea, prot, &pagesPtr);
  1679. if (retval < 0)
  1680. goto err;
  1681. kfree(pages);
  1682. return (HKVA)(vm->wspHkvaArea->addr) + pageIndex * PAGE_SIZE;
  1683. err:
  1684. if (dummyPage) {
  1685. free_pages(dummyPage, 0);
  1686. vm->wspHKVADummyPage = (HKVA)NULL;
  1687. }
  1688. kfree(pages);
  1689. free_vm_area(vm->wspHkvaArea);
  1690. vm->wspHkvaArea = (struct vm_struct *)NULL;
  1691. return 0;
  1692. }
  1693. static void
  1694. UnmapWSPHKVA(struct MvpkmVM *vm)
  1695. {
  1696. if (vm->wspHkvaArea)
  1697. free_vm_area(vm->wspHkvaArea);
  1698. if (vm->wspHKVADummyPage) {
  1699. free_pages(vm->wspHKVADummyPage, 0);
  1700. vm->wspHKVADummyPage = (HKVA)NULL;
  1701. }
  1702. }
  1703. /**
  1704. * @brief Clean and release locked pages
  1705. *
  1706. * @param lp Reference to the locked pages
  1707. */
  1708. static void
  1709. FreeLockedPages(struct LockedPage *lp)
  1710. {
  1711. struct page *page;
  1712. int count;
  1713. page = pfn_to_page(lp->page.mpn);
  1714. count = page_count(page);
  1715. if (count == 0) {
  1716. pr_err("%s: found locked page with 0 reference (mpn %05x)\n",
  1717. __func__, lp->page.mpn);
  1718. return;
  1719. }
  1720. if (count == 1) {
  1721. int i;
  1722. /*
  1723. * There is no other user for this page, clean it.
  1724. *
  1725. * We don't bother checking if the page was highmem or not,
  1726. * clear_highmem works for both.
  1727. * We clear the content of the page, and rely on the fact that
  1728. * the previous worldswitch has cleaned the potential
  1729. * VIVT I-CACHE.
  1730. */
  1731. for (i = 0; i < (1 << lp->page.order); i++)
  1732. clear_highpage(page + i);
  1733. } else if (lp->page.forRegion != MEMREGION_MAINMEM) {
  1734. pr_warn("%s: mpn 0x%05x for region %d is still in use\n",
  1735. __func__, lp->page.mpn, lp->page.forRegion);
  1736. }
  1737. __free_pages(page, lp->page.order);
  1738. }
  1739. /*********************************************************************
  1740. *
  1741. * Communicate with monitor
  1742. *
  1743. *********************************************************************/
  1744. /**
  1745. * @brief Register a new monitor page.
  1746. *
  1747. * @param vm which virtual machine we're running
  1748. * @return 0: successful<br>
  1749. * else: -errno
  1750. */
  1751. static int
  1752. SetupMonitor(struct MvpkmVM *vm)
  1753. {
  1754. int retval;
  1755. WorldSwitchPage *wsp = vm->wsp;
  1756. #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 40501
  1757. #define USE_ARCH_EXTENSION_SEC 1
  1758. #else
  1759. #define USE_ARCH_EXTENSION_SEC 0
  1760. #endif
  1761. if (!wsp || wsp->wspHKVA != (HKVA)wsp)
  1762. return -EINVAL;
  1763. retval = Mksck_WspInitialize(vm);
  1764. if (retval)
  1765. return retval;
  1766. vm->kobj.kset = mvpkmKSet;
  1767. retval = kobject_init_and_add(&vm->kobj, &mvpkmKType,
  1768. NULL, "%d", wsp->guestId);
  1769. if (retval)
  1770. goto error;
  1771. /*
  1772. * Get a reference to this module such that it cannot be unloaded until
  1773. * our kobject's release function completes.
  1774. */
  1775. __module_get(THIS_MODULE);
  1776. vm->haveKObj = true;
  1777. /*
  1778. * Caution: From here on, if we fail, we must not call kobject_put()
  1779. * on vm->kobj since that may / will deallocate 'vm'. Unregistering VM
  1780. * ksets on failures is fine and should be done for proper ref counting.
  1781. */
  1782. vm->devicesKSet = kset_create_and_add("devices", NULL, &vm->kobj);
  1783. if (!vm->devicesKSet) {
  1784. retval = -ENOMEM;
  1785. goto error;
  1786. }
  1787. vm->miscKSet = kset_create_and_add("misc", NULL, &vm->kobj);
  1788. if (!vm->miscKSet) {
  1789. kset_unregister(vm->devicesKSet);
  1790. vm->devicesKSet = NULL;
  1791. retval = -ENOMEM;
  1792. goto error;
  1793. }
  1794. down_write(&vm->wspSem);
  1795. /*
  1796. * The VE monitor needs to issue a SMC to bootstrap Hyp mode.
  1797. */
  1798. if (wsp->monType == MONITOR_TYPE_VE) {
  1799. /*
  1800. * Here we assemble the monitor's HMAIR0 based on wsp->memAttr.
  1801. * We map from the inner/outer normal page cacheability
  1802. * attributes obtained from DetermineCacheabilityAttribs to
  1803. * the format required in 4.2.8 ARM PRD03-GENC-008469 13.0
  1804. * (see this document for the magic numbers).
  1805. *
  1806. * * Where a choice is available, we opt for read and/or
  1807. * write allocation.
  1808. */
  1809. static const uint32 normalCacheAttr2MAIR[4] = {
  1810. 0x4, 0xf, 0xa, 0xe };
  1811. uint32 hmair0 =
  1812. ((normalCacheAttr2MAIR[wsp->memAttr.innerCache] |
  1813. (normalCacheAttr2MAIR[wsp->memAttr.outerCache] << 4))
  1814. << 8 * MVA_MEMORY) |
  1815. (0x4 << 8 * MVA_DEVICE);
  1816. /*
  1817. * See B4.1.74 ARM DDI 0406C-2c for the HTCR magic.
  1818. */
  1819. uint32 htcr =
  1820. 0x80000000 |
  1821. (wsp->memAttr.innerCache << 8) |
  1822. (wsp->memAttr.outerCache << 10) |
  1823. (wsp->memAttr.share << 12);
  1824. /**
  1825. * @knownjira{MVP-377}
  1826. * Set HSCTLR to enable MMU and caches. We should really run
  1827. * the monitor WXN, in non-MVP_DEVEL builds.
  1828. * See 13.18 ARM PRD03-GENC-008353 11.0 for the magic.
  1829. */
  1830. static const uint32 hsctlr = 0x30c5187d;
  1831. register uint32 r0 asm("r0") = wsp->monVA.excVec;
  1832. register uint32 r1 asm("r1") = wsp->regSave.ve.mHTTBR;
  1833. register uint32 r2 asm("r2") = htcr;
  1834. register uint32 r3 asm("r3") = hmair0;
  1835. register uint32 r4 asm("r4") = hsctlr;
  1836. asm volatile (
  1837. #if USE_ARCH_EXTENSION_SEC
  1838. ".arch_extension sec\n\t"
  1839. #endif
  1840. "smc 0"
  1841. :
  1842. : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4)
  1843. : "memory"
  1844. );
  1845. }
  1846. /*
  1847. * Initialize guest wait-for-interrupt waitqueue.
  1848. */
  1849. init_waitqueue_head(&vm->wfiWaitQ);
  1850. MonitorTimer_Setup(vm);
  1851. #ifdef CONFIG_HAS_WAKELOCK
  1852. wake_lock_init(&vm->wakeLock, WAKE_LOCK_SUSPEND, "mvpkm");
  1853. #endif
  1854. wsp->mvpkmVersion = MVP_VERSION_CODE;
  1855. up_write(&vm->wspSem);
  1856. /*
  1857. * Ensure coherence of monitor loading and page tables.
  1858. */
  1859. flush_cache_all();
  1860. return 0;
  1861. error:
  1862. Mksck_WspRelease(wsp);
  1863. vm->wsp = NULL;
  1864. return retval;
  1865. }
  1866. /**
  1867. * @brief dummy function to drop the info parameter
  1868. * @param info ignored
  1869. */
  1870. static
  1871. void FlushAllCpuCaches(void *info)
  1872. {
  1873. flush_cache_all();
  1874. }
  1875. /**
  1876. * @brief return to where monitor called worldswitch
  1877. *
  1878. * @param vm which virtual machine we're running
  1879. * @return 0: successful, just call back when ready<br>
  1880. * 1: successful, process code in WSP_Params(wsp)->callno<br>
  1881. * else: -errno
  1882. */
  1883. static int
  1884. RunMonitor(struct MvpkmVM *vm)
  1885. {
  1886. int ii;
  1887. unsigned long flags;
  1888. WorldSwitchPage *wsp = vm->wsp;
  1889. int retval = 0;
  1890. unsigned int freq = -1;
  1891. ASSERT(wsp);
  1892. #ifdef CONFIG_HAS_WAKELOCK
  1893. wake_lock(&vm->wakeLock);
  1894. #endif
  1895. /*
  1896. * Set VCPUThread affinity
  1897. */
  1898. if (cpumask_intersects(to_cpumask(vcpuAffinity), cpu_active_mask))
  1899. set_cpus_allowed_ptr(current, to_cpumask(vcpuAffinity));
  1900. /*
  1901. * Record the the current task structure, so an ABORT will know,
  1902. * who to wake.
  1903. */
  1904. down_write(&vm->monThreadTaskSem);
  1905. vm->monThreadTask = get_current();
  1906. up_write(&vm->monThreadTaskSem);
  1907. /*
  1908. * Keep going as long as the monitor is in critical section or
  1909. * there are no pending signals such as SIGINT or SIGKILL. Block
  1910. * interrupts before checking so any IPI sent will remain pending
  1911. * if our check just misses detecting the signal.
  1912. */
  1913. local_irq_save(flags);
  1914. while (wsp->critSecCount > 0 ||
  1915. (!signal_pending(current) &&
  1916. !(ATOMIC_GETO(wsp->hostActions) & ACTION_ABORT))) {
  1917. cpumask_set_cpu(smp_processor_id(), &inMonitor);
  1918. /*
  1919. * ARMv7 Performance counters are per CPU core and might be
  1920. * disabled over CPU core sleep if there is nothing else in
  1921. * the system to re-enable them, so now that we have been
  1922. * allocated a CPU core to run the guest,
  1923. * enable them and in particular the TSC (CCNT) which is used
  1924. * for monitor timing between world switches.
  1925. */
  1926. {
  1927. uint32 pmnc;
  1928. uint32 pmcnt;
  1929. /* make sure that Performance Counters are enabled */
  1930. ARM_MRC_CP15(PERF_MON_CONTROL_REGISTER, pmnc);
  1931. if ((pmnc & (ARM_PMNC_E | ARM_PMNC_D)) !=
  1932. (ARM_PMNC_E)) {
  1933. pmnc |= ARM_PMNC_E; /* Enable TSC */
  1934. /* Disable cycle count divider */
  1935. pmnc &= ~ARM_PMNC_D;
  1936. ARM_MCR_CP15(PERF_MON_CONTROL_REGISTER, pmnc);
  1937. }
  1938. /* make sure that the CCNT is enabled */
  1939. ARM_MRC_CP15(PERF_MON_COUNT_SET, pmcnt);
  1940. if ((pmcnt & ARM_PMCNT_C) != ARM_PMCNT_C) {
  1941. pmcnt |= ARM_PMCNT_C;
  1942. ARM_MCR_CP15(PERF_MON_COUNT_SET, pmcnt);
  1943. }
  1944. }
  1945. /*
  1946. * Update TSC to RATE64 ratio
  1947. */
  1948. {
  1949. struct TscToRate64Cb ttr;
  1950. if (CpuFreqUpdate(&freq, &ttr)) {
  1951. wsp->tscToRate64Mult = ttr.mult;
  1952. wsp->tscToRate64Shift = ttr.shift;
  1953. }
  1954. }
  1955. /*
  1956. * Save the time of day for the monitor's timer facility.
  1957. * The timing facility in the vmm needs to compute current
  1958. * time in the host linux's time representation. It uses
  1959. * the formula:
  1960. * now = wsp->switchedAt64 + (uint32)(TSC_READ() -
  1961. * wsp->lowerTSC)
  1962. *
  1963. * Read the timestamp counter *immediately after* ktime_get()
  1964. * as that will give the most consistent offset between
  1965. * reading the hardware clock register in ktime_get() and
  1966. * reading the hardware timestamp counter with TSC_READ().
  1967. */
  1968. ASSERT_ON_COMPILE(MVP_TIMER_RATE64 == NSEC_PER_SEC);
  1969. {
  1970. ktime_t now = ktime_get();
  1971. TSC_READ(wsp->switchedAtTSC);
  1972. wsp->switchedAt64 = ktime_to_ns(now);
  1973. }
  1974. /*
  1975. * Save host FPU contents and load monitor contents.
  1976. */
  1977. SWITCH_VFP_TO_MONITOR;
  1978. /*
  1979. * Call into the monitor to run guest instructions until it
  1980. * wants us to do something for it. Note that any hardware
  1981. * interrupt request will cause it to volunteer.
  1982. */
  1983. switch (wsp->monType) {
  1984. case MONITOR_TYPE_LPV: {
  1985. uint32 hostVBAR;
  1986. ARM_MRC_CP15(VECTOR_BASE, hostVBAR);
  1987. (*wsp->switchToMonitor)(&wsp->regSave);
  1988. ARM_MCR_CP15(VECTOR_BASE, hostVBAR);
  1989. break;
  1990. }
  1991. case MONITOR_TYPE_VE: {
  1992. register uint32 r1 asm("r1") = wsp->regSave.ve.mHTTBR;
  1993. asm volatile (
  1994. ".word " MVP_STRINGIFY(ARM_INSTR_HVC_A1_ENC(0))
  1995. : "=r" (r1) : "r" (r1) : "r0", "r2", "memory"
  1996. );
  1997. break;
  1998. }
  1999. default:
  2000. FATAL();
  2001. }
  2002. /*
  2003. * Save monitor FPU contents and load host contents.
  2004. */
  2005. SWITCH_VFP_TO_HOST;
  2006. cpumask_clear_cpu(smp_processor_id(), &inMonitor);
  2007. /*
  2008. * Re-enable local interrupts now that we are back in the
  2009. * host world.
  2010. */
  2011. local_irq_restore(flags);
  2012. /*
  2013. * Maybe the monitor wrote some messages to monitor->host
  2014. * sockets. This will wake the corresponding host threads to
  2015. * receive them.
  2016. */
  2017. /**
  2018. * @todo This lousy loop is in the critical path. It should
  2019. * be changed to some faster algorithm to wake blocked host
  2020. * sockets.
  2021. */
  2022. for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) {
  2023. if (wsp->isPageMapped[ii])
  2024. Mksck_WakeBlockedSockets(
  2025. MksckPage_GetFromIdx(ii));
  2026. }
  2027. switch (WSP_Params(wsp)->callno) {
  2028. case WSCALL_ACQUIRE_PAGE: {
  2029. uint32 i;
  2030. for (i = 0; i < WSP_Params(wsp)->pages.pages; ++i) {
  2031. MPN mpn = AllocZeroedFreePages(vm,
  2032. WSP_Params(wsp)->pages.order,
  2033. true,
  2034. WSP_Params(wsp)->pages.forRegion,
  2035. NULL);
  2036. if (mpn == 0) {
  2037. pr_err("WSCALL_ACQUIRE_PAGE: no order "\
  2038. "%u pages available\n",
  2039. WSP_Params(wsp)->pages.order);
  2040. WSP_Params(wsp)->pages.pages = i;
  2041. break;
  2042. }
  2043. WSP_Params(wsp)->pages.mpns[i] = mpn;
  2044. }
  2045. break;
  2046. }
  2047. case WSCALL_RELEASE_PAGE: {
  2048. uint32 i;
  2049. for (i = 0; i < WSP_Params(wsp)->pages.pages; ++i) {
  2050. if (!LockedListDel(vm,
  2051. WSP_Params(wsp)->pages.mpns[i])) {
  2052. WSP_Params(wsp)->pages.pages = i;
  2053. break;
  2054. }
  2055. }
  2056. break;
  2057. }
  2058. case WSCALL_MUTEXLOCK:
  2059. retval =
  2060. Mutex_Lock((void *)WSP_Params(wsp)->mutex.mtxHKVA,
  2061. WSP_Params(wsp)->mutex.mode);
  2062. if (retval < 0) {
  2063. WSP_Params(wsp)->mutex.ok = false;
  2064. goto monitorExit;
  2065. }
  2066. /*
  2067. * The locking succeeded. From this point on the monitor
  2068. * is in critical section. Even if an interrupt comes
  2069. * right here, it must return to the monitor to unlock
  2070. * the mutex.
  2071. */
  2072. wsp->critSecCount++;
  2073. WSP_Params(wsp)->mutex.ok = true;
  2074. break;
  2075. case WSCALL_MUTEXUNLOCK:
  2076. Mutex_Unlock((void *)WSP_Params(wsp)->mutex.mtxHKVA,
  2077. WSP_Params(wsp)->mutex.mode);
  2078. break;
  2079. case WSCALL_MUTEXUNLSLEEP:
  2080. /*
  2081. * The vcpu has just come back from the monitor. During
  2082. * the transition interrupts were disabled. Above,
  2083. * however, interrupts were enabled again and it is
  2084. * possible that a context switch happened into a thread
  2085. * (serve_vmx) that instructed the vcpu thread to
  2086. * abort. After returning to this thread the vcpu may
  2087. * enter a sleep below never to return from it. To avoid
  2088. * this deadlock we need to test the abort flag in
  2089. * Mutex_UnlSleepTest.
  2090. */
  2091. retval = Mutex_UnlSleepTest(
  2092. (void *)WSP_Params(wsp)->mutex.mtxHKVA,
  2093. WSP_Params(wsp)->mutex.mode,
  2094. WSP_Params(wsp)->mutex.cvi,
  2095. &wsp->hostActions,
  2096. ACTION_ABORT);
  2097. if (retval < 0)
  2098. goto monitorExit;
  2099. break;
  2100. case WSCALL_MUTEXUNLWAKE:
  2101. Mutex_UnlWake((void *)WSP_Params(wsp)->mutex.mtxHKVA,
  2102. WSP_Params(wsp)->mutex.mode,
  2103. WSP_Params(wsp)->mutex.cvi,
  2104. WSP_Params(wsp)->mutex.all);
  2105. break;
  2106. /*
  2107. * The monitor wants us to block (allowing other host threads
  2108. * to run) until an async message is waiting for the monitor
  2109. * to process.
  2110. *
  2111. * If MvpkmWaitForInt() returns an error, it should only be
  2112. * if there is another signal pending (such as SIGINT).
  2113. * So we pretend it completed normally, as the monitor is
  2114. * ready to be called again (it will see no messages to
  2115. * process and wait again), and return to user mode so the
  2116. * signals can be processed.
  2117. */
  2118. case WSCALL_WAIT:
  2119. #ifdef CONFIG_HAS_WAKELOCK
  2120. if (WSP_Params(wsp)->wait.suspendMode) {
  2121. /*
  2122. * Guest has ok'ed suspend mode, so release
  2123. * SUSPEND wakelock
  2124. */
  2125. wake_unlock(&vm->wakeLock);
  2126. retval = MvpkmWaitForInt(vm, true);
  2127. wake_lock(&vm->wakeLock);
  2128. WSP_Params(wsp)->wait.suspendMode = 0;
  2129. } else {
  2130. /*
  2131. * Guest has asked for WFI not suspend so
  2132. * keep holding SUSPEND wakelock
  2133. */
  2134. retval = MvpkmWaitForInt(vm, false);
  2135. }
  2136. #else
  2137. retval =
  2138. MvpkmWaitForInt(vm,
  2139. WSP_Params(wsp)->wait.suspendMode);
  2140. #endif
  2141. if (retval < 0)
  2142. goto monitorExit;
  2143. break;
  2144. /*
  2145. * The only reason the monitor returned was because there was a
  2146. * pending hardware interrupt. The host serviced and cleared
  2147. * that interrupt when we enabled interrupts above.
  2148. * Now we call the scheduler in case that interrupt woke
  2149. * another thread, we want to allow that thread to run before
  2150. * returning to do more guest code.
  2151. */
  2152. case WSCALL_IRQ:
  2153. break;
  2154. case WSCALL_GET_PAGE_FROM_VMID: {
  2155. MksckPage *mksckPage;
  2156. mksckPage = MksckPage_GetFromVmIdIncRefc(
  2157. WSP_Params(wsp)->pageMgmnt.vmId);
  2158. if (mksckPage) {
  2159. int ii;
  2160. int pageIndex;
  2161. WSP_Params(wsp)->pageMgmnt.found = true;
  2162. for (ii = 0; ii < MKSCKPAGE_TOTAL; ii++) {
  2163. WSP_Params(wsp)->pageMgmnt.mpn[ii] =
  2164. vmalloc_to_pfn((void *)(((HKVA)mksckPage) +
  2165. ii * PAGE_SIZE));
  2166. }
  2167. pageIndex = MKSCK_VMID2IDX(mksckPage->vmId);
  2168. ASSERT(!wsp->isPageMapped[pageIndex]);
  2169. wsp->isPageMapped[pageIndex] = true;
  2170. } else {
  2171. WSP_Params(wsp)->pageMgmnt.found = false;
  2172. }
  2173. break;
  2174. }
  2175. case WSCALL_REMOVE_PAGE_FROM_VMID: {
  2176. MksckPage *mksckPage;
  2177. int pageIndex;
  2178. mksckPage =
  2179. MksckPage_GetFromVmId(WSP_Params(wsp)->pageMgmnt.vmId);
  2180. pageIndex = MKSCK_VMID2IDX(mksckPage->vmId);
  2181. ASSERT(wsp->isPageMapped[pageIndex]);
  2182. wsp->isPageMapped[pageIndex] = false;
  2183. MksckPage_DecRefc(mksckPage);
  2184. break;
  2185. }
  2186. /*
  2187. * Read current wallclock time.
  2188. */
  2189. case WSCALL_READTOD: {
  2190. struct timeval nowTV;
  2191. do_gettimeofday(&nowTV);
  2192. WSP_Params(wsp)->tod.now = nowTV.tv_sec;
  2193. WSP_Params(wsp)->tod.nowusec = nowTV.tv_usec;
  2194. break;
  2195. }
  2196. case WSCALL_LOG: {
  2197. int len = strlen(WSP_Params(wsp)->log.messg);
  2198. pr_info("VMM: %s%s",
  2199. WSP_Params(wsp)->log.messg,
  2200. (WSP_Params(wsp)->log.messg[len-1] == '\n') ?
  2201. "" : "\n");
  2202. break;
  2203. }
  2204. case WSCALL_ABORT:
  2205. retval = WSP_Params(wsp)->abort.status;
  2206. goto monitorExit;
  2207. case WSCALL_QP_GUEST_ATTACH: {
  2208. int32 rc;
  2209. QPInitArgs args;
  2210. uint32 base;
  2211. uint32 nrPages;
  2212. args.id = WSP_Params(wsp)->qp.id;
  2213. args.capacity = WSP_Params(wsp)->qp.capacity;
  2214. args.type = WSP_Params(wsp)->qp.type;
  2215. base = WSP_Params(wsp)->qp.base;
  2216. nrPages = WSP_Params(wsp)->qp.nrPages;
  2217. rc = QP_GuestAttachRequest(vm, &args, base, nrPages);
  2218. WSP_Params(wsp)->qp.rc = rc;
  2219. WSP_Params(wsp)->qp.id = args.id;
  2220. break;
  2221. }
  2222. case WSCALL_QP_NOTIFY: {
  2223. QPInitArgs args;
  2224. args.id = WSP_Params(wsp)->qp.id;
  2225. args.capacity = WSP_Params(wsp)->qp.capacity;
  2226. args.type = WSP_Params(wsp)->qp.type;
  2227. WSP_Params(wsp)->qp.rc = QP_NotifyListener(&args);
  2228. break;
  2229. }
  2230. case WSCALL_MONITOR_TIMER:
  2231. MonitorTimer_Request(&vm->monTimer,
  2232. WSP_Params(wsp)->timer.when64);
  2233. break;
  2234. case WSCALL_COMM_SIGNAL:
  2235. Mvpkm_CommEvSignal(&WSP_Params(wsp)->commEvent.transpID,
  2236. WSP_Params(wsp)->commEvent.event);
  2237. break;
  2238. case WSCALL_FLUSH_ALL_DCACHES:
  2239. /*
  2240. * Broadcast Flush DCache request to all cores.
  2241. * Block while waiting for all of them to get done.
  2242. */
  2243. on_each_cpu(FlushAllCpuCaches, NULL, 1);
  2244. break;
  2245. default:
  2246. retval = -EPIPE;
  2247. goto monitorExit;
  2248. }
  2249. /*
  2250. * The params.callno callback was handled in kernel mode and
  2251. * completed successfully. Repeat for another call without
  2252. * returning to user mode, unless there are signals pending.
  2253. *
  2254. * But first, call the Linux scheduler to switch threads if
  2255. * there is some other thread Linux wants to run now.
  2256. */
  2257. if (need_resched())
  2258. schedule();
  2259. /*
  2260. * Check if cpus allowed mask has to be updated.
  2261. * Updating it must be done outside of an atomic context.
  2262. */
  2263. if (cpumask_intersects(to_cpumask(vcpuAffinity),
  2264. cpu_active_mask) &&
  2265. !cpumask_equal(to_cpumask(vcpuAffinity),
  2266. &current->cpus_allowed))
  2267. set_cpus_allowed_ptr(current, to_cpumask(vcpuAffinity));
  2268. local_irq_save(flags);
  2269. }
  2270. /*
  2271. * There are signals pending so don't try to do any more monitor/guest
  2272. * stuff. But since we were at the point of just about to run the
  2273. * monitor, return success status as user mode can simply call us
  2274. * back to run the monitor again.
  2275. */
  2276. local_irq_restore(flags);
  2277. monitorExit:
  2278. ASSERT(wsp->critSecCount == 0);
  2279. if (ATOMIC_GETO(wsp->hostActions) & ACTION_ABORT) {
  2280. PRINTK("Monitor has ABORT flag set.\n");
  2281. retval = ExitStatusHostRequest;
  2282. }
  2283. if (retval == ExitStatusHostRequest && vm->watchdogTriggered)
  2284. retval = ExitStatusVMMFatalKnown;
  2285. #ifdef CONFIG_HAS_WAKELOCK
  2286. wake_unlock(&vm->wakeLock);
  2287. #endif
  2288. down_write(&vm->monThreadTaskSem);
  2289. vm->monThreadTask = NULL;
  2290. up_write(&vm->monThreadTaskSem);
  2291. return retval;
  2292. }
  2293. /**
  2294. * @brief Guest is waiting for interrupts, sleep if necessary
  2295. *
  2296. * @param vm which virtual machine we're running
  2297. * @param suspend is the guest entering suspend or just WFI?
  2298. * @return 0: woken up, hostActions should have pending events
  2299. * -ERESTARTSYS: broke out because other signals are pending
  2300. *
  2301. * This function is called in the VCPU context after the world switch to wait
  2302. * for an incoming message. If any message gets queued to this VCPU, the
  2303. * sender will wake us up.
  2304. */
  2305. int
  2306. MvpkmWaitForInt(struct MvpkmVM *vm,
  2307. _Bool suspend)
  2308. {
  2309. WorldSwitchPage *wsp = vm->wsp;
  2310. wait_queue_head_t *q = &vm->wfiWaitQ;
  2311. if (suspend) {
  2312. return wait_event_interruptible(*q,
  2313. ATOMIC_GETO(wsp->hostActions) != 0);
  2314. } else {
  2315. int ret;
  2316. ret = wait_event_interruptible_timeout(*q,
  2317. ATOMIC_GETO(wsp->hostActions) != 0, 10*HZ);
  2318. if (ret == 0)
  2319. pr_warn("MvpkmWaitForInt: guest stuck for 10s in " \
  2320. "WFI! (hostActions %08x)\n",
  2321. ATOMIC_GETO(wsp->hostActions));
  2322. return ret > 0 ? 0 : ret;
  2323. }
  2324. }
  2325. /**
  2326. * @brief Force the guest to evaluate its hostActions flag field
  2327. *
  2328. * @param vm which guest needs waking
  2329. * @param why why should be guest be woken up?
  2330. *
  2331. * This function updates the hostAction flag field as and wakes up the guest as
  2332. * required so that it can evaluate it. The guest could be executing guest
  2333. * code in an SMP system, in that case send an IPI; or it could be sleeping, in
  2334. * the case wake it up.
  2335. */
  2336. void
  2337. Mvpkm_WakeGuest(struct MvpkmVM *vm,
  2338. int why)
  2339. {
  2340. ASSERT(why != 0);
  2341. /* set the host action */
  2342. if (ATOMIC_ORO(vm->wsp->hostActions, why) & why)
  2343. /* guest has already been woken up so no need to do it again */
  2344. return;
  2345. /*
  2346. * VCPU is certainly in 'wait for interrupt' wait. Wake it up!
  2347. */
  2348. #ifdef CONFIG_HAS_WAKELOCK
  2349. /*
  2350. * To prevent the system to go in suspend mode before the monitor had a
  2351. * chance on being scheduled, we will hold the VM wakelock from now.
  2352. * As the wakelocks are not managed as reference counts, this is not an
  2353. * an issue to take a wake_lock twice in a row.
  2354. */
  2355. wake_lock(&vm->wakeLock);
  2356. #endif
  2357. /*
  2358. * On a UP system, we ensure the monitor thread isn't blocked.
  2359. *
  2360. * On an MP system the other CPU might be running the guest. This
  2361. * is noop on UP.
  2362. *
  2363. * When the guest is running, it is an invariant that monThreadTaskSem
  2364. * is not held as a write lock, so we should not fail to acquire the
  2365. * lock.
  2366. * Mvpkm_WakeGuest may be called from an atomic context, so we can't
  2367. * sleep here.
  2368. */
  2369. if (down_read_trylock(&vm->monThreadTaskSem)) {
  2370. if (vm->monThreadTask) {
  2371. wake_up_process(vm->monThreadTask);
  2372. kick_process(vm->monThreadTask);
  2373. }
  2374. up_read(&vm->monThreadTaskSem);
  2375. } else {
  2376. pr_warn("Unexpected failure to acquire monThreadTaskSem!\n");
  2377. }
  2378. }