cgroup-v1.c 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340
  1. #include "cgroup-internal.h"
  2. #include <linux/ctype.h>
  3. #include <linux/kmod.h>
  4. #include <linux/sort.h>
  5. #include <linux/delay.h>
  6. #include <linux/mm.h>
  7. #include <linux/sched/signal.h>
  8. #include <linux/sched/task.h>
  9. #include <linux/magic.h>
  10. #include <linux/slab.h>
  11. #include <linux/vmalloc.h>
  12. #include <linux/delayacct.h>
  13. #include <linux/pid_namespace.h>
  14. #include <linux/cgroupstats.h>
  15. #include <trace/events/cgroup.h>
  16. #ifdef CONFIG_MTK_TASK_TURBO
  17. #include <mt-plat/turbo_common.h>
  18. #endif
  19. /*
  20. * pidlists linger the following amount before being destroyed. The goal
  21. * is avoiding frequent destruction in the middle of consecutive read calls
  22. * Expiring in the middle is a performance problem not a correctness one.
  23. * 1 sec should be enough.
  24. */
  25. #define CGROUP_PIDLIST_DESTROY_DELAY HZ
  26. /* Controllers blocked by the commandline in v1 */
  27. static u16 cgroup_no_v1_mask;
  28. /*
  29. * pidlist destructions need to be flushed on cgroup destruction. Use a
  30. * separate workqueue as flush domain.
  31. */
  32. static struct workqueue_struct *cgroup_pidlist_destroy_wq;
  33. /*
  34. * Protects cgroup_subsys->release_agent_path. Modifying it also requires
  35. * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
  36. */
  37. static DEFINE_SPINLOCK(release_agent_path_lock);
  38. bool cgroup1_ssid_disabled(int ssid)
  39. {
  40. return cgroup_no_v1_mask & (1 << ssid);
  41. }
  42. /**
  43. * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  44. * @from: attach to all cgroups of a given task
  45. * @tsk: the task to be attached
  46. */
  47. int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
  48. {
  49. struct cgroup_root *root;
  50. int retval = 0;
  51. mutex_lock(&cgroup_mutex);
  52. percpu_down_write(&cgroup_threadgroup_rwsem);
  53. for_each_root(root) {
  54. struct cgroup *from_cgrp;
  55. if (root == &cgrp_dfl_root)
  56. continue;
  57. spin_lock_irq(&css_set_lock);
  58. from_cgrp = task_cgroup_from_root(from, root);
  59. spin_unlock_irq(&css_set_lock);
  60. retval = cgroup_attach_task(from_cgrp, tsk, false);
  61. if (retval)
  62. break;
  63. }
  64. percpu_up_write(&cgroup_threadgroup_rwsem);
  65. mutex_unlock(&cgroup_mutex);
  66. return retval;
  67. }
  68. EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
  69. /**
  70. * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
  71. * @to: cgroup to which the tasks will be moved
  72. * @from: cgroup in which the tasks currently reside
  73. *
  74. * Locking rules between cgroup_post_fork() and the migration path
  75. * guarantee that, if a task is forking while being migrated, the new child
  76. * is guaranteed to be either visible in the source cgroup after the
  77. * parent's migration is complete or put into the target cgroup. No task
  78. * can slip out of migration through forking.
  79. */
  80. int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
  81. {
  82. DEFINE_CGROUP_MGCTX(mgctx);
  83. struct cgrp_cset_link *link;
  84. struct css_task_iter it;
  85. struct task_struct *task;
  86. int ret;
  87. if (cgroup_on_dfl(to))
  88. return -EINVAL;
  89. ret = cgroup_migrate_vet_dst(to);
  90. if (ret)
  91. return ret;
  92. mutex_lock(&cgroup_mutex);
  93. percpu_down_write(&cgroup_threadgroup_rwsem);
  94. /* all tasks in @from are being moved, all csets are source */
  95. spin_lock_irq(&css_set_lock);
  96. list_for_each_entry(link, &from->cset_links, cset_link)
  97. cgroup_migrate_add_src(link->cset, to, &mgctx);
  98. spin_unlock_irq(&css_set_lock);
  99. ret = cgroup_migrate_prepare_dst(&mgctx);
  100. if (ret)
  101. goto out_err;
  102. /*
  103. * Migrate tasks one-by-one until @from is empty. This fails iff
  104. * ->can_attach() fails.
  105. */
  106. do {
  107. css_task_iter_start(&from->self, 0, &it);
  108. do {
  109. task = css_task_iter_next(&it);
  110. } while (task && (task->flags & PF_EXITING));
  111. if (task)
  112. get_task_struct(task);
  113. css_task_iter_end(&it);
  114. if (task) {
  115. ret = cgroup_migrate(task, false, &mgctx);
  116. if (!ret)
  117. trace_cgroup_transfer_tasks(to, task, false);
  118. put_task_struct(task);
  119. }
  120. } while (task && !ret);
  121. out_err:
  122. cgroup_migrate_finish(&mgctx);
  123. percpu_up_write(&cgroup_threadgroup_rwsem);
  124. mutex_unlock(&cgroup_mutex);
  125. return ret;
  126. }
  127. /*
  128. * Stuff for reading the 'tasks'/'procs' files.
  129. *
  130. * Reading this file can return large amounts of data if a cgroup has
  131. * *lots* of attached tasks. So it may need several calls to read(),
  132. * but we cannot guarantee that the information we produce is correct
  133. * unless we produce it entirely atomically.
  134. *
  135. */
  136. /* which pidlist file are we talking about? */
  137. enum cgroup_filetype {
  138. CGROUP_FILE_PROCS,
  139. CGROUP_FILE_TASKS,
  140. };
  141. /*
  142. * A pidlist is a list of pids that virtually represents the contents of one
  143. * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
  144. * a pair (one each for procs, tasks) for each pid namespace that's relevant
  145. * to the cgroup.
  146. */
  147. struct cgroup_pidlist {
  148. /*
  149. * used to find which pidlist is wanted. doesn't change as long as
  150. * this particular list stays in the list.
  151. */
  152. struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
  153. /* array of xids */
  154. pid_t *list;
  155. /* how many elements the above list has */
  156. int length;
  157. /* each of these stored in a list by its cgroup */
  158. struct list_head links;
  159. /* pointer to the cgroup we belong to, for list removal purposes */
  160. struct cgroup *owner;
  161. /* for delayed destruction */
  162. struct delayed_work destroy_dwork;
  163. };
  164. /*
  165. * The following two functions "fix" the issue where there are more pids
  166. * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
  167. * TODO: replace with a kernel-wide solution to this problem
  168. */
  169. #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
  170. static void *pidlist_allocate(int count)
  171. {
  172. if (PIDLIST_TOO_LARGE(count))
  173. return vmalloc(count * sizeof(pid_t));
  174. else
  175. return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
  176. }
  177. static void pidlist_free(void *p)
  178. {
  179. kvfree(p);
  180. }
  181. /*
  182. * Used to destroy all pidlists lingering waiting for destroy timer. None
  183. * should be left afterwards.
  184. */
  185. void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
  186. {
  187. struct cgroup_pidlist *l, *tmp_l;
  188. mutex_lock(&cgrp->pidlist_mutex);
  189. list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
  190. mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
  191. mutex_unlock(&cgrp->pidlist_mutex);
  192. flush_workqueue(cgroup_pidlist_destroy_wq);
  193. BUG_ON(!list_empty(&cgrp->pidlists));
  194. }
  195. static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
  196. {
  197. struct delayed_work *dwork = to_delayed_work(work);
  198. struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
  199. destroy_dwork);
  200. struct cgroup_pidlist *tofree = NULL;
  201. mutex_lock(&l->owner->pidlist_mutex);
  202. /*
  203. * Destroy iff we didn't get queued again. The state won't change
  204. * as destroy_dwork can only be queued while locked.
  205. */
  206. if (!delayed_work_pending(dwork)) {
  207. list_del(&l->links);
  208. pidlist_free(l->list);
  209. put_pid_ns(l->key.ns);
  210. tofree = l;
  211. }
  212. mutex_unlock(&l->owner->pidlist_mutex);
  213. kfree(tofree);
  214. }
  215. /*
  216. * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  217. * Returns the number of unique elements.
  218. */
  219. static int pidlist_uniq(pid_t *list, int length)
  220. {
  221. int src, dest = 1;
  222. /*
  223. * we presume the 0th element is unique, so i starts at 1. trivial
  224. * edge cases first; no work needs to be done for either
  225. */
  226. if (length == 0 || length == 1)
  227. return length;
  228. /* src and dest walk down the list; dest counts unique elements */
  229. for (src = 1; src < length; src++) {
  230. /* find next unique element */
  231. while (list[src] == list[src-1]) {
  232. src++;
  233. if (src == length)
  234. goto after;
  235. }
  236. /* dest always points to where the next unique element goes */
  237. list[dest] = list[src];
  238. dest++;
  239. }
  240. after:
  241. return dest;
  242. }
  243. /*
  244. * The two pid files - task and cgroup.procs - guaranteed that the result
  245. * is sorted, which forced this whole pidlist fiasco. As pid order is
  246. * different per namespace, each namespace needs differently sorted list,
  247. * making it impossible to use, for example, single rbtree of member tasks
  248. * sorted by task pointer. As pidlists can be fairly large, allocating one
  249. * per open file is dangerous, so cgroup had to implement shared pool of
  250. * pidlists keyed by cgroup and namespace.
  251. */
  252. static int cmppid(const void *a, const void *b)
  253. {
  254. return *(pid_t *)a - *(pid_t *)b;
  255. }
  256. static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  257. enum cgroup_filetype type)
  258. {
  259. struct cgroup_pidlist *l;
  260. /* don't need task_nsproxy() if we're looking at ourself */
  261. struct pid_namespace *ns = task_active_pid_ns(current);
  262. lockdep_assert_held(&cgrp->pidlist_mutex);
  263. list_for_each_entry(l, &cgrp->pidlists, links)
  264. if (l->key.type == type && l->key.ns == ns)
  265. return l;
  266. return NULL;
  267. }
  268. /*
  269. * find the appropriate pidlist for our purpose (given procs vs tasks)
  270. * returns with the lock on that pidlist already held, and takes care
  271. * of the use count, or returns NULL with no locks held if we're out of
  272. * memory.
  273. */
  274. static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
  275. enum cgroup_filetype type)
  276. {
  277. struct cgroup_pidlist *l;
  278. lockdep_assert_held(&cgrp->pidlist_mutex);
  279. l = cgroup_pidlist_find(cgrp, type);
  280. if (l)
  281. return l;
  282. /* entry not found; create a new one */
  283. l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
  284. if (!l)
  285. return l;
  286. INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
  287. l->key.type = type;
  288. /* don't need task_nsproxy() if we're looking at ourself */
  289. l->key.ns = get_pid_ns(task_active_pid_ns(current));
  290. l->owner = cgrp;
  291. list_add(&l->links, &cgrp->pidlists);
  292. return l;
  293. }
  294. /**
  295. * cgroup_task_count - count the number of tasks in a cgroup.
  296. * @cgrp: the cgroup in question
  297. */
  298. int cgroup_task_count(const struct cgroup *cgrp)
  299. {
  300. int count = 0;
  301. struct cgrp_cset_link *link;
  302. spin_lock_irq(&css_set_lock);
  303. list_for_each_entry(link, &cgrp->cset_links, cset_link)
  304. count += link->cset->nr_tasks;
  305. spin_unlock_irq(&css_set_lock);
  306. return count;
  307. }
  308. /*
  309. * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  310. */
  311. static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  312. struct cgroup_pidlist **lp)
  313. {
  314. pid_t *array;
  315. int length;
  316. int pid, n = 0; /* used for populating the array */
  317. struct css_task_iter it;
  318. struct task_struct *tsk;
  319. struct cgroup_pidlist *l;
  320. lockdep_assert_held(&cgrp->pidlist_mutex);
  321. /*
  322. * If cgroup gets more users after we read count, we won't have
  323. * enough space - tough. This race is indistinguishable to the
  324. * caller from the case that the additional cgroup users didn't
  325. * show up until sometime later on.
  326. */
  327. length = cgroup_task_count(cgrp);
  328. array = pidlist_allocate(length);
  329. if (!array)
  330. return -ENOMEM;
  331. /* now, populate the array */
  332. css_task_iter_start(&cgrp->self, 0, &it);
  333. while ((tsk = css_task_iter_next(&it))) {
  334. if (unlikely(n == length))
  335. break;
  336. /* mtk: don't get pid when proc/task killed */
  337. if ((SIGNAL_GROUP_EXIT & tsk->signal->flags) ||
  338. (PF_EXITING & tsk->flags))
  339. continue;
  340. /* get tgid or pid for procs or tasks file respectively */
  341. if (type == CGROUP_FILE_PROCS)
  342. pid = task_tgid_vnr(tsk);
  343. else
  344. pid = task_pid_vnr(tsk);
  345. if (pid > 0) /* make sure to only use valid results */
  346. array[n++] = pid;
  347. }
  348. css_task_iter_end(&it);
  349. length = n;
  350. /* now sort & (if procs) strip out duplicates */
  351. sort(array, length, sizeof(pid_t), cmppid, NULL);
  352. if (type == CGROUP_FILE_PROCS)
  353. length = pidlist_uniq(array, length);
  354. l = cgroup_pidlist_find_create(cgrp, type);
  355. if (!l) {
  356. pidlist_free(array);
  357. return -ENOMEM;
  358. }
  359. /* store array, freeing old if necessary */
  360. pidlist_free(l->list);
  361. l->list = array;
  362. l->length = length;
  363. *lp = l;
  364. return 0;
  365. }
  366. /*
  367. * seq_file methods for the tasks/procs files. The seq_file position is the
  368. * next pid to display; the seq_file iterator is a pointer to the pid
  369. * in the cgroup->l->list array.
  370. */
  371. static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
  372. {
  373. /*
  374. * Initially we receive a position value that corresponds to
  375. * one more than the last pid shown (or 0 on the first call or
  376. * after a seek to the start). Use a binary-search to find the
  377. * next pid to display, if any
  378. */
  379. struct kernfs_open_file *of = s->private;
  380. struct cgroup *cgrp = seq_css(s)->cgroup;
  381. struct cgroup_pidlist *l;
  382. enum cgroup_filetype type = seq_cft(s)->private;
  383. int index = 0, pid = *pos;
  384. int *iter, ret;
  385. mutex_lock(&cgrp->pidlist_mutex);
  386. /*
  387. * !NULL @of->priv indicates that this isn't the first start()
  388. * after open. If the matching pidlist is around, we can use that.
  389. * Look for it. Note that @of->priv can't be used directly. It
  390. * could already have been destroyed.
  391. */
  392. if (of->priv)
  393. of->priv = cgroup_pidlist_find(cgrp, type);
  394. /*
  395. * Either this is the first start() after open or the matching
  396. * pidlist has been destroyed inbetween. Create a new one.
  397. */
  398. if (!of->priv) {
  399. ret = pidlist_array_load(cgrp, type,
  400. (struct cgroup_pidlist **)&of->priv);
  401. if (ret)
  402. return ERR_PTR(ret);
  403. }
  404. l = of->priv;
  405. if (pid) {
  406. int end = l->length;
  407. while (index < end) {
  408. int mid = (index + end) / 2;
  409. if (l->list[mid] == pid) {
  410. index = mid;
  411. break;
  412. } else if (l->list[mid] <= pid)
  413. index = mid + 1;
  414. else
  415. end = mid;
  416. }
  417. }
  418. /* If we're off the end of the array, we're done */
  419. if (index >= l->length)
  420. return NULL;
  421. /* Update the abstract position to be the actual pid that we found */
  422. iter = l->list + index;
  423. *pos = *iter;
  424. return iter;
  425. }
  426. static void cgroup_pidlist_stop(struct seq_file *s, void *v)
  427. {
  428. struct kernfs_open_file *of = s->private;
  429. struct cgroup_pidlist *l = of->priv;
  430. if (l)
  431. mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
  432. CGROUP_PIDLIST_DESTROY_DELAY);
  433. mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
  434. }
  435. static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
  436. {
  437. struct kernfs_open_file *of = s->private;
  438. struct cgroup_pidlist *l = of->priv;
  439. pid_t *p = v;
  440. pid_t *end = l->list + l->length;
  441. /*
  442. * Advance to the next pid in the array. If this goes off the
  443. * end, we're done
  444. */
  445. p++;
  446. if (p >= end) {
  447. (*pos)++;
  448. return NULL;
  449. } else {
  450. *pos = *p;
  451. return p;
  452. }
  453. }
  454. static int cgroup_pidlist_show(struct seq_file *s, void *v)
  455. {
  456. seq_printf(s, "%d\n", *(int *)v);
  457. return 0;
  458. }
  459. static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
  460. char *buf, size_t nbytes, loff_t off,
  461. bool threadgroup)
  462. {
  463. struct cgroup *cgrp;
  464. struct task_struct *task;
  465. const struct cred *cred, *tcred;
  466. ssize_t ret;
  467. cgrp = cgroup_kn_lock_live(of->kn, false);
  468. if (!cgrp)
  469. return -ENODEV;
  470. task = cgroup_procs_write_start(buf, threadgroup);
  471. ret = PTR_ERR_OR_ZERO(task);
  472. if (ret)
  473. goto out_unlock;
  474. /*
  475. * Even if we're attaching all tasks in the thread group, we only
  476. * need to check permissions on one of them.
  477. */
  478. cred = current_cred();
  479. tcred = get_task_cred(task);
  480. if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
  481. !uid_eq(cred->euid, tcred->uid) &&
  482. !uid_eq(cred->euid, tcred->suid) &&
  483. !ns_capable(tcred->user_ns, CAP_SYS_NICE))
  484. ret = -EACCES;
  485. put_cred(tcred);
  486. if (ret)
  487. goto out_finish;
  488. ret = cgroup_attach_task(cgrp, task, threadgroup);
  489. #ifdef CONFIG_MTK_TASK_TURBO
  490. if (!ret)
  491. cgroup_set_turbo_task(task);
  492. #endif
  493. out_finish:
  494. cgroup_procs_write_finish(task);
  495. out_unlock:
  496. cgroup_kn_unlock(of->kn);
  497. return ret ?: nbytes;
  498. }
  499. static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
  500. char *buf, size_t nbytes, loff_t off)
  501. {
  502. return __cgroup1_procs_write(of, buf, nbytes, off, true);
  503. }
  504. static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
  505. char *buf, size_t nbytes, loff_t off)
  506. {
  507. return __cgroup1_procs_write(of, buf, nbytes, off, false);
  508. }
  509. static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
  510. char *buf, size_t nbytes, loff_t off)
  511. {
  512. struct cgroup *cgrp;
  513. BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
  514. cgrp = cgroup_kn_lock_live(of->kn, false);
  515. if (!cgrp)
  516. return -ENODEV;
  517. spin_lock(&release_agent_path_lock);
  518. strlcpy(cgrp->root->release_agent_path, strstrip(buf),
  519. sizeof(cgrp->root->release_agent_path));
  520. spin_unlock(&release_agent_path_lock);
  521. cgroup_kn_unlock(of->kn);
  522. return nbytes;
  523. }
  524. static int cgroup_release_agent_show(struct seq_file *seq, void *v)
  525. {
  526. struct cgroup *cgrp = seq_css(seq)->cgroup;
  527. spin_lock(&release_agent_path_lock);
  528. seq_puts(seq, cgrp->root->release_agent_path);
  529. spin_unlock(&release_agent_path_lock);
  530. seq_putc(seq, '\n');
  531. return 0;
  532. }
  533. static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
  534. {
  535. seq_puts(seq, "0\n");
  536. return 0;
  537. }
  538. static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
  539. struct cftype *cft)
  540. {
  541. return notify_on_release(css->cgroup);
  542. }
  543. static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
  544. struct cftype *cft, u64 val)
  545. {
  546. if (val)
  547. set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
  548. else
  549. clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
  550. return 0;
  551. }
  552. static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
  553. struct cftype *cft)
  554. {
  555. return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  556. }
  557. static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
  558. struct cftype *cft, u64 val)
  559. {
  560. if (val)
  561. set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  562. else
  563. clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  564. return 0;
  565. }
  566. /* cgroup core interface files for the legacy hierarchies */
  567. struct cftype cgroup1_base_files[] = {
  568. {
  569. .name = "cgroup.procs",
  570. .seq_start = cgroup_pidlist_start,
  571. .seq_next = cgroup_pidlist_next,
  572. .seq_stop = cgroup_pidlist_stop,
  573. .seq_show = cgroup_pidlist_show,
  574. .private = CGROUP_FILE_PROCS,
  575. .write = cgroup1_procs_write,
  576. },
  577. {
  578. .name = "cgroup.clone_children",
  579. .read_u64 = cgroup_clone_children_read,
  580. .write_u64 = cgroup_clone_children_write,
  581. },
  582. {
  583. .name = "cgroup.sane_behavior",
  584. .flags = CFTYPE_ONLY_ON_ROOT,
  585. .seq_show = cgroup_sane_behavior_show,
  586. },
  587. {
  588. .name = "tasks",
  589. .seq_start = cgroup_pidlist_start,
  590. .seq_next = cgroup_pidlist_next,
  591. .seq_stop = cgroup_pidlist_stop,
  592. .seq_show = cgroup_pidlist_show,
  593. .private = CGROUP_FILE_TASKS,
  594. .write = cgroup1_tasks_write,
  595. },
  596. {
  597. .name = "notify_on_release",
  598. .read_u64 = cgroup_read_notify_on_release,
  599. .write_u64 = cgroup_write_notify_on_release,
  600. },
  601. {
  602. .name = "release_agent",
  603. .flags = CFTYPE_ONLY_ON_ROOT,
  604. .seq_show = cgroup_release_agent_show,
  605. .write = cgroup_release_agent_write,
  606. .max_write_len = PATH_MAX - 1,
  607. },
  608. { } /* terminate */
  609. };
  610. /* Display information about each subsystem and each hierarchy */
  611. static int proc_cgroupstats_show(struct seq_file *m, void *v)
  612. {
  613. struct cgroup_subsys *ss;
  614. int i;
  615. seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
  616. /*
  617. * ideally we don't want subsystems moving around while we do this.
  618. * cgroup_mutex is also necessary to guarantee an atomic snapshot of
  619. * subsys/hierarchy state.
  620. */
  621. mutex_lock(&cgroup_mutex);
  622. for_each_subsys(ss, i)
  623. seq_printf(m, "%s\t%d\t%d\t%d\n",
  624. ss->legacy_name, ss->root->hierarchy_id,
  625. atomic_read(&ss->root->nr_cgrps),
  626. cgroup_ssid_enabled(i));
  627. mutex_unlock(&cgroup_mutex);
  628. return 0;
  629. }
  630. static int cgroupstats_open(struct inode *inode, struct file *file)
  631. {
  632. return single_open(file, proc_cgroupstats_show, NULL);
  633. }
  634. const struct file_operations proc_cgroupstats_operations = {
  635. .open = cgroupstats_open,
  636. .read = seq_read,
  637. .llseek = seq_lseek,
  638. .release = single_release,
  639. };
  640. /**
  641. * cgroupstats_build - build and fill cgroupstats
  642. * @stats: cgroupstats to fill information into
  643. * @dentry: A dentry entry belonging to the cgroup for which stats have
  644. * been requested.
  645. *
  646. * Build and fill cgroupstats so that taskstats can export it to user
  647. * space.
  648. */
  649. int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  650. {
  651. struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
  652. struct cgroup *cgrp;
  653. struct css_task_iter it;
  654. struct task_struct *tsk;
  655. /* it should be kernfs_node belonging to cgroupfs and is a directory */
  656. if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
  657. kernfs_type(kn) != KERNFS_DIR)
  658. return -EINVAL;
  659. mutex_lock(&cgroup_mutex);
  660. /*
  661. * We aren't being called from kernfs and there's no guarantee on
  662. * @kn->priv's validity. For this and css_tryget_online_from_dir(),
  663. * @kn->priv is RCU safe. Let's do the RCU dancing.
  664. */
  665. rcu_read_lock();
  666. cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
  667. if (!cgrp || cgroup_is_dead(cgrp)) {
  668. rcu_read_unlock();
  669. mutex_unlock(&cgroup_mutex);
  670. return -ENOENT;
  671. }
  672. rcu_read_unlock();
  673. css_task_iter_start(&cgrp->self, 0, &it);
  674. while ((tsk = css_task_iter_next(&it))) {
  675. switch (tsk->state) {
  676. case TASK_RUNNING:
  677. stats->nr_running++;
  678. break;
  679. case TASK_INTERRUPTIBLE:
  680. stats->nr_sleeping++;
  681. break;
  682. case TASK_UNINTERRUPTIBLE:
  683. stats->nr_uninterruptible++;
  684. break;
  685. case TASK_STOPPED:
  686. stats->nr_stopped++;
  687. break;
  688. default:
  689. if (delayacct_is_task_waiting_on_io(tsk))
  690. stats->nr_io_wait++;
  691. break;
  692. }
  693. }
  694. css_task_iter_end(&it);
  695. mutex_unlock(&cgroup_mutex);
  696. return 0;
  697. }
  698. void cgroup1_check_for_release(struct cgroup *cgrp)
  699. {
  700. if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
  701. !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
  702. schedule_work(&cgrp->release_agent_work);
  703. }
  704. /*
  705. * Notify userspace when a cgroup is released, by running the
  706. * configured release agent with the name of the cgroup (path
  707. * relative to the root of cgroup file system) as the argument.
  708. *
  709. * Most likely, this user command will try to rmdir this cgroup.
  710. *
  711. * This races with the possibility that some other task will be
  712. * attached to this cgroup before it is removed, or that some other
  713. * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
  714. * The presumed 'rmdir' will fail quietly if this cgroup is no longer
  715. * unused, and this cgroup will be reprieved from its death sentence,
  716. * to continue to serve a useful existence. Next time it's released,
  717. * we will get notified again, if it still has 'notify_on_release' set.
  718. *
  719. * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
  720. * means only wait until the task is successfully execve()'d. The
  721. * separate release agent task is forked by call_usermodehelper(),
  722. * then control in this thread returns here, without waiting for the
  723. * release agent task. We don't bother to wait because the caller of
  724. * this routine has no use for the exit status of the release agent
  725. * task, so no sense holding our caller up for that.
  726. */
  727. void cgroup1_release_agent(struct work_struct *work)
  728. {
  729. struct cgroup *cgrp =
  730. container_of(work, struct cgroup, release_agent_work);
  731. char *pathbuf = NULL, *agentbuf = NULL;
  732. char *argv[3], *envp[3];
  733. int ret;
  734. mutex_lock(&cgroup_mutex);
  735. pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
  736. agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
  737. if (!pathbuf || !agentbuf || !strlen(agentbuf))
  738. goto out;
  739. spin_lock_irq(&css_set_lock);
  740. ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
  741. spin_unlock_irq(&css_set_lock);
  742. if (ret < 0 || ret >= PATH_MAX)
  743. goto out;
  744. argv[0] = agentbuf;
  745. argv[1] = pathbuf;
  746. argv[2] = NULL;
  747. /* minimal command environment */
  748. envp[0] = "HOME=/";
  749. envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  750. envp[2] = NULL;
  751. mutex_unlock(&cgroup_mutex);
  752. call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
  753. goto out_free;
  754. out:
  755. mutex_unlock(&cgroup_mutex);
  756. out_free:
  757. kfree(agentbuf);
  758. kfree(pathbuf);
  759. }
  760. /*
  761. * cgroup_rename - Only allow simple rename of directories in place.
  762. */
  763. static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
  764. const char *new_name_str)
  765. {
  766. struct cgroup *cgrp = kn->priv;
  767. int ret;
  768. /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
  769. if (strchr(new_name_str, '\n'))
  770. return -EINVAL;
  771. if (kernfs_type(kn) != KERNFS_DIR)
  772. return -ENOTDIR;
  773. if (kn->parent != new_parent)
  774. return -EIO;
  775. /*
  776. * We're gonna grab cgroup_mutex which nests outside kernfs
  777. * active_ref. kernfs_rename() doesn't require active_ref
  778. * protection. Break them before grabbing cgroup_mutex.
  779. */
  780. kernfs_break_active_protection(new_parent);
  781. kernfs_break_active_protection(kn);
  782. mutex_lock(&cgroup_mutex);
  783. ret = kernfs_rename(kn, new_parent, new_name_str);
  784. if (!ret)
  785. trace_cgroup_rename(cgrp);
  786. mutex_unlock(&cgroup_mutex);
  787. kernfs_unbreak_active_protection(kn);
  788. kernfs_unbreak_active_protection(new_parent);
  789. return ret;
  790. }
  791. static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
  792. {
  793. struct cgroup_root *root = cgroup_root_from_kf(kf_root);
  794. struct cgroup_subsys *ss;
  795. int ssid;
  796. for_each_subsys(ss, ssid)
  797. if (root->subsys_mask & (1 << ssid))
  798. seq_show_option(seq, ss->legacy_name, NULL);
  799. if (root->flags & CGRP_ROOT_NOPREFIX)
  800. seq_puts(seq, ",noprefix");
  801. if (root->flags & CGRP_ROOT_XATTR)
  802. seq_puts(seq, ",xattr");
  803. if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
  804. seq_puts(seq, ",cpuset_v2_mode");
  805. spin_lock(&release_agent_path_lock);
  806. if (strlen(root->release_agent_path))
  807. seq_show_option(seq, "release_agent",
  808. root->release_agent_path);
  809. spin_unlock(&release_agent_path_lock);
  810. if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
  811. seq_puts(seq, ",clone_children");
  812. if (strlen(root->name))
  813. seq_show_option(seq, "name", root->name);
  814. return 0;
  815. }
  816. static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
  817. {
  818. char *token, *o = data;
  819. bool all_ss = false, one_ss = false;
  820. u16 mask = U16_MAX;
  821. struct cgroup_subsys *ss;
  822. int nr_opts = 0;
  823. int i;
  824. #ifdef CONFIG_CPUSETS
  825. mask = ~((u16)1 << cpuset_cgrp_id);
  826. #endif
  827. memset(opts, 0, sizeof(*opts));
  828. while ((token = strsep(&o, ",")) != NULL) {
  829. nr_opts++;
  830. if (!*token)
  831. return -EINVAL;
  832. if (!strcmp(token, "none")) {
  833. /* Explicitly have no subsystems */
  834. opts->none = true;
  835. continue;
  836. }
  837. if (!strcmp(token, "all")) {
  838. /* Mutually exclusive option 'all' + subsystem name */
  839. if (one_ss)
  840. return -EINVAL;
  841. all_ss = true;
  842. continue;
  843. }
  844. if (!strcmp(token, "noprefix")) {
  845. opts->flags |= CGRP_ROOT_NOPREFIX;
  846. continue;
  847. }
  848. if (!strcmp(token, "clone_children")) {
  849. opts->cpuset_clone_children = true;
  850. continue;
  851. }
  852. if (!strcmp(token, "cpuset_v2_mode")) {
  853. opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
  854. continue;
  855. }
  856. if (!strcmp(token, "xattr")) {
  857. opts->flags |= CGRP_ROOT_XATTR;
  858. continue;
  859. }
  860. if (!strncmp(token, "release_agent=", 14)) {
  861. /* Specifying two release agents is forbidden */
  862. if (opts->release_agent)
  863. return -EINVAL;
  864. opts->release_agent =
  865. kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
  866. if (!opts->release_agent)
  867. return -ENOMEM;
  868. continue;
  869. }
  870. if (!strncmp(token, "name=", 5)) {
  871. const char *name = token + 5;
  872. /* Can't specify an empty name */
  873. if (!strlen(name))
  874. return -EINVAL;
  875. /* Must match [\w.-]+ */
  876. for (i = 0; i < strlen(name); i++) {
  877. char c = name[i];
  878. if (isalnum(c))
  879. continue;
  880. if ((c == '.') || (c == '-') || (c == '_'))
  881. continue;
  882. return -EINVAL;
  883. }
  884. /* Specifying two names is forbidden */
  885. if (opts->name)
  886. return -EINVAL;
  887. opts->name = kstrndup(name,
  888. MAX_CGROUP_ROOT_NAMELEN - 1,
  889. GFP_KERNEL);
  890. if (!opts->name)
  891. return -ENOMEM;
  892. continue;
  893. }
  894. for_each_subsys(ss, i) {
  895. if (strcmp(token, ss->legacy_name))
  896. continue;
  897. if (!cgroup_ssid_enabled(i))
  898. continue;
  899. if (cgroup1_ssid_disabled(i))
  900. continue;
  901. /* Mutually exclusive option 'all' + subsystem name */
  902. if (all_ss)
  903. return -EINVAL;
  904. opts->subsys_mask |= (1 << i);
  905. one_ss = true;
  906. break;
  907. }
  908. if (i == CGROUP_SUBSYS_COUNT)
  909. return -ENOENT;
  910. }
  911. /*
  912. * If the 'all' option was specified select all the subsystems,
  913. * otherwise if 'none', 'name=' and a subsystem name options were
  914. * not specified, let's default to 'all'
  915. */
  916. if (all_ss || (!one_ss && !opts->none && !opts->name))
  917. for_each_subsys(ss, i)
  918. if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
  919. opts->subsys_mask |= (1 << i);
  920. /*
  921. * We either have to specify by name or by subsystems. (So all
  922. * empty hierarchies must have a name).
  923. */
  924. if (!opts->subsys_mask && !opts->name)
  925. return -EINVAL;
  926. /*
  927. * Option noprefix was introduced just for backward compatibility
  928. * with the old cpuset, so we allow noprefix only if mounting just
  929. * the cpuset subsystem.
  930. */
  931. if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
  932. return -EINVAL;
  933. /* Can't specify "none" and some subsystems */
  934. if (opts->subsys_mask && opts->none)
  935. return -EINVAL;
  936. return 0;
  937. }
  938. static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
  939. {
  940. int ret = 0;
  941. struct cgroup_root *root = cgroup_root_from_kf(kf_root);
  942. struct cgroup_sb_opts opts;
  943. u16 added_mask, removed_mask;
  944. cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  945. /* See what subsystems are wanted */
  946. ret = parse_cgroupfs_options(data, &opts);
  947. if (ret)
  948. goto out_unlock;
  949. if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
  950. pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
  951. task_tgid_nr(current), current->comm);
  952. added_mask = opts.subsys_mask & ~root->subsys_mask;
  953. removed_mask = root->subsys_mask & ~opts.subsys_mask;
  954. /* Don't allow flags or name to change at remount */
  955. if ((opts.flags ^ root->flags) ||
  956. (opts.name && strcmp(opts.name, root->name))) {
  957. pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
  958. opts.flags, opts.name ?: "", root->flags, root->name);
  959. ret = -EINVAL;
  960. goto out_unlock;
  961. }
  962. /* remounting is not allowed for populated hierarchies */
  963. if (!list_empty(&root->cgrp.self.children)) {
  964. ret = -EBUSY;
  965. goto out_unlock;
  966. }
  967. ret = rebind_subsystems(root, added_mask);
  968. if (ret)
  969. goto out_unlock;
  970. WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
  971. if (opts.release_agent) {
  972. spin_lock(&release_agent_path_lock);
  973. strcpy(root->release_agent_path, opts.release_agent);
  974. spin_unlock(&release_agent_path_lock);
  975. }
  976. trace_cgroup_remount(root);
  977. out_unlock:
  978. kfree(opts.release_agent);
  979. kfree(opts.name);
  980. mutex_unlock(&cgroup_mutex);
  981. return ret;
  982. }
  983. struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
  984. .rename = cgroup1_rename,
  985. .show_options = cgroup1_show_options,
  986. .remount_fs = cgroup1_remount,
  987. .mkdir = cgroup_mkdir,
  988. .rmdir = cgroup_rmdir,
  989. .show_path = cgroup_show_path,
  990. };
  991. struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
  992. void *data, unsigned long magic,
  993. struct cgroup_namespace *ns)
  994. {
  995. struct super_block *pinned_sb = NULL;
  996. struct cgroup_sb_opts opts;
  997. struct cgroup_root *root;
  998. struct cgroup_subsys *ss;
  999. struct dentry *dentry;
  1000. int i, ret;
  1001. bool new_root = false;
  1002. cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  1003. /* First find the desired set of subsystems */
  1004. ret = parse_cgroupfs_options(data, &opts);
  1005. if (ret)
  1006. goto out_unlock;
  1007. /*
  1008. * Destruction of cgroup root is asynchronous, so subsystems may
  1009. * still be dying after the previous unmount. Let's drain the
  1010. * dying subsystems. We just need to ensure that the ones
  1011. * unmounted previously finish dying and don't care about new ones
  1012. * starting. Testing ref liveliness is good enough.
  1013. */
  1014. for_each_subsys(ss, i) {
  1015. if (!(opts.subsys_mask & (1 << i)) ||
  1016. ss->root == &cgrp_dfl_root)
  1017. continue;
  1018. if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
  1019. mutex_unlock(&cgroup_mutex);
  1020. msleep(10);
  1021. ret = restart_syscall();
  1022. goto out_free;
  1023. }
  1024. cgroup_put(&ss->root->cgrp);
  1025. }
  1026. for_each_root(root) {
  1027. bool name_match = false;
  1028. if (root == &cgrp_dfl_root)
  1029. continue;
  1030. /*
  1031. * If we asked for a name then it must match. Also, if
  1032. * name matches but sybsys_mask doesn't, we should fail.
  1033. * Remember whether name matched.
  1034. */
  1035. if (opts.name) {
  1036. if (strcmp(opts.name, root->name))
  1037. continue;
  1038. name_match = true;
  1039. }
  1040. /*
  1041. * If we asked for subsystems (or explicitly for no
  1042. * subsystems) then they must match.
  1043. */
  1044. if ((opts.subsys_mask || opts.none) &&
  1045. (opts.subsys_mask != root->subsys_mask)) {
  1046. if (!name_match)
  1047. continue;
  1048. ret = -EBUSY;
  1049. goto out_unlock;
  1050. }
  1051. if (root->flags ^ opts.flags)
  1052. pr_warn("new mount options do not match the existing superblock, will be ignored\n");
  1053. /*
  1054. * We want to reuse @root whose lifetime is governed by its
  1055. * ->cgrp. Let's check whether @root is alive and keep it
  1056. * that way. As cgroup_kill_sb() can happen anytime, we
  1057. * want to block it by pinning the sb so that @root doesn't
  1058. * get killed before mount is complete.
  1059. *
  1060. * With the sb pinned, tryget_live can reliably indicate
  1061. * whether @root can be reused. If it's being killed,
  1062. * drain it. We can use wait_queue for the wait but this
  1063. * path is super cold. Let's just sleep a bit and retry.
  1064. */
  1065. pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
  1066. if (IS_ERR(pinned_sb) ||
  1067. !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
  1068. mutex_unlock(&cgroup_mutex);
  1069. if (!IS_ERR_OR_NULL(pinned_sb))
  1070. deactivate_super(pinned_sb);
  1071. msleep(10);
  1072. ret = restart_syscall();
  1073. goto out_free;
  1074. }
  1075. ret = 0;
  1076. goto out_unlock;
  1077. }
  1078. /*
  1079. * No such thing, create a new one. name= matching without subsys
  1080. * specification is allowed for already existing hierarchies but we
  1081. * can't create new one without subsys specification.
  1082. */
  1083. if (!opts.subsys_mask && !opts.none) {
  1084. ret = -EINVAL;
  1085. goto out_unlock;
  1086. }
  1087. /* Hierarchies may only be created in the initial cgroup namespace. */
  1088. if (ns != &init_cgroup_ns) {
  1089. ret = -EPERM;
  1090. goto out_unlock;
  1091. }
  1092. root = kzalloc(sizeof(*root), GFP_KERNEL);
  1093. if (!root) {
  1094. ret = -ENOMEM;
  1095. goto out_unlock;
  1096. }
  1097. new_root = true;
  1098. init_cgroup_root(root, &opts);
  1099. ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
  1100. if (ret)
  1101. cgroup_free_root(root);
  1102. out_unlock:
  1103. mutex_unlock(&cgroup_mutex);
  1104. out_free:
  1105. kfree(opts.release_agent);
  1106. kfree(opts.name);
  1107. if (ret)
  1108. return ERR_PTR(ret);
  1109. dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
  1110. CGROUP_SUPER_MAGIC, ns);
  1111. /*
  1112. * There's a race window after we release cgroup_mutex and before
  1113. * allocating a superblock. Make sure a concurrent process won't
  1114. * be able to re-use the root during this window by delaying the
  1115. * initialization of root refcnt.
  1116. */
  1117. if (new_root) {
  1118. mutex_lock(&cgroup_mutex);
  1119. percpu_ref_reinit(&root->cgrp.self.refcnt);
  1120. mutex_unlock(&cgroup_mutex);
  1121. }
  1122. /*
  1123. * If @pinned_sb, we're reusing an existing root and holding an
  1124. * extra ref on its sb. Mount is complete. Put the extra ref.
  1125. */
  1126. if (pinned_sb)
  1127. deactivate_super(pinned_sb);
  1128. return dentry;
  1129. }
  1130. static int __init cgroup1_wq_init(void)
  1131. {
  1132. /*
  1133. * Used to destroy pidlists and separate to serve as flush domain.
  1134. * Cap @max_active to 1 too.
  1135. */
  1136. cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
  1137. 0, 1);
  1138. BUG_ON(!cgroup_pidlist_destroy_wq);
  1139. return 0;
  1140. }
  1141. core_initcall(cgroup1_wq_init);
  1142. static int __init cgroup_no_v1(char *str)
  1143. {
  1144. struct cgroup_subsys *ss;
  1145. char *token;
  1146. int i;
  1147. while ((token = strsep(&str, ",")) != NULL) {
  1148. if (!*token)
  1149. continue;
  1150. if (!strcmp(token, "all")) {
  1151. cgroup_no_v1_mask = U16_MAX;
  1152. break;
  1153. }
  1154. for_each_subsys(ss, i) {
  1155. if (strcmp(token, ss->name) &&
  1156. strcmp(token, ss->legacy_name))
  1157. continue;
  1158. cgroup_no_v1_mask |= 1 << i;
  1159. }
  1160. }
  1161. return 1;
  1162. }
  1163. __setup("cgroup_no_v1=", cgroup_no_v1);