pids.c 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /*
  2. * Process number limiting controller for cgroups.
  3. *
  4. * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
  5. * after a certain limit is reached.
  6. *
  7. * Since it is trivial to hit the task limit without hitting any kmemcg limits
  8. * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
  9. * preventable in the scope of a cgroup hierarchy by allowing resource limiting
  10. * of the number of tasks in a cgroup.
  11. *
  12. * In order to use the `pids` controller, set the maximum number of tasks in
  13. * pids.max (this is not available in the root cgroup for obvious reasons). The
  14. * number of processes currently in the cgroup is given by pids.current.
  15. * Organisational operations are not blocked by cgroup policies, so it is
  16. * possible to have pids.current > pids.max. However, it is not possible to
  17. * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
  18. * would cause a cgroup policy to be violated.
  19. *
  20. * To set a cgroup to have no limit, set pids.max to "max". This is the default
  21. * for all new cgroups (N.B. that PID limits are hierarchical, so the most
  22. * stringent limit in the hierarchy is followed).
  23. *
  24. * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
  25. * a superset of parent/child/pids.current.
  26. *
  27. * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
  28. *
  29. * This file is subject to the terms and conditions of version 2 of the GNU
  30. * General Public License. See the file COPYING in the main directory of the
  31. * Linux distribution for more details.
  32. */
  33. #include <linux/kernel.h>
  34. #include <linux/threads.h>
  35. #include <linux/atomic.h>
  36. #include <linux/cgroup.h>
  37. #include <linux/slab.h>
  38. #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  39. #define PIDS_MAX_STR "max"
  40. struct pids_cgroup {
  41. struct cgroup_subsys_state css;
  42. /*
  43. * Use 64-bit types so that we can safely represent "max" as
  44. * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
  45. */
  46. atomic64_t counter;
  47. atomic64_t limit;
  48. /* Handle for "pids.events" */
  49. struct cgroup_file events_file;
  50. /* Number of times fork failed because limit was hit. */
  51. atomic64_t events_limit;
  52. };
  53. static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
  54. {
  55. return container_of(css, struct pids_cgroup, css);
  56. }
  57. static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
  58. {
  59. return css_pids(pids->css.parent);
  60. }
  61. static struct cgroup_subsys_state *
  62. pids_css_alloc(struct cgroup_subsys_state *parent)
  63. {
  64. struct pids_cgroup *pids;
  65. pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
  66. if (!pids)
  67. return ERR_PTR(-ENOMEM);
  68. atomic64_set(&pids->counter, 0);
  69. atomic64_set(&pids->limit, PIDS_MAX);
  70. atomic64_set(&pids->events_limit, 0);
  71. return &pids->css;
  72. }
  73. static void pids_css_free(struct cgroup_subsys_state *css)
  74. {
  75. kfree(css_pids(css));
  76. }
  77. /**
  78. * pids_cancel - uncharge the local pid count
  79. * @pids: the pid cgroup state
  80. * @num: the number of pids to cancel
  81. *
  82. * This function will WARN if the pid count goes under 0, because such a case is
  83. * a bug in the pids controller proper.
  84. */
  85. static void pids_cancel(struct pids_cgroup *pids, int num)
  86. {
  87. /*
  88. * A negative count (or overflow for that matter) is invalid,
  89. * and indicates a bug in the `pids` controller proper.
  90. */
  91. WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
  92. }
  93. /**
  94. * pids_uncharge - hierarchically uncharge the pid count
  95. * @pids: the pid cgroup state
  96. * @num: the number of pids to uncharge
  97. */
  98. static void pids_uncharge(struct pids_cgroup *pids, int num)
  99. {
  100. struct pids_cgroup *p;
  101. for (p = pids; parent_pids(p); p = parent_pids(p))
  102. pids_cancel(p, num);
  103. }
  104. /**
  105. * pids_charge - hierarchically charge the pid count
  106. * @pids: the pid cgroup state
  107. * @num: the number of pids to charge
  108. *
  109. * This function does *not* follow the pid limit set. It cannot fail and the new
  110. * pid count may exceed the limit. This is only used for reverting failed
  111. * attaches, where there is no other way out than violating the limit.
  112. */
  113. static void pids_charge(struct pids_cgroup *pids, int num)
  114. {
  115. struct pids_cgroup *p;
  116. for (p = pids; parent_pids(p); p = parent_pids(p))
  117. atomic64_add(num, &p->counter);
  118. }
  119. /**
  120. * pids_try_charge - hierarchically try to charge the pid count
  121. * @pids: the pid cgroup state
  122. * @num: the number of pids to charge
  123. *
  124. * This function follows the set limit. It will fail if the charge would cause
  125. * the new value to exceed the hierarchical limit. Returns 0 if the charge
  126. * succeeded, otherwise -EAGAIN.
  127. */
  128. static int pids_try_charge(struct pids_cgroup *pids, int num)
  129. {
  130. struct pids_cgroup *p, *q;
  131. for (p = pids; parent_pids(p); p = parent_pids(p)) {
  132. int64_t new = atomic64_add_return(num, &p->counter);
  133. int64_t limit = atomic64_read(&p->limit);
  134. /*
  135. * Since new is capped to the maximum number of pid_t, if
  136. * p->limit is %PIDS_MAX then we know that this test will never
  137. * fail.
  138. */
  139. if (new > limit)
  140. goto revert;
  141. }
  142. return 0;
  143. revert:
  144. for (q = pids; q != p; q = parent_pids(q))
  145. pids_cancel(q, num);
  146. pids_cancel(p, num);
  147. return -EAGAIN;
  148. }
  149. static int pids_can_attach(struct cgroup_taskset *tset)
  150. {
  151. struct task_struct *task;
  152. struct cgroup_subsys_state *dst_css;
  153. cgroup_taskset_for_each(task, dst_css, tset) {
  154. struct pids_cgroup *pids = css_pids(dst_css);
  155. struct cgroup_subsys_state *old_css;
  156. struct pids_cgroup *old_pids;
  157. /*
  158. * No need to pin @old_css between here and cancel_attach()
  159. * because cgroup core protects it from being freed before
  160. * the migration completes or fails.
  161. */
  162. old_css = task_css(task, pids_cgrp_id);
  163. old_pids = css_pids(old_css);
  164. pids_charge(pids, 1);
  165. pids_uncharge(old_pids, 1);
  166. }
  167. return 0;
  168. }
  169. static void pids_cancel_attach(struct cgroup_taskset *tset)
  170. {
  171. struct task_struct *task;
  172. struct cgroup_subsys_state *dst_css;
  173. cgroup_taskset_for_each(task, dst_css, tset) {
  174. struct pids_cgroup *pids = css_pids(dst_css);
  175. struct cgroup_subsys_state *old_css;
  176. struct pids_cgroup *old_pids;
  177. old_css = task_css(task, pids_cgrp_id);
  178. old_pids = css_pids(old_css);
  179. pids_charge(old_pids, 1);
  180. pids_uncharge(pids, 1);
  181. }
  182. }
  183. /*
  184. * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
  185. * on cgroup_threadgroup_change_begin() held by the copy_process().
  186. */
  187. static int pids_can_fork(struct task_struct *task)
  188. {
  189. struct cgroup_subsys_state *css;
  190. struct pids_cgroup *pids;
  191. int err;
  192. css = task_css_check(current, pids_cgrp_id, true);
  193. pids = css_pids(css);
  194. err = pids_try_charge(pids, 1);
  195. if (err) {
  196. /* Only log the first time events_limit is incremented. */
  197. if (atomic64_inc_return(&pids->events_limit) == 1) {
  198. pr_info("cgroup: fork rejected by pids controller in ");
  199. pr_cont_cgroup_path(css->cgroup);
  200. pr_cont("\n");
  201. }
  202. cgroup_file_notify(&pids->events_file);
  203. }
  204. return err;
  205. }
  206. static void pids_cancel_fork(struct task_struct *task)
  207. {
  208. struct cgroup_subsys_state *css;
  209. struct pids_cgroup *pids;
  210. css = task_css_check(current, pids_cgrp_id, true);
  211. pids = css_pids(css);
  212. pids_uncharge(pids, 1);
  213. }
  214. static void pids_release(struct task_struct *task)
  215. {
  216. struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
  217. pids_uncharge(pids, 1);
  218. }
  219. static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
  220. size_t nbytes, loff_t off)
  221. {
  222. struct cgroup_subsys_state *css = of_css(of);
  223. struct pids_cgroup *pids = css_pids(css);
  224. int64_t limit;
  225. int err;
  226. buf = strstrip(buf);
  227. if (!strcmp(buf, PIDS_MAX_STR)) {
  228. limit = PIDS_MAX;
  229. goto set_limit;
  230. }
  231. err = kstrtoll(buf, 0, &limit);
  232. if (err)
  233. return err;
  234. if (limit < 0 || limit >= PIDS_MAX)
  235. return -EINVAL;
  236. set_limit:
  237. /*
  238. * Limit updates don't need to be mutex'd, since it isn't
  239. * critical that any racing fork()s follow the new limit.
  240. */
  241. atomic64_set(&pids->limit, limit);
  242. return nbytes;
  243. }
  244. static int pids_max_show(struct seq_file *sf, void *v)
  245. {
  246. struct cgroup_subsys_state *css = seq_css(sf);
  247. struct pids_cgroup *pids = css_pids(css);
  248. int64_t limit = atomic64_read(&pids->limit);
  249. if (limit >= PIDS_MAX)
  250. seq_printf(sf, "%s\n", PIDS_MAX_STR);
  251. else
  252. seq_printf(sf, "%lld\n", limit);
  253. return 0;
  254. }
  255. static s64 pids_current_read(struct cgroup_subsys_state *css,
  256. struct cftype *cft)
  257. {
  258. struct pids_cgroup *pids = css_pids(css);
  259. return atomic64_read(&pids->counter);
  260. }
  261. static int pids_events_show(struct seq_file *sf, void *v)
  262. {
  263. struct pids_cgroup *pids = css_pids(seq_css(sf));
  264. seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
  265. return 0;
  266. }
  267. static struct cftype pids_files[] = {
  268. {
  269. .name = "max",
  270. .write = pids_max_write,
  271. .seq_show = pids_max_show,
  272. .flags = CFTYPE_NOT_ON_ROOT,
  273. },
  274. {
  275. .name = "current",
  276. .read_s64 = pids_current_read,
  277. .flags = CFTYPE_NOT_ON_ROOT,
  278. },
  279. {
  280. .name = "events",
  281. .seq_show = pids_events_show,
  282. .file_offset = offsetof(struct pids_cgroup, events_file),
  283. .flags = CFTYPE_NOT_ON_ROOT,
  284. },
  285. { } /* terminate */
  286. };
  287. struct cgroup_subsys pids_cgrp_subsys = {
  288. .css_alloc = pids_css_alloc,
  289. .css_free = pids_css_free,
  290. .can_attach = pids_can_attach,
  291. .cancel_attach = pids_cancel_attach,
  292. .can_fork = pids_can_fork,
  293. .cancel_fork = pids_cancel_fork,
  294. .release = pids_release,
  295. .legacy_cftypes = pids_files,
  296. .dfl_cftypes = pids_files,
  297. .threaded = true,
  298. };