rdma.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. /*
  2. * RDMA resource limiting controller for cgroups.
  3. *
  4. * Used to allow a cgroup hierarchy to stop processes from consuming
  5. * additional RDMA resources after a certain limit is reached.
  6. *
  7. * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
  8. *
  9. * This file is subject to the terms and conditions of version 2 of the GNU
  10. * General Public License. See the file COPYING in the main directory of the
  11. * Linux distribution for more details.
  12. */
  13. #include <linux/bitops.h>
  14. #include <linux/slab.h>
  15. #include <linux/seq_file.h>
  16. #include <linux/cgroup.h>
  17. #include <linux/parser.h>
  18. #include <linux/cgroup_rdma.h>
  19. #define RDMACG_MAX_STR "max"
  20. /*
  21. * Protects list of resource pools maintained on per cgroup basis
  22. * and rdma device list.
  23. */
  24. static DEFINE_MUTEX(rdmacg_mutex);
  25. static LIST_HEAD(rdmacg_devices);
  26. enum rdmacg_file_type {
  27. RDMACG_RESOURCE_TYPE_MAX,
  28. RDMACG_RESOURCE_TYPE_STAT,
  29. };
  30. /*
  31. * resource table definition as to be seen by the user.
  32. * Need to add entries to it when more resources are
  33. * added/defined at IB verb/core layer.
  34. */
  35. static char const *rdmacg_resource_names[] = {
  36. [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
  37. [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
  38. };
  39. /* resource tracker for each resource of rdma cgroup */
  40. struct rdmacg_resource {
  41. int max;
  42. int usage;
  43. };
  44. /*
  45. * resource pool object which represents per cgroup, per device
  46. * resources. There are multiple instances of this object per cgroup,
  47. * therefore it cannot be embedded within rdma_cgroup structure. It
  48. * is maintained as list.
  49. */
  50. struct rdmacg_resource_pool {
  51. struct rdmacg_device *device;
  52. struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
  53. struct list_head cg_node;
  54. struct list_head dev_node;
  55. /* count active user tasks of this pool */
  56. u64 usage_sum;
  57. /* total number counts which are set to max */
  58. int num_max_cnt;
  59. };
  60. static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
  61. {
  62. return container_of(css, struct rdma_cgroup, css);
  63. }
  64. static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
  65. {
  66. return css_rdmacg(cg->css.parent);
  67. }
  68. static inline struct rdma_cgroup *get_current_rdmacg(void)
  69. {
  70. return css_rdmacg(task_get_css(current, rdma_cgrp_id));
  71. }
  72. static void set_resource_limit(struct rdmacg_resource_pool *rpool,
  73. int index, int new_max)
  74. {
  75. if (new_max == S32_MAX) {
  76. if (rpool->resources[index].max != S32_MAX)
  77. rpool->num_max_cnt++;
  78. } else {
  79. if (rpool->resources[index].max == S32_MAX)
  80. rpool->num_max_cnt--;
  81. }
  82. rpool->resources[index].max = new_max;
  83. }
  84. static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
  85. {
  86. int i;
  87. for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
  88. set_resource_limit(rpool, i, S32_MAX);
  89. }
  90. static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
  91. {
  92. lockdep_assert_held(&rdmacg_mutex);
  93. list_del(&rpool->cg_node);
  94. list_del(&rpool->dev_node);
  95. kfree(rpool);
  96. }
  97. static struct rdmacg_resource_pool *
  98. find_cg_rpool_locked(struct rdma_cgroup *cg,
  99. struct rdmacg_device *device)
  100. {
  101. struct rdmacg_resource_pool *pool;
  102. lockdep_assert_held(&rdmacg_mutex);
  103. list_for_each_entry(pool, &cg->rpools, cg_node)
  104. if (pool->device == device)
  105. return pool;
  106. return NULL;
  107. }
  108. static struct rdmacg_resource_pool *
  109. get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
  110. {
  111. struct rdmacg_resource_pool *rpool;
  112. rpool = find_cg_rpool_locked(cg, device);
  113. if (rpool)
  114. return rpool;
  115. rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
  116. if (!rpool)
  117. return ERR_PTR(-ENOMEM);
  118. rpool->device = device;
  119. set_all_resource_max_limit(rpool);
  120. INIT_LIST_HEAD(&rpool->cg_node);
  121. INIT_LIST_HEAD(&rpool->dev_node);
  122. list_add_tail(&rpool->cg_node, &cg->rpools);
  123. list_add_tail(&rpool->dev_node, &device->rpools);
  124. return rpool;
  125. }
  126. /**
  127. * uncharge_cg_locked - uncharge resource for rdma cgroup
  128. * @cg: pointer to cg to uncharge and all parents in hierarchy
  129. * @device: pointer to rdmacg device
  130. * @index: index of the resource to uncharge in cg (resource pool)
  131. *
  132. * It also frees the resource pool which was created as part of
  133. * charging operation when there are no resources attached to
  134. * resource pool.
  135. */
  136. static void
  137. uncharge_cg_locked(struct rdma_cgroup *cg,
  138. struct rdmacg_device *device,
  139. enum rdmacg_resource_type index)
  140. {
  141. struct rdmacg_resource_pool *rpool;
  142. rpool = find_cg_rpool_locked(cg, device);
  143. /*
  144. * rpool cannot be null at this stage. Let kernel operate in case
  145. * if there a bug in IB stack or rdma controller, instead of crashing
  146. * the system.
  147. */
  148. if (unlikely(!rpool)) {
  149. pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
  150. return;
  151. }
  152. rpool->resources[index].usage--;
  153. /*
  154. * A negative count (or overflow) is invalid,
  155. * it indicates a bug in the rdma controller.
  156. */
  157. WARN_ON_ONCE(rpool->resources[index].usage < 0);
  158. rpool->usage_sum--;
  159. if (rpool->usage_sum == 0 &&
  160. rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
  161. /*
  162. * No user of the rpool and all entries are set to max, so
  163. * safe to delete this rpool.
  164. */
  165. free_cg_rpool_locked(rpool);
  166. }
  167. }
  168. /**
  169. * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
  170. * @device: pointer to rdmacg device
  171. * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
  172. * stop uncharging
  173. * @index: index of the resource to uncharge in cg in given resource pool
  174. */
  175. static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
  176. struct rdmacg_device *device,
  177. struct rdma_cgroup *stop_cg,
  178. enum rdmacg_resource_type index)
  179. {
  180. struct rdma_cgroup *p;
  181. mutex_lock(&rdmacg_mutex);
  182. for (p = cg; p != stop_cg; p = parent_rdmacg(p))
  183. uncharge_cg_locked(p, device, index);
  184. mutex_unlock(&rdmacg_mutex);
  185. css_put(&cg->css);
  186. }
  187. /**
  188. * rdmacg_uncharge - hierarchically uncharge rdma resource count
  189. * @device: pointer to rdmacg device
  190. * @index: index of the resource to uncharge in cgroup in given resource pool
  191. */
  192. void rdmacg_uncharge(struct rdma_cgroup *cg,
  193. struct rdmacg_device *device,
  194. enum rdmacg_resource_type index)
  195. {
  196. if (index >= RDMACG_RESOURCE_MAX)
  197. return;
  198. rdmacg_uncharge_hierarchy(cg, device, NULL, index);
  199. }
  200. EXPORT_SYMBOL(rdmacg_uncharge);
  201. /**
  202. * rdmacg_try_charge - hierarchically try to charge the rdma resource
  203. * @rdmacg: pointer to rdma cgroup which will own this resource
  204. * @device: pointer to rdmacg device
  205. * @index: index of the resource to charge in cgroup (resource pool)
  206. *
  207. * This function follows charging resource in hierarchical way.
  208. * It will fail if the charge would cause the new value to exceed the
  209. * hierarchical limit.
  210. * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
  211. * Returns pointer to rdmacg for this resource when charging is successful.
  212. *
  213. * Charger needs to account resources on two criteria.
  214. * (a) per cgroup & (b) per device resource usage.
  215. * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
  216. * the configured limits. Per device provides granular configuration
  217. * in multi device usage. It allocates resource pool in the hierarchy
  218. * for each parent it come across for first resource. Later on resource
  219. * pool will be available. Therefore it will be much faster thereon
  220. * to charge/uncharge.
  221. */
  222. int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
  223. struct rdmacg_device *device,
  224. enum rdmacg_resource_type index)
  225. {
  226. struct rdma_cgroup *cg, *p;
  227. struct rdmacg_resource_pool *rpool;
  228. s64 new;
  229. int ret = 0;
  230. if (index >= RDMACG_RESOURCE_MAX)
  231. return -EINVAL;
  232. /*
  233. * hold on to css, as cgroup can be removed but resource
  234. * accounting happens on css.
  235. */
  236. cg = get_current_rdmacg();
  237. mutex_lock(&rdmacg_mutex);
  238. for (p = cg; p; p = parent_rdmacg(p)) {
  239. rpool = get_cg_rpool_locked(p, device);
  240. if (IS_ERR(rpool)) {
  241. ret = PTR_ERR(rpool);
  242. goto err;
  243. } else {
  244. new = rpool->resources[index].usage + 1;
  245. if (new > rpool->resources[index].max) {
  246. ret = -EAGAIN;
  247. goto err;
  248. } else {
  249. rpool->resources[index].usage = new;
  250. rpool->usage_sum++;
  251. }
  252. }
  253. }
  254. mutex_unlock(&rdmacg_mutex);
  255. *rdmacg = cg;
  256. return 0;
  257. err:
  258. mutex_unlock(&rdmacg_mutex);
  259. rdmacg_uncharge_hierarchy(cg, device, p, index);
  260. return ret;
  261. }
  262. EXPORT_SYMBOL(rdmacg_try_charge);
  263. /**
  264. * rdmacg_register_device - register rdmacg device to rdma controller.
  265. * @device: pointer to rdmacg device whose resources need to be accounted.
  266. *
  267. * If IB stack wish a device to participate in rdma cgroup resource
  268. * tracking, it must invoke this API to register with rdma cgroup before
  269. * any user space application can start using the RDMA resources.
  270. * Returns 0 on success or EINVAL when table length given is beyond
  271. * supported size.
  272. */
  273. int rdmacg_register_device(struct rdmacg_device *device)
  274. {
  275. INIT_LIST_HEAD(&device->dev_node);
  276. INIT_LIST_HEAD(&device->rpools);
  277. mutex_lock(&rdmacg_mutex);
  278. list_add_tail(&device->dev_node, &rdmacg_devices);
  279. mutex_unlock(&rdmacg_mutex);
  280. return 0;
  281. }
  282. EXPORT_SYMBOL(rdmacg_register_device);
  283. /**
  284. * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
  285. * @device: pointer to rdmacg device which was previously registered with rdma
  286. * controller using rdmacg_register_device().
  287. *
  288. * IB stack must invoke this after all the resources of the IB device
  289. * are destroyed and after ensuring that no more resources will be created
  290. * when this API is invoked.
  291. */
  292. void rdmacg_unregister_device(struct rdmacg_device *device)
  293. {
  294. struct rdmacg_resource_pool *rpool, *tmp;
  295. /*
  296. * Synchronize with any active resource settings,
  297. * usage query happening via configfs.
  298. */
  299. mutex_lock(&rdmacg_mutex);
  300. list_del_init(&device->dev_node);
  301. /*
  302. * Now that this device is off the cgroup list, its safe to free
  303. * all the rpool resources.
  304. */
  305. list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
  306. free_cg_rpool_locked(rpool);
  307. mutex_unlock(&rdmacg_mutex);
  308. }
  309. EXPORT_SYMBOL(rdmacg_unregister_device);
  310. static int parse_resource(char *c, int *intval)
  311. {
  312. substring_t argstr;
  313. const char **table = &rdmacg_resource_names[0];
  314. char *name, *value = c;
  315. size_t len;
  316. int ret, i = 0;
  317. name = strsep(&value, "=");
  318. if (!name || !value)
  319. return -EINVAL;
  320. len = strlen(value);
  321. for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
  322. if (strcmp(table[i], name))
  323. continue;
  324. argstr.from = value;
  325. argstr.to = value + len;
  326. ret = match_int(&argstr, intval);
  327. if (ret >= 0) {
  328. if (*intval < 0)
  329. break;
  330. return i;
  331. }
  332. if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
  333. *intval = S32_MAX;
  334. return i;
  335. }
  336. break;
  337. }
  338. return -EINVAL;
  339. }
  340. static int rdmacg_parse_limits(char *options,
  341. int *new_limits, unsigned long *enables)
  342. {
  343. char *c;
  344. int err = -EINVAL;
  345. /* parse resource options */
  346. while ((c = strsep(&options, " ")) != NULL) {
  347. int index, intval;
  348. index = parse_resource(c, &intval);
  349. if (index < 0)
  350. goto err;
  351. new_limits[index] = intval;
  352. *enables |= BIT(index);
  353. }
  354. return 0;
  355. err:
  356. return err;
  357. }
  358. static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
  359. {
  360. struct rdmacg_device *device;
  361. lockdep_assert_held(&rdmacg_mutex);
  362. list_for_each_entry(device, &rdmacg_devices, dev_node)
  363. if (!strcmp(name, device->name))
  364. return device;
  365. return NULL;
  366. }
  367. static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
  368. char *buf, size_t nbytes, loff_t off)
  369. {
  370. struct rdma_cgroup *cg = css_rdmacg(of_css(of));
  371. const char *dev_name;
  372. struct rdmacg_resource_pool *rpool;
  373. struct rdmacg_device *device;
  374. char *options = strstrip(buf);
  375. int *new_limits;
  376. unsigned long enables = 0;
  377. int i = 0, ret = 0;
  378. /* extract the device name first */
  379. dev_name = strsep(&options, " ");
  380. if (!dev_name) {
  381. ret = -EINVAL;
  382. goto err;
  383. }
  384. new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
  385. if (!new_limits) {
  386. ret = -ENOMEM;
  387. goto err;
  388. }
  389. ret = rdmacg_parse_limits(options, new_limits, &enables);
  390. if (ret)
  391. goto parse_err;
  392. /* acquire lock to synchronize with hot plug devices */
  393. mutex_lock(&rdmacg_mutex);
  394. device = rdmacg_get_device_locked(dev_name);
  395. if (!device) {
  396. ret = -ENODEV;
  397. goto dev_err;
  398. }
  399. rpool = get_cg_rpool_locked(cg, device);
  400. if (IS_ERR(rpool)) {
  401. ret = PTR_ERR(rpool);
  402. goto dev_err;
  403. }
  404. /* now set the new limits of the rpool */
  405. for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
  406. set_resource_limit(rpool, i, new_limits[i]);
  407. if (rpool->usage_sum == 0 &&
  408. rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
  409. /*
  410. * No user of the rpool and all entries are set to max, so
  411. * safe to delete this rpool.
  412. */
  413. free_cg_rpool_locked(rpool);
  414. }
  415. dev_err:
  416. mutex_unlock(&rdmacg_mutex);
  417. parse_err:
  418. kfree(new_limits);
  419. err:
  420. return ret ?: nbytes;
  421. }
  422. static void print_rpool_values(struct seq_file *sf,
  423. struct rdmacg_resource_pool *rpool)
  424. {
  425. enum rdmacg_file_type sf_type;
  426. int i;
  427. u32 value;
  428. sf_type = seq_cft(sf)->private;
  429. for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
  430. seq_puts(sf, rdmacg_resource_names[i]);
  431. seq_putc(sf, '=');
  432. if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
  433. if (rpool)
  434. value = rpool->resources[i].max;
  435. else
  436. value = S32_MAX;
  437. } else {
  438. if (rpool)
  439. value = rpool->resources[i].usage;
  440. else
  441. value = 0;
  442. }
  443. if (value == S32_MAX)
  444. seq_puts(sf, RDMACG_MAX_STR);
  445. else
  446. seq_printf(sf, "%d", value);
  447. seq_putc(sf, ' ');
  448. }
  449. }
  450. static int rdmacg_resource_read(struct seq_file *sf, void *v)
  451. {
  452. struct rdmacg_device *device;
  453. struct rdmacg_resource_pool *rpool;
  454. struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
  455. mutex_lock(&rdmacg_mutex);
  456. list_for_each_entry(device, &rdmacg_devices, dev_node) {
  457. seq_printf(sf, "%s ", device->name);
  458. rpool = find_cg_rpool_locked(cg, device);
  459. print_rpool_values(sf, rpool);
  460. seq_putc(sf, '\n');
  461. }
  462. mutex_unlock(&rdmacg_mutex);
  463. return 0;
  464. }
  465. static struct cftype rdmacg_files[] = {
  466. {
  467. .name = "max",
  468. .write = rdmacg_resource_set_max,
  469. .seq_show = rdmacg_resource_read,
  470. .private = RDMACG_RESOURCE_TYPE_MAX,
  471. .flags = CFTYPE_NOT_ON_ROOT,
  472. },
  473. {
  474. .name = "current",
  475. .seq_show = rdmacg_resource_read,
  476. .private = RDMACG_RESOURCE_TYPE_STAT,
  477. .flags = CFTYPE_NOT_ON_ROOT,
  478. },
  479. { } /* terminate */
  480. };
  481. static struct cgroup_subsys_state *
  482. rdmacg_css_alloc(struct cgroup_subsys_state *parent)
  483. {
  484. struct rdma_cgroup *cg;
  485. cg = kzalloc(sizeof(*cg), GFP_KERNEL);
  486. if (!cg)
  487. return ERR_PTR(-ENOMEM);
  488. INIT_LIST_HEAD(&cg->rpools);
  489. return &cg->css;
  490. }
  491. static void rdmacg_css_free(struct cgroup_subsys_state *css)
  492. {
  493. struct rdma_cgroup *cg = css_rdmacg(css);
  494. kfree(cg);
  495. }
  496. /**
  497. * rdmacg_css_offline - cgroup css_offline callback
  498. * @css: css of interest
  499. *
  500. * This function is called when @css is about to go away and responsible
  501. * for shooting down all rdmacg associated with @css. As part of that it
  502. * marks all the resource pool entries to max value, so that when resources are
  503. * uncharged, associated resource pool can be freed as well.
  504. */
  505. static void rdmacg_css_offline(struct cgroup_subsys_state *css)
  506. {
  507. struct rdma_cgroup *cg = css_rdmacg(css);
  508. struct rdmacg_resource_pool *rpool;
  509. mutex_lock(&rdmacg_mutex);
  510. list_for_each_entry(rpool, &cg->rpools, cg_node)
  511. set_all_resource_max_limit(rpool);
  512. mutex_unlock(&rdmacg_mutex);
  513. }
  514. struct cgroup_subsys rdma_cgrp_subsys = {
  515. .css_alloc = rdmacg_css_alloc,
  516. .css_free = rdmacg_css_free,
  517. .css_offline = rdmacg_css_offline,
  518. .legacy_cftypes = rdmacg_files,
  519. .dfl_cftypes = rdmacg_files,
  520. };