arraymap.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. #include <linux/bpf.h>
  13. #include <linux/err.h>
  14. #include <linux/slab.h>
  15. #include <linux/mm.h>
  16. #include <linux/filter.h>
  17. #include <linux/perf_event.h>
  18. static void bpf_array_free_percpu(struct bpf_array *array)
  19. {
  20. int i;
  21. for (i = 0; i < array->map.max_entries; i++) {
  22. free_percpu(array->pptrs[i]);
  23. cond_resched();
  24. }
  25. }
  26. static int bpf_array_alloc_percpu(struct bpf_array *array)
  27. {
  28. void __percpu *ptr;
  29. int i;
  30. for (i = 0; i < array->map.max_entries; i++) {
  31. ptr = __alloc_percpu_gfp(array->elem_size, 8,
  32. GFP_USER | __GFP_NOWARN);
  33. if (!ptr) {
  34. bpf_array_free_percpu(array);
  35. return -ENOMEM;
  36. }
  37. array->pptrs[i] = ptr;
  38. cond_resched();
  39. }
  40. return 0;
  41. }
  42. /* Called from syscall */
  43. static struct bpf_map *array_map_alloc(union bpf_attr *attr)
  44. {
  45. bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
  46. u32 elem_size, index_mask, max_entries;
  47. bool unpriv = !capable(CAP_SYS_ADMIN);
  48. u64 cost, array_size, mask64;
  49. struct bpf_array *array;
  50. int ret;
  51. /* check sanity of attributes */
  52. if (attr->max_entries == 0 || attr->key_size != 4 ||
  53. attr->value_size == 0 || attr->map_flags)
  54. return ERR_PTR(-EINVAL);
  55. if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
  56. /* if value_size is bigger, the user space won't be able to
  57. * access the elements.
  58. */
  59. return ERR_PTR(-E2BIG);
  60. elem_size = round_up(attr->value_size, 8);
  61. max_entries = attr->max_entries;
  62. /* On 32 bit archs roundup_pow_of_two() with max_entries that has
  63. * upper most bit set in u32 space is undefined behavior due to
  64. * resulting 1U << 32, so do it manually here in u64 space.
  65. */
  66. mask64 = fls_long(max_entries - 1);
  67. mask64 = 1ULL << mask64;
  68. mask64 -= 1;
  69. index_mask = mask64;
  70. if (unpriv) {
  71. /* round up array size to nearest power of 2,
  72. * since cpu will speculate within index_mask limits
  73. */
  74. max_entries = index_mask + 1;
  75. /* Check for overflows. */
  76. if (max_entries < attr->max_entries)
  77. return ERR_PTR(-E2BIG);
  78. }
  79. array_size = sizeof(*array);
  80. if (percpu)
  81. array_size += (u64) max_entries * sizeof(void *);
  82. else
  83. array_size += (u64) max_entries * elem_size;
  84. /* make sure there is no u32 overflow later in round_up() */
  85. cost = array_size;
  86. if (cost >= U32_MAX - PAGE_SIZE)
  87. return ERR_PTR(-ENOMEM);
  88. if (percpu) {
  89. cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
  90. if (cost >= U32_MAX - PAGE_SIZE)
  91. return ERR_PTR(-ENOMEM);
  92. }
  93. cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  94. ret = bpf_map_precharge_memlock(cost);
  95. if (ret < 0)
  96. return ERR_PTR(ret);
  97. /* allocate all map elements and zero-initialize them */
  98. array = bpf_map_area_alloc(array_size);
  99. if (!array)
  100. return ERR_PTR(-ENOMEM);
  101. array->index_mask = index_mask;
  102. array->map.unpriv_array = unpriv;
  103. /* copy mandatory map attributes */
  104. array->map.map_type = attr->map_type;
  105. array->map.key_size = attr->key_size;
  106. array->map.value_size = attr->value_size;
  107. array->map.max_entries = attr->max_entries;
  108. array->map.map_flags = attr->map_flags;
  109. array->map.pages = cost;
  110. array->elem_size = elem_size;
  111. if (percpu &&
  112. (elem_size > PCPU_MIN_UNIT_SIZE ||
  113. bpf_array_alloc_percpu(array))) {
  114. bpf_map_area_free(array);
  115. return ERR_PTR(-ENOMEM);
  116. }
  117. return &array->map;
  118. }
  119. /* Called from syscall or from eBPF program */
  120. static void *array_map_lookup_elem(struct bpf_map *map, void *key)
  121. {
  122. struct bpf_array *array = container_of(map, struct bpf_array, map);
  123. u32 index = *(u32 *)key;
  124. if (unlikely(index >= array->map.max_entries))
  125. return NULL;
  126. return array->value + array->elem_size * (index & array->index_mask);
  127. }
  128. /* Called from eBPF program */
  129. static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
  130. {
  131. struct bpf_array *array = container_of(map, struct bpf_array, map);
  132. u32 index = *(u32 *)key;
  133. if (unlikely(index >= array->map.max_entries))
  134. return NULL;
  135. return this_cpu_ptr(array->pptrs[index & array->index_mask]);
  136. }
  137. int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
  138. {
  139. struct bpf_array *array = container_of(map, struct bpf_array, map);
  140. u32 index = *(u32 *)key;
  141. void __percpu *pptr;
  142. int cpu, off = 0;
  143. u32 size;
  144. if (unlikely(index >= array->map.max_entries))
  145. return -ENOENT;
  146. /* per_cpu areas are zero-filled and bpf programs can only
  147. * access 'value_size' of them, so copying rounded areas
  148. * will not leak any kernel data
  149. */
  150. size = round_up(map->value_size, 8);
  151. rcu_read_lock();
  152. pptr = array->pptrs[index & array->index_mask];
  153. for_each_possible_cpu(cpu) {
  154. bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
  155. off += size;
  156. }
  157. rcu_read_unlock();
  158. return 0;
  159. }
  160. /* Called from syscall */
  161. static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  162. {
  163. struct bpf_array *array = container_of(map, struct bpf_array, map);
  164. u32 index = key ? *(u32 *)key : U32_MAX;
  165. u32 *next = (u32 *)next_key;
  166. if (index >= array->map.max_entries) {
  167. *next = 0;
  168. return 0;
  169. }
  170. if (index == array->map.max_entries - 1)
  171. return -ENOENT;
  172. *next = index + 1;
  173. return 0;
  174. }
  175. /* Called from syscall or from eBPF program */
  176. static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
  177. u64 map_flags)
  178. {
  179. struct bpf_array *array = container_of(map, struct bpf_array, map);
  180. u32 index = *(u32 *)key;
  181. if (unlikely(map_flags > BPF_EXIST))
  182. /* unknown flags */
  183. return -EINVAL;
  184. if (unlikely(index >= array->map.max_entries))
  185. /* all elements were pre-allocated, cannot insert a new one */
  186. return -E2BIG;
  187. if (unlikely(map_flags == BPF_NOEXIST))
  188. /* all elements already exist */
  189. return -EEXIST;
  190. if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
  191. memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
  192. value, map->value_size);
  193. else
  194. memcpy(array->value +
  195. array->elem_size * (index & array->index_mask),
  196. value, map->value_size);
  197. return 0;
  198. }
  199. int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
  200. u64 map_flags)
  201. {
  202. struct bpf_array *array = container_of(map, struct bpf_array, map);
  203. u32 index = *(u32 *)key;
  204. void __percpu *pptr;
  205. int cpu, off = 0;
  206. u32 size;
  207. if (unlikely(map_flags > BPF_EXIST))
  208. /* unknown flags */
  209. return -EINVAL;
  210. if (unlikely(index >= array->map.max_entries))
  211. /* all elements were pre-allocated, cannot insert a new one */
  212. return -E2BIG;
  213. if (unlikely(map_flags == BPF_NOEXIST))
  214. /* all elements already exist */
  215. return -EEXIST;
  216. /* the user space will provide round_up(value_size, 8) bytes that
  217. * will be copied into per-cpu area. bpf programs can only access
  218. * value_size of it. During lookup the same extra bytes will be
  219. * returned or zeros which were zero-filled by percpu_alloc,
  220. * so no kernel data leaks possible
  221. */
  222. size = round_up(map->value_size, 8);
  223. rcu_read_lock();
  224. pptr = array->pptrs[index & array->index_mask];
  225. for_each_possible_cpu(cpu) {
  226. bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
  227. off += size;
  228. }
  229. rcu_read_unlock();
  230. return 0;
  231. }
  232. /* Called from syscall or from eBPF program */
  233. static int array_map_delete_elem(struct bpf_map *map, void *key)
  234. {
  235. return -EINVAL;
  236. }
  237. /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
  238. static void array_map_free(struct bpf_map *map)
  239. {
  240. struct bpf_array *array = container_of(map, struct bpf_array, map);
  241. /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
  242. * so the programs (can be more than one that used this map) were
  243. * disconnected from events. Wait for outstanding programs to complete
  244. * and free the array
  245. */
  246. synchronize_rcu();
  247. if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
  248. bpf_array_free_percpu(array);
  249. bpf_map_area_free(array);
  250. }
  251. static const struct bpf_map_ops array_ops = {
  252. .map_alloc = array_map_alloc,
  253. .map_free = array_map_free,
  254. .map_get_next_key = array_map_get_next_key,
  255. .map_lookup_elem = array_map_lookup_elem,
  256. .map_update_elem = array_map_update_elem,
  257. .map_delete_elem = array_map_delete_elem,
  258. };
  259. static struct bpf_map_type_list array_type __read_mostly = {
  260. .ops = &array_ops,
  261. .type = BPF_MAP_TYPE_ARRAY,
  262. };
  263. static const struct bpf_map_ops percpu_array_ops = {
  264. .map_alloc = array_map_alloc,
  265. .map_free = array_map_free,
  266. .map_get_next_key = array_map_get_next_key,
  267. .map_lookup_elem = percpu_array_map_lookup_elem,
  268. .map_update_elem = array_map_update_elem,
  269. .map_delete_elem = array_map_delete_elem,
  270. };
  271. static struct bpf_map_type_list percpu_array_type __read_mostly = {
  272. .ops = &percpu_array_ops,
  273. .type = BPF_MAP_TYPE_PERCPU_ARRAY,
  274. };
  275. static int __init register_array_map(void)
  276. {
  277. bpf_register_map_type(&array_type);
  278. bpf_register_map_type(&percpu_array_type);
  279. return 0;
  280. }
  281. late_initcall(register_array_map);
  282. static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
  283. {
  284. /* only file descriptors can be stored in this type of map */
  285. if (attr->value_size != sizeof(u32))
  286. return ERR_PTR(-EINVAL);
  287. return array_map_alloc(attr);
  288. }
  289. static void fd_array_map_free(struct bpf_map *map)
  290. {
  291. struct bpf_array *array = container_of(map, struct bpf_array, map);
  292. int i;
  293. synchronize_rcu();
  294. /* make sure it's empty */
  295. for (i = 0; i < array->map.max_entries; i++)
  296. BUG_ON(array->ptrs[i] != NULL);
  297. bpf_map_area_free(array);
  298. }
  299. static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
  300. {
  301. return NULL;
  302. }
  303. /* only called from syscall */
  304. int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
  305. void *key, void *value, u64 map_flags)
  306. {
  307. struct bpf_array *array = container_of(map, struct bpf_array, map);
  308. void *new_ptr, *old_ptr;
  309. u32 index = *(u32 *)key, ufd;
  310. if (map_flags != BPF_ANY)
  311. return -EINVAL;
  312. if (index >= array->map.max_entries)
  313. return -E2BIG;
  314. ufd = *(u32 *)value;
  315. new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
  316. if (IS_ERR(new_ptr))
  317. return PTR_ERR(new_ptr);
  318. old_ptr = xchg(array->ptrs + index, new_ptr);
  319. if (old_ptr)
  320. map->ops->map_fd_put_ptr(old_ptr);
  321. return 0;
  322. }
  323. static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
  324. {
  325. struct bpf_array *array = container_of(map, struct bpf_array, map);
  326. void *old_ptr;
  327. u32 index = *(u32 *)key;
  328. if (index >= array->map.max_entries)
  329. return -E2BIG;
  330. old_ptr = xchg(array->ptrs + index, NULL);
  331. if (old_ptr) {
  332. map->ops->map_fd_put_ptr(old_ptr);
  333. return 0;
  334. } else {
  335. return -ENOENT;
  336. }
  337. }
  338. static void *prog_fd_array_get_ptr(struct bpf_map *map,
  339. struct file *map_file, int fd)
  340. {
  341. struct bpf_array *array = container_of(map, struct bpf_array, map);
  342. struct bpf_prog *prog = bpf_prog_get(fd);
  343. if (IS_ERR(prog))
  344. return prog;
  345. if (!bpf_prog_array_compatible(array, prog)) {
  346. bpf_prog_put(prog);
  347. return ERR_PTR(-EINVAL);
  348. }
  349. return prog;
  350. }
  351. static void prog_fd_array_put_ptr(void *ptr)
  352. {
  353. bpf_prog_put(ptr);
  354. }
  355. /* decrement refcnt of all bpf_progs that are stored in this map */
  356. void bpf_fd_array_map_clear(struct bpf_map *map)
  357. {
  358. struct bpf_array *array = container_of(map, struct bpf_array, map);
  359. int i;
  360. for (i = 0; i < array->map.max_entries; i++)
  361. fd_array_map_delete_elem(map, &i);
  362. }
  363. static const struct bpf_map_ops prog_array_ops = {
  364. .map_alloc = fd_array_map_alloc,
  365. .map_free = fd_array_map_free,
  366. .map_get_next_key = array_map_get_next_key,
  367. .map_lookup_elem = fd_array_map_lookup_elem,
  368. .map_delete_elem = fd_array_map_delete_elem,
  369. .map_fd_get_ptr = prog_fd_array_get_ptr,
  370. .map_fd_put_ptr = prog_fd_array_put_ptr,
  371. };
  372. static struct bpf_map_type_list prog_array_type __read_mostly = {
  373. .ops = &prog_array_ops,
  374. .type = BPF_MAP_TYPE_PROG_ARRAY,
  375. };
  376. static int __init register_prog_array_map(void)
  377. {
  378. bpf_register_map_type(&prog_array_type);
  379. return 0;
  380. }
  381. late_initcall(register_prog_array_map);
  382. static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
  383. struct file *map_file)
  384. {
  385. struct bpf_event_entry *ee;
  386. ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
  387. if (ee) {
  388. ee->event = perf_file->private_data;
  389. ee->perf_file = perf_file;
  390. ee->map_file = map_file;
  391. }
  392. return ee;
  393. }
  394. static void __bpf_event_entry_free(struct rcu_head *rcu)
  395. {
  396. struct bpf_event_entry *ee;
  397. ee = container_of(rcu, struct bpf_event_entry, rcu);
  398. fput(ee->perf_file);
  399. kfree(ee);
  400. }
  401. static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
  402. {
  403. call_rcu(&ee->rcu, __bpf_event_entry_free);
  404. }
  405. static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
  406. struct file *map_file, int fd)
  407. {
  408. const struct perf_event_attr *attr;
  409. struct bpf_event_entry *ee;
  410. struct perf_event *event;
  411. struct file *perf_file;
  412. perf_file = perf_event_get(fd);
  413. if (IS_ERR(perf_file))
  414. return perf_file;
  415. event = perf_file->private_data;
  416. ee = ERR_PTR(-EINVAL);
  417. attr = perf_event_attrs(event);
  418. if (IS_ERR(attr) || attr->inherit)
  419. goto err_out;
  420. switch (attr->type) {
  421. case PERF_TYPE_SOFTWARE:
  422. if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
  423. goto err_out;
  424. /* fall-through */
  425. case PERF_TYPE_RAW:
  426. case PERF_TYPE_HARDWARE:
  427. ee = bpf_event_entry_gen(perf_file, map_file);
  428. if (ee)
  429. return ee;
  430. ee = ERR_PTR(-ENOMEM);
  431. /* fall-through */
  432. default:
  433. break;
  434. }
  435. err_out:
  436. fput(perf_file);
  437. return ee;
  438. }
  439. static void perf_event_fd_array_put_ptr(void *ptr)
  440. {
  441. bpf_event_entry_free_rcu(ptr);
  442. }
  443. static void perf_event_fd_array_release(struct bpf_map *map,
  444. struct file *map_file)
  445. {
  446. struct bpf_array *array = container_of(map, struct bpf_array, map);
  447. struct bpf_event_entry *ee;
  448. int i;
  449. rcu_read_lock();
  450. for (i = 0; i < array->map.max_entries; i++) {
  451. ee = READ_ONCE(array->ptrs[i]);
  452. if (ee && ee->map_file == map_file)
  453. fd_array_map_delete_elem(map, &i);
  454. }
  455. rcu_read_unlock();
  456. }
  457. static const struct bpf_map_ops perf_event_array_ops = {
  458. .map_alloc = fd_array_map_alloc,
  459. .map_free = fd_array_map_free,
  460. .map_get_next_key = array_map_get_next_key,
  461. .map_lookup_elem = fd_array_map_lookup_elem,
  462. .map_delete_elem = fd_array_map_delete_elem,
  463. .map_fd_get_ptr = perf_event_fd_array_get_ptr,
  464. .map_fd_put_ptr = perf_event_fd_array_put_ptr,
  465. .map_release = perf_event_fd_array_release,
  466. };
  467. static struct bpf_map_type_list perf_event_array_type __read_mostly = {
  468. .ops = &perf_event_array_ops,
  469. .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
  470. };
  471. static int __init register_perf_event_array_map(void)
  472. {
  473. bpf_register_map_type(&perf_event_array_type);
  474. return 0;
  475. }
  476. late_initcall(register_perf_event_array_map);
  477. #ifdef CONFIG_CGROUPS
  478. static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
  479. struct file *map_file /* not used */,
  480. int fd)
  481. {
  482. return cgroup_get_from_fd(fd);
  483. }
  484. static void cgroup_fd_array_put_ptr(void *ptr)
  485. {
  486. /* cgroup_put free cgrp after a rcu grace period */
  487. cgroup_put(ptr);
  488. }
  489. static void cgroup_fd_array_free(struct bpf_map *map)
  490. {
  491. bpf_fd_array_map_clear(map);
  492. fd_array_map_free(map);
  493. }
  494. static const struct bpf_map_ops cgroup_array_ops = {
  495. .map_alloc = fd_array_map_alloc,
  496. .map_free = cgroup_fd_array_free,
  497. .map_get_next_key = array_map_get_next_key,
  498. .map_lookup_elem = fd_array_map_lookup_elem,
  499. .map_delete_elem = fd_array_map_delete_elem,
  500. .map_fd_get_ptr = cgroup_fd_array_get_ptr,
  501. .map_fd_put_ptr = cgroup_fd_array_put_ptr,
  502. };
  503. static struct bpf_map_type_list cgroup_array_type __read_mostly = {
  504. .ops = &cgroup_array_ops,
  505. .type = BPF_MAP_TYPE_CGROUP_ARRAY,
  506. };
  507. static int __init register_cgroup_array_map(void)
  508. {
  509. bpf_register_map_type(&cgroup_array_type);
  510. return 0;
  511. }
  512. late_initcall(register_cgroup_array_map);
  513. #endif