zbud.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528
  1. /*
  2. * zbud.c
  3. *
  4. * Copyright (C) 2013, Seth Jennings, IBM
  5. *
  6. * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
  7. *
  8. * zbud is an special purpose allocator for storing compressed pages. Contrary
  9. * to what its name may suggest, zbud is not a buddy allocator, but rather an
  10. * allocator that "buddies" two compressed pages together in a single memory
  11. * page.
  12. *
  13. * While this design limits storage density, it has simple and deterministic
  14. * reclaim properties that make it preferable to a higher density approach when
  15. * reclaim will be used.
  16. *
  17. * zbud works by storing compressed pages, or "zpages", together in pairs in a
  18. * single memory page called a "zbud page". The first buddy is "left
  19. * justifed" at the beginning of the zbud page, and the last buddy is "right
  20. * justified" at the end of the zbud page. The benefit is that if either
  21. * buddy is freed, the freed buddy space, coalesced with whatever slack space
  22. * that existed between the buddies, results in the largest possible free region
  23. * within the zbud page.
  24. *
  25. * zbud also provides an attractive lower bound on density. The ratio of zpages
  26. * to zbud pages can not be less than 1. This ensures that zbud can never "do
  27. * harm" by using more pages to store zpages than the uncompressed zpages would
  28. * have used on their own.
  29. *
  30. * zbud pages are divided into "chunks". The size of the chunks is fixed at
  31. * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
  32. * into chunks allows organizing unbuddied zbud pages into a manageable number
  33. * of unbuddied lists according to the number of free chunks available in the
  34. * zbud page.
  35. *
  36. * The zbud API differs from that of conventional allocators in that the
  37. * allocation function, zbud_alloc(), returns an opaque handle to the user,
  38. * not a dereferenceable pointer. The user must map the handle using
  39. * zbud_map() in order to get a usable pointer by which to access the
  40. * allocation data and unmap the handle with zbud_unmap() when operations
  41. * on the allocation data are complete.
  42. */
  43. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  44. #include <linux/atomic.h>
  45. #include <linux/list.h>
  46. #include <linux/mm.h>
  47. #include <linux/module.h>
  48. #include <linux/preempt.h>
  49. #include <linux/slab.h>
  50. #include <linux/spinlock.h>
  51. #include <linux/zbud.h>
  52. /*****************
  53. * Structures
  54. *****************/
  55. /*
  56. * NCHUNKS_ORDER determines the internal allocation granularity, effectively
  57. * adjusting internal fragmentation. It also determines the number of
  58. * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
  59. * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
  60. * will be 64 freelists per pool.
  61. */
  62. #define NCHUNKS_ORDER 6
  63. #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
  64. #define CHUNK_SIZE (1 << CHUNK_SHIFT)
  65. #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
  66. #define ZHDR_SIZE_ALIGNED CHUNK_SIZE
  67. /**
  68. * struct zbud_pool - stores metadata for each zbud pool
  69. * @lock: protects all pool fields and first|last_chunk fields of any
  70. * zbud page in the pool
  71. * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
  72. * the lists each zbud page is added to depends on the size of
  73. * its free region.
  74. * @buddied: list tracking the zbud pages that contain two buddies;
  75. * these zbud pages are full
  76. * @lru: list tracking the zbud pages in LRU order by most recently
  77. * added buddy.
  78. * @pages_nr: number of zbud pages in the pool.
  79. * @ops: pointer to a structure of user defined operations specified at
  80. * pool creation time.
  81. *
  82. * This structure is allocated at pool creation time and maintains metadata
  83. * pertaining to a particular zbud pool.
  84. */
  85. struct zbud_pool {
  86. spinlock_t lock;
  87. struct list_head unbuddied[NCHUNKS];
  88. struct list_head buddied;
  89. struct list_head lru;
  90. u64 pages_nr;
  91. struct zbud_ops *ops;
  92. };
  93. /*
  94. * struct zbud_header - zbud page metadata occupying the first chunk of each
  95. * zbud page.
  96. * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
  97. * @lru: links the zbud page into the lru list in the pool
  98. * @first_chunks: the size of the first buddy in chunks, 0 if free
  99. * @last_chunks: the size of the last buddy in chunks, 0 if free
  100. */
  101. struct zbud_header {
  102. struct list_head buddy;
  103. struct list_head lru;
  104. unsigned int first_chunks;
  105. unsigned int last_chunks;
  106. bool under_reclaim;
  107. };
  108. /*****************
  109. * Helpers
  110. *****************/
  111. /* Just to make the code easier to read */
  112. enum buddy {
  113. FIRST,
  114. LAST
  115. };
  116. /* Converts an allocation size in bytes to size in zbud chunks */
  117. static int size_to_chunks(int size)
  118. {
  119. return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
  120. }
  121. #define for_each_unbuddied_list(_iter, _begin) \
  122. for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
  123. /* Initializes the zbud header of a newly allocated zbud page */
  124. static struct zbud_header *init_zbud_page(struct page *page)
  125. {
  126. struct zbud_header *zhdr = page_address(page);
  127. zhdr->first_chunks = 0;
  128. zhdr->last_chunks = 0;
  129. INIT_LIST_HEAD(&zhdr->buddy);
  130. INIT_LIST_HEAD(&zhdr->lru);
  131. zhdr->under_reclaim = 0;
  132. return zhdr;
  133. }
  134. /* Resets the struct page fields and frees the page */
  135. static void free_zbud_page(struct zbud_header *zhdr)
  136. {
  137. __free_page(virt_to_page(zhdr));
  138. }
  139. /*
  140. * Encodes the handle of a particular buddy within a zbud page
  141. * Pool lock should be held as this function accesses first|last_chunks
  142. */
  143. static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
  144. {
  145. unsigned long handle;
  146. /*
  147. * For now, the encoded handle is actually just the pointer to the data
  148. * but this might not always be the case. A little information hiding.
  149. * Add CHUNK_SIZE to the handle if it is the first allocation to jump
  150. * over the zbud header in the first chunk.
  151. */
  152. handle = (unsigned long)zhdr;
  153. if (bud == FIRST)
  154. /* skip over zbud header */
  155. handle += ZHDR_SIZE_ALIGNED;
  156. else /* bud == LAST */
  157. handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
  158. return handle;
  159. }
  160. /* Returns the zbud page where a given handle is stored */
  161. static struct zbud_header *handle_to_zbud_header(unsigned long handle)
  162. {
  163. return (struct zbud_header *)(handle & PAGE_MASK);
  164. }
  165. /* Returns the number of free chunks in a zbud page */
  166. static int num_free_chunks(struct zbud_header *zhdr)
  167. {
  168. /*
  169. * Rather than branch for different situations, just use the fact that
  170. * free buddies have a length of zero to simplify everything. -1 at the
  171. * end for the zbud header.
  172. */
  173. return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
  174. }
  175. /*****************
  176. * API Functions
  177. *****************/
  178. /**
  179. * zbud_create_pool() - create a new zbud pool
  180. * @gfp: gfp flags when allocating the zbud pool structure
  181. * @ops: user-defined operations for the zbud pool
  182. *
  183. * Return: pointer to the new zbud pool or NULL if the metadata allocation
  184. * failed.
  185. */
  186. struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
  187. {
  188. struct zbud_pool *pool;
  189. int i;
  190. pool = kmalloc(sizeof(struct zbud_pool), gfp);
  191. if (!pool)
  192. return NULL;
  193. spin_lock_init(&pool->lock);
  194. for_each_unbuddied_list(i, 0)
  195. INIT_LIST_HEAD(&pool->unbuddied[i]);
  196. INIT_LIST_HEAD(&pool->buddied);
  197. INIT_LIST_HEAD(&pool->lru);
  198. pool->pages_nr = 0;
  199. pool->ops = ops;
  200. return pool;
  201. }
  202. /**
  203. * zbud_destroy_pool() - destroys an existing zbud pool
  204. * @pool: the zbud pool to be destroyed
  205. *
  206. * The pool should be emptied before this function is called.
  207. */
  208. void zbud_destroy_pool(struct zbud_pool *pool)
  209. {
  210. kfree(pool);
  211. }
  212. /**
  213. * zbud_alloc() - allocates a region of a given size
  214. * @pool: zbud pool from which to allocate
  215. * @size: size in bytes of the desired allocation
  216. * @gfp: gfp flags used if the pool needs to grow
  217. * @handle: handle of the new allocation
  218. *
  219. * This function will attempt to find a free region in the pool large enough to
  220. * satisfy the allocation request. A search of the unbuddied lists is
  221. * performed first. If no suitable free region is found, then a new page is
  222. * allocated and added to the pool to satisfy the request.
  223. *
  224. * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
  225. * as zbud pool pages.
  226. *
  227. * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
  228. * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
  229. * a new page.
  230. */
  231. int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
  232. unsigned long *handle)
  233. {
  234. int chunks, i, freechunks;
  235. struct zbud_header *zhdr = NULL;
  236. enum buddy bud;
  237. struct page *page;
  238. if (size <= 0 || gfp & __GFP_HIGHMEM)
  239. return -EINVAL;
  240. if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED)
  241. return -ENOSPC;
  242. chunks = size_to_chunks(size);
  243. spin_lock(&pool->lock);
  244. /* First, try to find an unbuddied zbud page. */
  245. zhdr = NULL;
  246. for_each_unbuddied_list(i, chunks) {
  247. if (!list_empty(&pool->unbuddied[i])) {
  248. zhdr = list_first_entry(&pool->unbuddied[i],
  249. struct zbud_header, buddy);
  250. list_del(&zhdr->buddy);
  251. if (zhdr->first_chunks == 0)
  252. bud = FIRST;
  253. else
  254. bud = LAST;
  255. goto found;
  256. }
  257. }
  258. /* Couldn't find unbuddied zbud page, create new one */
  259. spin_unlock(&pool->lock);
  260. page = alloc_page(gfp);
  261. if (!page)
  262. return -ENOMEM;
  263. spin_lock(&pool->lock);
  264. pool->pages_nr++;
  265. zhdr = init_zbud_page(page);
  266. bud = FIRST;
  267. found:
  268. if (bud == FIRST)
  269. zhdr->first_chunks = chunks;
  270. else
  271. zhdr->last_chunks = chunks;
  272. if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
  273. /* Add to unbuddied list */
  274. freechunks = num_free_chunks(zhdr);
  275. list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
  276. } else {
  277. /* Add to buddied list */
  278. list_add(&zhdr->buddy, &pool->buddied);
  279. }
  280. /* Add/move zbud page to beginning of LRU */
  281. if (!list_empty(&zhdr->lru))
  282. list_del(&zhdr->lru);
  283. list_add(&zhdr->lru, &pool->lru);
  284. *handle = encode_handle(zhdr, bud);
  285. spin_unlock(&pool->lock);
  286. return 0;
  287. }
  288. /**
  289. * zbud_free() - frees the allocation associated with the given handle
  290. * @pool: pool in which the allocation resided
  291. * @handle: handle associated with the allocation returned by zbud_alloc()
  292. *
  293. * In the case that the zbud page in which the allocation resides is under
  294. * reclaim, as indicated by the PG_reclaim flag being set, this function
  295. * only sets the first|last_chunks to 0. The page is actually freed
  296. * once both buddies are evicted (see zbud_reclaim_page() below).
  297. */
  298. void zbud_free(struct zbud_pool *pool, unsigned long handle)
  299. {
  300. struct zbud_header *zhdr;
  301. int freechunks;
  302. spin_lock(&pool->lock);
  303. zhdr = handle_to_zbud_header(handle);
  304. /* If first buddy, handle will be page aligned */
  305. if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
  306. zhdr->last_chunks = 0;
  307. else
  308. zhdr->first_chunks = 0;
  309. if (zhdr->under_reclaim) {
  310. /* zbud page is under reclaim, reclaim will free */
  311. spin_unlock(&pool->lock);
  312. return;
  313. }
  314. /* Remove from existing buddy list */
  315. list_del(&zhdr->buddy);
  316. if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
  317. /* zbud page is empty, free */
  318. list_del(&zhdr->lru);
  319. free_zbud_page(zhdr);
  320. pool->pages_nr--;
  321. } else {
  322. /* Add to unbuddied list */
  323. freechunks = num_free_chunks(zhdr);
  324. list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
  325. }
  326. spin_unlock(&pool->lock);
  327. }
  328. #define list_tail_entry(ptr, type, member) \
  329. list_entry((ptr)->prev, type, member)
  330. /**
  331. * zbud_reclaim_page() - evicts allocations from a pool page and frees it
  332. * @pool: pool from which a page will attempt to be evicted
  333. * @retires: number of pages on the LRU list for which eviction will
  334. * be attempted before failing
  335. *
  336. * zbud reclaim is different from normal system reclaim in that the reclaim is
  337. * done from the bottom, up. This is because only the bottom layer, zbud, has
  338. * information on how the allocations are organized within each zbud page. This
  339. * has the potential to create interesting locking situations between zbud and
  340. * the user, however.
  341. *
  342. * To avoid these, this is how zbud_reclaim_page() should be called:
  343. * The user detects a page should be reclaimed and calls zbud_reclaim_page().
  344. * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
  345. * the user-defined eviction handler with the pool and handle as arguments.
  346. *
  347. * If the handle can not be evicted, the eviction handler should return
  348. * non-zero. zbud_reclaim_page() will add the zbud page back to the
  349. * appropriate list and try the next zbud page on the LRU up to
  350. * a user defined number of retries.
  351. *
  352. * If the handle is successfully evicted, the eviction handler should
  353. * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
  354. * contains logic to delay freeing the page if the page is under reclaim,
  355. * as indicated by the setting of the PG_reclaim flag on the underlying page.
  356. *
  357. * If all buddies in the zbud page are successfully evicted, then the
  358. * zbud page can be freed.
  359. *
  360. * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
  361. * no pages to evict or an eviction handler is not registered, -EAGAIN if
  362. * the retry limit was hit.
  363. */
  364. int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
  365. {
  366. int i, ret, freechunks;
  367. struct zbud_header *zhdr;
  368. unsigned long first_handle = 0, last_handle = 0;
  369. spin_lock(&pool->lock);
  370. if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
  371. retries == 0) {
  372. spin_unlock(&pool->lock);
  373. return -EINVAL;
  374. }
  375. for (i = 0; i < retries; i++) {
  376. zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
  377. list_del(&zhdr->lru);
  378. list_del(&zhdr->buddy);
  379. /* Protect zbud page against free */
  380. zhdr->under_reclaim = true;
  381. /*
  382. * We need encode the handles before unlocking, since we can
  383. * race with free that will set (first|last)_chunks to 0
  384. */
  385. first_handle = 0;
  386. last_handle = 0;
  387. if (zhdr->first_chunks)
  388. first_handle = encode_handle(zhdr, FIRST);
  389. if (zhdr->last_chunks)
  390. last_handle = encode_handle(zhdr, LAST);
  391. spin_unlock(&pool->lock);
  392. /* Issue the eviction callback(s) */
  393. if (first_handle) {
  394. ret = pool->ops->evict(pool, first_handle);
  395. if (ret)
  396. goto next;
  397. }
  398. if (last_handle) {
  399. ret = pool->ops->evict(pool, last_handle);
  400. if (ret)
  401. goto next;
  402. }
  403. next:
  404. spin_lock(&pool->lock);
  405. zhdr->under_reclaim = false;
  406. if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
  407. /*
  408. * Both buddies are now free, free the zbud page and
  409. * return success.
  410. */
  411. free_zbud_page(zhdr);
  412. pool->pages_nr--;
  413. spin_unlock(&pool->lock);
  414. return 0;
  415. } else if (zhdr->first_chunks == 0 ||
  416. zhdr->last_chunks == 0) {
  417. /* add to unbuddied list */
  418. freechunks = num_free_chunks(zhdr);
  419. list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
  420. } else {
  421. /* add to buddied list */
  422. list_add(&zhdr->buddy, &pool->buddied);
  423. }
  424. /* add to beginning of LRU */
  425. list_add(&zhdr->lru, &pool->lru);
  426. }
  427. spin_unlock(&pool->lock);
  428. return -EAGAIN;
  429. }
  430. /**
  431. * zbud_map() - maps the allocation associated with the given handle
  432. * @pool: pool in which the allocation resides
  433. * @handle: handle associated with the allocation to be mapped
  434. *
  435. * While trivial for zbud, the mapping functions for others allocators
  436. * implementing this allocation API could have more complex information encoded
  437. * in the handle and could create temporary mappings to make the data
  438. * accessible to the user.
  439. *
  440. * Returns: a pointer to the mapped allocation
  441. */
  442. void *zbud_map(struct zbud_pool *pool, unsigned long handle)
  443. {
  444. return (void *)(handle);
  445. }
  446. /**
  447. * zbud_unmap() - maps the allocation associated with the given handle
  448. * @pool: pool in which the allocation resides
  449. * @handle: handle associated with the allocation to be unmapped
  450. */
  451. void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
  452. {
  453. }
  454. /**
  455. * zbud_get_pool_size() - gets the zbud pool size in pages
  456. * @pool: pool whose size is being queried
  457. *
  458. * Returns: size in pages of the given pool. The pool lock need not be
  459. * taken to access pages_nr.
  460. */
  461. u64 zbud_get_pool_size(struct zbud_pool *pool)
  462. {
  463. return pool->pages_nr;
  464. }
  465. static int __init init_zbud(void)
  466. {
  467. /* Make sure the zbud header will fit in one chunk */
  468. BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
  469. pr_info("loaded\n");
  470. return 0;
  471. }
  472. static void __exit exit_zbud(void)
  473. {
  474. pr_info("unloaded\n");
  475. }
  476. module_init(init_zbud);
  477. module_exit(exit_zbud);
  478. MODULE_LICENSE("GPL");
  479. MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
  480. MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");