zcache.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155
  1. /*
  2. * linux/mm/zcache.c
  3. *
  4. * A cleancache backend for file pages compression.
  5. * Concepts based on original zcache by Dan Magenheimer.
  6. * Copyright (C) 2013 Bob Liu <bob.liu@xxxxxxxxxx>
  7. *
  8. * With zcache, active file pages can be compressed in memory during page
  9. * reclaiming. When their data is needed again the I/O reading operation is
  10. * avoided. This results in a significant performance gain under memory pressure
  11. * for systems with many file pages.
  12. *
  13. * This program is free software; you can redistribute it and/or
  14. * modify it under the terms of the GNU General Public License
  15. * as published by the Free Software Foundation; either version 2
  16. * of the License, or (at your option) any later version.
  17. *
  18. * This program is distributed in the hope that it will be useful,
  19. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  21. * GNU General Public License for more details.
  22. */
  23. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  24. #include <linux/atomic.h>
  25. #include <linux/cleancache.h>
  26. #include <linux/cpu.h>
  27. #include <linux/crypto.h>
  28. #include <linux/page-flags.h>
  29. #include <linux/pagemap.h>
  30. #include <linux/highmem.h>
  31. #include <linux/mm_types.h>
  32. #include <linux/module.h>
  33. #include <linux/slab.h>
  34. #include <linux/spinlock.h>
  35. #include <linux/radix-tree.h>
  36. #include <linux/rbtree.h>
  37. #include <linux/types.h>
  38. #include <linux/zbud.h>
  39. /*
  40. * Enable/disable zcache (disabled by default)
  41. */
  42. static bool zcache_enabled __read_mostly;
  43. module_param_named(enabled, zcache_enabled, bool, 0);
  44. /*
  45. * Compressor to be used by zcache
  46. */
  47. #define ZCACHE_COMPRESSOR_DEFAULT "lzo"
  48. static char *zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT;
  49. module_param_named(compressor, zcache_compressor, charp, 0);
  50. /*
  51. * The maximum percentage of memory that the compressed pool can occupy.
  52. */
  53. static unsigned int zcache_max_pool_percent = 10;
  54. module_param_named(max_pool_percent, zcache_max_pool_percent, uint, 0644);
  55. static unsigned int zcache_clear_percent = 4;
  56. module_param_named(clear_percent, zcache_clear_percent, uint, 0644);
  57. /*
  58. * zcache statistics
  59. */
  60. static u64 zcache_pool_limit_hit;
  61. static u64 zcache_dup_entry;
  62. static u64 zcache_zbud_alloc_fail;
  63. static u64 zcache_evict_zpages;
  64. static u64 zcache_evict_filepages;
  65. static u64 zcache_inactive_pages_refused;
  66. static u64 zcache_reclaim_fail;
  67. static u64 zcache_pool_shrink;
  68. static u64 zcache_pool_shrink_fail;
  69. static u64 zcache_pool_shrink_pages;
  70. static u64 zcache_store_failed;
  71. static atomic_t zcache_stored_pages = ATOMIC_INIT(0);
  72. static atomic_t zcache_stored_zero_pages = ATOMIC_INIT(0);
  73. #define GFP_ZCACHE \
  74. (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | \
  75. __GFP_NOMEMALLOC | __GFP_NO_KSWAPD | __GFP_ZERO)
  76. /*
  77. * Make sure this is different from radix tree
  78. * indirect ptr or exceptional entry.
  79. */
  80. #define ZERO_HANDLE ((void *)~(~0UL >> 1))
  81. /*
  82. * Zcache receives pages for compression through the Cleancache API and is able
  83. * to evict pages from its own compressed pool on an LRU basis in the case that
  84. * the compressed pool is full.
  85. *
  86. * Zcache makes use of zbud for the managing the compressed memory pool. Each
  87. * allocation in zbud is not directly accessible by address. Rather, a handle
  88. * (zaddr) is return by the allocation routine and that handle(zaddr must be
  89. * mapped before being accessed. The compressed memory pool grows on demand and
  90. * shrinks as compressed pages are freed.
  91. *
  92. * When a file page is passed from cleancache to zcache, zcache maintains a
  93. * mapping of the <filesystem_type, inode_number, page_index> to the zbud
  94. * address that references that compressed file page. This mapping is achieved
  95. * with a red-black tree per filesystem type, plus a radix tree per red-black
  96. * node.
  97. *
  98. * A zcache pool with pool_id as the index is created when a filesystem mounted
  99. * Each zcache pool has a red-black tree, the inode number(rb_index) is the
  100. * search key. Each red-black tree node has a radix tree which use
  101. * page->index(ra_index) as the index. Each radix tree slot points to the zbud
  102. * address combining with some extra information(zcache_ra_handle).
  103. */
  104. #define MAX_ZCACHE_POOLS 32
  105. /*
  106. * One zcache_pool per (cleancache aware) filesystem mount instance
  107. */
  108. struct zcache_pool {
  109. struct rb_root rbtree;
  110. rwlock_t rb_lock; /* Protects rbtree */
  111. u64 size;
  112. struct zbud_pool *pool; /* Zbud pool used */
  113. };
  114. /*
  115. * Manage all zcache pools
  116. */
  117. struct _zcache {
  118. struct zcache_pool *pools[MAX_ZCACHE_POOLS];
  119. u32 num_pools; /* Current no. of zcache pools */
  120. spinlock_t pool_lock; /* Protects pools[] and num_pools */
  121. };
  122. struct _zcache zcache;
  123. /*
  124. * Redblack tree node, each node has a page index radix-tree.
  125. * Indexed by inode nubmer.
  126. */
  127. struct zcache_rbnode {
  128. struct rb_node rb_node;
  129. int rb_index;
  130. struct radix_tree_root ratree; /* Page radix tree per inode rbtree */
  131. spinlock_t ra_lock; /* Protects radix tree */
  132. struct kref refcount;
  133. };
  134. /*
  135. * Radix-tree leaf, indexed by page->index
  136. */
  137. struct zcache_ra_handle {
  138. int rb_index; /* Redblack tree index */
  139. int ra_index; /* Radix tree index */
  140. int zlen; /* Compressed page size */
  141. struct zcache_pool *zpool; /* Finding zcache_pool during evict */
  142. };
  143. u64 zcache_pages(void)
  144. {
  145. int i;
  146. u64 count = 0;
  147. for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++)
  148. count += zcache.pools[i]->size;
  149. return count;
  150. }
  151. static struct kmem_cache *zcache_rbnode_cache;
  152. static int zcache_rbnode_cache_create(void)
  153. {
  154. zcache_rbnode_cache = KMEM_CACHE(zcache_rbnode, 0);
  155. return zcache_rbnode_cache == NULL;
  156. }
  157. static void zcache_rbnode_cache_destroy(void)
  158. {
  159. kmem_cache_destroy(zcache_rbnode_cache);
  160. }
  161. static int zcache_shrink(struct shrinker *s, struct shrink_control *sc)
  162. {
  163. unsigned long active_file;
  164. unsigned long file;
  165. long file_gap;
  166. unsigned long freed = 0;
  167. unsigned long pool;
  168. static bool running;
  169. int i = 0;
  170. int retries;
  171. if (running)
  172. goto end;
  173. running = true;
  174. active_file = global_page_state(NR_ACTIVE_FILE);
  175. file = global_page_state(NR_FILE_PAGES);
  176. pool = zcache_pages();
  177. file_gap = pool - file;
  178. if ((file_gap >= 0) &&
  179. (totalram_pages * zcache_clear_percent / 100 > file)) {
  180. file_gap = pool;
  181. zcache_pool_shrink++;
  182. goto reclaim;
  183. }
  184. /*
  185. * file_gap == 0 means that the number of pages
  186. * stored by zcache is around twice as many as the
  187. * number of active file pages.
  188. */
  189. file_gap = pool - active_file;
  190. if (file_gap < 0)
  191. file_gap = 0;
  192. else
  193. zcache_pool_shrink++;
  194. reclaim:
  195. retries = file_gap;
  196. while ((file_gap > 0) && retries) {
  197. struct zcache_pool *zpool =
  198. zcache.pools[i++ % MAX_ZCACHE_POOLS];
  199. if (!zpool || !zpool->size)
  200. continue;
  201. if (zbud_reclaim_page(zpool->pool, 8)) {
  202. zcache_pool_shrink_fail++;
  203. retries--;
  204. continue;
  205. }
  206. freed++;
  207. file_gap--;
  208. }
  209. zcache_pool_shrink_pages += freed;
  210. for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++)
  211. zcache.pools[i]->size =
  212. zbud_get_pool_size(zcache.pools[i]->pool);
  213. running = false;
  214. end:
  215. return freed;
  216. }
  217. static struct shrinker zcache_shrinker = {
  218. .shrink = zcache_shrink,
  219. .seeks = DEFAULT_SEEKS * 16
  220. };
  221. /*
  222. * Compression functions
  223. * (Below functions are copyed from zswap!)
  224. */
  225. static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms;
  226. enum comp_op {
  227. ZCACHE_COMPOP_COMPRESS,
  228. ZCACHE_COMPOP_DECOMPRESS
  229. };
  230. static int zcache_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
  231. u8 *dst, unsigned int *dlen)
  232. {
  233. struct crypto_comp *tfm;
  234. int ret;
  235. tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
  236. switch (op) {
  237. case ZCACHE_COMPOP_COMPRESS:
  238. ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
  239. break;
  240. case ZCACHE_COMPOP_DECOMPRESS:
  241. ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
  242. break;
  243. default:
  244. ret = -EINVAL;
  245. }
  246. put_cpu();
  247. return ret;
  248. }
  249. static int __init zcache_comp_init(void)
  250. {
  251. if (!crypto_has_comp(zcache_compressor, 0, 0)) {
  252. pr_info("%s compressor not available\n", zcache_compressor);
  253. /* fall back to default compressor */
  254. zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT;
  255. if (!crypto_has_comp(zcache_compressor, 0, 0))
  256. /* can't even load the default compressor */
  257. return -ENODEV;
  258. }
  259. pr_info("using %s compressor\n", zcache_compressor);
  260. /* alloc percpu transforms */
  261. zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
  262. if (!zcache_comp_pcpu_tfms)
  263. return -ENOMEM;
  264. return 0;
  265. }
  266. static void zcache_comp_exit(void)
  267. {
  268. /* free percpu transforms */
  269. if (zcache_comp_pcpu_tfms)
  270. free_percpu(zcache_comp_pcpu_tfms);
  271. }
  272. /*
  273. * Per-cpu code
  274. * (Below functions are also copyed from zswap!)
  275. */
  276. static DEFINE_PER_CPU(u8 *, zcache_dstmem);
  277. static int __zcache_cpu_notifier(unsigned long action, unsigned long cpu)
  278. {
  279. struct crypto_comp *tfm;
  280. u8 *dst;
  281. switch (action) {
  282. case CPU_UP_PREPARE:
  283. tfm = crypto_alloc_comp(zcache_compressor, 0, 0);
  284. if (IS_ERR(tfm)) {
  285. pr_err("can't allocate compressor transform\n");
  286. return NOTIFY_BAD;
  287. }
  288. *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
  289. dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
  290. if (!dst) {
  291. pr_err("can't allocate compressor buffer\n");
  292. crypto_free_comp(tfm);
  293. *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
  294. return NOTIFY_BAD;
  295. }
  296. per_cpu(zcache_dstmem, cpu) = dst;
  297. break;
  298. case CPU_DEAD:
  299. case CPU_UP_CANCELED:
  300. tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
  301. if (tfm) {
  302. crypto_free_comp(tfm);
  303. *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
  304. }
  305. dst = per_cpu(zcache_dstmem, cpu);
  306. kfree(dst);
  307. per_cpu(zcache_dstmem, cpu) = NULL;
  308. break;
  309. default:
  310. break;
  311. }
  312. return NOTIFY_OK;
  313. }
  314. static int zcache_cpu_notifier(struct notifier_block *nb,
  315. unsigned long action, void *pcpu)
  316. {
  317. unsigned long cpu = (unsigned long)pcpu;
  318. return __zcache_cpu_notifier(action, cpu);
  319. }
  320. static struct notifier_block zcache_cpu_notifier_block = {
  321. .notifier_call = zcache_cpu_notifier
  322. };
  323. static int zcache_cpu_init(void)
  324. {
  325. unsigned long cpu;
  326. get_online_cpus();
  327. for_each_online_cpu(cpu)
  328. if (__zcache_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
  329. goto cleanup;
  330. register_cpu_notifier(&zcache_cpu_notifier_block);
  331. put_online_cpus();
  332. return 0;
  333. cleanup:
  334. for_each_online_cpu(cpu)
  335. __zcache_cpu_notifier(CPU_UP_CANCELED, cpu);
  336. put_online_cpus();
  337. return -ENOMEM;
  338. }
  339. /*
  340. * Zcache helpers
  341. */
  342. static bool zcache_is_full(void)
  343. {
  344. long file = global_page_state(NR_FILE_PAGES);
  345. return ((totalram_pages * zcache_max_pool_percent / 100 <
  346. zcache_pages()) ||
  347. (totalram_pages * zcache_clear_percent / 100 >
  348. file));
  349. }
  350. /*
  351. * The caller must hold zpool->rb_lock at least
  352. */
  353. static struct zcache_rbnode *zcache_find_rbnode(struct rb_root *rbtree,
  354. int index, struct rb_node **rb_parent, struct rb_node ***rb_link)
  355. {
  356. struct zcache_rbnode *entry;
  357. struct rb_node **__rb_link, *__rb_parent, *rb_prev;
  358. __rb_link = &rbtree->rb_node;
  359. rb_prev = __rb_parent = NULL;
  360. while (*__rb_link) {
  361. __rb_parent = *__rb_link;
  362. entry = rb_entry(__rb_parent, struct zcache_rbnode, rb_node);
  363. if (entry->rb_index > index)
  364. __rb_link = &__rb_parent->rb_left;
  365. else if (entry->rb_index < index) {
  366. rb_prev = __rb_parent;
  367. __rb_link = &__rb_parent->rb_right;
  368. } else
  369. return entry;
  370. }
  371. if (rb_parent)
  372. *rb_parent = __rb_parent;
  373. if (rb_link)
  374. *rb_link = __rb_link;
  375. return NULL;
  376. }
  377. static struct zcache_rbnode *zcache_find_get_rbnode(struct zcache_pool *zpool,
  378. int rb_index)
  379. {
  380. unsigned long flags;
  381. struct zcache_rbnode *rbnode;
  382. read_lock_irqsave(&zpool->rb_lock, flags);
  383. rbnode = zcache_find_rbnode(&zpool->rbtree, rb_index, 0, 0);
  384. if (rbnode)
  385. kref_get(&rbnode->refcount);
  386. read_unlock_irqrestore(&zpool->rb_lock, flags);
  387. return rbnode;
  388. }
  389. /*
  390. * kref_put callback for zcache_rbnode.
  391. *
  392. * The rbnode must have been isolated from rbtree already.
  393. */
  394. static void zcache_rbnode_release(struct kref *kref)
  395. {
  396. struct zcache_rbnode *rbnode;
  397. rbnode = container_of(kref, struct zcache_rbnode, refcount);
  398. BUG_ON(rbnode->ratree.rnode);
  399. kmem_cache_free(zcache_rbnode_cache, rbnode);
  400. }
  401. /*
  402. * Check whether the radix-tree of this rbnode is empty.
  403. * If that's true, then we can delete this zcache_rbnode from
  404. * zcache_pool->rbtree
  405. *
  406. * Caller must hold zcache_rbnode->ra_lock
  407. */
  408. static int zcache_rbnode_empty(struct zcache_rbnode *rbnode)
  409. {
  410. return rbnode->ratree.rnode == NULL;
  411. }
  412. /*
  413. * Remove zcache_rbnode from zpool->rbtree
  414. *
  415. * holded_rblock - whether the caller has holded zpool->rb_lock
  416. */
  417. static void zcache_rbnode_isolate(struct zcache_pool *zpool,
  418. struct zcache_rbnode *rbnode, bool holded_rblock)
  419. {
  420. unsigned long flags;
  421. if (!holded_rblock)
  422. write_lock_irqsave(&zpool->rb_lock, flags);
  423. /*
  424. * Someone can get reference on this rbnode before we could
  425. * acquire write lock above.
  426. * We want to remove it from zpool->rbtree when only the caller and
  427. * corresponding ratree holds a reference to this rbnode.
  428. * Below check ensures that a racing zcache put will not end up adding
  429. * a page to an isolated node and thereby losing that memory.
  430. */
  431. if (atomic_read(&rbnode->refcount.refcount) == 2) {
  432. rb_erase(&rbnode->rb_node, &zpool->rbtree);
  433. RB_CLEAR_NODE(&rbnode->rb_node);
  434. kref_put(&rbnode->refcount, zcache_rbnode_release);
  435. }
  436. if (!holded_rblock)
  437. write_unlock_irqrestore(&zpool->rb_lock, flags);
  438. }
  439. /*
  440. * Store zaddr which allocated by zbud_alloc() to the hierarchy rbtree-ratree.
  441. */
  442. static int zcache_store_zaddr(struct zcache_pool *zpool,
  443. int ra_index, int rb_index, unsigned long zaddr)
  444. {
  445. unsigned long flags;
  446. struct zcache_rbnode *rbnode, *tmp;
  447. struct rb_node **link = NULL, *parent = NULL;
  448. int ret;
  449. void *dup_zaddr;
  450. rbnode = zcache_find_get_rbnode(zpool, rb_index);
  451. if (!rbnode) {
  452. /* alloc and init a new rbnode */
  453. rbnode = kmem_cache_alloc(zcache_rbnode_cache,
  454. GFP_ZCACHE);
  455. if (!rbnode)
  456. return -ENOMEM;
  457. INIT_RADIX_TREE(&rbnode->ratree, GFP_ATOMIC|__GFP_NOWARN);
  458. spin_lock_init(&rbnode->ra_lock);
  459. rbnode->rb_index = rb_index;
  460. kref_init(&rbnode->refcount);
  461. RB_CLEAR_NODE(&rbnode->rb_node);
  462. /* add that rbnode to rbtree */
  463. write_lock_irqsave(&zpool->rb_lock, flags);
  464. tmp = zcache_find_rbnode(&zpool->rbtree, rb_index,
  465. &parent, &link);
  466. if (tmp) {
  467. /* somebody else allocated new rbnode */
  468. kmem_cache_free(zcache_rbnode_cache, rbnode);
  469. rbnode = tmp;
  470. } else {
  471. rb_link_node(&rbnode->rb_node, parent, link);
  472. rb_insert_color(&rbnode->rb_node, &zpool->rbtree);
  473. }
  474. /* Inc the reference of this zcache_rbnode */
  475. kref_get(&rbnode->refcount);
  476. write_unlock_irqrestore(&zpool->rb_lock, flags);
  477. }
  478. /* Succfully got a zcache_rbnode when arriving here */
  479. spin_lock_irqsave(&rbnode->ra_lock, flags);
  480. dup_zaddr = radix_tree_delete(&rbnode->ratree, ra_index);
  481. if (unlikely(dup_zaddr)) {
  482. if (dup_zaddr == ZERO_HANDLE) {
  483. atomic_dec(&zcache_stored_zero_pages);
  484. } else {
  485. zbud_free(zpool->pool, (unsigned long)dup_zaddr);
  486. atomic_dec(&zcache_stored_pages);
  487. zpool->size = zbud_get_pool_size(zpool->pool);
  488. }
  489. zcache_dup_entry++;
  490. }
  491. /* Insert zcache_ra_handle to ratree */
  492. ret = radix_tree_insert(&rbnode->ratree, ra_index,
  493. (void *)zaddr);
  494. spin_unlock_irqrestore(&rbnode->ra_lock, flags);
  495. if (unlikely(ret)) {
  496. write_lock_irqsave(&zpool->rb_lock, flags);
  497. spin_lock(&rbnode->ra_lock);
  498. if (zcache_rbnode_empty(rbnode))
  499. zcache_rbnode_isolate(zpool, rbnode, 1);
  500. spin_unlock(&rbnode->ra_lock);
  501. write_unlock_irqrestore(&zpool->rb_lock, flags);
  502. }
  503. kref_put(&rbnode->refcount, zcache_rbnode_release);
  504. return ret;
  505. }
  506. /*
  507. * Load zaddr and delete it from radix tree.
  508. * If the radix tree of the corresponding rbnode is empty, delete the rbnode
  509. * from zpool->rbtree also.
  510. */
  511. static void *zcache_load_delete_zaddr(struct zcache_pool *zpool,
  512. int rb_index, int ra_index)
  513. {
  514. struct zcache_rbnode *rbnode;
  515. void *zaddr = NULL;
  516. unsigned long flags;
  517. rbnode = zcache_find_get_rbnode(zpool, rb_index);
  518. if (!rbnode)
  519. goto out;
  520. BUG_ON(rbnode->rb_index != rb_index);
  521. spin_lock_irqsave(&rbnode->ra_lock, flags);
  522. zaddr = radix_tree_delete(&rbnode->ratree, ra_index);
  523. spin_unlock_irqrestore(&rbnode->ra_lock, flags);
  524. /* rb_lock and ra_lock must be taken again in the given sequence */
  525. write_lock_irqsave(&zpool->rb_lock, flags);
  526. spin_lock(&rbnode->ra_lock);
  527. if (zcache_rbnode_empty(rbnode))
  528. zcache_rbnode_isolate(zpool, rbnode, 1);
  529. spin_unlock(&rbnode->ra_lock);
  530. write_unlock_irqrestore(&zpool->rb_lock, flags);
  531. kref_put(&rbnode->refcount, zcache_rbnode_release);
  532. out:
  533. return zaddr;
  534. }
  535. static bool zero_page(struct page *page)
  536. {
  537. unsigned long *ptr = kmap_atomic(page);
  538. int i;
  539. bool ret = false;
  540. for (i = 0; i < PAGE_SIZE / sizeof(*ptr); i++) {
  541. if (ptr[i])
  542. goto out;
  543. }
  544. ret = true;
  545. out:
  546. kunmap_atomic(ptr);
  547. return ret;
  548. }
  549. static void zcache_store_page(int pool_id, struct cleancache_filekey key,
  550. pgoff_t index, struct page *page)
  551. {
  552. struct zcache_ra_handle *zhandle;
  553. u8 *zpage, *src, *dst;
  554. /* Address of zhandle + compressed data(zpage) */
  555. unsigned long zaddr = 0;
  556. unsigned int zlen = PAGE_SIZE;
  557. bool zero = 0;
  558. int ret;
  559. struct zcache_pool *zpool = zcache.pools[pool_id];
  560. /*
  561. * Zcache will be ineffective if the compressed memory pool is full with
  562. * compressed inactive file pages and most of them will never be used
  563. * again.
  564. * So we refuse to compress pages that are not from active file list.
  565. */
  566. if (!PageWasActive(page)) {
  567. zcache_inactive_pages_refused++;
  568. return;
  569. }
  570. zero = zero_page(page);
  571. if (zero)
  572. goto zero;
  573. if (zcache_is_full()) {
  574. zcache_pool_limit_hit++;
  575. if (zbud_reclaim_page(zpool->pool, 8)) {
  576. zcache_reclaim_fail++;
  577. return;
  578. }
  579. /*
  580. * Continue if reclaimed a page frame succ.
  581. */
  582. zcache_evict_filepages++;
  583. zpool->size = zbud_get_pool_size(zpool->pool);
  584. }
  585. /* compress */
  586. dst = get_cpu_var(zcache_dstmem);
  587. src = kmap_atomic(page);
  588. ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, src, PAGE_SIZE, dst,
  589. &zlen);
  590. kunmap_atomic(src);
  591. if (ret) {
  592. pr_err("zcache compress error ret %d\n", ret);
  593. put_cpu_var(zcache_dstmem);
  594. return;
  595. }
  596. /* store zcache handle together with compressed page data */
  597. ret = zbud_alloc(zpool->pool, zlen + sizeof(struct zcache_ra_handle),
  598. GFP_ZCACHE, &zaddr);
  599. if (ret) {
  600. zcache_zbud_alloc_fail++;
  601. put_cpu_var(zcache_dstmem);
  602. return;
  603. }
  604. zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, zaddr);
  605. /* Compressed page data stored at the end of zcache_ra_handle */
  606. zpage = (u8 *)(zhandle + 1);
  607. memcpy(zpage, dst, zlen);
  608. zbud_unmap(zpool->pool, zaddr);
  609. put_cpu_var(zcache_dstmem);
  610. zero:
  611. if (zero)
  612. zaddr = (unsigned long)ZERO_HANDLE;
  613. /* store zcache handle */
  614. ret = zcache_store_zaddr(zpool, index, key.u.ino, zaddr);
  615. if (ret) {
  616. zcache_store_failed++;
  617. if (!zero)
  618. zbud_free(zpool->pool, zaddr);
  619. return;
  620. }
  621. /* update stats */
  622. if (zero) {
  623. atomic_inc(&zcache_stored_zero_pages);
  624. } else {
  625. zhandle->ra_index = index;
  626. zhandle->rb_index = key.u.ino;
  627. zhandle->zlen = zlen;
  628. zhandle->zpool = zpool;
  629. atomic_inc(&zcache_stored_pages);
  630. zpool->size = zbud_get_pool_size(zpool->pool);
  631. }
  632. return;
  633. }
  634. static int zcache_load_page(int pool_id, struct cleancache_filekey key,
  635. pgoff_t index, struct page *page)
  636. {
  637. int ret = 0;
  638. u8 *src, *dst;
  639. void *zaddr;
  640. unsigned int dlen = PAGE_SIZE;
  641. struct zcache_ra_handle *zhandle;
  642. struct zcache_pool *zpool = zcache.pools[pool_id];
  643. zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index);
  644. if (!zaddr)
  645. return -ENOENT;
  646. else if (zaddr == ZERO_HANDLE)
  647. goto map;
  648. zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool,
  649. (unsigned long)zaddr);
  650. /* Compressed page data stored at the end of zcache_ra_handle */
  651. src = (u8 *)(zhandle + 1);
  652. /* decompress */
  653. map:
  654. dst = kmap_atomic(page);
  655. if (zaddr != ZERO_HANDLE) {
  656. ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, src,
  657. zhandle->zlen, dst, &dlen);
  658. } else {
  659. memset(dst, 0, PAGE_SIZE);
  660. kunmap_atomic(dst);
  661. flush_dcache_page(page);
  662. atomic_dec(&zcache_stored_zero_pages);
  663. goto out;
  664. }
  665. kunmap_atomic(dst);
  666. zbud_unmap(zpool->pool, (unsigned long)zaddr);
  667. zbud_free(zpool->pool, (unsigned long)zaddr);
  668. BUG_ON(ret);
  669. BUG_ON(dlen != PAGE_SIZE);
  670. /* update stats */
  671. atomic_dec(&zcache_stored_pages);
  672. zpool->size = zbud_get_pool_size(zpool->pool);
  673. out:
  674. SetPageWasActive(page);
  675. return ret;
  676. }
  677. static void zcache_flush_page(int pool_id, struct cleancache_filekey key,
  678. pgoff_t index)
  679. {
  680. struct zcache_pool *zpool = zcache.pools[pool_id];
  681. void *zaddr = NULL;
  682. zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index);
  683. if (zaddr && (zaddr != ZERO_HANDLE)) {
  684. zbud_free(zpool->pool, (unsigned long)zaddr);
  685. atomic_dec(&zcache_stored_pages);
  686. zpool->size = zbud_get_pool_size(zpool->pool);
  687. } else if (zaddr == ZERO_HANDLE) {
  688. atomic_dec(&zcache_stored_zero_pages);
  689. }
  690. }
  691. #define FREE_BATCH 16
  692. /*
  693. * Callers must hold the lock
  694. */
  695. static void zcache_flush_ratree(struct zcache_pool *zpool,
  696. struct zcache_rbnode *rbnode)
  697. {
  698. unsigned long index = 0;
  699. int count, i;
  700. struct zcache_ra_handle *zhandle;
  701. void *zaddr = NULL;
  702. do {
  703. void *zaddrs[FREE_BATCH];
  704. unsigned long indices[FREE_BATCH];
  705. count = radix_tree_gang_lookup_index(&rbnode->ratree,
  706. (void **)zaddrs, indices,
  707. index, FREE_BATCH);
  708. for (i = 0; i < count; i++) {
  709. if (zaddrs[i] == ZERO_HANDLE) {
  710. zaddr = radix_tree_delete(&rbnode->ratree,
  711. indices[i]);
  712. if (zaddr)
  713. atomic_dec(&zcache_stored_zero_pages);
  714. continue;
  715. }
  716. zhandle = (struct zcache_ra_handle *)zbud_map(
  717. zpool->pool, (unsigned long)zaddrs[i]);
  718. index = zhandle->ra_index;
  719. zaddr = radix_tree_delete(&rbnode->ratree, index);
  720. if (!zaddr)
  721. continue;
  722. zbud_unmap(zpool->pool, (unsigned long)zaddrs[i]);
  723. zbud_free(zpool->pool, (unsigned long)zaddrs[i]);
  724. atomic_dec(&zcache_stored_pages);
  725. zpool->size = zbud_get_pool_size(zpool->pool);
  726. }
  727. index++;
  728. } while (count == FREE_BATCH);
  729. }
  730. static void zcache_flush_inode(int pool_id, struct cleancache_filekey key)
  731. {
  732. struct zcache_rbnode *rbnode;
  733. unsigned long flags1, flags2;
  734. struct zcache_pool *zpool = zcache.pools[pool_id];
  735. /*
  736. * Refuse new pages added in to the same rbinode, so get rb_lock at
  737. * first.
  738. */
  739. write_lock_irqsave(&zpool->rb_lock, flags1);
  740. rbnode = zcache_find_rbnode(&zpool->rbtree, key.u.ino, 0, 0);
  741. if (!rbnode) {
  742. write_unlock_irqrestore(&zpool->rb_lock, flags1);
  743. return;
  744. }
  745. kref_get(&rbnode->refcount);
  746. spin_lock_irqsave(&rbnode->ra_lock, flags2);
  747. zcache_flush_ratree(zpool, rbnode);
  748. if (zcache_rbnode_empty(rbnode))
  749. /* When arrvied here, we already hold rb_lock */
  750. zcache_rbnode_isolate(zpool, rbnode, 1);
  751. spin_unlock_irqrestore(&rbnode->ra_lock, flags2);
  752. write_unlock_irqrestore(&zpool->rb_lock, flags1);
  753. kref_put(&rbnode->refcount, zcache_rbnode_release);
  754. }
  755. static void zcache_destroy_pool(struct zcache_pool *zpool);
  756. static void zcache_flush_fs(int pool_id)
  757. {
  758. struct zcache_rbnode *z_rbnode = NULL;
  759. struct rb_node *rbnode;
  760. unsigned long flags1, flags2;
  761. struct zcache_pool *zpool;
  762. if (pool_id < 0)
  763. return;
  764. zpool = zcache.pools[pool_id];
  765. if (!zpool)
  766. return;
  767. /*
  768. * Refuse new pages added in, so get rb_lock at first.
  769. */
  770. write_lock_irqsave(&zpool->rb_lock, flags1);
  771. rbnode = rb_first(&zpool->rbtree);
  772. while (rbnode) {
  773. z_rbnode = rb_entry(rbnode, struct zcache_rbnode, rb_node);
  774. rbnode = rb_next(rbnode);
  775. if (z_rbnode) {
  776. kref_get(&z_rbnode->refcount);
  777. spin_lock_irqsave(&z_rbnode->ra_lock, flags2);
  778. zcache_flush_ratree(zpool, z_rbnode);
  779. if (zcache_rbnode_empty(z_rbnode))
  780. zcache_rbnode_isolate(zpool, z_rbnode, 1);
  781. spin_unlock_irqrestore(&z_rbnode->ra_lock, flags2);
  782. kref_put(&z_rbnode->refcount, zcache_rbnode_release);
  783. }
  784. }
  785. write_unlock_irqrestore(&zpool->rb_lock, flags1);
  786. zcache_destroy_pool(zpool);
  787. }
  788. /*
  789. * Evict compressed pages from zcache pool on an LRU basis after the compressed
  790. * pool is full.
  791. */
  792. static int zcache_evict_zpage(struct zbud_pool *pool, unsigned long zaddr)
  793. {
  794. struct zcache_pool *zpool;
  795. struct zcache_ra_handle *zhandle;
  796. void *zaddr_intree;
  797. BUG_ON(zaddr == (unsigned long)ZERO_HANDLE);
  798. zhandle = (struct zcache_ra_handle *)zbud_map(pool, zaddr);
  799. zpool = zhandle->zpool;
  800. /* There can be a race with zcache store */
  801. if (!zpool)
  802. return -EINVAL;
  803. BUG_ON(pool != zpool->pool);
  804. zaddr_intree = zcache_load_delete_zaddr(zpool, zhandle->rb_index,
  805. zhandle->ra_index);
  806. if (zaddr_intree) {
  807. BUG_ON((unsigned long)zaddr_intree != zaddr);
  808. zbud_unmap(pool, zaddr);
  809. zbud_free(pool, zaddr);
  810. atomic_dec(&zcache_stored_pages);
  811. zpool->size = zbud_get_pool_size(pool);
  812. zcache_evict_zpages++;
  813. }
  814. return 0;
  815. }
  816. static struct zbud_ops zcache_zbud_ops = {
  817. .evict = zcache_evict_zpage
  818. };
  819. /* Return pool id */
  820. static int zcache_create_pool(void)
  821. {
  822. int ret;
  823. struct zcache_pool *zpool;
  824. zpool = kzalloc(sizeof(*zpool), GFP_KERNEL);
  825. if (!zpool) {
  826. ret = -ENOMEM;
  827. goto out;
  828. }
  829. zpool->pool = zbud_create_pool(GFP_KERNEL, &zcache_zbud_ops);
  830. if (!zpool->pool) {
  831. kfree(zpool);
  832. ret = -ENOMEM;
  833. goto out;
  834. }
  835. spin_lock(&zcache.pool_lock);
  836. if (zcache.num_pools == MAX_ZCACHE_POOLS) {
  837. pr_err("Cannot create new pool (limit:%u)\n", MAX_ZCACHE_POOLS);
  838. zbud_destroy_pool(zpool->pool);
  839. kfree(zpool);
  840. ret = -EPERM;
  841. goto out_unlock;
  842. }
  843. rwlock_init(&zpool->rb_lock);
  844. zpool->rbtree = RB_ROOT;
  845. /* Add to pool list */
  846. for (ret = 0; ret < MAX_ZCACHE_POOLS; ret++)
  847. if (!zcache.pools[ret])
  848. break;
  849. zcache.pools[ret] = zpool;
  850. zcache.num_pools++;
  851. pr_info("New pool created id:%d\n", ret);
  852. out_unlock:
  853. spin_unlock(&zcache.pool_lock);
  854. out:
  855. return ret;
  856. }
  857. static void zcache_destroy_pool(struct zcache_pool *zpool)
  858. {
  859. int i;
  860. if (!zpool)
  861. return;
  862. spin_lock(&zcache.pool_lock);
  863. zcache.num_pools--;
  864. for (i = 0; i < MAX_ZCACHE_POOLS; i++)
  865. if (zcache.pools[i] == zpool)
  866. break;
  867. zcache.pools[i] = NULL;
  868. spin_unlock(&zcache.pool_lock);
  869. if (!RB_EMPTY_ROOT(&zpool->rbtree))
  870. WARN_ON("Memory leak detected. Freeing non-empty pool!\n");
  871. zbud_destroy_pool(zpool->pool);
  872. kfree(zpool);
  873. }
  874. static int zcache_init_fs(size_t pagesize)
  875. {
  876. int ret;
  877. if (pagesize != PAGE_SIZE) {
  878. pr_info("Unsupported page size: %zu", pagesize);
  879. ret = -EINVAL;
  880. goto out;
  881. }
  882. ret = zcache_create_pool();
  883. if (ret < 0) {
  884. pr_info("Failed to create new pool\n");
  885. ret = -ENOMEM;
  886. goto out;
  887. }
  888. out:
  889. return ret;
  890. }
  891. static int zcache_init_shared_fs(char *uuid, size_t pagesize)
  892. {
  893. /* shared pools are unsupported and map to private */
  894. return zcache_init_fs(pagesize);
  895. }
  896. static struct cleancache_ops zcache_ops = {
  897. .put_page = zcache_store_page,
  898. .get_page = zcache_load_page,
  899. .invalidate_page = zcache_flush_page,
  900. .invalidate_inode = zcache_flush_inode,
  901. .invalidate_fs = zcache_flush_fs,
  902. .init_shared_fs = zcache_init_shared_fs,
  903. .init_fs = zcache_init_fs
  904. };
  905. /*
  906. * Debugfs functions
  907. */
  908. #ifdef CONFIG_DEBUG_FS
  909. #include <linux/debugfs.h>
  910. static int pool_pages_get(void *_data, u64 *val)
  911. {
  912. *val = zcache_pages();
  913. return 0;
  914. }
  915. DEFINE_SIMPLE_ATTRIBUTE(pool_page_fops, pool_pages_get, NULL, "%llu\n");
  916. static struct dentry *zcache_debugfs_root;
  917. static int __init zcache_debugfs_init(void)
  918. {
  919. if (!debugfs_initialized())
  920. return -ENODEV;
  921. zcache_debugfs_root = debugfs_create_dir("zcache", NULL);
  922. if (!zcache_debugfs_root)
  923. return -ENOMEM;
  924. debugfs_create_u64("pool_limit_hit", S_IRUGO, zcache_debugfs_root,
  925. &zcache_pool_limit_hit);
  926. debugfs_create_u64("reject_alloc_fail", S_IRUGO, zcache_debugfs_root,
  927. &zcache_zbud_alloc_fail);
  928. debugfs_create_u64("duplicate_entry", S_IRUGO, zcache_debugfs_root,
  929. &zcache_dup_entry);
  930. debugfs_create_file("pool_pages", S_IRUGO, zcache_debugfs_root, NULL,
  931. &pool_page_fops);
  932. debugfs_create_atomic_t("stored_pages", S_IRUGO, zcache_debugfs_root,
  933. &zcache_stored_pages);
  934. debugfs_create_atomic_t("stored_zero_pages", S_IRUGO,
  935. zcache_debugfs_root, &zcache_stored_zero_pages);
  936. debugfs_create_u64("evicted_zpages", S_IRUGO, zcache_debugfs_root,
  937. &zcache_evict_zpages);
  938. debugfs_create_u64("evicted_filepages", S_IRUGO, zcache_debugfs_root,
  939. &zcache_evict_filepages);
  940. debugfs_create_u64("reclaim_fail", S_IRUGO, zcache_debugfs_root,
  941. &zcache_reclaim_fail);
  942. debugfs_create_u64("inactive_pages_refused", S_IRUGO,
  943. zcache_debugfs_root, &zcache_inactive_pages_refused);
  944. debugfs_create_u64("pool_shrink_count", S_IRUGO,
  945. zcache_debugfs_root, &zcache_pool_shrink);
  946. debugfs_create_u64("pool_shrink_fail", S_IRUGO,
  947. zcache_debugfs_root, &zcache_pool_shrink_fail);
  948. debugfs_create_u64("pool_shrink_pages", S_IRUGO,
  949. zcache_debugfs_root, &zcache_pool_shrink_pages);
  950. debugfs_create_u64("store_fail", S_IRUGO,
  951. zcache_debugfs_root, &zcache_store_failed);
  952. return 0;
  953. }
  954. static void __exit zcache_debugfs_exit(void)
  955. {
  956. debugfs_remove_recursive(zcache_debugfs_root);
  957. }
  958. #else
  959. static int __init zcache_debugfs_init(void)
  960. {
  961. return 0;
  962. }
  963. static void __exit zcache_debugfs_exit(void)
  964. {
  965. }
  966. #endif
  967. /*
  968. * zcache init and exit
  969. */
  970. static int __init init_zcache(void)
  971. {
  972. if (!zcache_enabled)
  973. return 0;
  974. pr_info("loading zcache..\n");
  975. if (zcache_rbnode_cache_create()) {
  976. pr_err("entry cache creation failed\n");
  977. goto error;
  978. }
  979. if (zcache_comp_init()) {
  980. pr_err("compressor initialization failed\n");
  981. goto compfail;
  982. }
  983. if (zcache_cpu_init()) {
  984. pr_err("per-cpu initialization failed\n");
  985. goto pcpufail;
  986. }
  987. spin_lock_init(&zcache.pool_lock);
  988. cleancache_register_ops(&zcache_ops);
  989. if (zcache_debugfs_init())
  990. pr_warn("debugfs initialization failed\n");
  991. register_shrinker(&zcache_shrinker);
  992. return 0;
  993. pcpufail:
  994. zcache_comp_exit();
  995. compfail:
  996. zcache_rbnode_cache_destroy();
  997. error:
  998. return -ENOMEM;
  999. }
  1000. /* must be late so crypto has time to come up */
  1001. late_initcall(init_zcache);
  1002. MODULE_LICENSE("GPL");
  1003. MODULE_AUTHOR("Bob Liu <bob.liu@xxxxxxxxxx>");
  1004. MODULE_DESCRIPTION("Compressed cache for clean file pages");