madvise.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/mm/madvise.c
  4. *
  5. * Copyright (C) 1999 Linus Torvalds
  6. * Copyright (C) 2002 Christoph Hellwig
  7. */
  8. #include <linux/mman.h>
  9. #include <linux/pagemap.h>
  10. #include <linux/syscalls.h>
  11. #include <linux/mempolicy.h>
  12. #include <linux/page-isolation.h>
  13. #include <linux/userfaultfd_k.h>
  14. #include <linux/hugetlb.h>
  15. #include <linux/falloc.h>
  16. #include <linux/sched.h>
  17. #include <linux/ksm.h>
  18. #include <linux/fs.h>
  19. #include <linux/file.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/backing-dev.h>
  22. #include <linux/swap.h>
  23. #include <linux/swapops.h>
  24. #include <linux/shmem_fs.h>
  25. #include <linux/mmu_notifier.h>
  26. #include <asm/tlb.h>
  27. #include "internal.h"
  28. /*
  29. * Any behaviour which results in changes to the vma->vm_flags needs to
  30. * take mmap_sem for writing. Others, which simply traverse vmas, need
  31. * to only take it for reading.
  32. */
  33. static int madvise_need_mmap_write(int behavior)
  34. {
  35. switch (behavior) {
  36. case MADV_REMOVE:
  37. case MADV_WILLNEED:
  38. case MADV_DONTNEED:
  39. case MADV_FREE:
  40. return 0;
  41. default:
  42. /* be safe, default to 1. list exceptions explicitly */
  43. return 1;
  44. }
  45. }
  46. /*
  47. * We can potentially split a vm area into separate
  48. * areas, each area with its own behavior.
  49. */
  50. static long madvise_behavior(struct vm_area_struct *vma,
  51. struct vm_area_struct **prev,
  52. unsigned long start, unsigned long end, int behavior)
  53. {
  54. struct mm_struct *mm = vma->vm_mm;
  55. int error = 0;
  56. pgoff_t pgoff;
  57. unsigned long new_flags = vma->vm_flags;
  58. switch (behavior) {
  59. case MADV_NORMAL:
  60. new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  61. break;
  62. case MADV_SEQUENTIAL:
  63. new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  64. break;
  65. case MADV_RANDOM:
  66. new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  67. break;
  68. case MADV_DONTFORK:
  69. new_flags |= VM_DONTCOPY;
  70. break;
  71. case MADV_DOFORK:
  72. if (vma->vm_flags & VM_IO) {
  73. error = -EINVAL;
  74. goto out;
  75. }
  76. new_flags &= ~VM_DONTCOPY;
  77. break;
  78. case MADV_WIPEONFORK:
  79. /* MADV_WIPEONFORK is only supported on anonymous memory. */
  80. if (vma->vm_file || vma->vm_flags & VM_SHARED) {
  81. error = -EINVAL;
  82. goto out;
  83. }
  84. new_flags |= VM_WIPEONFORK;
  85. break;
  86. case MADV_KEEPONFORK:
  87. new_flags &= ~VM_WIPEONFORK;
  88. break;
  89. case MADV_DONTDUMP:
  90. new_flags |= VM_DONTDUMP;
  91. break;
  92. case MADV_DODUMP:
  93. if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
  94. error = -EINVAL;
  95. goto out;
  96. }
  97. new_flags &= ~VM_DONTDUMP;
  98. break;
  99. case MADV_MERGEABLE:
  100. case MADV_UNMERGEABLE:
  101. error = ksm_madvise(vma, start, end, behavior, &new_flags);
  102. if (error) {
  103. /*
  104. * madvise() returns EAGAIN if kernel resources, such as
  105. * slab, are temporarily unavailable.
  106. */
  107. if (error == -ENOMEM)
  108. error = -EAGAIN;
  109. goto out;
  110. }
  111. break;
  112. case MADV_HUGEPAGE:
  113. case MADV_NOHUGEPAGE:
  114. error = hugepage_madvise(vma, &new_flags, behavior);
  115. if (error) {
  116. /*
  117. * madvise() returns EAGAIN if kernel resources, such as
  118. * slab, are temporarily unavailable.
  119. */
  120. if (error == -ENOMEM)
  121. error = -EAGAIN;
  122. goto out;
  123. }
  124. break;
  125. }
  126. if (new_flags == vma->vm_flags) {
  127. *prev = vma;
  128. goto out;
  129. }
  130. pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  131. *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  132. vma->vm_file, pgoff, vma_policy(vma),
  133. vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
  134. if (*prev) {
  135. vma = *prev;
  136. goto success;
  137. }
  138. *prev = vma;
  139. if (start != vma->vm_start) {
  140. if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  141. error = -ENOMEM;
  142. goto out;
  143. }
  144. error = __split_vma(mm, vma, start, 1);
  145. if (error) {
  146. /*
  147. * madvise() returns EAGAIN if kernel resources, such as
  148. * slab, are temporarily unavailable.
  149. */
  150. if (error == -ENOMEM)
  151. error = -EAGAIN;
  152. goto out;
  153. }
  154. }
  155. if (end != vma->vm_end) {
  156. if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  157. error = -ENOMEM;
  158. goto out;
  159. }
  160. error = __split_vma(mm, vma, end, 0);
  161. if (error) {
  162. /*
  163. * madvise() returns EAGAIN if kernel resources, such as
  164. * slab, are temporarily unavailable.
  165. */
  166. if (error == -ENOMEM)
  167. error = -EAGAIN;
  168. goto out;
  169. }
  170. }
  171. success:
  172. /*
  173. * vm_flags is protected by the mmap_sem held in write mode.
  174. */
  175. vm_write_begin(vma);
  176. WRITE_ONCE(vma->vm_flags, new_flags);
  177. vm_write_end(vma);
  178. out:
  179. return error;
  180. }
  181. #ifdef CONFIG_SWAP
  182. static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  183. unsigned long end, struct mm_walk *walk)
  184. {
  185. pte_t *orig_pte;
  186. struct vm_area_struct *vma = walk->private;
  187. unsigned long index;
  188. if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  189. return 0;
  190. for (index = start; index != end; index += PAGE_SIZE) {
  191. pte_t pte;
  192. swp_entry_t entry;
  193. struct page *page;
  194. spinlock_t *ptl;
  195. orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  196. pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  197. pte_unmap_unlock(orig_pte, ptl);
  198. if (pte_present(pte) || pte_none(pte))
  199. continue;
  200. entry = pte_to_swp_entry(pte);
  201. if (unlikely(non_swap_entry(entry)))
  202. continue;
  203. page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  204. vma, index, false);
  205. if (page)
  206. put_page(page);
  207. }
  208. return 0;
  209. }
  210. static void force_swapin_readahead(struct vm_area_struct *vma,
  211. unsigned long start, unsigned long end)
  212. {
  213. struct mm_walk walk = {
  214. .mm = vma->vm_mm,
  215. .pmd_entry = swapin_walk_pmd_entry,
  216. .private = vma,
  217. };
  218. walk_page_range(start, end, &walk);
  219. lru_add_drain(); /* Push any new pages onto the LRU now */
  220. }
  221. static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  222. unsigned long start, unsigned long end,
  223. struct address_space *mapping)
  224. {
  225. pgoff_t index;
  226. struct page *page;
  227. swp_entry_t swap;
  228. for (; start < end; start += PAGE_SIZE) {
  229. index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  230. page = find_get_entry(mapping, index);
  231. if (!radix_tree_exceptional_entry(page)) {
  232. if (page)
  233. put_page(page);
  234. continue;
  235. }
  236. swap = radix_to_swp_entry(page);
  237. page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
  238. NULL, 0, false);
  239. if (page)
  240. put_page(page);
  241. }
  242. lru_add_drain(); /* Push any new pages onto the LRU now */
  243. }
  244. #endif /* CONFIG_SWAP */
  245. /*
  246. * Schedule all required I/O operations. Do not wait for completion.
  247. */
  248. static long madvise_willneed(struct vm_area_struct *vma,
  249. struct vm_area_struct **prev,
  250. unsigned long start, unsigned long end)
  251. {
  252. struct file *file = vma->vm_file;
  253. *prev = vma;
  254. #ifdef CONFIG_SWAP
  255. if (!file) {
  256. force_swapin_readahead(vma, start, end);
  257. return 0;
  258. }
  259. if (shmem_mapping(file->f_mapping)) {
  260. force_shm_swapin_readahead(vma, start, end,
  261. file->f_mapping);
  262. return 0;
  263. }
  264. #else
  265. if (!file)
  266. return -EBADF;
  267. #endif
  268. if (IS_DAX(file_inode(file))) {
  269. /* no bad return value, but ignore advice */
  270. return 0;
  271. }
  272. start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  273. if (end > vma->vm_end)
  274. end = vma->vm_end;
  275. end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  276. force_page_cache_readahead(file->f_mapping, file, start, end - start);
  277. return 0;
  278. }
  279. static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  280. unsigned long end, struct mm_walk *walk)
  281. {
  282. struct mmu_gather *tlb = walk->private;
  283. struct mm_struct *mm = tlb->mm;
  284. struct vm_area_struct *vma = walk->vma;
  285. spinlock_t *ptl;
  286. pte_t *orig_pte, *pte, ptent;
  287. struct page *page;
  288. int nr_swap = 0;
  289. unsigned long next;
  290. next = pmd_addr_end(addr, end);
  291. if (pmd_trans_huge(*pmd))
  292. if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  293. goto next;
  294. if (pmd_trans_unstable(pmd))
  295. return 0;
  296. tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
  297. orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  298. flush_tlb_batched_pending(mm);
  299. arch_enter_lazy_mmu_mode();
  300. for (; addr != end; pte++, addr += PAGE_SIZE) {
  301. ptent = *pte;
  302. if (pte_none(ptent))
  303. continue;
  304. /*
  305. * If the pte has swp_entry, just clear page table to
  306. * prevent swap-in which is more expensive rather than
  307. * (page allocation + zeroing).
  308. */
  309. if (!pte_present(ptent)) {
  310. swp_entry_t entry;
  311. entry = pte_to_swp_entry(ptent);
  312. if (non_swap_entry(entry))
  313. continue;
  314. nr_swap--;
  315. free_swap_and_cache(entry);
  316. pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  317. continue;
  318. }
  319. page = _vm_normal_page(vma, addr, ptent, true);
  320. if (!page)
  321. continue;
  322. /*
  323. * If pmd isn't transhuge but the page is THP and
  324. * is owned by only this process, split it and
  325. * deactivate all pages.
  326. */
  327. if (PageTransCompound(page)) {
  328. if (page_mapcount(page) != 1)
  329. goto out;
  330. get_page(page);
  331. if (!trylock_page(page)) {
  332. put_page(page);
  333. goto out;
  334. }
  335. pte_unmap_unlock(orig_pte, ptl);
  336. if (split_huge_page(page)) {
  337. unlock_page(page);
  338. put_page(page);
  339. pte_offset_map_lock(mm, pmd, addr, &ptl);
  340. goto out;
  341. }
  342. unlock_page(page);
  343. put_page(page);
  344. pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  345. pte--;
  346. addr -= PAGE_SIZE;
  347. continue;
  348. }
  349. VM_BUG_ON_PAGE(PageTransCompound(page), page);
  350. if (PageSwapCache(page) || PageDirty(page)) {
  351. if (!trylock_page(page))
  352. continue;
  353. /*
  354. * If page is shared with others, we couldn't clear
  355. * PG_dirty of the page.
  356. */
  357. if (page_mapcount(page) != 1) {
  358. unlock_page(page);
  359. continue;
  360. }
  361. if (PageSwapCache(page) && !try_to_free_swap(page)) {
  362. unlock_page(page);
  363. continue;
  364. }
  365. ClearPageDirty(page);
  366. unlock_page(page);
  367. }
  368. if (pte_young(ptent) || pte_dirty(ptent)) {
  369. /*
  370. * Some of architecture(ex, PPC) don't update TLB
  371. * with set_pte_at and tlb_remove_tlb_entry so for
  372. * the portability, remap the pte with old|clean
  373. * after pte clearing.
  374. */
  375. ptent = ptep_get_and_clear_full(mm, addr, pte,
  376. tlb->fullmm);
  377. ptent = pte_mkold(ptent);
  378. ptent = pte_mkclean(ptent);
  379. set_pte_at(mm, addr, pte, ptent);
  380. tlb_remove_tlb_entry(tlb, pte, addr);
  381. }
  382. mark_page_lazyfree(page);
  383. }
  384. out:
  385. if (nr_swap) {
  386. if (current->mm == mm)
  387. sync_mm_rss(mm);
  388. add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  389. }
  390. arch_leave_lazy_mmu_mode();
  391. pte_unmap_unlock(orig_pte, ptl);
  392. cond_resched();
  393. next:
  394. return 0;
  395. }
  396. static void madvise_free_page_range(struct mmu_gather *tlb,
  397. struct vm_area_struct *vma,
  398. unsigned long addr, unsigned long end)
  399. {
  400. struct mm_walk free_walk = {
  401. .pmd_entry = madvise_free_pte_range,
  402. .mm = vma->vm_mm,
  403. .private = tlb,
  404. };
  405. vm_write_begin(vma);
  406. tlb_start_vma(tlb, vma);
  407. walk_page_range(addr, end, &free_walk);
  408. tlb_end_vma(tlb, vma);
  409. vm_write_end(vma);
  410. }
  411. static int madvise_free_single_vma(struct vm_area_struct *vma,
  412. unsigned long start_addr, unsigned long end_addr)
  413. {
  414. unsigned long start, end;
  415. struct mm_struct *mm = vma->vm_mm;
  416. struct mmu_gather tlb;
  417. /* MADV_FREE works for only anon vma at the moment */
  418. if (!vma_is_anonymous(vma))
  419. return -EINVAL;
  420. start = max(vma->vm_start, start_addr);
  421. if (start >= vma->vm_end)
  422. return -EINVAL;
  423. end = min(vma->vm_end, end_addr);
  424. if (end <= vma->vm_start)
  425. return -EINVAL;
  426. lru_add_drain();
  427. tlb_gather_mmu(&tlb, mm, start, end);
  428. update_hiwater_rss(mm);
  429. mmu_notifier_invalidate_range_start(mm, start, end);
  430. madvise_free_page_range(&tlb, vma, start, end);
  431. mmu_notifier_invalidate_range_end(mm, start, end);
  432. tlb_finish_mmu(&tlb, start, end);
  433. return 0;
  434. }
  435. /*
  436. * Application no longer needs these pages. If the pages are dirty,
  437. * it's OK to just throw them away. The app will be more careful about
  438. * data it wants to keep. Be sure to free swap resources too. The
  439. * zap_page_range call sets things up for shrink_active_list to actually free
  440. * these pages later if no one else has touched them in the meantime,
  441. * although we could add these pages to a global reuse list for
  442. * shrink_active_list to pick up before reclaiming other pages.
  443. *
  444. * NB: This interface discards data rather than pushes it out to swap,
  445. * as some implementations do. This has performance implications for
  446. * applications like large transactional databases which want to discard
  447. * pages in anonymous maps after committing to backing store the data
  448. * that was kept in them. There is no reason to write this data out to
  449. * the swap area if the application is discarding it.
  450. *
  451. * An interface that causes the system to free clean pages and flush
  452. * dirty pages is already available as msync(MS_INVALIDATE).
  453. */
  454. static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  455. unsigned long start, unsigned long end)
  456. {
  457. zap_page_range(vma, start, end - start);
  458. return 0;
  459. }
  460. static long madvise_dontneed_free(struct vm_area_struct *vma,
  461. struct vm_area_struct **prev,
  462. unsigned long start, unsigned long end,
  463. int behavior)
  464. {
  465. *prev = vma;
  466. if (!can_madv_dontneed_vma(vma))
  467. return -EINVAL;
  468. if (!userfaultfd_remove(vma, start, end)) {
  469. *prev = NULL; /* mmap_sem has been dropped, prev is stale */
  470. down_read(&current->mm->mmap_sem);
  471. vma = find_vma(current->mm, start);
  472. if (!vma)
  473. return -ENOMEM;
  474. if (start < vma->vm_start) {
  475. /*
  476. * This "vma" under revalidation is the one
  477. * with the lowest vma->vm_start where start
  478. * is also < vma->vm_end. If start <
  479. * vma->vm_start it means an hole materialized
  480. * in the user address space within the
  481. * virtual range passed to MADV_DONTNEED
  482. * or MADV_FREE.
  483. */
  484. return -ENOMEM;
  485. }
  486. if (!can_madv_dontneed_vma(vma))
  487. return -EINVAL;
  488. if (end > vma->vm_end) {
  489. /*
  490. * Don't fail if end > vma->vm_end. If the old
  491. * vma was splitted while the mmap_sem was
  492. * released the effect of the concurrent
  493. * operation may not cause madvise() to
  494. * have an undefined result. There may be an
  495. * adjacent next vma that we'll walk
  496. * next. userfaultfd_remove() will generate an
  497. * UFFD_EVENT_REMOVE repetition on the
  498. * end-vma->vm_end range, but the manager can
  499. * handle a repetition fine.
  500. */
  501. end = vma->vm_end;
  502. }
  503. VM_WARN_ON(start >= end);
  504. }
  505. if (behavior == MADV_DONTNEED)
  506. return madvise_dontneed_single_vma(vma, start, end);
  507. else if (behavior == MADV_FREE)
  508. return madvise_free_single_vma(vma, start, end);
  509. else
  510. return -EINVAL;
  511. }
  512. /*
  513. * Application wants to free up the pages and associated backing store.
  514. * This is effectively punching a hole into the middle of a file.
  515. */
  516. static long madvise_remove(struct vm_area_struct *vma,
  517. struct vm_area_struct **prev,
  518. unsigned long start, unsigned long end)
  519. {
  520. loff_t offset;
  521. int error;
  522. struct file *f;
  523. *prev = NULL; /* tell sys_madvise we drop mmap_sem */
  524. if (vma->vm_flags & VM_LOCKED)
  525. return -EINVAL;
  526. f = vma->vm_file;
  527. if (!f || !f->f_mapping || !f->f_mapping->host) {
  528. return -EINVAL;
  529. }
  530. if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  531. return -EACCES;
  532. offset = (loff_t)(start - vma->vm_start)
  533. + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  534. /*
  535. * Filesystem's fallocate may need to take i_mutex. We need to
  536. * explicitly grab a reference because the vma (and hence the
  537. * vma's reference to the file) can go away as soon as we drop
  538. * mmap_sem.
  539. */
  540. get_file(f);
  541. if (userfaultfd_remove(vma, start, end)) {
  542. /* mmap_sem was not released by userfaultfd_remove() */
  543. up_read(&current->mm->mmap_sem);
  544. }
  545. error = vfs_fallocate(f,
  546. FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  547. offset, end - start);
  548. fput(f);
  549. down_read(&current->mm->mmap_sem);
  550. return error;
  551. }
  552. #ifdef CONFIG_MEMORY_FAILURE
  553. /*
  554. * Error injection support for memory error handling.
  555. */
  556. static int madvise_inject_error(int behavior,
  557. unsigned long start, unsigned long end)
  558. {
  559. struct page *page;
  560. struct zone *zone;
  561. unsigned int order;
  562. if (!capable(CAP_SYS_ADMIN))
  563. return -EPERM;
  564. for (; start < end; start += PAGE_SIZE << order) {
  565. int ret;
  566. ret = get_user_pages_fast(start, 1, 0, &page);
  567. if (ret != 1)
  568. return ret;
  569. /*
  570. * When soft offlining hugepages, after migrating the page
  571. * we dissolve it, therefore in the second loop "page" will
  572. * no longer be a compound page, and order will be 0.
  573. */
  574. order = compound_order(compound_head(page));
  575. if (PageHWPoison(page)) {
  576. put_page(page);
  577. continue;
  578. }
  579. if (behavior == MADV_SOFT_OFFLINE) {
  580. pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
  581. page_to_pfn(page), start);
  582. ret = soft_offline_page(page, MF_COUNT_INCREASED);
  583. if (ret)
  584. return ret;
  585. continue;
  586. }
  587. pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
  588. page_to_pfn(page), start);
  589. ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED);
  590. if (ret)
  591. return ret;
  592. }
  593. /* Ensure that all poisoned pages are removed from per-cpu lists */
  594. for_each_populated_zone(zone)
  595. drain_all_pages(zone);
  596. return 0;
  597. }
  598. #endif
  599. static long
  600. madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  601. unsigned long start, unsigned long end, int behavior)
  602. {
  603. switch (behavior) {
  604. case MADV_REMOVE:
  605. return madvise_remove(vma, prev, start, end);
  606. case MADV_WILLNEED:
  607. return madvise_willneed(vma, prev, start, end);
  608. case MADV_FREE:
  609. case MADV_DONTNEED:
  610. return madvise_dontneed_free(vma, prev, start, end, behavior);
  611. default:
  612. return madvise_behavior(vma, prev, start, end, behavior);
  613. }
  614. }
  615. static bool
  616. madvise_behavior_valid(int behavior)
  617. {
  618. switch (behavior) {
  619. case MADV_DOFORK:
  620. case MADV_DONTFORK:
  621. case MADV_NORMAL:
  622. case MADV_SEQUENTIAL:
  623. case MADV_RANDOM:
  624. case MADV_REMOVE:
  625. case MADV_WILLNEED:
  626. case MADV_DONTNEED:
  627. case MADV_FREE:
  628. #ifdef CONFIG_KSM
  629. case MADV_MERGEABLE:
  630. case MADV_UNMERGEABLE:
  631. #endif
  632. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  633. case MADV_HUGEPAGE:
  634. case MADV_NOHUGEPAGE:
  635. #endif
  636. case MADV_DONTDUMP:
  637. case MADV_DODUMP:
  638. case MADV_WIPEONFORK:
  639. case MADV_KEEPONFORK:
  640. #ifdef CONFIG_MEMORY_FAILURE
  641. case MADV_SOFT_OFFLINE:
  642. case MADV_HWPOISON:
  643. #endif
  644. return true;
  645. default:
  646. return false;
  647. }
  648. }
  649. /*
  650. * The madvise(2) system call.
  651. *
  652. * Applications can use madvise() to advise the kernel how it should
  653. * handle paging I/O in this VM area. The idea is to help the kernel
  654. * use appropriate read-ahead and caching techniques. The information
  655. * provided is advisory only, and can be safely disregarded by the
  656. * kernel without affecting the correct operation of the application.
  657. *
  658. * behavior values:
  659. * MADV_NORMAL - the default behavior is to read clusters. This
  660. * results in some read-ahead and read-behind.
  661. * MADV_RANDOM - the system should read the minimum amount of data
  662. * on any access, since it is unlikely that the appli-
  663. * cation will need more than what it asks for.
  664. * MADV_SEQUENTIAL - pages in the given range will probably be accessed
  665. * once, so they can be aggressively read ahead, and
  666. * can be freed soon after they are accessed.
  667. * MADV_WILLNEED - the application is notifying the system to read
  668. * some pages ahead.
  669. * MADV_DONTNEED - the application is finished with the given range,
  670. * so the kernel can free resources associated with it.
  671. * MADV_FREE - the application marks pages in the given range as lazy free,
  672. * where actual purges are postponed until memory pressure happens.
  673. * MADV_REMOVE - the application wants to free up the given range of
  674. * pages and associated backing store.
  675. * MADV_DONTFORK - omit this area from child's address space when forking:
  676. * typically, to avoid COWing pages pinned by get_user_pages().
  677. * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
  678. * MADV_WIPEONFORK - present the child process with zero-filled memory in this
  679. * range after a fork.
  680. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
  681. * MADV_HWPOISON - trigger memory error handler as if the given memory range
  682. * were corrupted by unrecoverable hardware memory failure.
  683. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
  684. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
  685. * this area with pages of identical content from other such areas.
  686. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
  687. * MADV_HUGEPAGE - the application wants to back the given range by transparent
  688. * huge pages in the future. Existing pages might be coalesced and
  689. * new pages might be allocated as THP.
  690. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
  691. * transparent huge pages so the existing pages will not be
  692. * coalesced into THP and new pages will not be allocated as THP.
  693. * MADV_DONTDUMP - the application wants to prevent pages in the given range
  694. * from being included in its core dump.
  695. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
  696. *
  697. * return values:
  698. * zero - success
  699. * -EINVAL - start + len < 0, start is not page-aligned,
  700. * "behavior" is not a valid value, or application
  701. * is attempting to release locked or shared pages,
  702. * or the specified address range includes file, Huge TLB,
  703. * MAP_SHARED or VMPFNMAP range.
  704. * -ENOMEM - addresses in the specified range are not currently
  705. * mapped, or are outside the AS of the process.
  706. * -EIO - an I/O error occurred while paging in data.
  707. * -EBADF - map exists, but area maps something that isn't a file.
  708. * -EAGAIN - a kernel resource was temporarily unavailable.
  709. */
  710. SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
  711. {
  712. unsigned long end, tmp;
  713. struct vm_area_struct *vma, *prev;
  714. int unmapped_error = 0;
  715. int error = -EINVAL;
  716. int write;
  717. size_t len;
  718. struct blk_plug plug;
  719. start = untagged_addr(start);
  720. if (!madvise_behavior_valid(behavior))
  721. return error;
  722. if (start & ~PAGE_MASK)
  723. return error;
  724. len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  725. /* Check to see whether len was rounded up from small -ve to zero */
  726. if (len_in && !len)
  727. return error;
  728. end = start + len;
  729. if (end < start)
  730. return error;
  731. error = 0;
  732. if (end == start)
  733. return error;
  734. #ifdef CONFIG_MEMORY_FAILURE
  735. if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  736. return madvise_inject_error(behavior, start, start + len_in);
  737. #endif
  738. write = madvise_need_mmap_write(behavior);
  739. if (write) {
  740. if (down_write_killable(&current->mm->mmap_sem))
  741. return -EINTR;
  742. } else {
  743. down_read(&current->mm->mmap_sem);
  744. }
  745. /*
  746. * If the interval [start,end) covers some unmapped address
  747. * ranges, just ignore them, but return -ENOMEM at the end.
  748. * - different from the way of handling in mlock etc.
  749. */
  750. vma = find_vma_prev(current->mm, start, &prev);
  751. if (vma && start > vma->vm_start)
  752. prev = vma;
  753. blk_start_plug(&plug);
  754. for (;;) {
  755. /* Still start < end. */
  756. error = -ENOMEM;
  757. if (!vma)
  758. goto out;
  759. /* Here start < (end|vma->vm_end). */
  760. if (start < vma->vm_start) {
  761. unmapped_error = -ENOMEM;
  762. start = vma->vm_start;
  763. if (start >= end)
  764. goto out;
  765. }
  766. /* Here vma->vm_start <= start < (end|vma->vm_end) */
  767. tmp = vma->vm_end;
  768. if (end < tmp)
  769. tmp = end;
  770. /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  771. error = madvise_vma(vma, &prev, start, tmp, behavior);
  772. if (error)
  773. goto out;
  774. start = tmp;
  775. if (prev && start < prev->vm_end)
  776. start = prev->vm_end;
  777. error = unmapped_error;
  778. if (start >= end)
  779. goto out;
  780. if (prev)
  781. vma = prev->vm_next;
  782. else /* madvise_remove dropped mmap_sem */
  783. vma = find_vma(current->mm, start);
  784. }
  785. out:
  786. blk_finish_plug(&plug);
  787. if (write)
  788. up_write(&current->mm->mmap_sem);
  789. else
  790. up_read(&current->mm->mmap_sem);
  791. return error;
  792. }