memory-failure.c 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488
  1. /*
  2. * Copyright (C) 2008, 2009 Intel Corporation
  3. * Authors: Andi Kleen, Fengguang Wu
  4. *
  5. * This software may be redistributed and/or modified under the terms of
  6. * the GNU General Public License ("GPL") version 2 only as published by the
  7. * Free Software Foundation.
  8. *
  9. * High level machine check handler. Handles pages reported by the
  10. * hardware as being corrupted usually due to a multi-bit ECC memory or cache
  11. * failure.
  12. *
  13. * In addition there is a "soft offline" entry point that allows stop using
  14. * not-yet-corrupted-by-suspicious pages without killing anything.
  15. *
  16. * Handles page cache pages in various states. The tricky part
  17. * here is that we can access any page asynchronously in respect to
  18. * other VM users, because memory failures could happen anytime and
  19. * anywhere. This could violate some of their assumptions. This is why
  20. * this code has to be extremely careful. Generally it tries to use
  21. * normal locking rules, as in get the standard locks, even if that means
  22. * the error handling takes potentially a long time.
  23. *
  24. * There are several operations here with exponential complexity because
  25. * of unsuitable VM data structures. For example the operation to map back
  26. * from RMAP chains to processes has to walk the complete process list and
  27. * has non linear complexity with the number. But since memory corruptions
  28. * are rare we hope to get away with this. This avoids impacting the core
  29. * VM.
  30. */
  31. /*
  32. * Notebook:
  33. * - hugetlb needs more code
  34. * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
  35. * - pass bad pages to kdump next kernel
  36. */
  37. #include <linux/kernel.h>
  38. #include <linux/mm.h>
  39. #include <linux/page-flags.h>
  40. #include <linux/kernel-page-flags.h>
  41. #include <linux/sched.h>
  42. #include <linux/ksm.h>
  43. #include <linux/rmap.h>
  44. #include <linux/pagemap.h>
  45. #include <linux/swap.h>
  46. #include <linux/backing-dev.h>
  47. #include <linux/migrate.h>
  48. #include <linux/page-isolation.h>
  49. #include <linux/suspend.h>
  50. #include <linux/slab.h>
  51. #include <linux/swapops.h>
  52. #include <linux/hugetlb.h>
  53. #include <linux/memory_hotplug.h>
  54. #include <linux/mm_inline.h>
  55. #include "internal.h"
  56. int sysctl_memory_failure_early_kill __read_mostly = 0;
  57. int sysctl_memory_failure_recovery __read_mostly = 1;
  58. atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
  59. #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
  60. u32 hwpoison_filter_enable = 0;
  61. u32 hwpoison_filter_dev_major = ~0U;
  62. u32 hwpoison_filter_dev_minor = ~0U;
  63. u64 hwpoison_filter_flags_mask;
  64. u64 hwpoison_filter_flags_value;
  65. EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
  66. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
  67. EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
  68. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
  69. EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
  70. static int hwpoison_filter_dev(struct page *p)
  71. {
  72. struct address_space *mapping;
  73. dev_t dev;
  74. if (hwpoison_filter_dev_major == ~0U &&
  75. hwpoison_filter_dev_minor == ~0U)
  76. return 0;
  77. /*
  78. * page_mapping() does not accept slab pages.
  79. */
  80. if (PageSlab(p))
  81. return -EINVAL;
  82. mapping = page_mapping(p);
  83. if (mapping == NULL || mapping->host == NULL)
  84. return -EINVAL;
  85. dev = mapping->host->i_sb->s_dev;
  86. if (hwpoison_filter_dev_major != ~0U &&
  87. hwpoison_filter_dev_major != MAJOR(dev))
  88. return -EINVAL;
  89. if (hwpoison_filter_dev_minor != ~0U &&
  90. hwpoison_filter_dev_minor != MINOR(dev))
  91. return -EINVAL;
  92. return 0;
  93. }
  94. static int hwpoison_filter_flags(struct page *p)
  95. {
  96. if (!hwpoison_filter_flags_mask)
  97. return 0;
  98. if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
  99. hwpoison_filter_flags_value)
  100. return 0;
  101. else
  102. return -EINVAL;
  103. }
  104. /*
  105. * This allows stress tests to limit test scope to a collection of tasks
  106. * by putting them under some memcg. This prevents killing unrelated/important
  107. * processes such as /sbin/init. Note that the target task may share clean
  108. * pages with init (eg. libc text), which is harmless. If the target task
  109. * share _dirty_ pages with another task B, the test scheme must make sure B
  110. * is also included in the memcg. At last, due to race conditions this filter
  111. * can only guarantee that the page either belongs to the memcg tasks, or is
  112. * a freed page.
  113. */
  114. #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  115. u64 hwpoison_filter_memcg;
  116. EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
  117. static int hwpoison_filter_task(struct page *p)
  118. {
  119. struct mem_cgroup *mem;
  120. struct cgroup_subsys_state *css;
  121. unsigned long ino;
  122. if (!hwpoison_filter_memcg)
  123. return 0;
  124. mem = try_get_mem_cgroup_from_page(p);
  125. if (!mem)
  126. return -EINVAL;
  127. css = mem_cgroup_css(mem);
  128. /* root_mem_cgroup has NULL dentries */
  129. if (!css->cgroup->dentry)
  130. return -EINVAL;
  131. ino = css->cgroup->dentry->d_inode->i_ino;
  132. css_put(css);
  133. if (ino != hwpoison_filter_memcg)
  134. return -EINVAL;
  135. return 0;
  136. }
  137. #else
  138. static int hwpoison_filter_task(struct page *p) { return 0; }
  139. #endif
  140. int hwpoison_filter(struct page *p)
  141. {
  142. if (!hwpoison_filter_enable)
  143. return 0;
  144. if (hwpoison_filter_dev(p))
  145. return -EINVAL;
  146. if (hwpoison_filter_flags(p))
  147. return -EINVAL;
  148. if (hwpoison_filter_task(p))
  149. return -EINVAL;
  150. return 0;
  151. }
  152. #else
  153. int hwpoison_filter(struct page *p)
  154. {
  155. return 0;
  156. }
  157. #endif
  158. EXPORT_SYMBOL_GPL(hwpoison_filter);
  159. /*
  160. * Send all the processes who have the page mapped an ``action optional''
  161. * signal.
  162. */
  163. static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
  164. unsigned long pfn, struct page *page)
  165. {
  166. struct siginfo si;
  167. int ret;
  168. printk(KERN_ERR
  169. "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
  170. pfn, t->comm, t->pid);
  171. si.si_signo = SIGBUS;
  172. si.si_errno = 0;
  173. si.si_code = BUS_MCEERR_AO;
  174. si.si_addr = (void *)addr;
  175. #ifdef __ARCH_SI_TRAPNO
  176. si.si_trapno = trapno;
  177. #endif
  178. si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
  179. /*
  180. * Don't use force here, it's convenient if the signal
  181. * can be temporarily blocked.
  182. * This could cause a loop when the user sets SIGBUS
  183. * to SIG_IGN, but hopefully no one will do that?
  184. */
  185. ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
  186. if (ret < 0)
  187. printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
  188. t->comm, t->pid, ret);
  189. return ret;
  190. }
  191. /*
  192. * When a unknown page type is encountered drain as many buffers as possible
  193. * in the hope to turn the page into a LRU or free page, which we can handle.
  194. */
  195. void shake_page(struct page *p, int access)
  196. {
  197. if (!PageSlab(p)) {
  198. lru_add_drain_all();
  199. if (PageLRU(p))
  200. return;
  201. drain_all_pages();
  202. if (PageLRU(p) || is_free_buddy_page(p))
  203. return;
  204. }
  205. /*
  206. * Only call shrink_slab here (which would also shrink other caches) if
  207. * access is not potentially fatal.
  208. */
  209. if (access) {
  210. int nr;
  211. do {
  212. struct shrink_control shrink = {
  213. .gfp_mask = GFP_KERNEL,
  214. };
  215. nr = shrink_slab(&shrink, 1000, 1000);
  216. if (page_count(p) == 1)
  217. break;
  218. } while (nr > 10);
  219. }
  220. }
  221. EXPORT_SYMBOL_GPL(shake_page);
  222. /*
  223. * Kill all processes that have a poisoned page mapped and then isolate
  224. * the page.
  225. *
  226. * General strategy:
  227. * Find all processes having the page mapped and kill them.
  228. * But we keep a page reference around so that the page is not
  229. * actually freed yet.
  230. * Then stash the page away
  231. *
  232. * There's no convenient way to get back to mapped processes
  233. * from the VMAs. So do a brute-force search over all
  234. * running processes.
  235. *
  236. * Remember that machine checks are not common (or rather
  237. * if they are common you have other problems), so this shouldn't
  238. * be a performance issue.
  239. *
  240. * Also there are some races possible while we get from the
  241. * error detection to actually handle it.
  242. */
  243. struct to_kill {
  244. struct list_head nd;
  245. struct task_struct *tsk;
  246. unsigned long addr;
  247. char addr_valid;
  248. };
  249. /*
  250. * Failure handling: if we can't find or can't kill a process there's
  251. * not much we can do. We just print a message and ignore otherwise.
  252. */
  253. /*
  254. * Schedule a process for later kill.
  255. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  256. * TBD would GFP_NOIO be enough?
  257. */
  258. static void add_to_kill(struct task_struct *tsk, struct page *p,
  259. struct vm_area_struct *vma,
  260. struct list_head *to_kill,
  261. struct to_kill **tkc)
  262. {
  263. struct to_kill *tk;
  264. if (*tkc) {
  265. tk = *tkc;
  266. *tkc = NULL;
  267. } else {
  268. tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
  269. if (!tk) {
  270. printk(KERN_ERR
  271. "MCE: Out of memory while machine check handling\n");
  272. return;
  273. }
  274. }
  275. tk->addr = page_address_in_vma(p, vma);
  276. tk->addr_valid = 1;
  277. /*
  278. * In theory we don't have to kill when the page was
  279. * munmaped. But it could be also a mremap. Since that's
  280. * likely very rare kill anyways just out of paranoia, but use
  281. * a SIGKILL because the error is not contained anymore.
  282. */
  283. if (tk->addr == -EFAULT) {
  284. pr_info("MCE: Unable to find user space address %lx in %s\n",
  285. page_to_pfn(p), tsk->comm);
  286. tk->addr_valid = 0;
  287. }
  288. get_task_struct(tsk);
  289. tk->tsk = tsk;
  290. list_add_tail(&tk->nd, to_kill);
  291. }
  292. /*
  293. * Kill the processes that have been collected earlier.
  294. *
  295. * Only do anything when DOIT is set, otherwise just free the list
  296. * (this is used for clean pages which do not need killing)
  297. * Also when FAIL is set do a force kill because something went
  298. * wrong earlier.
  299. */
  300. static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
  301. int fail, struct page *page, unsigned long pfn)
  302. {
  303. struct to_kill *tk, *next;
  304. list_for_each_entry_safe (tk, next, to_kill, nd) {
  305. if (doit) {
  306. /*
  307. * In case something went wrong with munmapping
  308. * make sure the process doesn't catch the
  309. * signal and then access the memory. Just kill it.
  310. */
  311. if (fail || tk->addr_valid == 0) {
  312. printk(KERN_ERR
  313. "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
  314. pfn, tk->tsk->comm, tk->tsk->pid);
  315. force_sig(SIGKILL, tk->tsk);
  316. }
  317. /*
  318. * In theory the process could have mapped
  319. * something else on the address in-between. We could
  320. * check for that, but we need to tell the
  321. * process anyways.
  322. */
  323. else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
  324. pfn, page) < 0)
  325. printk(KERN_ERR
  326. "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
  327. pfn, tk->tsk->comm, tk->tsk->pid);
  328. }
  329. put_task_struct(tk->tsk);
  330. kfree(tk);
  331. }
  332. }
  333. static int task_early_kill(struct task_struct *tsk)
  334. {
  335. if (!tsk->mm)
  336. return 0;
  337. if (tsk->flags & PF_MCE_PROCESS)
  338. return !!(tsk->flags & PF_MCE_EARLY);
  339. return sysctl_memory_failure_early_kill;
  340. }
  341. /*
  342. * Collect processes when the error hit an anonymous page.
  343. */
  344. static void collect_procs_anon(struct page *page, struct list_head *to_kill,
  345. struct to_kill **tkc)
  346. {
  347. struct vm_area_struct *vma;
  348. struct task_struct *tsk;
  349. struct anon_vma *av;
  350. av = page_lock_anon_vma(page);
  351. if (av == NULL) /* Not actually mapped anymore */
  352. return;
  353. read_lock(&tasklist_lock);
  354. for_each_process (tsk) {
  355. struct anon_vma_chain *vmac;
  356. if (!task_early_kill(tsk))
  357. continue;
  358. list_for_each_entry(vmac, &av->head, same_anon_vma) {
  359. vma = vmac->vma;
  360. if (!page_mapped_in_vma(page, vma))
  361. continue;
  362. if (vma->vm_mm == tsk->mm)
  363. add_to_kill(tsk, page, vma, to_kill, tkc);
  364. }
  365. }
  366. read_unlock(&tasklist_lock);
  367. page_unlock_anon_vma(av);
  368. }
  369. /*
  370. * Collect processes when the error hit a file mapped page.
  371. */
  372. static void collect_procs_file(struct page *page, struct list_head *to_kill,
  373. struct to_kill **tkc)
  374. {
  375. struct vm_area_struct *vma;
  376. struct task_struct *tsk;
  377. struct prio_tree_iter iter;
  378. struct address_space *mapping = page->mapping;
  379. mutex_lock(&mapping->i_mmap_mutex);
  380. read_lock(&tasklist_lock);
  381. for_each_process(tsk) {
  382. pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  383. if (!task_early_kill(tsk))
  384. continue;
  385. vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
  386. pgoff) {
  387. /*
  388. * Send early kill signal to tasks where a vma covers
  389. * the page but the corrupted page is not necessarily
  390. * mapped it in its pte.
  391. * Assume applications who requested early kill want
  392. * to be informed of all such data corruptions.
  393. */
  394. if (vma->vm_mm == tsk->mm)
  395. add_to_kill(tsk, page, vma, to_kill, tkc);
  396. }
  397. }
  398. read_unlock(&tasklist_lock);
  399. mutex_unlock(&mapping->i_mmap_mutex);
  400. }
  401. /*
  402. * Collect the processes who have the corrupted page mapped to kill.
  403. * This is done in two steps for locking reasons.
  404. * First preallocate one tokill structure outside the spin locks,
  405. * so that we can kill at least one process reasonably reliable.
  406. */
  407. static void collect_procs(struct page *page, struct list_head *tokill)
  408. {
  409. struct to_kill *tk;
  410. if (!page->mapping)
  411. return;
  412. tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
  413. if (!tk)
  414. return;
  415. if (PageAnon(page))
  416. collect_procs_anon(page, tokill, &tk);
  417. else
  418. collect_procs_file(page, tokill, &tk);
  419. kfree(tk);
  420. }
  421. /*
  422. * Error handlers for various types of pages.
  423. */
  424. enum outcome {
  425. IGNORED, /* Error: cannot be handled */
  426. FAILED, /* Error: handling failed */
  427. DELAYED, /* Will be handled later */
  428. RECOVERED, /* Successfully recovered */
  429. };
  430. static const char *action_name[] = {
  431. [IGNORED] = "Ignored",
  432. [FAILED] = "Failed",
  433. [DELAYED] = "Delayed",
  434. [RECOVERED] = "Recovered",
  435. };
  436. /*
  437. * XXX: It is possible that a page is isolated from LRU cache,
  438. * and then kept in swap cache or failed to remove from page cache.
  439. * The page count will stop it from being freed by unpoison.
  440. * Stress tests should be aware of this memory leak problem.
  441. */
  442. static int delete_from_lru_cache(struct page *p)
  443. {
  444. if (!isolate_lru_page(p)) {
  445. /*
  446. * Clear sensible page flags, so that the buddy system won't
  447. * complain when the page is unpoison-and-freed.
  448. */
  449. ClearPageActive(p);
  450. ClearPageUnevictable(p);
  451. /*
  452. * drop the page count elevated by isolate_lru_page()
  453. */
  454. page_cache_release(p);
  455. return 0;
  456. }
  457. return -EIO;
  458. }
  459. /*
  460. * Error hit kernel page.
  461. * Do nothing, try to be lucky and not touch this instead. For a few cases we
  462. * could be more sophisticated.
  463. */
  464. static int me_kernel(struct page *p, unsigned long pfn)
  465. {
  466. return IGNORED;
  467. }
  468. /*
  469. * Page in unknown state. Do nothing.
  470. */
  471. static int me_unknown(struct page *p, unsigned long pfn)
  472. {
  473. printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
  474. return FAILED;
  475. }
  476. /*
  477. * Clean (or cleaned) page cache page.
  478. */
  479. static int me_pagecache_clean(struct page *p, unsigned long pfn)
  480. {
  481. int err;
  482. int ret = FAILED;
  483. struct address_space *mapping;
  484. delete_from_lru_cache(p);
  485. /*
  486. * For anonymous pages we're done the only reference left
  487. * should be the one m_f() holds.
  488. */
  489. if (PageAnon(p))
  490. return RECOVERED;
  491. /*
  492. * Now truncate the page in the page cache. This is really
  493. * more like a "temporary hole punch"
  494. * Don't do this for block devices when someone else
  495. * has a reference, because it could be file system metadata
  496. * and that's not safe to truncate.
  497. */
  498. mapping = page_mapping(p);
  499. if (!mapping) {
  500. /*
  501. * Page has been teared down in the meanwhile
  502. */
  503. return FAILED;
  504. }
  505. /*
  506. * Truncation is a bit tricky. Enable it per file system for now.
  507. *
  508. * Open: to take i_mutex or not for this? Right now we don't.
  509. */
  510. if (mapping->a_ops->error_remove_page) {
  511. err = mapping->a_ops->error_remove_page(mapping, p);
  512. if (err != 0) {
  513. printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
  514. pfn, err);
  515. } else if (page_has_private(p) &&
  516. !try_to_release_page(p, GFP_NOIO)) {
  517. pr_info("MCE %#lx: failed to release buffers\n", pfn);
  518. } else {
  519. ret = RECOVERED;
  520. }
  521. } else {
  522. /*
  523. * If the file system doesn't support it just invalidate
  524. * This fails on dirty or anything with private pages
  525. */
  526. if (invalidate_inode_page(p))
  527. ret = RECOVERED;
  528. else
  529. printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
  530. pfn);
  531. }
  532. return ret;
  533. }
  534. /*
  535. * Dirty cache page page
  536. * Issues: when the error hit a hole page the error is not properly
  537. * propagated.
  538. */
  539. static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  540. {
  541. struct address_space *mapping = page_mapping(p);
  542. SetPageError(p);
  543. /* TBD: print more information about the file. */
  544. if (mapping) {
  545. /*
  546. * IO error will be reported by write(), fsync(), etc.
  547. * who check the mapping.
  548. * This way the application knows that something went
  549. * wrong with its dirty file data.
  550. *
  551. * There's one open issue:
  552. *
  553. * The EIO will be only reported on the next IO
  554. * operation and then cleared through the IO map.
  555. * Normally Linux has two mechanisms to pass IO error
  556. * first through the AS_EIO flag in the address space
  557. * and then through the PageError flag in the page.
  558. * Since we drop pages on memory failure handling the
  559. * only mechanism open to use is through AS_AIO.
  560. *
  561. * This has the disadvantage that it gets cleared on
  562. * the first operation that returns an error, while
  563. * the PageError bit is more sticky and only cleared
  564. * when the page is reread or dropped. If an
  565. * application assumes it will always get error on
  566. * fsync, but does other operations on the fd before
  567. * and the page is dropped between then the error
  568. * will not be properly reported.
  569. *
  570. * This can already happen even without hwpoisoned
  571. * pages: first on metadata IO errors (which only
  572. * report through AS_EIO) or when the page is dropped
  573. * at the wrong time.
  574. *
  575. * So right now we assume that the application DTRT on
  576. * the first EIO, but we're not worse than other parts
  577. * of the kernel.
  578. */
  579. mapping_set_error(mapping, EIO);
  580. }
  581. return me_pagecache_clean(p, pfn);
  582. }
  583. /*
  584. * Clean and dirty swap cache.
  585. *
  586. * Dirty swap cache page is tricky to handle. The page could live both in page
  587. * cache and swap cache(ie. page is freshly swapped in). So it could be
  588. * referenced concurrently by 2 types of PTEs:
  589. * normal PTEs and swap PTEs. We try to handle them consistently by calling
  590. * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
  591. * and then
  592. * - clear dirty bit to prevent IO
  593. * - remove from LRU
  594. * - but keep in the swap cache, so that when we return to it on
  595. * a later page fault, we know the application is accessing
  596. * corrupted data and shall be killed (we installed simple
  597. * interception code in do_swap_page to catch it).
  598. *
  599. * Clean swap cache pages can be directly isolated. A later page fault will
  600. * bring in the known good data from disk.
  601. */
  602. static int me_swapcache_dirty(struct page *p, unsigned long pfn)
  603. {
  604. ClearPageDirty(p);
  605. /* Trigger EIO in shmem: */
  606. ClearPageUptodate(p);
  607. if (!delete_from_lru_cache(p))
  608. return DELAYED;
  609. else
  610. return FAILED;
  611. }
  612. static int me_swapcache_clean(struct page *p, unsigned long pfn)
  613. {
  614. delete_from_swap_cache(p);
  615. if (!delete_from_lru_cache(p))
  616. return RECOVERED;
  617. else
  618. return FAILED;
  619. }
  620. /*
  621. * Huge pages. Needs work.
  622. * Issues:
  623. * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  624. * To narrow down kill region to one page, we need to break up pmd.
  625. */
  626. static int me_huge_page(struct page *p, unsigned long pfn)
  627. {
  628. int res = 0;
  629. struct page *hpage = compound_head(p);
  630. /*
  631. * We can safely recover from error on free or reserved (i.e.
  632. * not in-use) hugepage by dequeuing it from freelist.
  633. * To check whether a hugepage is in-use or not, we can't use
  634. * page->lru because it can be used in other hugepage operations,
  635. * such as __unmap_hugepage_range() and gather_surplus_pages().
  636. * So instead we use page_mapping() and PageAnon().
  637. * We assume that this function is called with page lock held,
  638. * so there is no race between isolation and mapping/unmapping.
  639. */
  640. if (!(page_mapping(hpage) || PageAnon(hpage))) {
  641. res = dequeue_hwpoisoned_huge_page(hpage);
  642. if (!res)
  643. return RECOVERED;
  644. }
  645. return DELAYED;
  646. }
  647. /*
  648. * Various page states we can handle.
  649. *
  650. * A page state is defined by its current page->flags bits.
  651. * The table matches them in order and calls the right handler.
  652. *
  653. * This is quite tricky because we can access page at any time
  654. * in its live cycle, so all accesses have to be extremely careful.
  655. *
  656. * This is not complete. More states could be added.
  657. * For any missing state don't attempt recovery.
  658. */
  659. #define dirty (1UL << PG_dirty)
  660. #define sc (1UL << PG_swapcache)
  661. #define unevict (1UL << PG_unevictable)
  662. #define mlock (1UL << PG_mlocked)
  663. #define writeback (1UL << PG_writeback)
  664. #define lru (1UL << PG_lru)
  665. #define swapbacked (1UL << PG_swapbacked)
  666. #define head (1UL << PG_head)
  667. #define tail (1UL << PG_tail)
  668. #define compound (1UL << PG_compound)
  669. #define slab (1UL << PG_slab)
  670. #define reserved (1UL << PG_reserved)
  671. static struct page_state {
  672. unsigned long mask;
  673. unsigned long res;
  674. char *msg;
  675. int (*action)(struct page *p, unsigned long pfn);
  676. } error_states[] = {
  677. { reserved, reserved, "reserved kernel", me_kernel },
  678. /*
  679. * free pages are specially detected outside this table:
  680. * PG_buddy pages only make a small fraction of all free pages.
  681. */
  682. /*
  683. * Could in theory check if slab page is free or if we can drop
  684. * currently unused objects without touching them. But just
  685. * treat it as standard kernel for now.
  686. */
  687. { slab, slab, "kernel slab", me_kernel },
  688. #ifdef CONFIG_PAGEFLAGS_EXTENDED
  689. { head, head, "huge", me_huge_page },
  690. { tail, tail, "huge", me_huge_page },
  691. #else
  692. { compound, compound, "huge", me_huge_page },
  693. #endif
  694. { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
  695. { sc|dirty, sc, "swapcache", me_swapcache_clean },
  696. { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
  697. { unevict, unevict, "unevictable LRU", me_pagecache_clean},
  698. { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
  699. { mlock, mlock, "mlocked LRU", me_pagecache_clean },
  700. { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
  701. { lru|dirty, lru, "clean LRU", me_pagecache_clean },
  702. /*
  703. * Catchall entry: must be at end.
  704. */
  705. { 0, 0, "unknown page state", me_unknown },
  706. };
  707. #undef dirty
  708. #undef sc
  709. #undef unevict
  710. #undef mlock
  711. #undef writeback
  712. #undef lru
  713. #undef swapbacked
  714. #undef head
  715. #undef tail
  716. #undef compound
  717. #undef slab
  718. #undef reserved
  719. static void action_result(unsigned long pfn, char *msg, int result)
  720. {
  721. struct page *page = pfn_to_page(pfn);
  722. printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
  723. pfn,
  724. PageDirty(page) ? "dirty " : "",
  725. msg, action_name[result]);
  726. }
  727. static int page_action(struct page_state *ps, struct page *p,
  728. unsigned long pfn)
  729. {
  730. int result;
  731. int count;
  732. result = ps->action(p, pfn);
  733. action_result(pfn, ps->msg, result);
  734. count = page_count(p) - 1;
  735. if (ps->action == me_swapcache_dirty && result == DELAYED)
  736. count--;
  737. if (count != 0) {
  738. printk(KERN_ERR
  739. "MCE %#lx: %s page still referenced by %d users\n",
  740. pfn, ps->msg, count);
  741. result = FAILED;
  742. }
  743. /* Could do more checks here if page looks ok */
  744. /*
  745. * Could adjust zone counters here to correct for the missing page.
  746. */
  747. return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
  748. }
  749. /*
  750. * Do all that is necessary to remove user space mappings. Unmap
  751. * the pages and send SIGBUS to the processes if the data was dirty.
  752. */
  753. static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
  754. int trapno)
  755. {
  756. enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
  757. struct address_space *mapping;
  758. LIST_HEAD(tokill);
  759. int ret;
  760. int kill = 1;
  761. struct page *hpage = compound_head(p);
  762. struct page *ppage;
  763. if (PageReserved(p) || PageSlab(p))
  764. return SWAP_SUCCESS;
  765. /*
  766. * This check implies we don't kill processes if their pages
  767. * are in the swap cache early. Those are always late kills.
  768. */
  769. if (!page_mapped(hpage))
  770. return SWAP_SUCCESS;
  771. if (PageKsm(p))
  772. return SWAP_FAIL;
  773. if (PageSwapCache(p)) {
  774. printk(KERN_ERR
  775. "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
  776. ttu |= TTU_IGNORE_HWPOISON;
  777. }
  778. /*
  779. * Propagate the dirty bit from PTEs to struct page first, because we
  780. * need this to decide if we should kill or just drop the page.
  781. * XXX: the dirty test could be racy: set_page_dirty() may not always
  782. * be called inside page lock (it's recommended but not enforced).
  783. */
  784. mapping = page_mapping(hpage);
  785. if (!PageDirty(hpage) && mapping &&
  786. mapping_cap_writeback_dirty(mapping)) {
  787. if (page_mkclean(hpage)) {
  788. SetPageDirty(hpage);
  789. } else {
  790. kill = 0;
  791. ttu |= TTU_IGNORE_HWPOISON;
  792. printk(KERN_INFO
  793. "MCE %#lx: corrupted page was clean: dropped without side effects\n",
  794. pfn);
  795. }
  796. }
  797. /*
  798. * ppage: poisoned page
  799. * if p is regular page(4k page)
  800. * ppage == real poisoned page;
  801. * else p is hugetlb or THP, ppage == head page.
  802. */
  803. ppage = hpage;
  804. if (PageTransHuge(hpage)) {
  805. /*
  806. * Verify that this isn't a hugetlbfs head page, the check for
  807. * PageAnon is just for avoid tripping a split_huge_page
  808. * internal debug check, as split_huge_page refuses to deal with
  809. * anything that isn't an anon page. PageAnon can't go away fro
  810. * under us because we hold a refcount on the hpage, without a
  811. * refcount on the hpage. split_huge_page can't be safely called
  812. * in the first place, having a refcount on the tail isn't
  813. * enough * to be safe.
  814. */
  815. if (!PageHuge(hpage) && PageAnon(hpage)) {
  816. if (unlikely(split_huge_page(hpage))) {
  817. /*
  818. * FIXME: if splitting THP is failed, it is
  819. * better to stop the following operation rather
  820. * than causing panic by unmapping. System might
  821. * survive if the page is freed later.
  822. */
  823. printk(KERN_INFO
  824. "MCE %#lx: failed to split THP\n", pfn);
  825. BUG_ON(!PageHWPoison(p));
  826. return SWAP_FAIL;
  827. }
  828. /* THP is split, so ppage should be the real poisoned page. */
  829. ppage = p;
  830. }
  831. }
  832. /*
  833. * First collect all the processes that have the page
  834. * mapped in dirty form. This has to be done before try_to_unmap,
  835. * because ttu takes the rmap data structures down.
  836. *
  837. * Error handling: We ignore errors here because
  838. * there's nothing that can be done.
  839. */
  840. if (kill)
  841. collect_procs(ppage, &tokill);
  842. if (hpage != ppage)
  843. lock_page(ppage);
  844. ret = try_to_unmap(ppage, ttu);
  845. if (ret != SWAP_SUCCESS)
  846. printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
  847. pfn, page_mapcount(ppage));
  848. if (hpage != ppage)
  849. unlock_page(ppage);
  850. /*
  851. * Now that the dirty bit has been propagated to the
  852. * struct page and all unmaps done we can decide if
  853. * killing is needed or not. Only kill when the page
  854. * was dirty, otherwise the tokill list is merely
  855. * freed. When there was a problem unmapping earlier
  856. * use a more force-full uncatchable kill to prevent
  857. * any accesses to the poisoned memory.
  858. */
  859. kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
  860. ret != SWAP_SUCCESS, p, pfn);
  861. return ret;
  862. }
  863. static void set_page_hwpoison_huge_page(struct page *hpage)
  864. {
  865. int i;
  866. int nr_pages = 1 << compound_trans_order(hpage);
  867. for (i = 0; i < nr_pages; i++)
  868. SetPageHWPoison(hpage + i);
  869. }
  870. static void clear_page_hwpoison_huge_page(struct page *hpage)
  871. {
  872. int i;
  873. int nr_pages = 1 << compound_trans_order(hpage);
  874. for (i = 0; i < nr_pages; i++)
  875. ClearPageHWPoison(hpage + i);
  876. }
  877. int __memory_failure(unsigned long pfn, int trapno, int flags)
  878. {
  879. struct page_state *ps;
  880. struct page *p;
  881. struct page *hpage;
  882. int res;
  883. unsigned int nr_pages;
  884. if (!sysctl_memory_failure_recovery)
  885. panic("Memory failure from trap %d on page %lx", trapno, pfn);
  886. if (!pfn_valid(pfn)) {
  887. printk(KERN_ERR
  888. "MCE %#lx: memory outside kernel control\n",
  889. pfn);
  890. return -ENXIO;
  891. }
  892. p = pfn_to_page(pfn);
  893. hpage = compound_head(p);
  894. if (TestSetPageHWPoison(p)) {
  895. printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
  896. return 0;
  897. }
  898. nr_pages = 1 << compound_trans_order(hpage);
  899. atomic_long_add(nr_pages, &mce_bad_pages);
  900. /*
  901. * We need/can do nothing about count=0 pages.
  902. * 1) it's a free page, and therefore in safe hand:
  903. * prep_new_page() will be the gate keeper.
  904. * 2) it's a free hugepage, which is also safe:
  905. * an affected hugepage will be dequeued from hugepage freelist,
  906. * so there's no concern about reusing it ever after.
  907. * 3) it's part of a non-compound high order page.
  908. * Implies some kernel user: cannot stop them from
  909. * R/W the page; let's pray that the page has been
  910. * used and will be freed some time later.
  911. * In fact it's dangerous to directly bump up page count from 0,
  912. * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
  913. */
  914. if (!(flags & MF_COUNT_INCREASED) &&
  915. !get_page_unless_zero(hpage)) {
  916. if (is_free_buddy_page(p)) {
  917. action_result(pfn, "free buddy", DELAYED);
  918. return 0;
  919. } else if (PageHuge(hpage)) {
  920. /*
  921. * Check "just unpoisoned", "filter hit", and
  922. * "race with other subpage."
  923. */
  924. lock_page(hpage);
  925. if (!PageHWPoison(hpage)
  926. || (hwpoison_filter(p) && TestClearPageHWPoison(p))
  927. || (p != hpage && TestSetPageHWPoison(hpage))) {
  928. atomic_long_sub(nr_pages, &mce_bad_pages);
  929. return 0;
  930. }
  931. set_page_hwpoison_huge_page(hpage);
  932. res = dequeue_hwpoisoned_huge_page(hpage);
  933. action_result(pfn, "free huge",
  934. res ? IGNORED : DELAYED);
  935. unlock_page(hpage);
  936. return res;
  937. } else {
  938. action_result(pfn, "high order kernel", IGNORED);
  939. return -EBUSY;
  940. }
  941. }
  942. /*
  943. * We ignore non-LRU pages for good reasons.
  944. * - PG_locked is only well defined for LRU pages and a few others
  945. * - to avoid races with __set_page_locked()
  946. * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
  947. * The check (unnecessarily) ignores LRU pages being isolated and
  948. * walked by the page reclaim code, however that's not a big loss.
  949. */
  950. if (!PageHuge(p) && !PageTransCompound(p)) {
  951. if (!PageLRU(p))
  952. shake_page(p, 0);
  953. if (!PageLRU(p)) {
  954. /*
  955. * shake_page could have turned it free.
  956. */
  957. if (is_free_buddy_page(p)) {
  958. action_result(pfn, "free buddy, 2nd try",
  959. DELAYED);
  960. return 0;
  961. }
  962. action_result(pfn, "non LRU", IGNORED);
  963. put_page(p);
  964. return -EBUSY;
  965. }
  966. }
  967. /*
  968. * Lock the page and wait for writeback to finish.
  969. * It's very difficult to mess with pages currently under IO
  970. * and in many cases impossible, so we just avoid it here.
  971. */
  972. lock_page(hpage);
  973. /*
  974. * unpoison always clear PG_hwpoison inside page lock
  975. */
  976. if (!PageHWPoison(p)) {
  977. printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
  978. res = 0;
  979. goto out;
  980. }
  981. if (hwpoison_filter(p)) {
  982. if (TestClearPageHWPoison(p))
  983. atomic_long_sub(nr_pages, &mce_bad_pages);
  984. unlock_page(hpage);
  985. put_page(hpage);
  986. return 0;
  987. }
  988. /*
  989. * For error on the tail page, we should set PG_hwpoison
  990. * on the head page to show that the hugepage is hwpoisoned
  991. */
  992. if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
  993. action_result(pfn, "hugepage already hardware poisoned",
  994. IGNORED);
  995. unlock_page(hpage);
  996. put_page(hpage);
  997. return 0;
  998. }
  999. /*
  1000. * Set PG_hwpoison on all pages in an error hugepage,
  1001. * because containment is done in hugepage unit for now.
  1002. * Since we have done TestSetPageHWPoison() for the head page with
  1003. * page lock held, we can safely set PG_hwpoison bits on tail pages.
  1004. */
  1005. if (PageHuge(p))
  1006. set_page_hwpoison_huge_page(hpage);
  1007. wait_on_page_writeback(p);
  1008. /*
  1009. * Now take care of user space mappings.
  1010. * Abort on fail: __delete_from_page_cache() assumes unmapped page.
  1011. */
  1012. if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
  1013. printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
  1014. res = -EBUSY;
  1015. goto out;
  1016. }
  1017. /*
  1018. * Torn down by someone else?
  1019. */
  1020. if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
  1021. action_result(pfn, "already truncated LRU", IGNORED);
  1022. res = -EBUSY;
  1023. goto out;
  1024. }
  1025. res = -EBUSY;
  1026. for (ps = error_states;; ps++) {
  1027. if ((p->flags & ps->mask) == ps->res) {
  1028. res = page_action(ps, p, pfn);
  1029. break;
  1030. }
  1031. }
  1032. out:
  1033. unlock_page(hpage);
  1034. return res;
  1035. }
  1036. EXPORT_SYMBOL_GPL(__memory_failure);
  1037. /**
  1038. * memory_failure - Handle memory failure of a page.
  1039. * @pfn: Page Number of the corrupted page
  1040. * @trapno: Trap number reported in the signal to user space.
  1041. *
  1042. * This function is called by the low level machine check code
  1043. * of an architecture when it detects hardware memory corruption
  1044. * of a page. It tries its best to recover, which includes
  1045. * dropping pages, killing processes etc.
  1046. *
  1047. * The function is primarily of use for corruptions that
  1048. * happen outside the current execution context (e.g. when
  1049. * detected by a background scrubber)
  1050. *
  1051. * Must run in process context (e.g. a work queue) with interrupts
  1052. * enabled and no spinlocks hold.
  1053. */
  1054. void memory_failure(unsigned long pfn, int trapno)
  1055. {
  1056. __memory_failure(pfn, trapno, 0);
  1057. }
  1058. /**
  1059. * unpoison_memory - Unpoison a previously poisoned page
  1060. * @pfn: Page number of the to be unpoisoned page
  1061. *
  1062. * Software-unpoison a page that has been poisoned by
  1063. * memory_failure() earlier.
  1064. *
  1065. * This is only done on the software-level, so it only works
  1066. * for linux injected failures, not real hardware failures
  1067. *
  1068. * Returns 0 for success, otherwise -errno.
  1069. */
  1070. int unpoison_memory(unsigned long pfn)
  1071. {
  1072. struct page *page;
  1073. struct page *p;
  1074. int freeit = 0;
  1075. unsigned int nr_pages;
  1076. if (!pfn_valid(pfn))
  1077. return -ENXIO;
  1078. p = pfn_to_page(pfn);
  1079. page = compound_head(p);
  1080. if (!PageHWPoison(p)) {
  1081. pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
  1082. return 0;
  1083. }
  1084. nr_pages = 1 << compound_trans_order(page);
  1085. if (!get_page_unless_zero(page)) {
  1086. /*
  1087. * Since HWPoisoned hugepage should have non-zero refcount,
  1088. * race between memory failure and unpoison seems to happen.
  1089. * In such case unpoison fails and memory failure runs
  1090. * to the end.
  1091. */
  1092. if (PageHuge(page)) {
  1093. pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
  1094. return 0;
  1095. }
  1096. if (TestClearPageHWPoison(p))
  1097. atomic_long_sub(nr_pages, &mce_bad_pages);
  1098. pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
  1099. return 0;
  1100. }
  1101. lock_page(page);
  1102. /*
  1103. * This test is racy because PG_hwpoison is set outside of page lock.
  1104. * That's acceptable because that won't trigger kernel panic. Instead,
  1105. * the PG_hwpoison page will be caught and isolated on the entrance to
  1106. * the free buddy page pool.
  1107. */
  1108. if (TestClearPageHWPoison(page)) {
  1109. pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
  1110. atomic_long_sub(nr_pages, &mce_bad_pages);
  1111. freeit = 1;
  1112. if (PageHuge(page))
  1113. clear_page_hwpoison_huge_page(page);
  1114. }
  1115. unlock_page(page);
  1116. put_page(page);
  1117. if (freeit)
  1118. put_page(page);
  1119. return 0;
  1120. }
  1121. EXPORT_SYMBOL(unpoison_memory);
  1122. static struct page *new_page(struct page *p, unsigned long private, int **x)
  1123. {
  1124. int nid = page_to_nid(p);
  1125. if (PageHuge(p))
  1126. return alloc_huge_page_node(page_hstate(compound_head(p)),
  1127. nid);
  1128. else
  1129. return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
  1130. }
  1131. /*
  1132. * Safely get reference count of an arbitrary page.
  1133. * Returns 0 for a free page, -EIO for a zero refcount page
  1134. * that is not free, and 1 for any other page type.
  1135. * For 1 the page is returned with increased page count, otherwise not.
  1136. */
  1137. static int get_any_page(struct page *p, unsigned long pfn, int flags)
  1138. {
  1139. int ret;
  1140. if (flags & MF_COUNT_INCREASED)
  1141. return 1;
  1142. /*
  1143. * The lock_memory_hotplug prevents a race with memory hotplug.
  1144. * This is a big hammer, a better would be nicer.
  1145. */
  1146. lock_memory_hotplug();
  1147. /*
  1148. * Isolate the page, so that it doesn't get reallocated if it
  1149. * was free.
  1150. */
  1151. set_migratetype_isolate(p);
  1152. /*
  1153. * When the target page is a free hugepage, just remove it
  1154. * from free hugepage list.
  1155. */
  1156. if (!get_page_unless_zero(compound_head(p))) {
  1157. if (PageHuge(p)) {
  1158. pr_info("get_any_page: %#lx free huge page\n", pfn);
  1159. ret = dequeue_hwpoisoned_huge_page(compound_head(p));
  1160. } else if (is_free_buddy_page(p)) {
  1161. pr_info("get_any_page: %#lx free buddy page\n", pfn);
  1162. /* Set hwpoison bit while page is still isolated */
  1163. SetPageHWPoison(p);
  1164. ret = 0;
  1165. } else {
  1166. pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
  1167. pfn, p->flags);
  1168. ret = -EIO;
  1169. }
  1170. } else {
  1171. /* Not a free page */
  1172. ret = 1;
  1173. }
  1174. unset_migratetype_isolate(p);
  1175. unlock_memory_hotplug();
  1176. return ret;
  1177. }
  1178. static int soft_offline_huge_page(struct page *page, int flags)
  1179. {
  1180. int ret;
  1181. unsigned long pfn = page_to_pfn(page);
  1182. struct page *hpage = compound_head(page);
  1183. LIST_HEAD(pagelist);
  1184. ret = get_any_page(page, pfn, flags);
  1185. if (ret < 0)
  1186. return ret;
  1187. if (ret == 0)
  1188. goto done;
  1189. if (PageHWPoison(hpage)) {
  1190. put_page(hpage);
  1191. pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
  1192. return -EBUSY;
  1193. }
  1194. /* Keep page count to indicate a given hugepage is isolated. */
  1195. list_add(&hpage->lru, &pagelist);
  1196. ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
  1197. true);
  1198. if (ret) {
  1199. struct page *page1, *page2;
  1200. list_for_each_entry_safe(page1, page2, &pagelist, lru)
  1201. put_page(page1);
  1202. pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
  1203. pfn, ret, page->flags);
  1204. if (ret > 0)
  1205. ret = -EIO;
  1206. return ret;
  1207. }
  1208. done:
  1209. if (!PageHWPoison(hpage))
  1210. atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
  1211. set_page_hwpoison_huge_page(hpage);
  1212. dequeue_hwpoisoned_huge_page(hpage);
  1213. /* keep elevated page count for bad page */
  1214. return ret;
  1215. }
  1216. /**
  1217. * soft_offline_page - Soft offline a page.
  1218. * @page: page to offline
  1219. * @flags: flags. Same as memory_failure().
  1220. *
  1221. * Returns 0 on success, otherwise negated errno.
  1222. *
  1223. * Soft offline a page, by migration or invalidation,
  1224. * without killing anything. This is for the case when
  1225. * a page is not corrupted yet (so it's still valid to access),
  1226. * but has had a number of corrected errors and is better taken
  1227. * out.
  1228. *
  1229. * The actual policy on when to do that is maintained by
  1230. * user space.
  1231. *
  1232. * This should never impact any application or cause data loss,
  1233. * however it might take some time.
  1234. *
  1235. * This is not a 100% solution for all memory, but tries to be
  1236. * ``good enough'' for the majority of memory.
  1237. */
  1238. int soft_offline_page(struct page *page, int flags)
  1239. {
  1240. int ret;
  1241. unsigned long pfn = page_to_pfn(page);
  1242. if (PageHuge(page))
  1243. return soft_offline_huge_page(page, flags);
  1244. ret = get_any_page(page, pfn, flags);
  1245. if (ret < 0)
  1246. return ret;
  1247. if (ret == 0)
  1248. goto done;
  1249. /*
  1250. * Page cache page we can handle?
  1251. */
  1252. if (!PageLRU(page)) {
  1253. /*
  1254. * Try to free it.
  1255. */
  1256. put_page(page);
  1257. shake_page(page, 1);
  1258. /*
  1259. * Did it turn free?
  1260. */
  1261. ret = get_any_page(page, pfn, 0);
  1262. if (ret < 0)
  1263. return ret;
  1264. if (ret == 0)
  1265. goto done;
  1266. }
  1267. if (!PageLRU(page)) {
  1268. pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
  1269. pfn, page->flags);
  1270. return -EIO;
  1271. }
  1272. lock_page(page);
  1273. wait_on_page_writeback(page);
  1274. /*
  1275. * Synchronized using the page lock with memory_failure()
  1276. */
  1277. if (PageHWPoison(page)) {
  1278. unlock_page(page);
  1279. put_page(page);
  1280. pr_info("soft offline: %#lx page already poisoned\n", pfn);
  1281. return -EBUSY;
  1282. }
  1283. /*
  1284. * Try to invalidate first. This should work for
  1285. * non dirty unmapped page cache pages.
  1286. */
  1287. ret = invalidate_inode_page(page);
  1288. unlock_page(page);
  1289. /*
  1290. * RED-PEN would be better to keep it isolated here, but we
  1291. * would need to fix isolation locking first.
  1292. */
  1293. if (ret == 1) {
  1294. put_page(page);
  1295. ret = 0;
  1296. pr_info("soft_offline: %#lx: invalidated\n", pfn);
  1297. goto done;
  1298. }
  1299. /*
  1300. * Simple invalidation didn't work.
  1301. * Try to migrate to a new page instead. migrate.c
  1302. * handles a large number of cases for us.
  1303. */
  1304. ret = isolate_lru_page(page);
  1305. /*
  1306. * Drop page reference which is came from get_any_page()
  1307. * successful isolate_lru_page() already took another one.
  1308. */
  1309. put_page(page);
  1310. if (!ret) {
  1311. LIST_HEAD(pagelist);
  1312. inc_zone_page_state(page, NR_ISOLATED_ANON +
  1313. page_is_file_cache(page));
  1314. list_add(&page->lru, &pagelist);
  1315. ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
  1316. 0, true);
  1317. if (ret) {
  1318. putback_lru_pages(&pagelist);
  1319. pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
  1320. pfn, ret, page->flags);
  1321. if (ret > 0)
  1322. ret = -EIO;
  1323. }
  1324. } else {
  1325. pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
  1326. pfn, ret, page_count(page), page->flags);
  1327. }
  1328. if (ret)
  1329. return ret;
  1330. done:
  1331. atomic_long_add(1, &mce_bad_pages);
  1332. SetPageHWPoison(page);
  1333. /* keep elevated page count for bad page */
  1334. return ret;
  1335. }