nmi.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. /*
  2. * Copyright (C) 1991, 1992 Linus Torvalds
  3. * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  4. * Copyright (C) 2011 Don Zickus Red Hat, Inc.
  5. *
  6. * Pentium III FXSR, SSE support
  7. * Gareth Hughes <gareth@valinux.com>, May 2000
  8. */
  9. /*
  10. * Handle hardware traps and faults.
  11. */
  12. #include <linux/spinlock.h>
  13. #include <linux/kprobes.h>
  14. #include <linux/kdebug.h>
  15. #include <linux/nmi.h>
  16. #include <linux/debugfs.h>
  17. #include <linux/delay.h>
  18. #include <linux/hardirq.h>
  19. #include <linux/ratelimit.h>
  20. #include <linux/slab.h>
  21. #include <linux/export.h>
  22. #if defined(CONFIG_EDAC)
  23. #include <linux/edac.h>
  24. #endif
  25. #include <linux/atomic.h>
  26. #include <asm/traps.h>
  27. #include <asm/mach_traps.h>
  28. #include <asm/nmi.h>
  29. #include <asm/x86_init.h>
  30. #include <asm/reboot.h>
  31. #include <asm/cache.h>
  32. #define CREATE_TRACE_POINTS
  33. #include <trace/events/nmi.h>
  34. struct nmi_desc {
  35. spinlock_t lock;
  36. struct list_head head;
  37. };
  38. static struct nmi_desc nmi_desc[NMI_MAX] =
  39. {
  40. {
  41. .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
  42. .head = LIST_HEAD_INIT(nmi_desc[0].head),
  43. },
  44. {
  45. .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
  46. .head = LIST_HEAD_INIT(nmi_desc[1].head),
  47. },
  48. {
  49. .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
  50. .head = LIST_HEAD_INIT(nmi_desc[2].head),
  51. },
  52. {
  53. .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
  54. .head = LIST_HEAD_INIT(nmi_desc[3].head),
  55. },
  56. };
  57. struct nmi_stats {
  58. unsigned int normal;
  59. unsigned int unknown;
  60. unsigned int external;
  61. unsigned int swallow;
  62. };
  63. static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
  64. static int ignore_nmis __read_mostly;
  65. int unknown_nmi_panic;
  66. /*
  67. * Prevent NMI reason port (0x61) being accessed simultaneously, can
  68. * only be used in NMI handler.
  69. */
  70. static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
  71. static int __init setup_unknown_nmi_panic(char *str)
  72. {
  73. unknown_nmi_panic = 1;
  74. return 1;
  75. }
  76. __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
  77. #define nmi_to_desc(type) (&nmi_desc[type])
  78. static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
  79. static int __init nmi_warning_debugfs(void)
  80. {
  81. debugfs_create_u64("nmi_longest_ns", 0644,
  82. arch_debugfs_dir, &nmi_longest_ns);
  83. return 0;
  84. }
  85. fs_initcall(nmi_warning_debugfs);
  86. static void nmi_max_handler(struct irq_work *w)
  87. {
  88. struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
  89. int remainder_ns, decimal_msecs;
  90. u64 whole_msecs = ACCESS_ONCE(a->max_duration);
  91. remainder_ns = do_div(whole_msecs, (1000 * 1000));
  92. decimal_msecs = remainder_ns / 1000;
  93. printk_ratelimited(KERN_INFO
  94. "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
  95. a->handler, whole_msecs, decimal_msecs);
  96. }
  97. static int nmi_handle(unsigned int type, struct pt_regs *regs)
  98. {
  99. struct nmi_desc *desc = nmi_to_desc(type);
  100. struct nmiaction *a;
  101. int handled=0;
  102. rcu_read_lock();
  103. /*
  104. * NMIs are edge-triggered, which means if you have enough
  105. * of them concurrently, you can lose some because only one
  106. * can be latched at any given time. Walk the whole list
  107. * to handle those situations.
  108. */
  109. list_for_each_entry_rcu(a, &desc->head, list) {
  110. int thishandled;
  111. u64 delta;
  112. delta = sched_clock();
  113. thishandled = a->handler(type, regs);
  114. handled += thishandled;
  115. delta = sched_clock() - delta;
  116. trace_nmi_handler(a->handler, (int)delta, thishandled);
  117. if (delta < nmi_longest_ns || delta < a->max_duration)
  118. continue;
  119. a->max_duration = delta;
  120. irq_work_queue(&a->irq_work);
  121. }
  122. rcu_read_unlock();
  123. /* return total number of NMI events handled */
  124. return handled;
  125. }
  126. NOKPROBE_SYMBOL(nmi_handle);
  127. int __register_nmi_handler(unsigned int type, struct nmiaction *action)
  128. {
  129. struct nmi_desc *desc = nmi_to_desc(type);
  130. unsigned long flags;
  131. if (!action->handler)
  132. return -EINVAL;
  133. init_irq_work(&action->irq_work, nmi_max_handler);
  134. spin_lock_irqsave(&desc->lock, flags);
  135. /*
  136. * most handlers of type NMI_UNKNOWN never return because
  137. * they just assume the NMI is theirs. Just a sanity check
  138. * to manage expectations
  139. */
  140. WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
  141. WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
  142. WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
  143. /*
  144. * some handlers need to be executed first otherwise a fake
  145. * event confuses some handlers (kdump uses this flag)
  146. */
  147. if (action->flags & NMI_FLAG_FIRST)
  148. list_add_rcu(&action->list, &desc->head);
  149. else
  150. list_add_tail_rcu(&action->list, &desc->head);
  151. spin_unlock_irqrestore(&desc->lock, flags);
  152. return 0;
  153. }
  154. EXPORT_SYMBOL(__register_nmi_handler);
  155. void unregister_nmi_handler(unsigned int type, const char *name)
  156. {
  157. struct nmi_desc *desc = nmi_to_desc(type);
  158. struct nmiaction *n;
  159. unsigned long flags;
  160. spin_lock_irqsave(&desc->lock, flags);
  161. list_for_each_entry_rcu(n, &desc->head, list) {
  162. /*
  163. * the name passed in to describe the nmi handler
  164. * is used as the lookup key
  165. */
  166. if (!strcmp(n->name, name)) {
  167. WARN(in_nmi(),
  168. "Trying to free NMI (%s) from NMI context!\n", n->name);
  169. list_del_rcu(&n->list);
  170. break;
  171. }
  172. }
  173. spin_unlock_irqrestore(&desc->lock, flags);
  174. synchronize_rcu();
  175. }
  176. EXPORT_SYMBOL_GPL(unregister_nmi_handler);
  177. static void
  178. pci_serr_error(unsigned char reason, struct pt_regs *regs)
  179. {
  180. /* check to see if anyone registered against these types of errors */
  181. if (nmi_handle(NMI_SERR, regs))
  182. return;
  183. pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
  184. reason, smp_processor_id());
  185. /*
  186. * On some machines, PCI SERR line is used to report memory
  187. * errors. EDAC makes use of it.
  188. */
  189. #if defined(CONFIG_EDAC)
  190. if (edac_handler_set()) {
  191. edac_atomic_assert_error();
  192. return;
  193. }
  194. #endif
  195. if (panic_on_unrecovered_nmi)
  196. nmi_panic(regs, "NMI: Not continuing");
  197. pr_emerg("Dazed and confused, but trying to continue\n");
  198. /* Clear and disable the PCI SERR error line. */
  199. reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
  200. outb(reason, NMI_REASON_PORT);
  201. }
  202. NOKPROBE_SYMBOL(pci_serr_error);
  203. static void
  204. io_check_error(unsigned char reason, struct pt_regs *regs)
  205. {
  206. unsigned long i;
  207. /* check to see if anyone registered against these types of errors */
  208. if (nmi_handle(NMI_IO_CHECK, regs))
  209. return;
  210. pr_emerg(
  211. "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
  212. reason, smp_processor_id());
  213. show_regs(regs);
  214. if (panic_on_io_nmi) {
  215. nmi_panic(regs, "NMI IOCK error: Not continuing");
  216. /*
  217. * If we end up here, it means we have received an NMI while
  218. * processing panic(). Simply return without delaying and
  219. * re-enabling NMIs.
  220. */
  221. return;
  222. }
  223. /* Re-enable the IOCK line, wait for a few seconds */
  224. reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
  225. outb(reason, NMI_REASON_PORT);
  226. i = 20000;
  227. while (--i) {
  228. touch_nmi_watchdog();
  229. udelay(100);
  230. }
  231. reason &= ~NMI_REASON_CLEAR_IOCHK;
  232. outb(reason, NMI_REASON_PORT);
  233. }
  234. NOKPROBE_SYMBOL(io_check_error);
  235. static void
  236. unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
  237. {
  238. int handled;
  239. /*
  240. * Use 'false' as back-to-back NMIs are dealt with one level up.
  241. * Of course this makes having multiple 'unknown' handlers useless
  242. * as only the first one is ever run (unless it can actually determine
  243. * if it caused the NMI)
  244. */
  245. handled = nmi_handle(NMI_UNKNOWN, regs);
  246. if (handled) {
  247. __this_cpu_add(nmi_stats.unknown, handled);
  248. return;
  249. }
  250. __this_cpu_add(nmi_stats.unknown, 1);
  251. pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
  252. reason, smp_processor_id());
  253. pr_emerg("Do you have a strange power saving mode enabled?\n");
  254. if (unknown_nmi_panic || panic_on_unrecovered_nmi)
  255. nmi_panic(regs, "NMI: Not continuing");
  256. pr_emerg("Dazed and confused, but trying to continue\n");
  257. }
  258. NOKPROBE_SYMBOL(unknown_nmi_error);
  259. static DEFINE_PER_CPU(bool, swallow_nmi);
  260. static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
  261. static void default_do_nmi(struct pt_regs *regs)
  262. {
  263. unsigned char reason = 0;
  264. int handled;
  265. bool b2b = false;
  266. /*
  267. * CPU-specific NMI must be processed before non-CPU-specific
  268. * NMI, otherwise we may lose it, because the CPU-specific
  269. * NMI can not be detected/processed on other CPUs.
  270. */
  271. /*
  272. * Back-to-back NMIs are interesting because they can either
  273. * be two NMI or more than two NMIs (any thing over two is dropped
  274. * due to NMI being edge-triggered). If this is the second half
  275. * of the back-to-back NMI, assume we dropped things and process
  276. * more handlers. Otherwise reset the 'swallow' NMI behaviour
  277. */
  278. if (regs->ip == __this_cpu_read(last_nmi_rip))
  279. b2b = true;
  280. else
  281. __this_cpu_write(swallow_nmi, false);
  282. __this_cpu_write(last_nmi_rip, regs->ip);
  283. handled = nmi_handle(NMI_LOCAL, regs);
  284. __this_cpu_add(nmi_stats.normal, handled);
  285. if (handled) {
  286. /*
  287. * There are cases when a NMI handler handles multiple
  288. * events in the current NMI. One of these events may
  289. * be queued for in the next NMI. Because the event is
  290. * already handled, the next NMI will result in an unknown
  291. * NMI. Instead lets flag this for a potential NMI to
  292. * swallow.
  293. */
  294. if (handled > 1)
  295. __this_cpu_write(swallow_nmi, true);
  296. return;
  297. }
  298. /*
  299. * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
  300. *
  301. * Another CPU may be processing panic routines while holding
  302. * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
  303. * and if so, call its callback directly. If there is no CPU preparing
  304. * crash dump, we simply loop here.
  305. */
  306. while (!raw_spin_trylock(&nmi_reason_lock)) {
  307. run_crash_ipi_callback(regs);
  308. cpu_relax();
  309. }
  310. reason = x86_platform.get_nmi_reason();
  311. if (reason & NMI_REASON_MASK) {
  312. if (reason & NMI_REASON_SERR)
  313. pci_serr_error(reason, regs);
  314. else if (reason & NMI_REASON_IOCHK)
  315. io_check_error(reason, regs);
  316. #ifdef CONFIG_X86_32
  317. /*
  318. * Reassert NMI in case it became active
  319. * meanwhile as it's edge-triggered:
  320. */
  321. reassert_nmi();
  322. #endif
  323. __this_cpu_add(nmi_stats.external, 1);
  324. raw_spin_unlock(&nmi_reason_lock);
  325. return;
  326. }
  327. raw_spin_unlock(&nmi_reason_lock);
  328. /*
  329. * Only one NMI can be latched at a time. To handle
  330. * this we may process multiple nmi handlers at once to
  331. * cover the case where an NMI is dropped. The downside
  332. * to this approach is we may process an NMI prematurely,
  333. * while its real NMI is sitting latched. This will cause
  334. * an unknown NMI on the next run of the NMI processing.
  335. *
  336. * We tried to flag that condition above, by setting the
  337. * swallow_nmi flag when we process more than one event.
  338. * This condition is also only present on the second half
  339. * of a back-to-back NMI, so we flag that condition too.
  340. *
  341. * If both are true, we assume we already processed this
  342. * NMI previously and we swallow it. Otherwise we reset
  343. * the logic.
  344. *
  345. * There are scenarios where we may accidentally swallow
  346. * a 'real' unknown NMI. For example, while processing
  347. * a perf NMI another perf NMI comes in along with a
  348. * 'real' unknown NMI. These two NMIs get combined into
  349. * one (as descibed above). When the next NMI gets
  350. * processed, it will be flagged by perf as handled, but
  351. * noone will know that there was a 'real' unknown NMI sent
  352. * also. As a result it gets swallowed. Or if the first
  353. * perf NMI returns two events handled then the second
  354. * NMI will get eaten by the logic below, again losing a
  355. * 'real' unknown NMI. But this is the best we can do
  356. * for now.
  357. */
  358. if (b2b && __this_cpu_read(swallow_nmi))
  359. __this_cpu_add(nmi_stats.swallow, 1);
  360. else
  361. unknown_nmi_error(reason, regs);
  362. }
  363. NOKPROBE_SYMBOL(default_do_nmi);
  364. /*
  365. * NMIs can page fault or hit breakpoints which will cause it to lose
  366. * its NMI context with the CPU when the breakpoint or page fault does an IRET.
  367. *
  368. * As a result, NMIs can nest if NMIs get unmasked due an IRET during
  369. * NMI processing. On x86_64, the asm glue protects us from nested NMIs
  370. * if the outer NMI came from kernel mode, but we can still nest if the
  371. * outer NMI came from user mode.
  372. *
  373. * To handle these nested NMIs, we have three states:
  374. *
  375. * 1) not running
  376. * 2) executing
  377. * 3) latched
  378. *
  379. * When no NMI is in progress, it is in the "not running" state.
  380. * When an NMI comes in, it goes into the "executing" state.
  381. * Normally, if another NMI is triggered, it does not interrupt
  382. * the running NMI and the HW will simply latch it so that when
  383. * the first NMI finishes, it will restart the second NMI.
  384. * (Note, the latch is binary, thus multiple NMIs triggering,
  385. * when one is running, are ignored. Only one NMI is restarted.)
  386. *
  387. * If an NMI executes an iret, another NMI can preempt it. We do not
  388. * want to allow this new NMI to run, but we want to execute it when the
  389. * first one finishes. We set the state to "latched", and the exit of
  390. * the first NMI will perform a dec_return, if the result is zero
  391. * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
  392. * dec_return would have set the state to NMI_EXECUTING (what we want it
  393. * to be when we are running). In this case, we simply jump back to
  394. * rerun the NMI handler again, and restart the 'latched' NMI.
  395. *
  396. * No trap (breakpoint or page fault) should be hit before nmi_restart,
  397. * thus there is no race between the first check of state for NOT_RUNNING
  398. * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
  399. * at this point.
  400. *
  401. * In case the NMI takes a page fault, we need to save off the CR2
  402. * because the NMI could have preempted another page fault and corrupt
  403. * the CR2 that is about to be read. As nested NMIs must be restarted
  404. * and they can not take breakpoints or page faults, the update of the
  405. * CR2 must be done before converting the nmi state back to NOT_RUNNING.
  406. * Otherwise, there would be a race of another nested NMI coming in
  407. * after setting state to NOT_RUNNING but before updating the nmi_cr2.
  408. */
  409. enum nmi_states {
  410. NMI_NOT_RUNNING = 0,
  411. NMI_EXECUTING,
  412. NMI_LATCHED,
  413. };
  414. static DEFINE_PER_CPU(enum nmi_states, nmi_state);
  415. static DEFINE_PER_CPU(unsigned long, nmi_cr2);
  416. #ifdef CONFIG_X86_64
  417. /*
  418. * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
  419. * some care, the inner breakpoint will clobber the outer breakpoint's
  420. * stack.
  421. *
  422. * If a breakpoint is being processed, and the debug stack is being
  423. * used, if an NMI comes in and also hits a breakpoint, the stack
  424. * pointer will be set to the same fixed address as the breakpoint that
  425. * was interrupted, causing that stack to be corrupted. To handle this
  426. * case, check if the stack that was interrupted is the debug stack, and
  427. * if so, change the IDT so that new breakpoints will use the current
  428. * stack and not switch to the fixed address. On return of the NMI,
  429. * switch back to the original IDT.
  430. */
  431. static DEFINE_PER_CPU(int, update_debug_stack);
  432. #endif
  433. dotraplinkage notrace void
  434. do_nmi(struct pt_regs *regs, long error_code)
  435. {
  436. if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
  437. this_cpu_write(nmi_state, NMI_LATCHED);
  438. return;
  439. }
  440. this_cpu_write(nmi_state, NMI_EXECUTING);
  441. this_cpu_write(nmi_cr2, read_cr2());
  442. nmi_restart:
  443. #ifdef CONFIG_X86_64
  444. /*
  445. * If we interrupted a breakpoint, it is possible that
  446. * the nmi handler will have breakpoints too. We need to
  447. * change the IDT such that breakpoints that happen here
  448. * continue to use the NMI stack.
  449. */
  450. if (unlikely(is_debug_stack(regs->sp))) {
  451. debug_stack_set_zero();
  452. this_cpu_write(update_debug_stack, 1);
  453. }
  454. #endif
  455. nmi_enter();
  456. inc_irq_stat(__nmi_count);
  457. if (!ignore_nmis)
  458. default_do_nmi(regs);
  459. nmi_exit();
  460. #ifdef CONFIG_X86_64
  461. if (unlikely(this_cpu_read(update_debug_stack))) {
  462. debug_stack_reset();
  463. this_cpu_write(update_debug_stack, 0);
  464. }
  465. #endif
  466. if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
  467. write_cr2(this_cpu_read(nmi_cr2));
  468. if (this_cpu_dec_return(nmi_state))
  469. goto nmi_restart;
  470. }
  471. NOKPROBE_SYMBOL(do_nmi);
  472. void stop_nmi(void)
  473. {
  474. ignore_nmis++;
  475. }
  476. void restart_nmi(void)
  477. {
  478. ignore_nmis--;
  479. }
  480. /* reset the back-to-back NMI logic */
  481. void local_touch_nmi(void)
  482. {
  483. __this_cpu_write(last_nmi_rip, 0);
  484. }
  485. EXPORT_SYMBOL_GPL(local_touch_nmi);