GHES: Seriously speedup and cleanup NMI handler (Jiri Kosina and Borislav Petkov)

This is the result of us seeing this during boot
 
 [   24.332560] INFO: NMI handler (ghes_notify_nmi) took too long to run: 3.265 msecs
 [   24.332567] INFO: NMI handler (ghes_notify_nmi) took too long to run: 5.946 msecs
 [   24.332568] INFO: NMI handler (ghes_notify_nmi) took too long to run: 5.948 msecs
 
 and a report of people running perf stat and the machine softlocking.
 "hest_disable" was helping in this case, which made us look at that
 particular NMI handler. It was grabbing a lock each time it is run and
 on each CPU. But this is not needed as the GHES sources are global and
 they need only a single reader.
 
 This patchset does that and cleans up the handler in the process.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQIcBAABAgAGBQJVRzqjAAoJEBLB8Bhh3lVKFQUQAKmPPaWp2qPUkRHtrdFfae0n
 Um9PSAHlvO8oHPrr+5j+CEVX1gQ1qUFMuxSVuBYYhmMZnsAVCwKX8gOi3slZmElo
 L0MoqxygP2v3Rc79Jk8KEkov+80zzp46VoB5Yh0mTfzSejeS+c1owwS0RzVIpda3
 Zp1cAThTVsCdDmmWGhnWfc8T/wniPc2zbbVhtjivO/wwSu7ArWUR5/03pbxzex8M
 3xnJY29USB62xO5OJf0mxAIhTqv4a7NSQ6NziBrI2Mt4inHj0oSR7KLVFpeSx4mX
 LLo8dWfDpMIbwd2sZYg5fG6Kz2i3Cc5I252V4j7MI+srvvZzZPBr1aB8qNyQVGPF
 grrQKQuD2Az0WZRo2gK2LvxvlHfmJyOJMaFPW+RMnARbtKMmmAIwaN4+zmPGHoH8
 aQK7cKUyb1JxICpy9iAZjXMvqxc5zbhYra0G+zuLKruhEvmLrwibphjI82BNdaBZ
 rhkK+cw47lPgwKVNRWKjpNGGziwqU2cBlmc50k9OyQhhv5HF2TeYyBRryykOVmLV
 DpsUxO4rCj0YEioyoJEruBO1Ks167HGrg5sbSemO/YeM7wLgr7hMfVNlGM4DI68X
 b3Ie81/7qD8d3nQZ/gkk8ex1D0QhS0FWZthbNKpvkX+Zc/ELbGhfgHxX55HmfGuD
 DUmvP2oXlALAIPiJAYVP
 =Qj+0
 -----END PGP SIGNATURE-----

Merge tag 'ras_for_4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into acpi-apei

Pull GHES changes for 4.2 from Borislav Petkov:

"GHES: Seriously speedup and cleanup NMI handler (Jiri Kosina and Borislav Petkov)

This is the result of us seeing this during boot

[   24.332560] INFO: NMI handler (ghes_notify_nmi) took too long to run: 3.265 msecs
[   24.332567] INFO: NMI handler (ghes_notify_nmi) took too long to run: 5.946 msecs
[   24.332568] INFO: NMI handler (ghes_notify_nmi) took too long to run: 5.948 msecs

and a report of people running perf stat and the machine softlocking.
"hest_disable" was helping in this case, which made us look at that
particular NMI handler. It was grabbing a lock each time it is run and
on each CPU. But this is not needed as the GHES sources are global and
they need only a single reader.

This patchset does that and cleans up the handler in the process."

* tag 'ras_for_4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
  GHES: Make NMI handler have a single reader
  GHES: Elliminate double-loop in the NMI handler
  GHES: Panic right after detection
  GHES: Carve out the panic functionality
  GHES: Carve out error queueing in a separate function
This commit is contained in:
Rafael J. Wysocki 2015-05-04 22:34:58 +02:00
commit 20f34165a9

View file

@ -729,10 +729,10 @@ static struct llist_head ghes_estatus_llist;
static struct irq_work ghes_proc_irq_work;
/*
* NMI may be triggered on any CPU, so ghes_nmi_lock is used for
* mutual exclusion.
* NMI may be triggered on any CPU, so ghes_in_nmi is used for
* having only one concurrent reader.
*/
static DEFINE_RAW_SPINLOCK(ghes_nmi_lock);
static atomic_t ghes_in_nmi = ATOMIC_INIT(0);
static LIST_HEAD(ghes_nmi);
@ -797,73 +797,75 @@ static void ghes_print_queued_estatus(void)
}
}
/* Save estatus for further processing in IRQ context */
static void __process_error(struct ghes *ghes)
{
#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
u32 len, node_len;
struct ghes_estatus_node *estatus_node;
struct acpi_hest_generic_status *estatus;
if (ghes_estatus_cached(ghes->estatus))
return;
len = cper_estatus_len(ghes->estatus);
node_len = GHES_ESTATUS_NODE_LEN(len);
estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
if (!estatus_node)
return;
estatus_node->ghes = ghes;
estatus_node->generic = ghes->generic;
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
memcpy(estatus, ghes->estatus, len);
llist_add(&estatus_node->llnode, &ghes_estatus_llist);
#endif
}
static void __ghes_panic(struct ghes *ghes)
{
oops_begin();
ghes_print_queued_estatus();
__ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
/* reboot to log the error! */
if (panic_timeout == 0)
panic_timeout = ghes_panic_timeout;
panic("Fatal hardware error!");
}
static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
{
struct ghes *ghes, *ghes_global = NULL;
int sev, sev_global = -1;
int ret = NMI_DONE;
struct ghes *ghes;
int sev, ret = NMI_DONE;
if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
return ret;
raw_spin_lock(&ghes_nmi_lock);
list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
if (ghes_read_estatus(ghes, 1)) {
ghes_clear_estatus(ghes);
continue;
}
sev = ghes_severity(ghes->estatus->error_severity);
if (sev > sev_global) {
sev_global = sev;
ghes_global = ghes;
}
if (sev >= GHES_SEV_PANIC)
__ghes_panic(ghes);
if (!(ghes->flags & GHES_TO_CLEAR))
continue;
__process_error(ghes);
ghes_clear_estatus(ghes);
ret = NMI_HANDLED;
}
if (ret == NMI_DONE)
goto out;
if (sev_global >= GHES_SEV_PANIC) {
oops_begin();
ghes_print_queued_estatus();
__ghes_print_estatus(KERN_EMERG, ghes_global->generic,
ghes_global->estatus);
/* reboot to log the error! */
if (panic_timeout == 0)
panic_timeout = ghes_panic_timeout;
panic("Fatal hardware error!");
}
list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
u32 len, node_len;
struct ghes_estatus_node *estatus_node;
struct acpi_hest_generic_status *estatus;
#endif
if (!(ghes->flags & GHES_TO_CLEAR))
continue;
#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
if (ghes_estatus_cached(ghes->estatus))
goto next;
/* Save estatus for further processing in IRQ context */
len = cper_estatus_len(ghes->estatus);
node_len = GHES_ESTATUS_NODE_LEN(len);
estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
node_len);
if (estatus_node) {
estatus_node->ghes = ghes;
estatus_node->generic = ghes->generic;
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
memcpy(estatus, ghes->estatus, len);
llist_add(&estatus_node->llnode, &ghes_estatus_llist);
}
next:
#endif
ghes_clear_estatus(ghes);
}
#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
irq_work_queue(&ghes_proc_irq_work);
#endif
out:
raw_spin_unlock(&ghes_nmi_lock);
atomic_dec(&ghes_in_nmi);
return ret;
}