1
0
Fork 0

x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks

We now switch to the kernel stack when a machine check interrupts
during user mode.  This means that we can perform recovery actions
in the tail of do_machine_check()

Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
hifive-unleashed-5.1
Luck, Tony 2015-01-05 16:44:42 -08:00 committed by Andy Lutomirski
parent bced35b65a
commit d4812e169d
4 changed files with 26 additions and 94 deletions

View File

@ -190,7 +190,6 @@ enum mcp_flags {
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
int mce_notify_irq(void);
void mce_notify_process(void);
DECLARE_PER_CPU(struct mce, injectm);

View File

@ -75,7 +75,6 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
@ -100,7 +99,6 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
@ -140,7 +138,7 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
_TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
/* flags to check in __switch_to() */

View File

@ -1003,51 +1003,6 @@ static void mce_clear_state(unsigned long *toclear)
}
}
/*
* Need to save faulting physical address associated with a process
* in the machine check handler some place where we can grab it back
* later in mce_notify_process()
*/
#define MCE_INFO_MAX 16
struct mce_info {
atomic_t inuse;
struct task_struct *t;
__u64 paddr;
int restartable;
} mce_info[MCE_INFO_MAX];
static void mce_save_info(__u64 addr, int c)
{
struct mce_info *mi;
for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
mi->t = current;
mi->paddr = addr;
mi->restartable = c;
return;
}
}
mce_panic("Too many concurrent recoverable errors", NULL, NULL);
}
static struct mce_info *mce_find_info(void)
{
struct mce_info *mi;
for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
if (atomic_read(&mi->inuse) && mi->t == current)
return mi;
return NULL;
}
static void mce_clear_info(struct mce_info *mi)
{
atomic_set(&mi->inuse, 0);
}
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@ -1086,6 +1041,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
u64 recover_paddr = ~0ull;
int flags = MF_ACTION_REQUIRED;
prev_state = ist_enter(regs);
@ -1207,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (no_way_out)
mce_panic("Fatal machine check on current CPU", &m, msg);
if (worst == MCE_AR_SEVERITY) {
/* schedule action before return to userland */
mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
set_thread_flag(TIF_MCE_NOTIFY);
recover_paddr = m.addr;
if (!(m.mcgstatus & MCG_STATUS_RIPV))
flags |= MF_MUST_KILL;
} else if (kill_it) {
force_sig(SIGBUS, current);
}
@ -1220,6 +1177,26 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
sync_core();
if (recover_paddr == ~0ull)
goto done;
pr_err("Uncorrected hardware memory error in user-access at %llx",
recover_paddr);
/*
* We must call memory_failure() here even if the current process is
* doomed. We still need to mark the page as poisoned and alert any
* other users of the page.
*/
ist_begin_non_atomic(regs);
local_irq_enable();
if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
pr_err("Memory error not recovered");
force_sig(SIGBUS, current);
}
local_irq_disable();
ist_end_non_atomic();
done:
ist_exit(regs, prev_state);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@ -1237,42 +1214,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
}
#endif
/*
* Called in process context that interrupted by MCE and marked with
* TIF_MCE_NOTIFY, just before returning to erroneous userland.
* This code is allowed to sleep.
* Attempt possible recovery such as calling the high level VM handler to
* process any corrupted pages, and kill/signal current process if required.
* Action required errors are handled here.
*/
void mce_notify_process(void)
{
unsigned long pfn;
struct mce_info *mi = mce_find_info();
int flags = MF_ACTION_REQUIRED;
if (!mi)
mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
pfn = mi->paddr >> PAGE_SHIFT;
clear_thread_flag(TIF_MCE_NOTIFY);
pr_err("Uncorrected hardware memory error in user-access at %llx",
mi->paddr);
/*
* We must call memory_failure() here even if the current process is
* doomed. We still need to mark the page as poisoned and alert any
* other users of the page.
*/
if (!mi->restartable)
flags |= MF_MUST_KILL;
if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
pr_err("Memory error not recovered");
force_sig(SIGBUS, current);
}
mce_clear_info(mi);
}
/*
* Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check()

View File

@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
user_exit();
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);