From 1d8ca3be86ebc6a38dad8236f45c7a9c61681e78 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 6 Nov 2018 15:12:29 -0500 Subject: [PATCH 01/29] x86/mm/fault: Allow stack access below %rsp The current x86 page fault handler allows stack access below the stack pointer if it is no more than 64k+256 bytes. Any access beyond the 64k+ limit will cause a segmentation fault. The gcc -fstack-check option generates code to probe the stack for large stack allocation to see if the stack is accessible. The newer gcc does that while updating the %rsp simultaneously. Older gcc's like gcc4 doesn't do that. As a result, an application compiled with an old gcc and the -fstack-check option may fail to start at all: $ cat test.c int main() { char tmp[1024*128]; printf("### ok\n"); return 0; } $ gcc -fstack-check -g -o test test.c $ ./test Segmentation fault The old binary was working in older kernels where expand_stack() was somehow called before the check. But it is not working in newer kernels. Besides, the 64k+ limit check is kind of crude and will not catch a lot of mistakes that userspace applications may be misbehaving anyway. I think the kernel isn't the right place for this kind of tests. We should leave it to userspace instrumentation tools to perform them. The 64k+ limit check is now removed to just let expand_stack() decide if a segmentation fault should happen, when the RLIMIT_STACK limit is exceeded, for example. Signed-off-by: Waiman Long Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1541535149-31963-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 71d4b9d4d43f..29525cf21100 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1380,18 +1380,6 @@ retry: bad_area(regs, sw_error_code, address); return; } - if (sw_error_code & X86_PF_USER) { - /* - * Accessing the stack below %sp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535, $31" pushes - * 32 pointers and then decrements %sp by 65535.) - */ - if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { - bad_area(regs, sw_error_code, address); - return; - } - } if (unlikely(expand_stack(vma, address))) { bad_area(regs, sw_error_code, address); return; From 6344be608c039f3a787f1144c46fcb04c0f76561 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:25 -0800 Subject: [PATCH 02/29] x86/fault: Check user_mode(regs) when avoiding an mmap_sem deadlock The fault-handling code that takes mmap_sem needs to avoid a deadlock that could occur if the kernel took a bad (OOPS-worthy) page fault on a user address while holding mmap_sem. This can only happen if the faulting instruction was in the kernel (i.e. user_mode(regs)). Rather than checking the sw_error_code (which will have the USER bit set if the fault was a USER-permission access *or* if user_mode(regs)), just check user_mode(regs) directly. The old code would have malfunctioned if the kernel executed a bogus WRUSS instruction while holding mmap_sem. Fortunately, that is extremely unlikely in current kernels, which don't use WRUSS. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/4b89b542e8ceba9bd6abde2f386afed6d99244a9.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 29525cf21100..8624cb7d8d65 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1344,13 +1344,10 @@ void do_user_addr_fault(struct pt_regs *regs, * Only do the expensive exception table search when we might be at * risk of a deadlock. This happens if we * 1. Failed to acquire mmap_sem, and - * 2. The access did not originate in userspace. Note: either the - * hardware or earlier page fault code may set X86_PF_USER - * in sw_error_code. + * 2. The access did not originate in userspace. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if (!(sw_error_code & X86_PF_USER) && - !search_exception_tables(regs->ip)) { + if (!user_mode(regs) && !search_exception_tables(regs->ip)) { /* * Fault from code in kernel from * which we do not expect faults. From dae0a10593007d049ea71601357ac41d4f247ee9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:27 -0800 Subject: [PATCH 03/29] x86/cpufeatures, x86/fault: Mark SMAP as disabled when configured out Add X86_FEATURE_SMAP to the disabled features mask as appropriate and use cpu_feature_enabled() in the fault code. This lets us get rid of a redundant IS_ENABLED(CONFIG_X86_SMAP). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/fe93332eded3d702f0b0b4cf83928d6830739ba3.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/mm/fault.c | 5 +---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33833d1909af..a5ea841cc6d2 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -16,6 +16,12 @@ # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) #endif +#ifdef CONFIG_X86_SMAP +# define DISABLE_SMAP 0 +#else +# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) +#endif + #ifdef CONFIG_X86_INTEL_UMIP # define DISABLE_UMIP 0 #else @@ -68,7 +74,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_MPX) +#define DISABLED_MASK9 (DISABLE_MPX|DISABLE_SMAP) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8624cb7d8d65..39e39cd42097 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1150,10 +1150,7 @@ static int fault_in_kernel_space(unsigned long address) static inline bool smap_violation(int error_code, struct pt_regs *regs) { - if (!IS_ENABLED(CONFIG_X86_SMAP)) - return false; - - if (!static_cpu_has(X86_FEATURE_SMAP)) + if (!cpu_feature_enabled(X86_FEATURE_SMAP)) return false; if (error_code & X86_PF_USER) From a15781b536293edc32bf374233f3b8ad77c3f72b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:28 -0800 Subject: [PATCH 04/29] x86/fault: Fold smap_violation() into do_user_addr_fault() smap_violation() has a single caller, and the contents are a bit nonsensical. I'm going to fix it, but first let's fold it into its caller for ease of comprehension. In this particular case, the user_mode(regs) check is incorrect -- it will cause false positives in the case of a user-initiated kernel-privileged access. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/806c366f6ca861152398ce2c01744d59d9aceb6d.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 39e39cd42097..9d092ab74f18 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1148,20 +1148,6 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } -static inline bool smap_violation(int error_code, struct pt_regs *regs) -{ - if (!cpu_feature_enabled(X86_FEATURE_SMAP)) - return false; - - if (error_code & X86_PF_USER) - return false; - - if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) - return false; - - return true; -} - /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that @@ -1249,10 +1235,13 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* - * Check for invalid kernel (supervisor) access to user - * pages in the user address space. + * If SMAP is on, check for invalid kernel (supervisor) + * access to user pages in the user address space. */ - if (unlikely(smap_violation(hw_error_code, regs))) { + if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && + !(hw_error_code & X86_PF_USER) && + (user_mode(regs) || !(regs->flags & X86_EFLAGS_AC)))) + { bad_area_nosemaphore(regs, hw_error_code, address); return; } From e50928d7213e72ee95507221a89ed07d2bb6517b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:29 -0800 Subject: [PATCH 05/29] x86/fault: Fix SMAP #PF handling buglet for implicit supervisor accesses Currently, if a user program somehow triggers an implicit supervisor access to a user address (e.g. if the kernel somehow sets LDTR to a user address), it will be incorrectly detected as a SMAP violation if AC is clear and SMAP is enabled. This is incorrect -- the error has nothing to do with SMAP. Fix the condition so that only accesses with the hardware USER bit set are diagnosed as SMAP violations. With the logic fixed, an implicit supervisor access to a user address will hit the code lower in the function that is intended to handle it even if SMAP is enabled. That logic is still a bit buggy, and later patches will clean it up. I *think* this code is still correct for WRUSS, and I've added a comment to that effect. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/d1d1b2e66ef31f884dba172084486ea9423ddcdb.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d092ab74f18..7a69b66cf071 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1235,12 +1235,15 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* - * If SMAP is on, check for invalid kernel (supervisor) - * access to user pages in the user address space. + * If SMAP is on, check for invalid kernel (supervisor) access to user + * pages in the user address space. The odd case here is WRUSS, + * which, according to the preliminary documentation, does not respect + * SMAP and will have the USER bit set so, in all cases, SMAP + * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(hw_error_code & X86_PF_USER) && - (user_mode(regs) || !(regs->flags & X86_EFLAGS_AC)))) + !(regs->flags & X86_EFLAGS_AC))) { bad_area_nosemaphore(regs, hw_error_code, address); return; From 6ea59b074f15e7ef4b042a108950861b383e7b02 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:30 -0800 Subject: [PATCH 06/29] x86/fault: Improve the condition for signalling vs OOPSing __bad_area_nosemaphore() currently checks the X86_PF_USER bit in the error code to decide whether to send a signal or to treat the fault as a kernel error. This can cause somewhat erratic behavior. The straightforward cases where the CPL agrees with the hardware USER bit are all correct, but the other cases are confusing. - A user instruction accessing a kernel address with supervisor privilege (e.g. a descriptor table access failed). The USER bit will be clear, and we OOPS. This is correct, because it indicates a kernel bug, not a user error. - A user instruction accessing a user address with supervisor privilege (e.g. a descriptor table was incorrectly pointing at user memory). __bad_area_nosemaphore() will be passed a modified error code with the user bit set, and we will send a signal. Sending the signal will work (because the regs and the entry frame genuinely come from user mode), but we really ought to OOPS, as this event indicates a severe kernel bug. - A kernel instruction with user privilege (i.e. WRUSS). This should OOPS or get fixed up. The current code would instead try send a signal and malfunction. Change the logic: a signal should be sent if the faulting context is user mode *and* the access has user privilege. Otherwise it's either a kernel mode fault or a failed implicit access, either of which should end up in no_context(). Note to -stable maintainers: don't backport this unless you backport CET. The bug it fixes is unobservable in current kernels unless something is extremely wrong. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/10e509c43893170e262e82027ea399130ae81159.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7a69b66cf071..3c9aed03d18e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -794,7 +794,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & X86_PF_USER) { + if (user_mode(regs) && (error_code & X86_PF_USER)) { /* * It's possible to have interrupts off here: */ From e49d3cbef0176c182b86206185f137a87f16ab91 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:31 -0800 Subject: [PATCH 07/29] x86/fault: Make error_code sanitization more robust The error code in a page fault on a kernel address indicates whether that address is mapped, which should not be revealed in a signal. The normal code path for a page fault on a kernel address sanitizes the bit, but the paths for vsyscall emulation and SIGBUS do not. Both are harmless, but for subtle reasons. SIGBUS is never sent for a kernel address, and vsyscall emulation will never fault on a kernel address per se because it will fail an access_ok() check instead. Make the code more robust by adding a helper that sets the relevant fields and sanitizing the error code in the helper. This also cleans up the code -- we had three copies of roughly the same thing. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/b31159bd55bd0c4fa061a20dfd6c429c094bebaa.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3c9aed03d18e..b5ec1ca2f4a0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -631,6 +631,24 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; + + /* + * To avoid leaking information about the kernel page + * table layout, pretend that user-mode accesses to + * kernel addresses are always protection faults. + */ + if (address >= TASK_SIZE_MAX) + error_code |= X86_PF_PROT; + + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; +} + static noinline void no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) @@ -656,9 +674,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | X86_PF_USER; - tsk->thread.cr2 = address; + set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ force_sig_fault(signal, si_code, (void __user *)address, @@ -821,9 +837,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); @@ -937,9 +951,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { From 1ad33f5aec20f53785dbad44c6fb3b204aefd921 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:32 -0800 Subject: [PATCH 08/29] x86/fault: Don't set thread.cr2, etc before OOPSing The fault handling code sets the cr2, trap_nr, and error_code fields in thread_struct before OOPSing. No one reads those fields during an OOPS, so remove the code to set them. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/d418022aa0fad9cb40467aa7acaf4e95be50ee96.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b5ec1ca2f4a0..b898a38093a3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -621,10 +621,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - if (__die("Bad pagetable", regs, error_code)) sig = 0; @@ -753,10 +749,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; From 0ed32f1aa66ee758e6c8164f549f7ff9d399a20e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 Nov 2018 15:11:22 -0800 Subject: [PATCH 09/29] x86/fault: Remove sw_error_code All of the fault handling code now corrently checks user_mode(regs) as needed, and nothing depends on the X86_PF_USER bit being munged. Get rid of the sw_error code and use hw_error_code everywhere. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/078f5b8ae6e8c79ff8ee7345b5c476c45003e5ac.1542841400.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 50 ++++++++++----------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b898a38093a3..82881bc5feef 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1217,7 +1217,6 @@ void do_user_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { - unsigned long sw_error_code; struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; @@ -1262,13 +1261,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* - * hw_error_code is literally the "page fault error code" passed to - * the kernel directly from the hardware. But, we will shortly be - * modifying it in software, so give it a new name. - */ - sw_error_code = hw_error_code; - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -1278,26 +1270,6 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (user_mode(regs)) { local_irq_enable(); - /* - * Up to this point, X86_PF_USER set in hw_error_code - * indicated a user-mode access. But, after this, - * X86_PF_USER in sw_error_code will indicate either - * that, *or* an implicit kernel(supervisor)-mode access - * which originated from user mode. - */ - if (!(hw_error_code & X86_PF_USER)) { - /* - * The CPU was in user mode, but the CPU says - * the fault was not a user-mode access. - * Must be an implicit kernel-mode access, - * which we do not expect to happen in the - * user address space. - */ - pr_warn_once("kernel-mode error from user-mode: %lx\n", - hw_error_code); - - sw_error_code |= X86_PF_USER; - } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1306,9 +1278,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (sw_error_code & X86_PF_WRITE) + if (hw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (sw_error_code & X86_PF_INSTR) + if (hw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1321,7 +1293,7 @@ void do_user_addr_fault(struct pt_regs *regs, * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. */ - if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { if (emulate_vsyscall(regs, address)) return; } @@ -1345,7 +1317,7 @@ void do_user_addr_fault(struct pt_regs *regs, * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, sw_error_code, address); + bad_area_nosemaphore(regs, hw_error_code, address); return; } retry: @@ -1361,17 +1333,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } @@ -1380,8 +1352,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(sw_error_code, vma))) { - bad_area_access_error(regs, sw_error_code, address, vma); + if (unlikely(access_error(hw_error_code, vma))) { + bad_area_access_error(regs, hw_error_code, address, vma); return; } @@ -1420,13 +1392,13 @@ good_area: return; /* Not returning to user mode? Handle exceptions or die: */ - no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, sw_error_code, address, fault); + mm_fault_error(regs, hw_error_code, address, fault); return; } From ebb53e2597e2dc7637ab213df006e99681b6ee25 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 Nov 2018 15:11:23 -0800 Subject: [PATCH 10/29] x86/fault: Don't try to recover from an implicit supervisor access This avoids a situation in which we attempt to apply various fixups that are not intended to handle implicit supervisor accesses from user mode if we screw up in a way that causes this type of fault. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/9999f151d72ff352265f3274c5ab3a4105090f49.1542841400.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 82881bc5feef..ca38bd0472f2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -653,6 +653,15 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long flags; int sig; + if (user_mode(regs)) { + /* + * This is an implicit supervisor-mode access from user + * mode. Bypass all the kernel-mode recovery code and just + * OOPS. + */ + goto oops; + } + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* @@ -738,6 +747,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_recover_from_page_fault(address); +oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: From d38bc89c72e7235ac889ae64fe7828e2e61a18af Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 Nov 2018 15:11:24 -0800 Subject: [PATCH 11/29] x86/oops: Show the correct CS value in show_regs() show_regs() shows the CS in the CPU register instead of the value in regs. This means that we'll probably print "CS: 0010" almost all the time regardless of what was actually in CS when the kernel malfunctioned. This gives a particularly confusing result if we OOPSed due to an implicit supervisor access from user mode. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/4e36812b6e1e95236a812021d35cbf22746b5af6.1542841400.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0e0b4288a4b2..2b8e6324fa20 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -66,7 +66,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; - unsigned int ds, cs, es; + unsigned int ds, es; show_iret_regs(regs); @@ -98,7 +98,6 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) } asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); @@ -114,7 +113,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds, es, cr0); printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); From af2ebdcf044039e89da3cd44c0f04dea317020c5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 Nov 2018 15:11:26 -0800 Subject: [PATCH 12/29] x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code Rather than hardcoding 6 with a comment, use the defined constants. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/e023f20352b0d05a8b0205629897917262d2ad68.1542841400.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 85fd85d52ffd..d78bcc03e60e 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -102,7 +102,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; - thread->error_code = 6; /* user fault, no page, write */ + thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; From a1a371c468f7238b7826fde55786b02377faf8e2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 Nov 2018 15:11:25 -0800 Subject: [PATCH 13/29] x86/fault: Decode page fault OOPSes better One of Linus' favorite hobbies seems to be looking at OOPSes and decoding the error code in his head. This is not one of my favorite hobbies :) Teach the page fault OOPS hander to decode the error code. If it's a !USER fault from user mode, print an explicit note to that effect and print out the addresses of various tables that might cause such an error. With this patch applied, if I intentionally point the LDT at 0x0 and run the x86 selftests, I get: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 HW error: normal kernel read fault This was a system access from user code IDT: 0xfffffe0000000000 (limit=0xfff) GDT: 0xfffffe0000001000 (limit=0x7f) LDTR: 0x50 -- base=0x0 limit=0xfff7 TR: 0x40 -- base=0xfffffe0000003000 limit=0x206f PGD 800000000456e067 P4D 800000000456e067 PUD 4623067 PMD 0 SMP PTI CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317 Hardware name: ... RIP: 0033:0x401454 Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/11212acb25980cd1b3030875cd9502414fbb214d.1542841400.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 84 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ca38bd0472f2..f5efbdba2b6d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -27,6 +27,7 @@ #include /* struct vm86 */ #include /* vma_pkey() */ #include /* efi_recover_from_page_fault()*/ +#include /* store_idt(), ... */ #define CREATE_TRACE_POINTS #include @@ -571,10 +572,53 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) +{ + u32 offset = (index >> 3) * sizeof(struct desc_struct); + unsigned long addr; + struct ldttss_desc desc; + + if (index == 0) { + pr_alert("%s: NULL\n", name); + return; + } + + if (offset + sizeof(struct ldttss_desc) >= gdt->size) { + pr_alert("%s: 0x%hx -- out of bounds\n", name, index); + return; + } + + if (probe_kernel_read(&desc, (void *)(gdt->address + offset), + sizeof(struct ldttss_desc))) { + pr_alert("%s: 0x%hx -- GDT entry is not readable\n", + name, index); + return; + } + + addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24); +#ifdef CONFIG_X86_64 + addr |= ((u64)desc.base3 << 32); +#endif + pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", + name, index, addr, (desc.limit0 | (desc.limit1 << 16))); +} + +static void errstr(unsigned long ec, char *buf, unsigned long mask, + const char *txt) +{ + if (ec & mask) { + if (buf[0]) + strcat(buf, " "); + strcat(buf, txt); + } +} + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { + char errtxt[64]; + if (!oops_may_print()) return; @@ -602,6 +646,46 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", (void *)address); + errtxt[0] = 0; + errstr(error_code, errtxt, X86_PF_PROT, "PROT"); + errstr(error_code, errtxt, X86_PF_WRITE, "WRITE"); + errstr(error_code, errtxt, X86_PF_USER, "USER"); + errstr(error_code, errtxt, X86_PF_RSVD, "RSVD"); + errstr(error_code, errtxt, X86_PF_INSTR, "INSTR"); + errstr(error_code, errtxt, X86_PF_PK, "PK"); + pr_alert("HW error: %s\n", error_code ? errtxt : + "normal kernel read fault"); + if (!(error_code & X86_PF_USER) && user_mode(regs)) { + struct desc_ptr idt, gdt; + u16 ldtr, tr; + + pr_alert("This was a system access from user code\n"); + + /* + * This can happen for quite a few reasons. The more obvious + * ones are faults accessing the GDT, or LDT. Perhaps + * surprisingly, if the CPU tries to deliver a benign or + * contributory exception from user code and gets a page fault + * during delivery, the page fault can be delivered as though + * it originated directly from user code. This could happen + * due to wrong permissions on the IDT, GDT, LDT, TSS, or + * kernel or IST stack. + */ + store_idt(&idt); + + /* Usable even on Xen PV -- it's just slow. */ + native_store_gdt(&gdt); + + pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", + idt.address, idt.size, gdt.address, gdt.size); + + store_ldt(ldtr); + show_ldttss(&gdt, "LDTR", ldtr); + + store_tr(tr); + show_ldttss(&gdt, "TR", tr); + } + dump_pagetable(address); } From a2aa52ab16efbee40ad118ebac4a5e438f5b43ee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 22 Nov 2018 09:34:03 +0100 Subject: [PATCH 14/29] x86/fault: Clean up the page fault oops decoder a bit - Make the oops messages a bit less scary (don't mention 'HW errors') - Turn 'PROT USER' (which is visually easily confused with PROT_USER) into individual bit descriptors: "[PROT] [USER]". This also makes "[normal kernel read fault]" more apparent. - De-abbreviate variables to make the code easier to read - Use vertical alignment where appropriate. - Add comment about string size limits and the helper function. - Remove unnecessary line breaks. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f5efbdba2b6d..2ff25ad33233 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -603,10 +603,13 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } -static void errstr(unsigned long ec, char *buf, unsigned long mask, - const char *txt) +/* + * This helper function transforms the #PF error_code bits into + * "[PROT] [USER]" type of descriptive, almost human-readable error strings: + */ +static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) { - if (ec & mask) { + if (error_code & mask) { if (buf[0]) strcat(buf, " "); strcat(buf, txt); @@ -614,10 +617,9 @@ static void errstr(unsigned long ec, char *buf, unsigned long mask, } static void -show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - char errtxt[64]; + char err_txt[64]; if (!oops_may_print()) return; @@ -646,15 +648,21 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", (void *)address); - errtxt[0] = 0; - errstr(error_code, errtxt, X86_PF_PROT, "PROT"); - errstr(error_code, errtxt, X86_PF_WRITE, "WRITE"); - errstr(error_code, errtxt, X86_PF_USER, "USER"); - errstr(error_code, errtxt, X86_PF_RSVD, "RSVD"); - errstr(error_code, errtxt, X86_PF_INSTR, "INSTR"); - errstr(error_code, errtxt, X86_PF_PK, "PK"); - pr_alert("HW error: %s\n", error_code ? errtxt : - "normal kernel read fault"); + err_txt[0] = 0; + + /* + * Note: length of these appended strings including the separation space and the + * zero delimiter must fit into err_txt[]. + */ + err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); + err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); + err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); + err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); + err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); + err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); + + pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); + if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; From c683c37cd13246941924c48f6c6a9863425e0cec Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 4 Dec 2018 13:37:06 -0800 Subject: [PATCH 15/29] generic/pgtable: Make {pmd, pud}_same() unconditionally available In preparation for {pmd,pud}_same() to be used outside of transparent huge page code paths, make them unconditionally available. This enables them to be used in the definition of a new family of set_{pte,pmd,pud,p4d,pgd}_safe() helpers. Signed-off-by: Dan Williams Acked-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154395942644.32119.10238934183949418128.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- include/asm-generic/pgtable.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 359fb935ded6..eea50ef8b8cd 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -375,7 +375,6 @@ static inline int pte_unused(pte_t pte) #endif #ifndef __HAVE_ARCH_PMD_SAME -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); @@ -385,19 +384,6 @@ static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) -{ - BUILD_BUG(); - return 0; -} - -static inline int pud_same(pud_t pud_a, pud_t pud_b) -{ - BUILD_BUG(); - return 0; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_DO_SWAP_PAGE From 0cebbb60f759a709dabb3c87b9704f9844878850 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 4 Dec 2018 13:37:11 -0800 Subject: [PATCH 16/29] generic/pgtable: Introduce {p4d,pgd}_same() In preparation for introducing '_safe' versions of page table entry 'set' helpers, introduce generic versions of p4d_same() and pgd_same(). Signed-off-by: Dan Williams Acked-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154395943153.32119.1733586547359626311.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- include/asm-generic/pgtable.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index eea50ef8b8cd..dae7f98babed 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -386,6 +386,20 @@ static inline int pud_same(pud_t pud_a, pud_t pud_b) } #endif +#ifndef __HAVE_ARCH_P4D_SAME +static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) +{ + return p4d_val(p4d_a) == p4d_val(p4d_b); +} +#endif + +#ifndef __HAVE_ARCH_PGD_SAME +static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) +{ + return pgd_val(pgd_a) == pgd_val(pgd_b); +} +#endif + #ifndef __HAVE_ARCH_DO_SWAP_PAGE /* * Some architectures support metadata associated with a page. When a From 4369deaa2f022ef92da45a0e7eec8a4a52e8e8a4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 4 Dec 2018 13:37:16 -0800 Subject: [PATCH 17/29] generic/pgtable: Introduce set_pte_safe() Commit: f77084d96355 "x86/mm/pat: Disable preemption around __flush_tlb_all()" introduced a warning to capture cases __flush_tlb_all() is called without pre-emption disabled. It triggers a false positive warning in the memory hotplug path. On investigation it was found that the __flush_tlb_all() calls are not necessary. However, they are only "not necessary" in practice provided the ptes are being initially populated from the !present state. Introduce set_pte_safe() as a sanity check that the pte is being updated in a way that does not require a TLB flush. Forgive the macro, the availability of the various of set_pte() levels is hit and miss across architectures. [ mingo: Minor readability edits. ] Suggested-by: Peter Zijlstra Suggested-by: Dave Hansen Signed-off-by: Dan Williams Acked-by: Peter Zijlstra (Intel) Acked-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/279dadae-9148-465c-7ec6-3f37e026c6c9@intel.com Signed-off-by: Ingo Molnar --- include/asm-generic/pgtable.h | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index dae7f98babed..a9cac82e9a7a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -400,6 +400,44 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) } #endif +/* + * Use set_p*_safe(), and elide TLB flushing, when confident that *no* + * TLB flush will be required as a result of the "set". For example, use + * in scenarios where it is known ahead of time that the routine is + * setting non-present entries, or re-setting an existing entry to the + * same value. Otherwise, use the typical "set" helpers and flush the + * TLB. + */ +#define set_pte_safe(ptep, pte) \ +({ \ + WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ + set_pte(ptep, pte); \ +}) + +#define set_pmd_safe(pmdp, pmd) \ +({ \ + WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ + set_pmd(pmdp, pmd); \ +}) + +#define set_pud_safe(pudp, pud) \ +({ \ + WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ + set_pud(pudp, pud); \ +}) + +#define set_p4d_safe(p4dp, p4d) \ +({ \ + WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ + set_p4d(p4dp, p4d); \ +}) + +#define set_pgd_safe(pgdp, pgd) \ +({ \ + WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ + set_pgd(pgdp, pgd); \ +}) + #ifndef __HAVE_ARCH_DO_SWAP_PAGE /* * Some architectures support metadata associated with a page. When a From 0a9fe8ca844d43f3f547f0e166122b6048121c8f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 4 Dec 2018 13:37:21 -0800 Subject: [PATCH 18/29] x86/mm: Validate kernel_physical_mapping_init() PTE population The usage of __flush_tlb_all() in the kernel_physical_mapping_init() path is not necessary. In general flushing the TLB is not required when updating an entry from the !present state. However, to give confidence in the future removal of TLB flushing in this path, use the new set_pte_safe() family of helpers to assert that the !present assumption is true in this path. [ mingo: Minor readability edits. ] Suggested-by: Peter Zijlstra Suggested-by: Dave Hansen Signed-off-by: Dan Williams Acked-by: Kirill A. Shutemov Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154395944177.32119.8524957429632012270.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgalloc.h | 27 ++++++++++++++++++++++++ arch/x86/mm/init_64.c | 24 ++++++++++----------- include/asm-generic/5level-fixup.h | 1 + include/asm-generic/pgtable-nop4d-hack.h | 1 + include/asm-generic/pgtable-nop4d.h | 1 + include/asm-generic/pgtable-nopud.h | 1 + 6 files changed, 43 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index ec7f43327033..1ea41aaef68b 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -80,6 +80,13 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); } +static inline void pmd_populate_kernel_safe(struct mm_struct *mm, + pmd_t *pmd, pte_t *pte) +{ + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); +} + static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { @@ -132,6 +139,12 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); } + +static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd))); +} #endif /* CONFIG_X86_PAE */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -141,6 +154,12 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); + set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud))); +} + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_KERNEL_ACCOUNT; @@ -173,6 +192,14 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } +static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) +{ + if (!pgtable_l5_enabled()) + return; + paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); + set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); +} + static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_KERNEL_ACCOUNT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5fab264948c2..3e25ac2793ef 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -432,7 +432,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pte(pte, __pte(0)); + set_pte_safe(pte, __pte(0)); continue; } @@ -452,7 +452,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } @@ -487,7 +487,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pmd(pmd, __pmd(0)); + set_pmd_safe(pmd, __pmd(0)); continue; } @@ -524,7 +524,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); @@ -536,7 +536,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, pte); + pmd_populate_kernel_safe(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -573,7 +573,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pud(pud, __pud(0)); + set_pud_safe(pud, __pud(0)); continue; } @@ -611,7 +611,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); @@ -624,7 +624,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, page_size_mask, prot); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, pmd); + pud_populate_safe(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); @@ -659,7 +659,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_p4d(p4d, __p4d(0)); + set_p4d_safe(p4d, __p4d(0)); continue; } @@ -677,7 +677,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, page_size_mask); spin_lock(&init_mm.page_table_lock); - p4d_populate(&init_mm, p4d, pud); + p4d_populate_safe(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); @@ -723,9 +723,9 @@ kernel_physical_mapping_init(unsigned long paddr_start, spin_lock(&init_mm.page_table_lock); if (pgtable_l5_enabled()) - pgd_populate(&init_mm, pgd, p4d); + pgd_populate_safe(&init_mm, pgd, p4d); else - p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); + p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index 73474bb52344..bb6cb347018c 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -26,6 +26,7 @@ #define p4d_clear(p4d) pgd_clear(p4d) #define p4d_val(p4d) pgd_val(p4d) #define p4d_populate(mm, p4d, pud) pgd_populate(mm, p4d, pud) +#define p4d_populate_safe(mm, p4d, pud) pgd_populate(mm, p4d, pud) #define p4d_page(p4d) pgd_page(p4d) #define p4d_page_vaddr(p4d) pgd_page_vaddr(p4d) diff --git a/include/asm-generic/pgtable-nop4d-hack.h b/include/asm-generic/pgtable-nop4d-hack.h index 1d6dd38c0e5e..829bdb0d6327 100644 --- a/include/asm-generic/pgtable-nop4d-hack.h +++ b/include/asm-generic/pgtable-nop4d-hack.h @@ -31,6 +31,7 @@ static inline void pgd_clear(pgd_t *pgd) { } #define pud_ERROR(pud) (pgd_ERROR((pud).pgd)) #define pgd_populate(mm, pgd, pud) do { } while (0) +#define pgd_populate_safe(mm, pgd, pud) do { } while (0) /* * (puds are folded into pgds so this doesn't get actually called, * but the define is needed for a generic inline function.) diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h index 04cb913797bc..aebab905e6cd 100644 --- a/include/asm-generic/pgtable-nop4d.h +++ b/include/asm-generic/pgtable-nop4d.h @@ -26,6 +26,7 @@ static inline void pgd_clear(pgd_t *pgd) { } #define p4d_ERROR(p4d) (pgd_ERROR((p4d).pgd)) #define pgd_populate(mm, pgd, p4d) do { } while (0) +#define pgd_populate_safe(mm, pgd, p4d) do { } while (0) /* * (p4ds are folded into pgds so this doesn't get actually called, * but the define is needed for a generic inline function.) diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index 9bef475db6fe..c77a1d301155 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -35,6 +35,7 @@ static inline void p4d_clear(p4d_t *p4d) { } #define pud_ERROR(pud) (p4d_ERROR((pud).p4d)) #define p4d_populate(mm, p4d, pud) do { } while (0) +#define p4d_populate_safe(mm, p4d, pud) do { } while (0) /* * (puds are folded into p4ds so this doesn't get actually called, * but the define is needed for a generic inline function.) From ba6f508d0ec4adb09f0a939af6d5e19cdfa8667d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 4 Dec 2018 13:37:27 -0800 Subject: [PATCH 19/29] x86/mm: Drop usage of __flush_tlb_all() in kernel_physical_mapping_init() Commit: f77084d96355 "x86/mm/pat: Disable preemption around __flush_tlb_all()" addressed a case where __flush_tlb_all() is called without preemption being disabled. It also left a warning to catch other cases where preemption is not disabled. That warning triggers for the memory hotplug path which is also used for persistent memory enabling: WARNING: CPU: 35 PID: 911 at ./arch/x86/include/asm/tlbflush.h:460 RIP: 0010:__flush_tlb_all+0x1b/0x3a [..] Call Trace: phys_pud_init+0x29c/0x2bb kernel_physical_mapping_init+0xfc/0x219 init_memory_mapping+0x1a5/0x3b0 arch_add_memory+0x2c/0x50 devm_memremap_pages+0x3aa/0x610 pmem_attach_disk+0x585/0x700 [nd_pmem] Andy wondered why a path that can sleep was using __flush_tlb_all() [1] and Dave confirmed the expectation for TLB flush is for modifying / invalidating existing PTE entries, but not initial population [2]. Drop the usage of __flush_tlb_all() in phys_{p4d,pud,pmd}_init() on the expectation that this path is only ever populating empty entries for the linear map. Note, at linear map teardown time there is a call to the all-cpu flush_tlb_all() to invalidate the removed mappings. [1]: https://lkml.kernel.org/r/9DFD717D-857D-493D-A606-B635D72BAC21@amacapital.net [2]: https://lkml.kernel.org/r/749919a4-cdb1-48a3-adb4-adb81a5fa0b5@intel.com [ mingo: Minor readability edits. ] Suggested-by: Dave Hansen Reported-by: Andy Lutomirski Signed-off-by: Dan Williams Acked-by: Peter Zijlstra (Intel) Acked-by: Kirill A. Shutemov Cc: Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: dave.hansen@intel.com Fixes: f77084d96355 ("x86/mm/pat: Disable preemption around __flush_tlb_all()") Link: http://lkml.kernel.org/r/154395944713.32119.15611079023837132638.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3e25ac2793ef..484c1b92f078 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -584,7 +584,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, paddr_end, page_size_mask, prot); - __flush_tlb_all(); continue; } /* @@ -627,7 +626,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, pud_populate_safe(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } - __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); @@ -668,7 +666,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pud_init(pud, paddr, paddr_end, page_size_mask); - __flush_tlb_all(); continue; } @@ -680,7 +677,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, p4d_populate_safe(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); } - __flush_tlb_all(); return paddr_last; } @@ -733,8 +729,6 @@ kernel_physical_mapping_init(unsigned long paddr_start, if (pgd_changed) sync_global_pgds(vaddr_start, vaddr_end - 1); - __flush_tlb_all(); - return paddr_last; } From ecc729f1f47142ad31741549f400b611435c1af7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:45 +0100 Subject: [PATCH 20/29] x86/mm/cpa: Add ARRAY and PAGES_ARRAY selftests The current pageattr-test code only uses the regular range interface, add code that also tests the array and pages interface. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.162771364@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 08f8f76a4852..b6b6468530f1 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -23,7 +23,8 @@ static __read_mostly int print = 1; enum { - NTEST = 400, + NTEST = 3 * 100, + NPAGES = 100, #ifdef CONFIG_X86_64 LPS = (1 << PMD_SHIFT), #elif defined(CONFIG_X86_PAE) @@ -110,6 +111,9 @@ static int print_split(struct split_state *s) static unsigned long addr[NTEST]; static unsigned int len[NTEST]; +static struct page *pages[NPAGES]; +static unsigned long addrs[NPAGES]; + /* Change the global bit on random pages in the direct mapping */ static int pageattr_test(void) { @@ -137,7 +141,7 @@ static int pageattr_test(void) unsigned long pfn = prandom_u32() % max_pfn_mapped; addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); - len[i] = prandom_u32() % 100; + len[i] = prandom_u32() % NPAGES; len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); if (len[i] == 0) @@ -167,14 +171,30 @@ static int pageattr_test(void) break; } __set_bit(pfn + k, bm); + addrs[k] = addr[i] + k*PAGE_SIZE; + pages[k] = pfn_to_page(pfn + k); } if (!addr[i] || !pte || !k) { addr[i] = 0; continue; } - test_addr = addr[i]; - err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); + switch (i % 3) { + case 0: + test_addr = addr[i]; + err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); + break; + + case 1: + err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + break; + + case 2: + err = cpa_set_pages_array(pages, len[i], PAGE_CPA_TEST); + break; + } + + if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; From 16ebf031e8ab73779a382c9f2b097891da6af923 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:46 +0100 Subject: [PATCH 21/29] x86/mm/cpa: Add __cpa_addr() helper The code to compute the virtual address of a cpa_data is duplicated; introduce a helper before more copies happen. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.229119497@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a1bcde35db4c..6e6900ebea30 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -228,6 +228,23 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn) #endif +static unsigned long __cpa_addr(struct cpa_data *cpa, int idx) +{ + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[idx]; + + if (unlikely(PageHighMem(page))) + return 0; + + return (unsigned long)page_address(page); + } + + if (cpa->flags & CPA_ARRAY) + return cpa->vaddr[idx]; + + return *cpa->vaddr; +} + /* * Flushing functions */ @@ -1476,15 +1493,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_PAGES_ARRAY) { - struct page *page = cpa->pages[cpa->curpage]; - if (unlikely(PageHighMem(page))) - return 0; - address = (unsigned long)page_address(page); - } else if (cpa->flags & CPA_ARRAY) - address = cpa->vaddr[cpa->curpage]; - else - address = *cpa->vaddr; + address = __cpa_addr(cpa, cpa->curpage); repeat: kpte = _lookup_address_cpa(cpa, address, &level); if (!kpte) @@ -1565,16 +1574,7 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_PAGES_ARRAY) { - struct page *page = cpa->pages[cpa->curpage]; - if (unlikely(PageHighMem(page))) - return 0; - vaddr = (unsigned long)page_address(page); - } else if (cpa->flags & CPA_ARRAY) - vaddr = cpa->vaddr[cpa->curpage]; - else - vaddr = *cpa->vaddr; - + vaddr = __cpa_addr(cpa, cpa->curpage); if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { From 98bfc9b038cde1ce108f69a50720e394fe774cb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:47 +0100 Subject: [PATCH 22/29] x86/mm/cpa: Make cpa_data::vaddr invariant Currently __change_page_attr_set_clr() will modify cpa->vaddr when !(CPA_ARRAY | CPA_PAGES_ARRAY), whereas in the array cases it will increment cpa->curpage. Change __cpa_addr() such that its @idx argument also works in the !array case and use cpa->curpage increments for all cases. NOTE: since cpa_data::numpages is 'unsigned long' so should cpa_data::curpage be. NOTE: after this only cpa->numpages is still modified. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.295174892@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6e6900ebea30..ce8af3f08628 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -35,11 +35,11 @@ struct cpa_data { pgprot_t mask_set; pgprot_t mask_clr; unsigned long numpages; - int flags; + unsigned long curpage; unsigned long pfn; - unsigned force_split : 1, + unsigned int flags; + unsigned int force_split : 1, force_static_prot : 1; - int curpage; struct page **pages; }; @@ -228,7 +228,7 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn) #endif -static unsigned long __cpa_addr(struct cpa_data *cpa, int idx) +static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) { if (cpa->flags & CPA_PAGES_ARRAY) { struct page *page = cpa->pages[idx]; @@ -242,7 +242,7 @@ static unsigned long __cpa_addr(struct cpa_data *cpa, int idx) if (cpa->flags & CPA_ARRAY) return cpa->vaddr[idx]; - return *cpa->vaddr; + return *cpa->vaddr + idx * PAGE_SIZE; } /* @@ -1581,6 +1581,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa = *cpa; alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + alias_cpa.curpage = 0; ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) @@ -1600,6 +1601,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa = *cpa; alias_cpa.vaddr = &temp_cpa_vaddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + alias_cpa.curpage = 0; /* * The high mapping range is imprecise, so ignore the @@ -1648,11 +1650,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) */ BUG_ON(cpa->numpages > numpages || !cpa->numpages); numpages -= cpa->numpages; - if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) - cpa->curpage++; - else - *cpa->vaddr += cpa->numpages * PAGE_SIZE; - + cpa->curpage += cpa->numpages; } return 0; } From 5fe26b7a8f4693d532c7a3c3632e47e7d7016238 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:48 +0100 Subject: [PATCH 23/29] x86/mm/cpa: Simplify the code after making cpa->vaddr invariant Since cpa->vaddr is invariant, this means we can remove all workarounds that deal with it changing. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.366619025@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 7 ++----- arch/x86/mm/pageattr.c | 13 ++++--------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index b6b6468530f1..facce271e8b9 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -124,7 +124,6 @@ static int pageattr_test(void) unsigned int level; int i, k; int err; - unsigned long test_addr; if (print) printk(KERN_INFO "CPA self-test:\n"); @@ -181,8 +180,7 @@ static int pageattr_test(void) switch (i % 3) { case 0: - test_addr = addr[i]; - err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); + err = change_page_attr_set(&addr[i], len[i], PAGE_CPA_TEST, 0); break; case 1: @@ -226,8 +224,7 @@ static int pageattr_test(void) failed++; continue; } - test_addr = addr[i]; - err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); + err = change_page_attr_clear(&addr[i], len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index ce8af3f08628..afa98b7b6050 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1908,15 +1908,13 @@ EXPORT_SYMBOL_GPL(set_memory_array_wt); int _set_memory_wc(unsigned long addr, int numpages) { int ret; - unsigned long addr_copy = addr; ret = change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); if (!ret) { - ret = change_page_attr_set_clr(&addr_copy, numpages, - cachemode2pgprot( - _PAGE_CACHE_MODE_WC), + ret = change_page_attr_set_clr(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } @@ -2064,7 +2062,6 @@ int set_memory_global(unsigned long addr, int numpages) static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; - unsigned long start; int ret; /* Nothing to do if memory encryption is not active */ @@ -2075,8 +2072,6 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) addr &= PAGE_MASK; - start = addr; - memset(&cpa, 0, sizeof(cpa)); cpa.vaddr = &addr; cpa.numpages = numpages; @@ -2091,7 +2086,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush_range(start, numpages, 1); + cpa_flush_range(addr, numpages, 1); ret = __change_page_attr_set_clr(&cpa, 1); @@ -2102,7 +2097,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) * in case TLB flushing gets optimized in the cpa_flush_range() * path use the same logic as above. */ - cpa_flush_range(start, numpages, 0); + cpa_flush_range(addr, numpages, 0); return ret; } From 935f5839827ef54b53406e80906f7c355eb73c1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:49 +0100 Subject: [PATCH 24/29] x86/mm/cpa: Optimize cpa_flush_array() TLB invalidation Instead of punting and doing tlb_flush_all(), do the same as flush_tlb_kernel_range() does and use single page invalidations. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.430001980@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/mm_internal.h | 2 ++ arch/x86/mm/pageattr.c | 42 ++++++++++++++++++++++----------------- arch/x86/mm/tlb.c | 4 +++- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 4e1f6e1b8159..319bde386d5f 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -19,4 +19,6 @@ extern int after_bootmem; void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); +extern unsigned long tlb_single_page_flush_ceiling; + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index afa98b7b6050..351874259a71 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -26,6 +26,8 @@ #include #include +#include "mm_internal.h" + /* * The current flushing context - we pass it instead of 5 arguments: */ @@ -346,16 +348,26 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void cpa_flush_array(unsigned long baddr, unsigned long *start, - int numpages, int cache, - int in_flags, struct page **pages) +void __cpa_flush_array(void *data) { - unsigned int i, level; + struct cpa_data *cpa = data; + unsigned int i; - if (__inv_flush_all(cache)) + for (i = 0; i < cpa->numpages; i++) + __flush_tlb_one_kernel(__cpa_addr(cpa, i)); +} + +static void cpa_flush_array(struct cpa_data *cpa, int cache) +{ + unsigned int i; + + if (cpa_check_flush_all(cache)) return; - flush_tlb_all(); + if (cpa->numpages <= tlb_single_page_flush_ceiling) + on_each_cpu(__cpa_flush_array, cpa, 1); + else + flush_tlb_all(); if (!cache) return; @@ -366,15 +378,11 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, * will cause all other CPUs to flush the same * cachelines: */ - for (i = 0; i < numpages; i++) { - unsigned long addr; + for (i = 0; i < cpa->numpages; i++) { + unsigned long addr = __cpa_addr(cpa, i); + unsigned int level; pte_t *pte; - if (in_flags & CPA_PAGES_ARRAY) - addr = (unsigned long)page_address(pages[i]); - else - addr = start[i]; - pte = lookup_address(addr, &level); /* @@ -1771,12 +1779,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, goto out; } - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(baddr, addr, numpages, cache, - cpa.flags, pages); - } else { + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) + cpa_flush_array(&cpa, cache); + else cpa_flush_range(baddr, numpages, cache); - } out: return ret; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 03b6b4c2238d..999d6d8f0bef 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -15,6 +15,8 @@ #include #include +#include "mm_internal.h" + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -721,7 +723,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * * This is in units of pages. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, From 83b4e39146aa70913580966e0f2b78b7c3492760 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:50 +0100 Subject: [PATCH 25/29] x86/mm/cpa: Make cpa_data::numpages invariant Make sure __change_page_attr_set_clr() doesn't modify cpa->numpages. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.493000228@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 351874259a71..12b69263e501 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1625,14 +1625,15 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { unsigned long numpages = cpa->numpages; - int ret; + unsigned long rempages = numpages; + int ret = 0; - while (numpages) { + while (rempages) { /* * Store the remaining nr of pages for the large page * preservation check. */ - cpa->numpages = numpages; + cpa->numpages = rempages; /* for array changes, we can't use large page */ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; @@ -1643,12 +1644,12 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) - return ret; + goto out; if (checkalias) { ret = cpa_process_alias(cpa); if (ret) - return ret; + goto out; } /* @@ -1656,11 +1657,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * CPA operation. Either a large page has been * preserved or a single page update happened. */ - BUG_ON(cpa->numpages > numpages || !cpa->numpages); - numpages -= cpa->numpages; + BUG_ON(cpa->numpages > rempages || !cpa->numpages); + rempages -= cpa->numpages; cpa->curpage += cpa->numpages; } - return 0; + +out: + /* Restore the original numpages */ + cpa->numpages = numpages; + return ret; } /* From fe0937b24ff5d7b343b9922201e469f9a6009d9d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:51 +0100 Subject: [PATCH 26/29] x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single cpa_flush() function Note that the cache flush loop in cpa_flush_*() is identical when we use __cpa_addr(); further observe that flush_tlb_kernel_range() is a special case of to the cpa_flush_array() TLB invalidation code. This then means the two functions are virtually identical. Fold these two functions into a single cpa_flush() call. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.559855600@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 92 +++++++++--------------------------------- 1 file changed, 18 insertions(+), 74 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 12b69263e501..85ef53b86fa0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -304,51 +304,7 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static bool __inv_flush_all(int cache) -{ - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - - if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { - cpa_flush_all(cache); - return true; - } - - return false; -} - -static void cpa_flush_range(unsigned long start, int numpages, int cache) -{ - unsigned int i, level; - unsigned long addr; - - WARN_ON(PAGE_ALIGN(start) != start); - - if (__inv_flush_all(cache)) - return; - - flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - - if (!cache) - return; - - /* - * We only need to flush on one CPU, - * clflush is a MESI-coherent instruction that - * will cause all other CPUs to flush the same - * cachelines: - */ - for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { - pte_t *pte = lookup_address(addr, &level); - - /* - * Only flush present addresses: - */ - if (pte && (pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range((void *) addr, PAGE_SIZE); - } -} - -void __cpa_flush_array(void *data) +void __cpa_flush_tlb(void *data) { struct cpa_data *cpa = data; unsigned int i; @@ -357,33 +313,31 @@ void __cpa_flush_array(void *data) __flush_tlb_one_kernel(__cpa_addr(cpa, i)); } -static void cpa_flush_array(struct cpa_data *cpa, int cache) +static void cpa_flush(struct cpa_data *data, int cache) { + struct cpa_data *cpa = data; unsigned int i; - if (cpa_check_flush_all(cache)) + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); return; + } if (cpa->numpages <= tlb_single_page_flush_ceiling) - on_each_cpu(__cpa_flush_array, cpa, 1); + on_each_cpu(__cpa_flush_tlb, cpa, 1); else flush_tlb_all(); if (!cache) return; - /* - * We only need to flush on one CPU, - * clflush is a MESI-coherent instruction that - * will cause all other CPUs to flush the same - * cachelines: - */ for (i = 0; i < cpa->numpages; i++) { unsigned long addr = __cpa_addr(cpa, i); unsigned int level; - pte_t *pte; - pte = lookup_address(addr, &level); + pte_t *pte = lookup_address(addr, &level); /* * Only flush present addresses: @@ -1698,7 +1652,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, { struct cpa_data cpa; int ret, cache, checkalias; - unsigned long baddr = 0; memset(&cpa, 0, sizeof(cpa)); @@ -1732,11 +1685,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, */ WARN_ON_ONCE(1); } - /* - * Save address for cache flush. *addr is modified in the call - * to __change_page_attr_set_clr() below. - */ - baddr = make_addr_canonical_again(*addr); } /* Must avoid aliasing mappings in the highmem code */ @@ -1784,11 +1732,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, goto out; } - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) - cpa_flush_array(&cpa, cache); - else - cpa_flush_range(baddr, numpages, cache); - + cpa_flush(&cpa, cache); out: return ret; } @@ -2097,18 +2041,18 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush_range(addr, numpages, 1); + cpa_flush(&cpa, 1); ret = __change_page_attr_set_clr(&cpa, 1); /* - * After changing the encryption attribute, we need to flush TLBs - * again in case any speculative TLB caching occurred (but no need - * to flush caches again). We could just use cpa_flush_all(), but - * in case TLB flushing gets optimized in the cpa_flush_range() - * path use the same logic as above. + * After changing the encryption attribute, we need to flush TLBs again + * in case any speculative TLB caching occurred (but no need to flush + * caches again). We could just use cpa_flush_all(), but in case TLB + * flushing gets optimized in the cpa_flush() path use the same logic + * as above. */ - cpa_flush_range(addr, numpages, 0); + cpa_flush(&cpa, 0); return ret; } From c38116bb940ae37f51fccd315b420ee5961dcb76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:52 +0100 Subject: [PATCH 27/29] x86/mm/cpa: Better use CLFLUSHOPT Currently we issue an MFENCE before and after flushing a range. This means that if we flush a bunch of single page ranges -- like with the cpa array, we issue a whole bunch of superfluous MFENCEs. Reorgainze the code a little to avoid this. [ mingo: capitalize instructions, tweak changelog and comments. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.626999883@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 85ef53b86fa0..7d05149995dc 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -251,15 +251,7 @@ static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) * Flushing functions */ -/** - * clflush_cache_range - flush a cache range with clflush - * @vaddr: virtual start address - * @size: number of bytes to flush - * - * clflushopt is an unordered instruction which needs fencing with mfence or - * sfence to avoid ordering issues. - */ -void clflush_cache_range(void *vaddr, unsigned int size) +static void clflush_cache_range_opt(void *vaddr, unsigned int size) { const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); @@ -268,11 +260,22 @@ void clflush_cache_range(void *vaddr, unsigned int size) if (p >= vend) return; - mb(); - for (; p < vend; p += clflush_size) clflushopt(p); +} +/** + * clflush_cache_range - flush a cache range with clflush + * @vaddr: virtual start address + * @size: number of bytes to flush + * + * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or + * SFENCE to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + mb(); + clflush_cache_range_opt(vaddr, size); mb(); } EXPORT_SYMBOL_GPL(clflush_cache_range); @@ -333,6 +336,7 @@ static void cpa_flush(struct cpa_data *data, int cache) if (!cache) return; + mb(); for (i = 0; i < cpa->numpages; i++) { unsigned long addr = __cpa_addr(cpa, i); unsigned int level; @@ -343,8 +347,9 @@ static void cpa_flush(struct cpa_data *data, int cache) * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range((void *)addr, PAGE_SIZE); + clflush_cache_range_opt((void *)addr, PAGE_SIZE); } + mb(); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, From 3c567356dbe0da4fc310cfcffafc39526e1ca43a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:53 +0100 Subject: [PATCH 28/29] x86/mm/cpa: Rename @addrinarray to @numpages The CPA_ARRAY interface works in single pages, and everything, except in these 'few' locations is this variable called 'numpages'. Remove this 'addrinarray' abberation and use 'numpages' consistently. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tom.StDenis@amd.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20181203171043.695039210@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 52 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7d05149995dc..df4340c8e293 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1808,14 +1808,14 @@ out_err: } EXPORT_SYMBOL(set_memory_uc); -static int _set_memory_array(unsigned long *addr, int addrinarray, +static int _set_memory_array(unsigned long *addr, int numpages, enum page_cache_mode new_type) { enum page_cache_mode set_type; int i, j; int ret; - for (i = 0; i < addrinarray; i++) { + for (i = 0; i < numpages; i++) { ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, new_type, NULL); if (ret) @@ -1826,11 +1826,11 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; - ret = change_page_attr_set(addr, addrinarray, + ret = change_page_attr_set(addr, numpages, cachemode2pgprot(set_type), 1); if (!ret && new_type == _PAGE_CACHE_MODE_WC) - ret = change_page_attr_set_clr(addr, addrinarray, + ret = change_page_attr_set_clr(addr, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), @@ -1847,21 +1847,21 @@ out_free: return ret; } -int set_memory_array_uc(unsigned long *addr, int addrinarray) +int set_memory_array_uc(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_memory_array_uc); -int set_memory_array_wc(unsigned long *addr, int addrinarray) +int set_memory_array_wc(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_memory_array_wc); -int set_memory_array_wt(unsigned long *addr, int addrinarray) +int set_memory_array_wt(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WT); } EXPORT_SYMBOL_GPL(set_memory_array_wt); @@ -1941,18 +1941,18 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); -int set_memory_array_wb(unsigned long *addr, int addrinarray) +int set_memory_array_wb(unsigned long *addr, int numpages) { int i; int ret; /* WB cache mode is hard wired to all cache attribute bits being 0 */ - ret = change_page_attr_clear(addr, addrinarray, + ret = change_page_attr_clear(addr, numpages, __pgprot(_PAGE_CACHE_MASK), 1); if (ret) return ret; - for (i = 0; i < addrinarray; i++) + for (i = 0; i < numpages; i++) free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); return 0; @@ -2082,7 +2082,7 @@ int set_pages_uc(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_uc); -static int _set_pages_array(struct page **pages, int addrinarray, +static int _set_pages_array(struct page **pages, int numpages, enum page_cache_mode new_type) { unsigned long start; @@ -2092,7 +2092,7 @@ static int _set_pages_array(struct page **pages, int addrinarray, int free_idx; int ret; - for (i = 0; i < addrinarray; i++) { + for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; @@ -2105,10 +2105,10 @@ static int _set_pages_array(struct page **pages, int addrinarray, set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; - ret = cpa_set_pages_array(pages, addrinarray, + ret = cpa_set_pages_array(pages, numpages, cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) - ret = change_page_attr_set_clr(NULL, addrinarray, + ret = change_page_attr_set_clr(NULL, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), @@ -2128,21 +2128,21 @@ err_out: return -EINVAL; } -int set_pages_array_uc(struct page **pages, int addrinarray) +int set_pages_array_uc(struct page **pages, int numpages) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); + return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); -int set_pages_array_wc(struct page **pages, int addrinarray) +int set_pages_array_wc(struct page **pages, int numpages) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); + return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); -int set_pages_array_wt(struct page **pages, int addrinarray) +int set_pages_array_wt(struct page **pages, int numpages) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); + return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WT); } EXPORT_SYMBOL_GPL(set_pages_array_wt); @@ -2154,7 +2154,7 @@ int set_pages_wb(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_wb); -int set_pages_array_wb(struct page **pages, int addrinarray) +int set_pages_array_wb(struct page **pages, int numpages) { int retval; unsigned long start; @@ -2162,12 +2162,12 @@ int set_pages_array_wb(struct page **pages, int addrinarray) int i; /* WB cache mode is hard wired to all cache attribute bits being 0 */ - retval = cpa_clear_pages_array(pages, addrinarray, + retval = cpa_clear_pages_array(pages, numpages, __pgprot(_PAGE_CACHE_MASK)); if (retval) return retval; - for (i = 0; i < addrinarray; i++) { + for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; From 6848ac7ca39a226ede5df7af0efcc4ef0611e99c Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Mon, 19 Nov 2018 10:43:34 -0500 Subject: [PATCH 29/29] x86/mm/dump_pagetables: Use DEFINE_SHOW_ATTRIBUTE() Use DEFINE_SHOW_ATTRIBUTE() instead of open coding it. Signed-off-by: Yangtao Li Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: keescook@chromium.org Cc: luto@kernel.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/20181119154334.18265-1-tiny.windzz@gmail.com --- arch/x86/mm/debug_pagetables.c | 58 ++++------------------------------ 1 file changed, 7 insertions(+), 51 deletions(-) diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 225fe2f0bfec..cd84f067e41d 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -10,20 +10,9 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } -static int ptdump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show, NULL); -} +DEFINE_SHOW_ATTRIBUTE(ptdump); -static const struct file_operations ptdump_fops = { - .owner = THIS_MODULE, - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int ptdump_show_curknl(struct seq_file *m, void *v) +static int ptdump_curknl_show(struct seq_file *m, void *v) { if (current->mm->pgd) { down_read(¤t->mm->mmap_sem); @@ -33,23 +22,12 @@ static int ptdump_show_curknl(struct seq_file *m, void *v) return 0; } -static int ptdump_open_curknl(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_curknl, NULL); -} - -static const struct file_operations ptdump_curknl_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_curknl, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); #ifdef CONFIG_PAGE_TABLE_ISOLATION static struct dentry *pe_curusr; -static int ptdump_show_curusr(struct seq_file *m, void *v) +static int ptdump_curusr_show(struct seq_file *m, void *v) { if (current->mm->pgd) { down_read(¤t->mm->mmap_sem); @@ -59,42 +37,20 @@ static int ptdump_show_curusr(struct seq_file *m, void *v) return 0; } -static int ptdump_open_curusr(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_curusr, NULL); -} - -static const struct file_operations ptdump_curusr_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_curusr, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_curusr); #endif #if defined(CONFIG_EFI) && defined(CONFIG_X86_64) static struct dentry *pe_efi; -static int ptdump_show_efi(struct seq_file *m, void *v) +static int ptdump_efi_show(struct seq_file *m, void *v) { if (efi_mm.pgd) ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false); return 0; } -static int ptdump_open_efi(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_efi, NULL); -} - -static const struct file_operations ptdump_efi_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_efi, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_efi); #endif static struct dentry *dir, *pe_knl, *pe_curknl;