diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a89fdbc1f0be..03663740c866 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -421,7 +421,7 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) regs->ip = landing_pad; /* - * Fetch ECX from where the vDSO stashed it. + * Fetch EBP from where the vDSO stashed it. * * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! */ @@ -432,10 +432,10 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) * Micro-optimization: the pointer we're following is explicitly * 32 bits, so it can't be out of range. */ - __get_user(*(u32 *)®s->cx, + __get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #else - get_user(*(u32 *)®s->cx, + get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #endif ) { diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3eb572ed3d7a..f3b6d54e0042 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -292,7 +292,7 @@ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ - pushl %ecx /* pt_regs->cx */ + pushl %ebp /* pt_regs->sp (stashed in bp) */ pushfl /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ @@ -308,8 +308,9 @@ sysenter_past_esp: movl %esp, %eax call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV /* Opportunistic SYSEXIT */ TRACE_IRQS_ON /* User mode traces as IRQs on. */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index c3201830a85e..6a1ae3751e82 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -63,7 +63,7 @@ ENTRY(entry_SYSENTER_compat) /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rcx /* pt_regs->sp */ + pushq %rbp /* pt_regs->sp (stashed in bp) */ /* * Push flags. This is nasty. First, interrupts are currently @@ -82,14 +82,14 @@ ENTRY(entry_SYSENTER_compat) pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx (will be overwritten) */ + pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 = 0 */ pushq %r8 /* pt_regs->r9 = 0 */ pushq %r8 /* pt_regs->r10 = 0 */ pushq %r8 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ - pushq %rbp /* pt_regs->rbp */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ pushq %r8 /* pt_regs->r12 = 0 */ pushq %r8 /* pt_regs->r13 = 0 */ pushq %r8 /* pt_regs->r14 = 0 */ @@ -121,8 +121,9 @@ sysenter_flags_fixed: movq %rsp, %rdi call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV jmp sysret32_from_system_call sysenter_fix_flags: @@ -178,7 +179,7 @@ ENTRY(entry_SYSCALL_compat) pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx (will be overwritten) */ + pushq %rbp /* pt_regs->cx (stashed in bp) */ pushq $-ENOSYS /* pt_regs->ax */ xorq %r8,%r8 pushq %r8 /* pt_regs->r8 = 0 */ @@ -186,7 +187,7 @@ ENTRY(entry_SYSCALL_compat) pushq %r8 /* pt_regs->r10 = 0 */ pushq %r8 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ - pushq %rbp /* pt_regs->rbp */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ pushq %r8 /* pt_regs->r12 = 0 */ pushq %r8 /* pt_regs->r13 = 0 */ pushq %r8 /* pt_regs->r14 = 0 */ @@ -200,8 +201,9 @@ ENTRY(entry_SYSCALL_compat) movq %rsp, %rdi call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV /* Opportunistic SYSRET */ sysret32_from_system_call: diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 93bd8452383f..3a1d9297074b 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -1,5 +1,5 @@ /* - * Code for the vDSO. This version uses the old int $0x80 method. + * AT_SYSINFO entry point */ #include @@ -21,35 +21,67 @@ __kernel_vsyscall: /* * Reshuffle regs so that all of any of the entry instructions * will preserve enough state. + * + * A really nice entry sequence would be: + * pushl %edx + * pushl %ecx + * movl %esp, %ecx + * + * Unfortunately, naughty Android versions between July and December + * 2015 actually hardcode the traditional Linux SYSENTER entry + * sequence. That is severely broken for a number of reasons (ask + * anyone with an AMD CPU, for example). Nonetheless, we try to keep + * it working approximately as well as it ever worked. + * + * This link may eludicate some of the history: + * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 + * personally, I find it hard to understand what's going on there. + * + * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE. + * Execute an indirect call to the address in the AT_SYSINFO auxv + * entry. That is the ONLY correct way to make a fast 32-bit system + * call on Linux. (Open-coding int $0x80 is also fine, but it's + * slow.) */ - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx, 0 pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx, 0 - movl %esp, %ecx + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + + #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter" + #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall" #ifdef CONFIG_X86_64 /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ - ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \ - "syscall", X86_FEATURE_SYSCALL32 + ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \ + SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 #else - ALTERNATIVE "", "sysenter", X86_FEATURE_SEP + ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP #endif /* Enter using int $0x80 */ - movl (%esp), %ecx int $0x80 GLOBAL(int80_landing_pad) - /* Restore ECX and EDX in case they were clobbered. */ - popl %ecx - CFI_RESTORE ecx + /* + * Restore EDX and ECX in case they were clobbered. EBP is not + * clobbered (the kernel restores it), but it's cleaner and + * probably faster to pop it than to adjust ESP using addl. + */ + popl %ebp + CFI_RESTORE ebp CFI_ADJUST_CFA_OFFSET -4 popl %edx CFI_RESTORE edx CFI_ADJUST_CFA_OFFSET -4 + popl %ecx + CFI_RESTORE ecx + CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e4f8010f22e0..f7ba9fbf12ee 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -216,6 +216,7 @@ #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 10d0596433f8..c759b3cca663 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -19,6 +19,12 @@ static inline int paravirt_enabled(void) return pv_info.paravirt_enabled; } +static inline int paravirt_has_feature(unsigned int feature) +{ + WARN_ON_ONCE(!pv_info.paravirt_enabled); + return (pv_info.features & feature); +} + static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 31247b5bff7c..3d44191185f8 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -70,9 +70,14 @@ struct pv_info { #endif int paravirt_enabled; + unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; +#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) +/* Supported features */ +#define PV_SUPPORTED_RTC (1<<0) + struct pv_init_ops { /* * Patch may replace one of the defined code sequences with diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 67522256c7ff..2d5a50cb61a2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -472,6 +472,7 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid #define paravirt_enabled() 0 +#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 38dd5efdd04c..2bd2292a316d 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -193,20 +193,17 @@ static int __init numachip_system_init(void) case 1: init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); numachip_apic_icr_write = numachip1_apic_icr_write; - x86_init.pci.arch_init = pci_numachip_init; break; case 2: init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE); numachip_apic_icr_write = numachip2_apic_icr_write; - - /* Use MCFG config cycles rather than locked CF8 cycles */ - raw_pci_ops = &pci_mmcfg; break; default: return 0; } x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c5b0d562dbf5..7e8a736d09db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -999,6 +999,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) int flags = MF_ACTION_REQUIRED; int lmce = 0; + /* If this CPU is offline, just bail out. */ + if (cpu_is_offline(smp_processor_id())) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return; + } + } + ist_enter(regs); this_cpu_inc(mce_exception_count); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index cd9685235df9..4af8d063fb36 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void) } #endif + if (paravirt_enabled() && !paravirt_has(RTC)) + return -ENODEV; + platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a0d09f6c6533..a43b2eafc466 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1414,6 +1414,7 @@ __init void lguest_init(void) pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; + pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5774800ff583..b7de78bdc09c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1192,7 +1192,7 @@ static const struct pv_info xen_info __initconst = { #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - + .features = 0, .name = "Xen", }; @@ -1535,6 +1535,8 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; + if (xen_initial_domain()) + pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; pv_apic_ops = xen_apic_ops; if (!xen_pvh_domain()) { @@ -1886,8 +1888,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); static void xen_set_cpu_features(struct cpuinfo_x86 *c) { - if (xen_pv_domain()) + if (xen_pv_domain()) { clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + set_cpu_cap(c, X86_FEATURE_XENPV); + } } const struct hypervisor_x86 x86_hyper_xen = {