From 38480df564cc68f081bb38998927d164b9010995 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:05:59 +0200 Subject: [PATCH 01/25] KVM: arm64: pvtime: steal-time is only supported when configured Don't confuse the guest by saying steal-time is supported when it hasn't been configured by userspace and won't work. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-2-drjones@redhat.com --- arch/arm64/kvm/pvtime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index f7b52ce1557e..c3ef4ebd6846 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -43,7 +43,8 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) switch (feature) { case ARM_SMCCC_HV_PV_TIME_FEATURES: case ARM_SMCCC_HV_PV_TIME_ST: - val = SMCCC_RET_SUCCESS; + if (vcpu->arch.steal.base != GPA_INVALID) + val = SMCCC_RET_SUCCESS; break; } From 2dbd780e34ac53e79c6c359ce12b89ed665ef562 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:00 +0200 Subject: [PATCH 02/25] KVM: arm64: pvtime: Fix potential loss of stolen time We should only check current->sched_info.run_delay once when updating stolen time. Otherwise there's a chance there could be a change between checks that we miss (preemption disabling comes after vcpu request checks). Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-3-drjones@redhat.com --- arch/arm64/kvm/pvtime.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index c3ef4ebd6846..95f9580275b1 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -13,6 +13,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; + u64 last_steal = vcpu->arch.steal.last_steal; u64 steal; __le64 steal_le; u64 offset; @@ -24,8 +25,8 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) /* Let's do the local bookkeeping */ steal = vcpu->arch.steal.steal; - steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal; - vcpu->arch.steal.last_steal = current->sched_info.run_delay; + vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); + steal += vcpu->arch.steal.last_steal - last_steal; vcpu->arch.steal.steal = steal; steal_le = cpu_to_le64(steal); From 4d2d4ce001f283ed8127173543b4cfb65641e357 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:01 +0200 Subject: [PATCH 03/25] KVM: arm64: Drop type input from kvm_put_guest We can use typeof() to avoid the need for the type input. Suggested-by: Marc Zyngier Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-4-drjones@redhat.com --- arch/arm64/kvm/pvtime.c | 2 +- include/linux/kvm_host.h | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 95f9580275b1..241ded7ee0ad 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -32,7 +32,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) steal_le = cpu_to_le64(steal); idx = srcu_read_lock(&kvm->srcu); offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); - kvm_put_guest(kvm, base + offset, steal_le, u64); + kvm_put_guest(kvm, base + offset, steal_le); srcu_read_unlock(&kvm->srcu, idx); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a23076765b4c..84371fb06209 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -749,25 +749,26 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); -#define __kvm_put_guest(kvm, gfn, offset, value, type) \ +#define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ - type __user *__uaddr = (type __user *)(__addr + offset); \ + typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ - __ret = put_user(value, __uaddr); \ + __ret = put_user(v, __uaddr); \ if (!__ret) \ mark_page_dirty(kvm, gfn); \ __ret; \ }) -#define kvm_put_guest(kvm, gpa, value, type) \ +#define kvm_put_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ + \ __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \ - offset_in_page(__gpa), (value), type); \ + offset_in_page(__gpa), v); \ }) int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); From 53f985584e3c2ebe5f2455530fbf87a001528db8 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:02 +0200 Subject: [PATCH 04/25] KVM: arm64: pvtime: Fix stolen time accounting across migration When updating the stolen time we should always read the current stolen time from the user provided memory, not from a kernel cache. If we use a cache then we'll end up resetting stolen time to zero on the first update after migration. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-5-drjones@redhat.com --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/pvtime.c | 25 ++++++++++--------------- include/linux/kvm_host.h | 20 ++++++++++++++++++++ 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 65568b23868a..dd9c3b25aa1e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -368,7 +368,6 @@ struct kvm_vcpu_arch { /* Guest PV state */ struct { - u64 steal; u64 last_steal; gpa_t base; } steal; diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 241ded7ee0ad..75234321d896 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -13,26 +13,22 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - u64 last_steal = vcpu->arch.steal.last_steal; - u64 steal; - __le64 steal_le; - u64 offset; - int idx; u64 base = vcpu->arch.steal.base; + u64 last_steal = vcpu->arch.steal.last_steal; + u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); + u64 steal = 0; + int idx; if (base == GPA_INVALID) return; - /* Let's do the local bookkeeping */ - steal = vcpu->arch.steal.steal; - vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); - steal += vcpu->arch.steal.last_steal - last_steal; - vcpu->arch.steal.steal = steal; - - steal_le = cpu_to_le64(steal); idx = srcu_read_lock(&kvm->srcu); - offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); - kvm_put_guest(kvm, base + offset, steal_le); + if (!kvm_get_guest(kvm, base + offset, steal)) { + steal = le64_to_cpu(steal); + vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); + steal += vcpu->arch.steal.last_steal - last_steal; + kvm_put_guest(kvm, base + offset, cpu_to_le64(steal)); + } srcu_read_unlock(&kvm->srcu, idx); } @@ -66,7 +62,6 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) * Start counting stolen time from the time the guest requests * the feature enabled. */ - vcpu->arch.steal.steal = 0; vcpu->arch.steal.last_steal = current->sched_info.run_delay; idx = srcu_read_lock(&kvm->srcu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 84371fb06209..05e3c2fb3ef7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -749,6 +749,26 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); +#define __kvm_get_guest(kvm, gfn, offset, v) \ +({ \ + unsigned long __addr = gfn_to_hva(kvm, gfn); \ + typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ + int __ret = -EFAULT; \ + \ + if (!kvm_is_error_hva(__addr)) \ + __ret = get_user(v, __uaddr); \ + __ret; \ +}) + +#define kvm_get_guest(kvm, gpa, v) \ +({ \ + gpa_t __gpa = gpa; \ + struct kvm *__kvm = kvm; \ + \ + __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT, \ + offset_in_page(__gpa), v); \ +}) + #define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ From 739c7af7daeede8e2ec78392f2617c965ce0342a Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:03 +0200 Subject: [PATCH 05/25] KVM: Documentation: Minor fixups In preparation for documenting a new capability let's fix up the formatting of the current ones. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Reviewed-by: Steven Price Link: https://lore.kernel.org/r/20200804170604.42662-6-drjones@redhat.com --- Documentation/virt/kvm/api.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index eb3a1316f03e..49af23d2b462 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6130,7 +6130,7 @@ HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. 8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH ----------------------------------- -:Architecture: x86 +:Architectures: x86 This capability indicates that KVM running on top of Hyper-V hypervisor enables Direct TLB flush for its guests meaning that TLB flush @@ -6143,16 +6143,17 @@ in CPUID and only exposes Hyper-V identification. In this case, guest thinks it's running on Hyper-V and only use Hyper-V hypercalls. 8.22 KVM_CAP_S390_VCPU_RESETS +----------------------------- -Architectures: s390 +:Architectures: s390 This capability indicates that the KVM_S390_NORMAL_RESET and KVM_S390_CLEAR_RESET ioctls are available. 8.23 KVM_CAP_S390_PROTECTED +--------------------------- -Architecture: s390 - +:Architectures: s390 This capability indicates that the Ultravisor has been initialized and KVM can therefore start protected VMs. From 004a01241c5a0d375266ebf1c72f208de99294e9 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:04 +0200 Subject: [PATCH 06/25] arm64/x86: KVM: Introduce steal-time cap arm64 requires a vcpu fd (KVM_HAS_DEVICE_ATTR vcpu ioctl) to probe support for steal-time. However this is unnecessary, as only a KVM fd is required, and it complicates userspace (userspace may prefer delaying vcpu creation until after feature probing). Introduce a cap that can be checked instead. While x86 can already probe steal-time support with a kvm fd (KVM_GET_SUPPORTED_CPUID), we add the cap there too for consistency. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Reviewed-by: Steven Price Link: https://lore.kernel.org/r/20200804170604.42662-7-drjones@redhat.com --- Documentation/virt/kvm/api.rst | 13 +++++++++++++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 3 +++ arch/arm64/kvm/pvtime.c | 2 +- arch/x86/kvm/x86.c | 3 +++ include/uapi/linux/kvm.h | 1 + 6 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 49af23d2b462..d2b733dc7892 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6160,3 +6160,16 @@ KVM can therefore start protected VMs. This capability governs the KVM_S390_PV_COMMAND ioctl and the KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected guests when the state change is invalid. + +8.24 KVM_CAP_STEAL_TIME +----------------------- + +:Architectures: arm64, x86 + +This capability indicates that KVM supports steal time accounting. +When steal time accounting is supported it may be enabled with +architecture-specific interfaces. This capability and the architecture- +specific interfaces must be consistent, i.e. if one says the feature +is supported, than the other should as well and vice versa. For arm64 +see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL". +For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME". diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index dd9c3b25aa1e..af4989a25bb7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -543,6 +543,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); void kvm_update_stolen_time(struct kvm_vcpu *vcpu); +bool kvm_arm_pvtime_supported(void); int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 691d21e4c717..57876b0b870b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -206,6 +206,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = 1; break; + case KVM_CAP_STEAL_TIME: + r = kvm_arm_pvtime_supported(); + break; default: r = kvm_arch_vm_ioctl_check_extension(kvm, ext); break; diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 75234321d896..920ac43077ad 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -71,7 +71,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) return base; } -static bool kvm_arm_pvtime_supported(void) +bool kvm_arm_pvtime_supported(void) { return !!sched_info_on(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 599d73206299..c44d3a73b8eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3581,6 +3581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; + case KVM_CAP_STEAL_TIME: + r = sched_info_on(); + break; default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f6d86033c4fa..3d8023474f2a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1035,6 +1035,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_LAST_CPU 184 #define KVM_CAP_SMALLER_MAXPHYADDR 185 #define KVM_CAP_S390_DIAG318 186 +#define KVM_CAP_STEAL_TIME 187 #ifdef KVM_CAP_IRQ_ROUTING From 3fb884ffe921c99483a84b0175f3c03f048e9069 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Sep 2020 11:18:29 +0100 Subject: [PATCH 07/25] KVM: arm64: Do not try to map PUDs when they are folded into PMD For the obscure cases where PMD and PUD are the same size (64kB pages with 42bit VA, for example, which results in only two levels of page tables), we can't map anything as a PUD, because there is... erm... no PUD to speak of. Everything is either a PMD or a PTE. So let's only try and map a PUD when its size is different from that of a PMD. Cc: stable@vger.kernel.org Fixes: b8e0ba7c8bea ("KVM: arm64: Add support for creating PUD hugepages at stage 2") Reported-by: Gavin Shan Reported-by: Eric Auger Reviewed-by: Alexandru Elisei Reviewed-by: Gavin Shan Tested-by: Gavin Shan Tested-by: Eric Auger Tested-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/mmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0121ef2c7c8d..16b8660ddbcc 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1964,7 +1964,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, (fault_status == FSC_PERM && stage2_is_exec(mmu, fault_ipa, vma_pagesize)); - if (vma_pagesize == PUD_SIZE) { + /* + * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and + * all we have is a 2-level page table. Trying to map a PUD in + * this case would be fatally wrong. + */ + if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) { pud_t new_pud = kvm_pfn_pud(pfn, mem_type); new_pud = kvm_pud_mkhuge(new_pud); From 376426b1a953762b00df887e28d29e44ab4ff723 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Sep 2020 11:53:03 +0100 Subject: [PATCH 08/25] KVM: arm64: Fix address truncation in traces Owing to their ARMv7 origins, the trace events are truncating most address values to 32bits. That's not really helpful. Expand the printing of such values to their full glory. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/trace_arm.h | 16 ++++++++-------- arch/arm64/kvm/trace_handle_exit.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index 4691053c5ee4..ff0444352bba 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -23,7 +23,7 @@ TRACE_EVENT(kvm_entry, __entry->vcpu_pc = vcpu_pc; ), - TP_printk("PC: 0x%08lx", __entry->vcpu_pc) + TP_printk("PC: 0x%016lx", __entry->vcpu_pc) ); TRACE_EVENT(kvm_exit, @@ -42,7 +42,7 @@ TRACE_EVENT(kvm_exit, __entry->vcpu_pc = vcpu_pc; ), - TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", + TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%016lx", __print_symbolic(__entry->ret, kvm_arm_exception_type), __entry->esr_ec, __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), @@ -69,7 +69,7 @@ TRACE_EVENT(kvm_guest_fault, __entry->ipa = ipa; ), - TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx", + TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#016lx", __entry->ipa, __entry->hsr, __entry->hxfar, __entry->vcpu_pc) ); @@ -131,7 +131,7 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->cpsr = cpsr; ), - TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)", + TP_printk("Emulate MMIO at: 0x%016lx (instr: %08lx, cpsr: %08lx)", __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); @@ -149,7 +149,7 @@ TRACE_EVENT(kvm_unmap_hva_range, __entry->end = end; ), - TP_printk("mmu notifier unmap range: %#08lx -- %#08lx", + TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", __entry->start, __entry->end) ); @@ -165,7 +165,7 @@ TRACE_EVENT(kvm_set_spte_hva, __entry->hva = hva; ), - TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva) + TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) ); TRACE_EVENT(kvm_age_hva, @@ -182,7 +182,7 @@ TRACE_EVENT(kvm_age_hva, __entry->end = end; ), - TP_printk("mmu notifier age hva: %#08lx -- %#08lx", + TP_printk("mmu notifier age hva: %#016lx -- %#016lx", __entry->start, __entry->end) ); @@ -198,7 +198,7 @@ TRACE_EVENT(kvm_test_age_hva, __entry->hva = hva; ), - TP_printk("mmu notifier test age hva: %#08lx", __entry->hva) + TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); TRACE_EVENT(kvm_set_way_flush, diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 2c56d1e0f5bd..8d78acc4fba7 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -22,7 +22,7 @@ TRACE_EVENT(kvm_wfx_arm64, __entry->is_wfe = is_wfe; ), - TP_printk("guest executed wf%c at: 0x%08lx", + TP_printk("guest executed wf%c at: 0x%016lx", __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) ); @@ -42,7 +42,7 @@ TRACE_EVENT(kvm_hvc_arm64, __entry->imm = imm; ), - TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)", + TP_printk("HVC at 0x%016lx (r0: 0x%016lx, imm: 0x%lx)", __entry->vcpu_pc, __entry->r0, __entry->imm) ); @@ -135,7 +135,7 @@ TRACE_EVENT(trap_reg, __entry->write_value = write_value; ), - TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) + TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) ); TRACE_EVENT(kvm_handle_sys_reg, From 7b75cd5128421c673153efb1236705696a1a9812 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 1 Sep 2020 14:33:56 +0100 Subject: [PATCH 09/25] KVM: arm64: Update page shift if stage 2 block mapping not supported Commit 196f878a7ac2e (" KVM: arm/arm64: Signal SIGBUS when stage2 discovers hwpoison memory") modifies user_mem_abort() to send a SIGBUS signal when the fault IPA maps to a hwpoisoned page. Commit 1559b7583ff6 ("KVM: arm/arm64: Re-check VMA on detecting a poisoned page") changed kvm_send_hwpoison_signal() to use the page shift instead of the VMA because at that point the code had already released the mmap lock, which means userspace could have modified the VMA. If userspace uses hugetlbfs for the VM memory, user_mem_abort() tries to map the guest fault IPA using block mappings in stage 2. That is not always possible, if, for example, userspace uses dirty page logging for the VM. Update the page shift appropriately in those cases when we downgrade the stage 2 entry from a block mapping to a page. Fixes: 1559b7583ff6 ("KVM: arm/arm64: Re-check VMA on detecting a poisoned page") Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20200901133357.52640-2-alexandru.elisei@arm.com --- arch/arm64/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 16b8660ddbcc..f58d657a898d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1871,6 +1871,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { force_pte = true; vma_pagesize = PAGE_SIZE; + vma_shift = PAGE_SHIFT; } /* From 43fea4e42599c3eb4109996698f5c25761d3f815 Mon Sep 17 00:00:00 2001 From: Peter Shier Date: Thu, 20 Aug 2020 16:05:45 -0700 Subject: [PATCH 10/25] KVM: nVMX: Update VMCS02 when L2 PAE PDPTE updates detected When L2 uses PAE, L0 intercepts of L2 writes to CR0/CR3/CR4 call load_pdptrs to read the possibly updated PDPTEs from the guest physical address referenced by CR3. It loads them into vcpu->arch.walk_mmu->pdptrs and sets VCPU_EXREG_PDPTR in vcpu->arch.regs_dirty. At the subsequent assumed reentry into L2, the mmu will call vmx_load_mmu_pgd which calls ept_load_pdptrs. ept_load_pdptrs sees VCPU_EXREG_PDPTR set in vcpu->arch.regs_dirty and loads VMCS02.GUEST_PDPTRn from vcpu->arch.walk_mmu->pdptrs[]. This all works if the L2 CRn write intercept always resumes L2. The resume path calls vmx_check_nested_events which checks for exceptions, MTF, and expired VMX preemption timers. If vmx_check_nested_events finds any of these conditions pending it will reflect the corresponding exit into L1. Live migration at this point would also cause a missed immediate reentry into L2. After L1 exits, vmx_vcpu_run calls vmx_register_cache_reset which clears VCPU_EXREG_PDPTR in vcpu->arch.regs_dirty. When L2 next resumes, ept_load_pdptrs finds VCPU_EXREG_PDPTR clear in vcpu->arch.regs_dirty and does not load VMCS02.GUEST_PDPTRn from vcpu->arch.walk_mmu->pdptrs[]. prepare_vmcs02 will then load VMCS02.GUEST_PDPTRn from vmcs12->pdptr0/1/2/3 which contain the stale values stored at last L2 exit. A repro of this bug showed L2 entering triple fault immediately due to the bad VMCS02.GUEST_PDPTRn values. When L2 is in PAE paging mode add a call to ept_load_pdptrs before leaving L2. This will update VMCS02.GUEST_PDPTRn if they are dirty in vcpu->arch.walk_mmu->pdptrs[]. Tested: kvm-unit-tests with new directed test: vmx_mtf_pdpte_test. Verified that test fails without the fix. Also ran Google internal VMM with an Ubuntu 16.04 4.4.0-83 guest running a custom hypervisor with a 32-bit Windows XP L2 guest using PAE. Prior to fix would repro readily. Ran 14 simultaneous L2s for 140 iterations with no failures. Signed-off-by: Peter Shier Reviewed-by: Jim Mattson Message-Id: <20200820230545.2411347-1-pshier@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++++++ arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/vmx/vmx.h | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 23b58c28a1c9..d7482ccf6a8d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4404,6 +4404,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); + /* + * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between + * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are + * up-to-date before switching to L1. + */ + if (enable_ept && is_pae_paging(vcpu)) + vmx_ept_load_pdptrs(vcpu); + leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 46ba2e03a892..19a599bebd5c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2971,7 +2971,7 @@ static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) vpid_sync_context(to_vmx(vcpu)->vpid); } -static void ept_load_pdptrs(struct kvm_vcpu *vcpu) +void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -3114,7 +3114,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, guest_cr3 = vcpu->arch.cr3; else /* vmcs01.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; - ept_load_pdptrs(vcpu); + vmx_ept_load_pdptrs(vcpu); } else { guest_cr3 = pgd; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 26175a4759fa..a2f82127c170 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -356,6 +356,7 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); +void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 From 0f990222108d214a0924d920e6095b58107d7b59 Mon Sep 17 00:00:00 2001 From: Haiwei Li Date: Tue, 1 Sep 2020 19:41:37 +0800 Subject: [PATCH 11/25] KVM: Check the allocation of pv cpu mask check the allocation of per-cpu __pv_cpu_mask. Initialize ops only when successful. Signed-off-by: Haiwei Li Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 08320b0b2b27..9e7dd3a96873 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -654,7 +654,6 @@ static void __init kvm_guest_init(void) } if (pv_tlb_flush_supported()) { - pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; pv_ops.mmu.tlb_remove_table = tlb_remove_table; pr_info("KVM setup pv remote TLB flush\n"); } @@ -767,6 +766,14 @@ static __init int activate_jump_labels(void) } arch_initcall(activate_jump_labels); +static void kvm_free_pv_cpu_mask(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + free_cpumask_var(per_cpu(__pv_cpu_mask, cpu)); +} + static __init int kvm_alloc_cpumask(void) { int cpu; @@ -785,11 +792,20 @@ static __init int kvm_alloc_cpumask(void) if (alloc) for_each_possible_cpu(cpu) { - zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), - GFP_KERNEL, cpu_to_node(cpu)); + if (!zalloc_cpumask_var_node( + per_cpu_ptr(&__pv_cpu_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu))) { + goto zalloc_cpumask_fail; + } } + apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; return 0; + +zalloc_cpumask_fail: + kvm_free_pv_cpu_mask(); + return -ENOMEM; } arch_initcall(kvm_alloc_cpumask); From f65886606c2d3b562716de030706dfe1bea4ed5e Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Mon, 7 Sep 2020 11:55:35 -0700 Subject: [PATCH 12/25] KVM: fix memory leak in kvm_io_bus_unregister_dev() when kmalloc() fails in kvm_io_bus_unregister_dev(), before removing the bus, we should iterate over all other devices linked to it and call kvm_iodevice_destructor() for them Fixes: 90db10434b16 ("KVM: kvm_io_bus_unregister_dev() should never fail") Cc: stable@vger.kernel.org Reported-and-tested-by: syzbot+f196caa45793d6374707@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=f196caa45793d6374707 Signed-off-by: Rustam Kovhaev Reviewed-by: Vitaly Kuznetsov Message-Id: <20200907185535.233114-1-rkovhaev@gmail.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 67cd0b88a6b6..cf88233b819a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4332,7 +4332,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev) { - int i; + int i, j; struct kvm_io_bus *new_bus, *bus; bus = kvm_get_bus(kvm, bus_idx); @@ -4349,17 +4349,20 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), GFP_KERNEL_ACCOUNT); - if (!new_bus) { + if (new_bus) { + memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); + new_bus->dev_count--; + memcpy(new_bus->range + i, bus->range + i + 1, + (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); + } else { pr_err("kvm: failed to shrink bus, removing it completely\n"); - goto broken; + for (j = 0; j < bus->dev_count; j++) { + if (j == i) + continue; + kvm_iodevice_destructor(bus->range[j].dev); + } } - memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); - new_bus->dev_count--; - memcpy(new_bus->range + i, bus->range + i + 1, - (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); - -broken: rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); kfree(bus); From c6b177a3beb9140dc0ba05b61c5142fcec5f2bf7 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Fri, 28 Aug 2020 16:56:21 +0800 Subject: [PATCH 13/25] KVM: nVMX: Fix the update value of nested load IA32_PERF_GLOBAL_CTRL control A minor fix for the update of VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL field in exit_ctls_high. Fixes: 03a8871add95 ("KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL VM-{Entry,Exit} control") Signed-off-by: Chenyi Qiang Reviewed-by: Xiaoyao Li Message-Id: <20200828085622.8365-5-chenyi.qiang@intel.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d7482ccf6a8d..1bb6b31eb646 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4676,7 +4676,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high &= - ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; } } From f6f6195b888c28a0b59ceb0562daff92a2be86c3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 2 Sep 2020 21:54:21 +0800 Subject: [PATCH 14/25] kvm x86/mmu: use KVM_REQ_MMU_SYNC to sync when needed When kvm_mmu_get_page() gets a page with unsynced children, the spt pagetable is unsynchronized with the guest pagetable. But the guest might not issue a "flush" operation on it when the pagetable entry is changed from zero or other cases. The hypervisor has the responsibility to synchronize the pagetables. KVM behaved as above for many years, But commit 8c8560b83390 ("KVM: x86/mmu: Use KVM_REQ_TLB_FLUSH_CURRENT for MMU specific flushes") inadvertently included a line of code to change it without giving any reason in the changelog. It is clear that the commit's intention was to change KVM_REQ_TLB_FLUSH -> KVM_REQ_TLB_FLUSH_CURRENT, so we don't needlessly flush other contexts; however, one of the hunks changed a nearby KVM_REQ_MMU_SYNC instead. This patch changes it back. Link: https://lore.kernel.org/lkml/20200320212833.3507-26-sean.j.christopherson@intel.com/ Cc: Sean Christopherson Cc: Vitaly Kuznetsov Signed-off-by: Lai Jiangshan Message-Id: <20200902135421.31158-1-jiangshanlai@gmail.com> fixes: 8c8560b83390 ("KVM: x86/mmu: Use KVM_REQ_TLB_FLUSH_CURRENT for MMU specific flushes") Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a5d0207e7189..76c5826e29a2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2469,7 +2469,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } if (sp->unsync_children) - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); __clear_sp_write_flooding_count(sp); From 15e9e35cd1dec2bc138464de6bf8ef828df19235 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 10 Sep 2020 18:33:51 +0800 Subject: [PATCH 15/25] KVM: MIPS: Change the definition of kvm type MIPS defines two kvm types: #define KVM_VM_MIPS_TE 0 #define KVM_VM_MIPS_VZ 1 In Documentation/virt/kvm/api.rst it is said that "You probably want to use 0 as machine type", which implies that type 0 be the "automatic" or "default" type. And, in user-space libvirt use the null-machine (with type 0) to detect the kvm capability, which returns "KVM not supported" on a VZ platform. I try to fix it in QEMU but it is ugly: https://lists.nongnu.org/archive/html/qemu-devel/2020-08/msg05629.html And Thomas Huth suggests me to change the definition of kvm type: https://lists.nongnu.org/archive/html/qemu-devel/2020-09/msg03281.html So I define like this: #define KVM_VM_MIPS_AUTO 0 #define KVM_VM_MIPS_VZ 1 #define KVM_VM_MIPS_TE 2 Since VZ and TE cannot co-exists, using type 0 on a TE platform will still return success (so old user-space tools have no problems on new kernels); the advantage is that using type 0 on a VZ platform will not return failure. So, the only problem is "new user-space tools use type 2 on old kernels", but if we treat this as a kernel bug, we can backport this patch to old stable kernels. Signed-off-by: Huacai Chen Message-Id: <1599734031-28746-1-git-send-email-chenhc@lemote.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 2 ++ include/uapi/linux/kvm.h | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 7de85d2253ff..0c50ac444222 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -137,6 +137,8 @@ extern void kvm_init_loongson_ipi(struct kvm *kvm); int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { switch (type) { + case KVM_VM_MIPS_AUTO: + break; #ifdef CONFIG_KVM_MIPS_VZ case KVM_VM_MIPS_VZ: #else diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3d8023474f2a..7d8eced6f459 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -790,9 +790,10 @@ struct kvm_ppc_resize_hpt { #define KVM_VM_PPC_HV 1 #define KVM_VM_PPC_PR 2 -/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */ -#define KVM_VM_MIPS_TE 0 +/* on MIPS, 0 indicates auto, 1 forces VZ ASE, 2 forces trap & emulate */ +#define KVM_VM_MIPS_AUTO 0 #define KVM_VM_MIPS_VZ 1 +#define KVM_VM_MIPS_TE 2 #define KVM_S390_SIE_PAGE_OFFSET 1 From 7be74942f184fdfba34ddd19a0d995deb34d4a03 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 25 Aug 2020 12:56:28 -0700 Subject: [PATCH 16/25] KVM: SVM: Periodically schedule when unregistering regions on destroy There may be many encrypted regions that need to be unregistered when a SEV VM is destroyed. This can lead to soft lockups. For example, on a host running 4.15: watchdog: BUG: soft lockup - CPU#206 stuck for 11s! [t_virtual_machi:194348] CPU: 206 PID: 194348 Comm: t_virtual_machi RIP: 0010:free_unref_page_list+0x105/0x170 ... Call Trace: [<0>] release_pages+0x159/0x3d0 [<0>] sev_unpin_memory+0x2c/0x50 [kvm_amd] [<0>] __unregister_enc_region_locked+0x2f/0x70 [kvm_amd] [<0>] svm_vm_destroy+0xa9/0x200 [kvm_amd] [<0>] kvm_arch_destroy_vm+0x47/0x200 [<0>] kvm_put_kvm+0x1a8/0x2f0 [<0>] kvm_vm_release+0x25/0x30 [<0>] do_exit+0x335/0xc10 [<0>] do_group_exit+0x3f/0xa0 [<0>] get_signal+0x1bc/0x670 [<0>] do_signal+0x31/0x130 Although the CLFLUSH is no longer issued on every encrypted region to be unregistered, there are no other changes that can prevent soft lockups for very large SEV VMs in the latest kernel. Periodically schedule if necessary. This still holds kvm->lock across the resched, but since this only happens when the VM is destroyed this is assumed to be acceptable. Signed-off-by: David Rientjes Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 402dc4234e39..7bf7bf734979 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1106,6 +1106,7 @@ void sev_vm_destroy(struct kvm *kvm) list_for_each_safe(pos, q, head) { __unregister_enc_region_locked(kvm, list_entry(pos, struct enc_region, list)); + cond_resched(); } } From d831de177217cd494bfb99f2c849a0d40c2a7890 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 11 Sep 2020 11:31:47 +0200 Subject: [PATCH 17/25] KVM: x86: always allow writing '0' to MSR_KVM_ASYNC_PF_EN Even without in-kernel LAPIC we should allow writing '0' to MSR_KVM_ASYNC_PF_EN as we're not enabling the mechanism. In particular, QEMU with 'kernel-irqchip=off' fails to start a guest with qemu-system-x86_64: error: failed to set MSR 0x4b564d02 to 0x0 Fixes: 9d3c447c72fb2 ("KVM: X86: Fix async pf caused null-ptr-deref") Reported-by: Dr. David Alan Gilbert Signed-off-by: Vitaly Kuznetsov Message-Id: <20200911093147.484565-1-vkuznets@redhat.com> [Actually commit the version proposed by Sean Christopherson. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 432ef34b9ea9..e3de0fe5af37 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2735,7 +2735,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 1; if (!lapic_in_kernel(vcpu)) - return 1; + return data ? 1 : 0; vcpu->arch.apf.msr_en_val = data; From e42c68281b444f9a20d72a062f8c6fd0d31e4de8 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sat, 12 Sep 2020 02:16:39 -0400 Subject: [PATCH 18/25] KVM: SVM: avoid emulation with stale next_rip svm->next_rip is reset in svm_vcpu_run() only after calling svm_exit_handlers_fastpath(), which will cause SVM's skip_emulated_instruction() to write a stale RIP. We can move svm_exit_handlers_fastpath towards the end of svm_vcpu_run(). To align VMX with SVM, keep svm_complete_interrupts() close as well. Suggested-by: Sean Christopherson Cc: Paul K. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Wanpeng Li [Also move vmcb_mark_all_clean before any possible write to the VMCB. - Paolo] --- arch/x86/kvm/svm/svm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 03dd7bac8034..8ba4a32843fa 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - svm_complete_interrupts(svm); - if (is_guest_mode(vcpu)) { int vmexit; @@ -3504,7 +3502,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ - exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(&svm->vcpu); @@ -3518,6 +3515,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + vmcb_mark_all_clean(svm->vmcb); /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) @@ -3537,7 +3535,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); - vmcb_mark_all_clean(svm->vmcb); + svm_complete_interrupts(svm); + exit_fastpath = svm_exit_handlers_fastpath(vcpu); return exit_fastpath; } From 99b82a1437cb31340dbb2c437a2923b9814a7b15 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 19 Aug 2020 16:55:27 +0800 Subject: [PATCH 19/25] KVM: VMX: Don't freeze guest when event delivery causes an APIC-access exit According to SDM 27.2.4, Event delivery causes an APIC-access VM exit. Don't report internal error and freeze guest when event delivery causes an APIC-access exit, it is handleable and the event will be re-injected during the next vmentry. Signed-off-by: Wanpeng Li Message-Id: <1597827327-25055-2-git-send-email-wanpengli@tencent.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 19a599bebd5c..75cd720c9e8c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6054,6 +6054,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_APIC_ACCESS && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; From 244081f9073fe934adbcb2db6496b91b8fc51655 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 8 Sep 2020 15:53:49 +0200 Subject: [PATCH 20/25] x86/kvm: properly use DEFINE_IDTENTRY_SYSVEC() macro DEFINE_IDTENTRY_SYSVEC() already contains irqentry_enter()/ irqentry_exit(). Signed-off-by: Vitaly Kuznetsov Message-Id: <20200908135350.355053-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9e7dd3a96873..02d15485ff1d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -270,9 +270,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); u32 token; - irqentry_state_t state; - - state = irqentry_enter(regs); inc_irq_stat(irq_hv_callback_count); @@ -283,7 +280,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); } - irqentry_exit(regs, state); set_irq_regs(old_regs); } From cc17b22559d9b9c8b7540810df172f3d7af901ce Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 8 Sep 2020 15:53:50 +0200 Subject: [PATCH 21/25] x86/kvm: don't forget to ACK async PF IRQ Merge commit 26d05b368a5c0 ("Merge branch 'kvm-async-pf-int' into HEAD") tried to adapt the new interrupt based async PF mechanism to the newly introduced IDTENTRY magic but unfortunately it missed the fact that DEFINE_IDTENTRY_SYSVEC() doesn't call ack_APIC_irq() on its own and all DEFINE_IDTENTRY_SYSVEC() users have to call it manually. As the result all multi-CPU KVM guest hang on boot when KVM_FEATURE_ASYNC_PF_INT is present. The breakage went unnoticed because no KVM userspace (e.g. QEMU) currently set it (and thus async PF mechanism is currently disabled) but we're about to change that. Fixes: 26d05b368a5c0 ("Merge branch 'kvm-async-pf-int' into HEAD") Signed-off-by: Vitaly Kuznetsov Message-Id: <20200908135350.355053-3-vkuznets@redhat.com> Tested-by: Ingo Molnar Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 02d15485ff1d..1b51b727b140 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -271,6 +271,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) struct pt_regs *old_regs = set_irq_regs(regs); u32 token; + ack_APIC_irq(); + inc_irq_stat(irq_hv_callback_count); if (__this_cpu_read(apf_reason.enabled)) { From 9883764ad0ce037c554ac0ef302dcf671f8d1ccb Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 19:27:18 +0300 Subject: [PATCH 22/25] SVM: nSVM: correctly restore GIF on vmexit from nesting after migration Currently code in svm_set_nested_state copies the current vmcb control area to L1 control area (hsave->control), under assumption that it mostly reflects the defaults that kvm choose, and later qemu overrides these defaults with L2 state using standard KVM interfaces, like KVM_SET_REGS. However nested GIF (which is AMD specific thing) is by default is true, and it is copied to hsave area as such. This alone is not a big deal since on VMexit, GIF is always set to false, regardless of what it was on VM entry. However in nested_svm_vmexit we were first were setting GIF to false, but then we overwrite the control fields with value from the hsave area. (including the nested GIF field itself if GIF virtualization is enabled). Now on normal vm entry this is not a problem, since GIF is usually false prior to normal vm entry, and this is the value that copied to hsave, and then restored, but this is not always the case when the nested state is loaded as explained above. To fix this issue, move svm_set_gif after we restore the L1 control state in nested_svm_vmexit, so that even with wrong GIF in the saved L1 control area, we still clear GIF as the spec says. Signed-off-by: Maxim Levitsky Message-Id: <20200827162720.278690-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index fb68467e6049..95fdf068fe4c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -586,7 +586,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; /* Give the current vmcb to the guest */ - svm_set_gif(svm, false); nested_vmcb->save.es = vmcb->save.es; nested_vmcb->save.cs = vmcb->save.cs; @@ -632,6 +631,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Restore the original control entries */ copy_vmcb_control_area(&vmcb->control, &hsave->control); + /* On vmexit the GIF is set to false */ + svm_set_gif(svm, false); + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; From 772b81bb2f9b191a046ba7bba1f232eb7b109b84 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 19:27:19 +0300 Subject: [PATCH 23/25] SVM: nSVM: setup nested msr permission bitmap on nested state load This code was missing and was forcing the L2 run with L1's msr permission bitmap Signed-off-by: Maxim Levitsky Message-Id: <20200827162720.278690-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 95fdf068fe4c..e90bc436f584 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1134,6 +1134,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, load_nested_vmcb_control(svm, &ctl); nested_prepare_vmcb_control(svm); + if (!nested_svm_vmrun_msrpm(svm)) + return -EINVAL; + out_set_gif: svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); return 0; From 3ebb5d2617fbf45567975f878232178c5b292d58 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 19:27:20 +0300 Subject: [PATCH 24/25] KVM: nSVM: more strict SMM checks when returning to nested guest * check that guest is 64 bit guest, otherwise the SVM related fields in the smm state area are not defined * If the SMM area indicates that SMM interrupted a running guest, check that EFER.SVME which is also saved in this area is set, otherwise the guest might have tampered with SMM save area, and so indicate emulation failure which should triple fault the guest. * Check that that guest CPUID supports SVM (due to the same issue as above) Signed-off-by: Maxim Levitsky Message-Id: <20200827162720.278690-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8ba4a32843fa..5764b87379cf 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3899,21 +3899,28 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *nested_vmcb; struct kvm_host_map map; - u64 guest; - u64 vmcb; int ret = 0; - guest = GET_SMSTATE(u64, smstate, 0x7ed8); - vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); + u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); + u64 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); - if (guest) { - if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) - return 1; - nested_vmcb = map.hva; - ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + if (guest) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + return 1; + + if (!(saved_efer & EFER_SVME)) + return 1; + + if (kvm_vcpu_map(&svm->vcpu, + gpa_to_gfn(vmcb), &map) == -EINVAL) + return 1; + + ret = enter_svm_guest_mode(svm, vmcb, map.hva); + kvm_vcpu_unmap(&svm->vcpu, &map, true); + } } return ret; From 37f66bbef0920429b8cb5eddba849ec4308a9f8e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 20:11:44 +0300 Subject: [PATCH 25/25] KVM: emulator: more strict rsm checks. Don't ignore return values in rsm_load_state_64/32 to avoid loading invalid state from SMM state area if it was tampered with by the guest. This is primarly intended to avoid letting guest set bits in EFER (like EFER.SVME when nesting is disabled) by manipulating SMM save area. Signed-off-by: Maxim Levitsky Message-Id: <20200827171145.374620-8-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d0e2825ae617..1d450d7710d6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2505,9 +2505,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); - ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + return X86EMUL_UNHANDLEABLE; + val = GET_SMSTATE(u32, smstate, 0x7fc8); - ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + return X86EMUL_UNHANDLEABLE; selector = GET_SMSTATE(u32, smstate, 0x7fc4); set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); @@ -2560,16 +2565,23 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; val = GET_SMSTATE(u32, smstate, 0x7f68); - ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + return X86EMUL_UNHANDLEABLE; + val = GET_SMSTATE(u32, smstate, 0x7f60); - ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + return X86EMUL_UNHANDLEABLE; cr0 = GET_SMSTATE(u64, smstate, 0x7f58); cr3 = GET_SMSTATE(u64, smstate, 0x7f50); cr4 = GET_SMSTATE(u64, smstate, 0x7f48); ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); val = GET_SMSTATE(u64, smstate, 0x7ed0); - ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); + + if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA)) + return X86EMUL_UNHANDLEABLE; selector = GET_SMSTATE(u32, smstate, 0x7e90); rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);