From acde25726bc6034b628febb8a4c6c0838736ccbf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 May 2017 16:39:41 +1000 Subject: [PATCH 01/27] KVM: PPC: Book3S HV: Add radix checks in real-mode hypercall handlers POWER9 running a radix guest will take some hypervisor interrupts without going to real mode (turning off the MMU). This means that early hypercall handlers may now be called in virtual mode. Most of the handlers work just fine in both modes, but there are some that can crash the host if called in virtual mode, notably the TCE (IOMMU) hypercalls H_PUT_TCE, H_STUFF_TCE and H_PUT_TCE_INDIRECT. These already have both a real-mode and a virtual-mode version, so we arrange for the real-mode version to return H_TOO_HARD for radix guests, which will result in the virtual-mode version being called. The other hypercall which is sensitive to the MMU mode is H_RANDOM. It doesn't have a virtual-mode version, so this adds code to enable it to be called in either mode. An alternative solution was considered which would refuse to call any of the early hypercall handlers when doing a virtual-mode exit from a radix guest. However, the XICS-on-XIVE code depends on the XICS hypercalls being handled early even for virtual-mode exits, because the handlers need to be called before the XIVE vCPU state has been pulled off the hardware. Therefore that solution would have become quite invasive and complicated, and was rejected in favour of the simpler, though less elegant, solution presented here. Reviewed-by: David Gibson Tested-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio_hv.c | 13 +++++++++++++ arch/powerpc/kvm/book3s_hv_builtin.c | 9 ++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index eda0a8f6fae8..3adfd2f5301c 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -301,6 +301,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ + /* For radix, we might be in virtual mode, so punt */ + if (kvm_is_radix(vcpu->kvm)) + return H_TOO_HARD; + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -381,6 +385,10 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, bool prereg = false; struct kvmppc_spapr_tce_iommu_table *stit; + /* For radix, we might be in virtual mode, so punt */ + if (kvm_is_radix(vcpu->kvm)) + return H_TOO_HARD; + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -491,6 +499,10 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, long i, ret; struct kvmppc_spapr_tce_iommu_table *stit; + /* For radix, we might be in virtual mode, so punt */ + if (kvm_is_radix(vcpu->kvm)) + return H_TOO_HARD; + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -527,6 +539,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, return H_SUCCESS; } +/* This can be called in either virtual mode or real mode */ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) { diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 88a65923c649..ee4c2558c305 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -207,7 +207,14 @@ EXPORT_SYMBOL_GPL(kvmppc_hwrng_present); long kvmppc_h_random(struct kvm_vcpu *vcpu) { - if (powernv_get_random_real_mode(&vcpu->arch.gpr[4])) + int r; + + /* Only need to do the expensive mfmsr() on radix */ + if (kvm_is_radix(vcpu->kvm) && (mfmsr() & MSR_IR)) + r = powernv_get_random_long(&vcpu->arch.gpr[4]); + else + r = powernv_get_random_real_mode(&vcpu->arch.gpr[4]); + if (r) return H_SUCCESS; return H_HARDWARE; From 67325e988faea735d663799b6d152b5f4254093c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 May 2017 11:33:30 +1000 Subject: [PATCH 02/27] KVM: PPC: Book3S PR: Check copy_to/from_user return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PR KVM implementation of the PAPR HPT hypercalls (H_ENTER etc.) access an image of the HPT in userspace memory using copy_from_user and copy_to_user. Recently, the declarations of those functions were annotated to indicate that the return value must be checked. Since this code doesn't currently check the return value, this causes compile warnings like the ones shown below, and since on PPC the default is to compile arch/powerpc with -Werror, this causes the build to fail. To fix this, we check the return values, and if non-zero, fail the hypercall being processed with a H_FUNCTION error return value. There is really no good error return value to use since PAPR didn't envisage the possibility that the hypervisor may not be able to access the guest's HPT, and H_FUNCTION (function not supported) seems as good as any. The typical compile warnings look like this: CC arch/powerpc/kvm/book3s_pr_papr.o /home/paulus/kernel/kvm/arch/powerpc/kvm/book3s_pr_papr.c: In function ‘kvmppc_h_pr_enter’: /home/paulus/kernel/kvm/arch/powerpc/kvm/book3s_pr_papr.c:53:2: error: ignoring return value of ‘copy_from_user’, declared with attribute warn_unused_result [-Werror=unused-result] copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); ^ /home/paulus/kernel/kvm/arch/powerpc/kvm/book3s_pr_papr.c:74:2: error: ignoring return value of ‘copy_to_user’, declared with attribute warn_unused_result [-Werror=unused-result] copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); ^ ... etc. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_pr_papr.c | 34 +++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index bcbeeb62dd13..a04384adece7 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -50,7 +50,9 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) pteg_addr = get_pteg_addr(vcpu, pte_index); mutex_lock(&vcpu->kvm->arch.hpt_mutex); - copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); + ret = H_FUNCTION; + if (copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg))) + goto done; hpte = pteg; ret = H_PTEG_FULL; @@ -71,7 +73,9 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6)); hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7)); pteg_addr += i * HPTE_SIZE; - copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); + ret = H_FUNCTION; + if (copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE)) + goto done; kvmppc_set_gpr(vcpu, 4, pte_index | i); ret = H_SUCCESS; @@ -93,7 +97,9 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) pteg = get_pteg_addr(vcpu, pte_index); mutex_lock(&vcpu->kvm->arch.hpt_mutex); - copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + ret = H_FUNCTION; + if (copy_from_user(pte, (void __user *)pteg, sizeof(pte))) + goto done; pte[0] = be64_to_cpu((__force __be64)pte[0]); pte[1] = be64_to_cpu((__force __be64)pte[1]); @@ -103,7 +109,9 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) goto done; - copy_to_user((void __user *)pteg, &v, sizeof(v)); + ret = H_FUNCTION; + if (copy_to_user((void __user *)pteg, &v, sizeof(v))) + goto done; rb = compute_tlbie_rb(pte[0], pte[1], pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); @@ -171,7 +179,10 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) } pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX); - copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + if (copy_from_user(pte, (void __user *)pteg, sizeof(pte))) { + ret = H_FUNCTION; + break; + } pte[0] = be64_to_cpu((__force __be64)pte[0]); pte[1] = be64_to_cpu((__force __be64)pte[1]); @@ -184,7 +195,10 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) tsh |= H_BULK_REMOVE_NOT_FOUND; } else { /* Splat the pteg in (userland) hpt */ - copy_to_user((void __user *)pteg, &v, sizeof(v)); + if (copy_to_user((void __user *)pteg, &v, sizeof(v))) { + ret = H_FUNCTION; + break; + } rb = compute_tlbie_rb(pte[0], pte[1], tsh & H_BULK_REMOVE_PTEX); @@ -211,7 +225,9 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) pteg = get_pteg_addr(vcpu, pte_index); mutex_lock(&vcpu->kvm->arch.hpt_mutex); - copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + ret = H_FUNCTION; + if (copy_from_user(pte, (void __user *)pteg, sizeof(pte))) + goto done; pte[0] = be64_to_cpu((__force __be64)pte[0]); pte[1] = be64_to_cpu((__force __be64)pte[1]); @@ -234,7 +250,9 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); pte[0] = (__force u64)cpu_to_be64(pte[0]); pte[1] = (__force u64)cpu_to_be64(pte[1]); - copy_to_user((void __user *)pteg, pte, sizeof(pte)); + ret = H_FUNCTION; + if (copy_to_user((void __user *)pteg, pte, sizeof(pte))) + goto done; ret = H_SUCCESS; done: From 76d837a4c0f905f98088877d780169d7a14a6b29 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 May 2017 14:31:59 +1000 Subject: [PATCH 03/27] KVM: PPC: Book3S PR: Don't include SPAPR TCE code on non-pseries platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e91aa8e6ecd5 ("KVM: PPC: Enable IOMMU_API for KVM_BOOK3S_64 permanently", 2017-03-22) enabled the SPAPR TCE code for all 64-bit Book 3S kernel configurations in order to simplify the code and reduce #ifdefs. However, 64-bit Book 3S PPC platforms other than pseries and powernv don't implement the necessary IOMMU callbacks, leading to build failures like the following (for a pasemi config): scripts/kconfig/conf --silentoldconfig Kconfig warning: (KVM_BOOK3S_64) selects SPAPR_TCE_IOMMU which has unmet direct dependencies (IOMMU_SUPPORT && (PPC_POWERNV || PPC_PSERIES)) ... CC [M] arch/powerpc/kvm/book3s_64_vio.o /home/paulus/kernel/kvm/arch/powerpc/kvm/book3s_64_vio.c: In function ‘kvmppc_clear_tce’: /home/paulus/kernel/kvm/arch/powerpc/kvm/book3s_64_vio.c:363:2: error: implicit declaration of function ‘iommu_tce_xchg’ [-Werror=implicit-function-declaration] iommu_tce_xchg(tbl, entry, &hpa, &dir); ^ To fix this, we make the inclusion of the SPAPR TCE support, and the code that uses it in book3s_vio.c and book3s_vio_hv.c, depend on the inclusion of support for the pseries and/or powernv platforms. This means that when running a 'pseries' guest on those platforms, the guest won't have in-kernel acceleration of the PAPR TCE hypercalls, but at least now they compile. Reviewed-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/Kconfig | 2 +- arch/powerpc/kvm/Makefile | 4 +-- arch/powerpc/kvm/book3s_pr_papr.c | 46 +++++++++++++++++++++---------- arch/powerpc/kvm/powerpc.c | 4 ++- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 24de532c1736..0c52cb5d43f5 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -67,7 +67,7 @@ config KVM_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE - select SPAPR_TCE_IOMMU if IOMMU_SUPPORT + select SPAPR_TCE_IOMMU if IOMMU_SUPPORT && (PPC_SERIES || PPC_POWERNV) ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index d91a2604c496..381a6ec0ff3b 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -46,7 +46,7 @@ kvm-e500mc-objs := \ e500_emulate.o kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \ +kvm-book3s_64-builtin-objs-$(CONFIG_SPAPR_TCE_IOMMU) := \ book3s_64_vio_hv.o kvm-pr-y := \ @@ -90,11 +90,11 @@ kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o +kvm-book3s_64-objs-$(CONFIG_SPAPR_TCE_IOMMU) += book3s_64_vio.o kvm-book3s_64-module-objs := \ $(common-objs-y) \ book3s.o \ - book3s_64_vio.o \ book3s_rtas.o \ $(kvm-book3s_64-objs-y) diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index a04384adece7..8a4205fa774f 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -262,20 +262,6 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) return EMULATE_DONE; } -static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) -{ - unsigned long liobn = kvmppc_get_gpr(vcpu, 4); - unsigned long ioba = kvmppc_get_gpr(vcpu, 5); - unsigned long tce = kvmppc_get_gpr(vcpu, 6); - long rc; - - rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce); - if (rc == H_TOO_HARD) - return EMULATE_FAIL; - kvmppc_set_gpr(vcpu, 3, rc); - return EMULATE_DONE; -} - static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu) { long rc; @@ -298,6 +284,21 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +#ifdef CONFIG_SPAPR_TCE_IOMMU +static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce = kvmppc_get_gpr(vcpu, 6); + long rc; + + rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu) { unsigned long liobn = kvmppc_get_gpr(vcpu, 4); @@ -329,6 +330,23 @@ static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +#else /* CONFIG_SPAPR_TCE_IOMMU */ +static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) +{ + return EMULATE_FAIL; +} + +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu) +{ + return EMULATE_FAIL; +} + +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu) +{ + return EMULATE_FAIL; +} +#endif /* CONFIG_SPAPR_TCE_IOMMU */ + static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { long rc = kvmppc_xics_hcall(vcpu, cmd); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f7cf2cd564ef..7f71ab5fcad1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1749,7 +1749,7 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_enable_cap(kvm, &cap); break; } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_CREATE_SPAPR_TCE_64: { struct kvm_create_spapr_tce_64 create_tce_64; @@ -1780,6 +1780,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); goto out; } +#endif +#ifdef CONFIG_PPC_BOOK3S_64 case KVM_PPC_GET_SMMU_INFO: { struct kvm_ppc_smmu_info info; struct kvm *kvm = filp->private_data; From 01630ab8543f21df30b0dca19087b85744f61aee Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 May 2017 11:04:52 +0100 Subject: [PATCH 04/27] ARM: KVM: Fix tracepoint generation after move to virt/kvm/arm/ Moving most of the shared code to virt/kvm/arm had for consequence that KVM/ARM doesn't build anymore, because the code that used to define the tracepoints is now somewhere else. Fix this by defining CREATE_TRACE_POINTS in coproc.c, and clean-up trace.h as well. Fixes: 35d2d5d490e2 ("KVM: arm/arm64: Move shared files to virt/kvm/arm") Reported-by: Arnd Bergmann Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/coproc.c | 1 + arch/arm/kvm/trace.h | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 2c14b69511e9..ac8d36da4d08 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -32,6 +32,7 @@ #include #include "../vfp/vfpinstr.h" +#define CREATE_TRACE_POINTS #include "trace.h" #include "coproc.h" diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index fc0943776db2..b0d10648c486 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -1,5 +1,5 @@ -#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVM_H +#if !defined(_TRACE_ARM_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ARM_KVM_H #include @@ -74,10 +74,10 @@ TRACE_EVENT(kvm_hvc, __entry->vcpu_pc, __entry->r0, __entry->imm) ); -#endif /* _TRACE_KVM_H */ +#endif /* _TRACE_ARM_KVM_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH arch/arm/kvm +#define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace From cde13b5dad60471886a3bccb4f4134c647c4a9dc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 May 2017 14:30:37 +0100 Subject: [PATCH 05/27] arm64: KVM: Do not use stack-protector to compile EL2 code We like living dangerously. Nothing explicitely forbids stack-protector to be used in the EL2 code, while distributions routinely compile their kernel with it. We're just lucky that no code actually triggers the instrumentation. Let's not try our luck for much longer, and disable stack-protector for code living at EL2. Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index aaf42ae8d8c3..14c4e3b14bcb 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -2,6 +2,8 @@ # Makefile for Kernel-based Virtual Machine module, HYP part # +ccflags-y += -fno-stack-protector + KVM=../../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o From 501ad27c67ed0b90df465f23d33e9aed64058a47 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 May 2017 14:30:38 +0100 Subject: [PATCH 06/27] arm: KVM: Do not use stack-protector to compile HYP code We like living dangerously. Nothing explicitely forbids stack-protector to be used in the HYP code, while distributions routinely compile their kernel with it. We're just lucky that no code actually triggers the instrumentation. Let's not try our luck for much longer, and disable stack-protector for code living at HYP. Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/kvm/hyp/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile index 3023bb530edf..8679405b0b2b 100644 --- a/arch/arm/kvm/hyp/Makefile +++ b/arch/arm/kvm/hyp/Makefile @@ -2,6 +2,8 @@ # Makefile for Kernel-based Virtual Machine module, HYP part # +ccflags-y += -fno-stack-protector + KVM=../../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o From ddf42d068f8802de122bb7efdfcb3179336053f1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 May 2017 14:30:39 +0100 Subject: [PATCH 07/27] KVM: arm/arm64: vgic-v2: Do not use Active+Pending state for a HW interrupt When an interrupt is injected with the HW bit set (indicating that deactivation should be propagated to the physical distributor), special care must be taken so that we never mark the corresponding LR with the Active+Pending state (as the pending state is kept in the physycal distributor). Cc: stable@vger.kernel.org Fixes: 140b086dd197 ("KVM: arm/arm64: vgic-new: Add GICv2 world switch backend") Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic/vgic-v2.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index a65757aab6d3..504b4bd0d651 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -149,6 +149,13 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (irq->hw) { val |= GICH_LR_HW; val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active && irq_is_pending(irq)) + val &= ~GICH_LR_PENDING_BIT; } else { if (irq->config == VGIC_CONFIG_LEVEL) val |= GICH_LR_EOI; From 3d6e77ad1489650afa20da92bb589c8778baa8da Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 May 2017 14:30:40 +0100 Subject: [PATCH 08/27] KVM: arm/arm64: vgic-v3: Do not use Active+Pending state for a HW interrupt When an interrupt is injected with the HW bit set (indicating that deactivation should be propagated to the physical distributor), special care must be taken so that we never mark the corresponding LR with the Active+Pending state (as the pending state is kept in the physycal distributor). Cc: stable@vger.kernel.org Fixes: 59529f69f504 ("KVM: arm/arm64: vgic-new: Add GICv3 world switch backend") Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic/vgic-v3.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 8fa737edde6f..6fe3f003636a 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -127,6 +127,13 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (irq->hw) { val |= ICH_LR_HW; val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active && irq_is_pending(irq)) + val &= ~ICH_LR_PENDING_BIT; } else { if (irq->config == VGIC_CONFIG_LEVEL) val |= ICH_LR_EOI; From 15d2bffdde6268883647c6112970f74d3e1af651 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 May 2017 14:30:41 +0100 Subject: [PATCH 09/27] KVM: arm/arm64: vgic-v3: Use PREbits to infer the number of ICH_APxRn_EL2 registers The GICv3 documentation is extremely confusing, as it talks about the number of priorities represented by the ICH_APxRn_EL2 registers, while it should really talk about the number of preemption levels. This leads to a bug where we may access undefined ICH_APxRn_EL2 registers, since PREbits is allowed to be smaller than PRIbits. Thankfully, nobody seem to have taken this path so far... The fix is to use ICH_VTR_EL2.PREbits instead. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- virt/kvm/arm/hyp/vgic-v3-sr.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index bce6037cf01d..32c3295929b0 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -22,7 +22,7 @@ #include #define vtr_to_max_lr_idx(v) ((v) & 0xf) -#define vtr_to_nr_pri_bits(v) (((u32)(v) >> 29) + 1) +#define vtr_to_nr_pre_bits(v) (((u32)(v) >> 26) + 1) static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) { @@ -135,13 +135,13 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) if (used_lrs) { int i; - u32 nr_pri_bits; + u32 nr_pre_bits; cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2); write_gicreg(0, ICH_HCR_EL2); val = read_gicreg(ICH_VTR_EL2); - nr_pri_bits = vtr_to_nr_pri_bits(val); + nr_pre_bits = vtr_to_nr_pre_bits(val); for (i = 0; i < used_lrs; i++) { if (cpu_if->vgic_elrsr & (1 << i)) @@ -152,7 +152,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) __gic_v3_set_lr(0, i); } - switch (nr_pri_bits) { + switch (nr_pre_bits) { case 7: cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2); cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2); @@ -162,7 +162,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2); } - switch (nr_pri_bits) { + switch (nr_pre_bits) { case 7: cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2); cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2); @@ -198,7 +198,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; u64 val; - u32 nr_pri_bits; + u32 nr_pre_bits; int i; /* @@ -217,12 +217,12 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) } val = read_gicreg(ICH_VTR_EL2); - nr_pri_bits = vtr_to_nr_pri_bits(val); + nr_pre_bits = vtr_to_nr_pre_bits(val); if (used_lrs) { write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); - switch (nr_pri_bits) { + switch (nr_pre_bits) { case 7: write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2); write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2); @@ -232,7 +232,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2); } - switch (nr_pri_bits) { + switch (nr_pre_bits) { case 7: write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2); write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2); From 6c0d706b563af732adb094c5bf807437e8963e84 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 3 May 2017 15:17:51 +0100 Subject: [PATCH 10/27] kvm: arm/arm64: Fix race in resetting stage2 PGD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In kvm_free_stage2_pgd() we check the stage2 PGD before holding the lock and proceed to take the lock if it is valid. And we unmap the page tables, followed by releasing the lock. We reset the PGD only after dropping this lock, which could cause a race condition where another thread waiting on or even holding the lock, could potentially see that the PGD is still valid and proceed to perform a stage2 operation and later encounter a NULL PGD. [223090.242280] Unable to handle kernel NULL pointer dereference at virtual address 00000040 [223090.262330] PC is at unmap_stage2_range+0x8c/0x428 [223090.262332] LR is at kvm_unmap_hva_handler+0x2c/0x3c [223090.262531] Call trace: [223090.262533] [] unmap_stage2_range+0x8c/0x428 [223090.262535] [] kvm_unmap_hva_handler+0x2c/0x3c [223090.262537] [] handle_hva_to_gpa+0xb0/0x104 [223090.262539] [] kvm_unmap_hva+0x5c/0xbc [223090.262543] [] kvm_mmu_notifier_invalidate_page+0x50/0x8c [223090.262547] [] __mmu_notifier_invalidate_page+0x5c/0x84 [223090.262551] [] try_to_unmap_one+0x1d0/0x4a0 [223090.262553] [] rmap_walk+0x1cc/0x2e0 [223090.262555] [] try_to_unmap+0x74/0xa4 [223090.262557] [] migrate_pages+0x31c/0x5ac [223090.262561] [] compact_zone+0x3fc/0x7ac [223090.262563] [] compact_zone_order+0x94/0xb0 [223090.262564] [] try_to_compact_pages+0x108/0x290 [223090.262569] [] __alloc_pages_direct_compact+0x70/0x1ac [223090.262571] [] __alloc_pages_nodemask+0x434/0x9f4 [223090.262572] [] alloc_pages_vma+0x230/0x254 [223090.262574] [] do_huge_pmd_anonymous_page+0x114/0x538 [223090.262576] [] handle_mm_fault+0xd40/0x17a4 [223090.262577] [] __get_user_pages+0x12c/0x36c [223090.262578] [] get_user_pages_unlocked+0xa4/0x1b8 [223090.262579] [] __gfn_to_pfn_memslot+0x280/0x31c [223090.262580] [] gfn_to_pfn_prot+0x4c/0x5c [223090.262582] [] kvm_handle_guest_abort+0x240/0x774 [223090.262584] [] handle_exit+0x11c/0x1ac [223090.262586] [] kvm_arch_vcpu_ioctl_run+0x31c/0x648 [223090.262587] [] kvm_vcpu_ioctl+0x378/0x768 [223090.262590] [] do_vfs_ioctl+0x324/0x5a4 [223090.262591] [] SyS_ioctl+0x90/0xa4 [223090.262595] [] el0_svc_naked+0x38/0x3c This patch moves the stage2 PGD manipulation under the lock. Reported-by: Alexander Graf Cc: Mark Rutland Cc: Marc Zyngier Cc: Paolo Bonzini Cc: Radim Krčmář Reviewed-by: Christoffer Dall Reviewed-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Christoffer Dall --- virt/kvm/arm/mmu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 313ee646480f..909a1a793b31 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -829,22 +829,22 @@ void stage2_unmap_vm(struct kvm *kvm) * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all * underlying level-2 and level-3 tables before freeing the actual level-1 table * and setting the struct pointer to NULL. - * - * Note we don't need locking here as this is only called when the VM is - * destroyed, which can only be done once. */ void kvm_free_stage2_pgd(struct kvm *kvm) { - if (kvm->arch.pgd == NULL) - return; + void *pgd = NULL; spin_lock(&kvm->mmu_lock); - unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + if (kvm->arch.pgd) { + unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + pgd = kvm->arch.pgd; + kvm->arch.pgd = NULL; + } spin_unlock(&kvm->mmu_lock); /* Free the HW pgd, one page at a time */ - free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE); - kvm->arch.pgd = NULL; + if (pgd) + free_pages_exact(pgd, S2_PGD_SIZE); } static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, From 661e6b02b5aa82db31897f36e96324b77450fd7a Mon Sep 17 00:00:00 2001 From: Zhichao Huang Date: Thu, 11 May 2017 13:46:11 +0100 Subject: [PATCH 11/27] KVM: arm: plug potential guest hardware debug leakage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hardware debugging in guests is not intercepted currently, it means that a malicious guest can bring down the entire machine by writing to the debug registers. This patch enable trapping of all debug registers, preventing the guests to access the debug registers. This includes access to the debug mode(DBGDSCR) in the guest world all the time which could otherwise mess with the host state. Reads return 0 and writes are ignored (RAZ_WI). The result is the guest cannot detect any working hardware based debug support. As debug exceptions are still routed to the guest normal debug using software based breakpoints still works. To support debugging using hardware registers we need to implement a debug register aware world switch as well as special trapping for registers that may affect the host state. Cc: stable@vger.kernel.org Signed-off-by: Zhichao Huang Signed-off-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_coproc.h | 3 +- arch/arm/kvm/coproc.c | 77 +++++++++++++++++++++++-------- arch/arm/kvm/handle_exit.c | 4 +- arch/arm/kvm/hyp/switch.c | 4 +- 4 files changed, 66 insertions(+), 22 deletions(-) diff --git a/arch/arm/include/asm/kvm_coproc.h b/arch/arm/include/asm/kvm_coproc.h index 4917c2f7e459..e74ab0fbab79 100644 --- a/arch/arm/include/asm/kvm_coproc.h +++ b/arch/arm/include/asm/kvm_coproc.h @@ -31,7 +31,8 @@ void kvm_register_target_coproc_table(struct kvm_coproc_target_table *table); int kvm_handle_cp10_id(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_handle_cp_0_13_access(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run); diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index ac8d36da4d08..1403ffb1916b 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -112,12 +112,6 @@ int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } -int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - kvm_inject_undefined(vcpu); - return 1; -} - static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) { /* @@ -533,12 +527,7 @@ static int emulate_cp15(struct kvm_vcpu *vcpu, return 1; } -/** - * kvm_handle_cp15_64 -- handles a mrrc/mcrr trap on a guest CP15 access - * @vcpu: The VCPU pointer - * @run: The kvm_run struct - */ -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +static struct coproc_params decode_64bit_hsr(struct kvm_vcpu *vcpu) { struct coproc_params params; @@ -552,9 +541,38 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) params.Rt2 = (kvm_vcpu_get_hsr(vcpu) >> 10) & 0xf; params.CRm = 0; + return params; +} + +/** + * kvm_handle_cp15_64 -- handles a mrrc/mcrr trap on a guest CP15 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params = decode_64bit_hsr(vcpu); + return emulate_cp15(vcpu, ¶ms); } +/** + * kvm_handle_cp14_64 -- handles a mrrc/mcrr trap on a guest CP14 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params = decode_64bit_hsr(vcpu); + + /* raz_wi cp14 */ + pm_fake(vcpu, ¶ms, NULL); + + /* handled */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 1; +} + static void reset_coproc_regs(struct kvm_vcpu *vcpu, const struct coproc_reg *table, size_t num) { @@ -565,12 +583,7 @@ static void reset_coproc_regs(struct kvm_vcpu *vcpu, table[i].reset(vcpu, &table[i]); } -/** - * kvm_handle_cp15_32 -- handles a mrc/mcr trap on a guest CP15 access - * @vcpu: The VCPU pointer - * @run: The kvm_run struct - */ -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +static struct coproc_params decode_32bit_hsr(struct kvm_vcpu *vcpu) { struct coproc_params params; @@ -584,9 +597,37 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) params.Op2 = (kvm_vcpu_get_hsr(vcpu) >> 17) & 0x7; params.Rt2 = 0; + return params; +} + +/** + * kvm_handle_cp15_32 -- handles a mrc/mcr trap on a guest CP15 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params = decode_32bit_hsr(vcpu); return emulate_cp15(vcpu, ¶ms); } +/** + * kvm_handle_cp14_32 -- handles a mrc/mcr trap on a guest CP14 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params = decode_32bit_hsr(vcpu); + + /* raz_wi cp14 */ + pm_fake(vcpu, ¶ms, NULL); + + /* handled */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 1; +} + /****************************************************************************** * Userspace API *****************************************************************************/ diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 5fd7968cdae9..f86a9aaef462 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -95,9 +95,9 @@ static exit_handle_fn arm_exit_handlers[] = { [HSR_EC_WFI] = kvm_handle_wfx, [HSR_EC_CP15_32] = kvm_handle_cp15_32, [HSR_EC_CP15_64] = kvm_handle_cp15_64, - [HSR_EC_CP14_MR] = kvm_handle_cp14_access, + [HSR_EC_CP14_MR] = kvm_handle_cp14_32, [HSR_EC_CP14_LS] = kvm_handle_cp14_load_store, - [HSR_EC_CP14_64] = kvm_handle_cp14_access, + [HSR_EC_CP14_64] = kvm_handle_cp14_64, [HSR_EC_CP_0_13] = kvm_handle_cp_0_13_access, [HSR_EC_CP10_ID] = kvm_handle_cp10_id, [HSR_EC_HVC] = handle_hvc, diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index 92678b7bd046..624a510d31df 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -48,7 +48,9 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu, u32 *fpexc_host) write_sysreg(HSTR_T(15), HSTR); write_sysreg(HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11), HCPTR); val = read_sysreg(HDCR); - write_sysreg(val | HDCR_TPM | HDCR_TPMCR, HDCR); + val |= HDCR_TPM | HDCR_TPMCR; /* trap performance monitors */ + val |= HDCR_TDRA | HDCR_TDOSA | HDCR_TDA; /* trap debug regs */ + write_sysreg(val, HDCR); } static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) From 9b619a8f08da9f61f166edbbe30ad05c359ec19e Mon Sep 17 00:00:00 2001 From: Zhichao Huang Date: Thu, 11 May 2017 13:46:12 +0100 Subject: [PATCH 12/27] KVM: arm: rename pm_fake handler to trap_raz_wi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pm_fake doesn't quite describe what the handler does (ignoring writes and returning 0 for reads). As we're about to use it (a lot) in a different context, rename it with a (admitedly cryptic) name that make sense for all users. Signed-off-by: Zhichao Huang Reviewed-by: Alex Bennee Acked-by: Christoffer Dall Acked-by: Marc Zyngier Signed-off-by: Alex Bennée Signed-off-by: Christoffer Dall --- arch/arm/kvm/coproc.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 1403ffb1916b..6d1d2e26dfe5 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -279,7 +279,7 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, * must always support PMCCNTR (the cycle counter): we just RAZ/WI for * all PM registers, which doesn't crash the guest kernel at least. */ -static bool pm_fake(struct kvm_vcpu *vcpu, +static bool trap_raz_wi(struct kvm_vcpu *vcpu, const struct coproc_params *p, const struct coproc_reg *r) { @@ -289,19 +289,19 @@ static bool pm_fake(struct kvm_vcpu *vcpu, return read_zero(vcpu, p); } -#define access_pmcr pm_fake -#define access_pmcntenset pm_fake -#define access_pmcntenclr pm_fake -#define access_pmovsr pm_fake -#define access_pmselr pm_fake -#define access_pmceid0 pm_fake -#define access_pmceid1 pm_fake -#define access_pmccntr pm_fake -#define access_pmxevtyper pm_fake -#define access_pmxevcntr pm_fake -#define access_pmuserenr pm_fake -#define access_pmintenset pm_fake -#define access_pmintenclr pm_fake +#define access_pmcr trap_raz_wi +#define access_pmcntenset trap_raz_wi +#define access_pmcntenclr trap_raz_wi +#define access_pmovsr trap_raz_wi +#define access_pmselr trap_raz_wi +#define access_pmceid0 trap_raz_wi +#define access_pmceid1 trap_raz_wi +#define access_pmccntr trap_raz_wi +#define access_pmxevtyper trap_raz_wi +#define access_pmxevcntr trap_raz_wi +#define access_pmuserenr trap_raz_wi +#define access_pmintenset trap_raz_wi +#define access_pmintenclr trap_raz_wi /* Architected CP15 registers. * CRn denotes the primary register number, but is copied to the CRm in the @@ -566,7 +566,7 @@ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run) struct coproc_params params = decode_64bit_hsr(vcpu); /* raz_wi cp14 */ - pm_fake(vcpu, ¶ms, NULL); + trap_raz_wi(vcpu, ¶ms, NULL); /* handled */ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); @@ -621,7 +621,7 @@ int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run) struct coproc_params params = decode_32bit_hsr(vcpu); /* raz_wi cp14 */ - pm_fake(vcpu, ¶ms, NULL); + trap_raz_wi(vcpu, ¶ms, NULL); /* handled */ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); From 4769886baf39b6a307eb8f9e39848823ca6c5939 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 May 2017 22:43:17 +0300 Subject: [PATCH 13/27] kvm: nVMX: off by one in vmx_write_pml_buffer() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are PML_ENTITY_NUM elements in the pml_address[] array so the > should be >= or we write beyond the end of the array when we do: pml_address[vmcs12->guest_pml_index--] = gpa; Fixes: c5f983f6e845 ("nVMX: Implement emulated Page Modification Logging") Signed-off-by: Dan Carpenter Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c6f4ad44aa95..7698e8f321bf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11213,7 +11213,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) if (!nested_cpu_has_pml(vmcs12)) return 0; - if (vmcs12->guest_pml_index > PML_ENTITY_NUM) { + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { vmx->nested.pml_full = true; return 1; } From a575813bfe4bc15aba511a5e91e61d242bff8b9d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 11 May 2017 02:58:55 -0700 Subject: [PATCH 14/27] KVM: x86: Fix load damaged SSEx MXCSR register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: BUG: unable to handle kernel paging request at ffffffffc07f6a2e IP: report_bug+0x94/0x120 PGD 348e12067 P4D 348e12067 PUD 348e14067 PMD 3cbd84067 PTE 80000003f7e87161 Oops: 0003 [#1] SMP CPU: 2 PID: 7091 Comm: kvm_load_guest_ Tainted: G OE 4.11.0+ #8 task: ffff92fdfb525400 task.stack: ffffbda6c3d04000 RIP: 0010:report_bug+0x94/0x120 RSP: 0018:ffffbda6c3d07b20 EFLAGS: 00010202 do_trap+0x156/0x170 do_error_trap+0xa3/0x170 ? kvm_load_guest_fpu.part.175+0x12a/0x170 [kvm] ? mark_held_locks+0x79/0xa0 ? retint_kernel+0x10/0x10 ? trace_hardirqs_off_thunk+0x1a/0x1c do_invalid_op+0x20/0x30 invalid_op+0x1e/0x30 RIP: 0010:kvm_load_guest_fpu.part.175+0x12a/0x170 [kvm] ? kvm_load_guest_fpu.part.175+0x1c/0x170 [kvm] kvm_arch_vcpu_ioctl_run+0xed6/0x1b70 [kvm] kvm_vcpu_ioctl+0x384/0x780 [kvm] ? kvm_vcpu_ioctl+0x384/0x780 [kvm] ? sched_clock+0x13/0x20 ? __do_page_fault+0x2a0/0x550 do_vfs_ioctl+0xa4/0x700 ? up_read+0x1f/0x40 ? __do_page_fault+0x2a0/0x550 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x23/0xc2 SDM mentioned that "The MXCSR has several reserved bits, and attempting to write a 1 to any of these bits will cause a general-protection exception(#GP) to be generated". The syzkaller forks' testcase overrides xsave area w/ random values and steps on the reserved bits of MXCSR register. The damaged MXCSR register values of guest will be restored to SSEx MXCSR register before vmentry. This patch fixes it by catching userspace override MXCSR register reserved bits w/ random values and bails out immediately. Reported-by: Andrey Konovalov Reviewed-by: Paolo Bonzini Cc: Paolo Bonzini Cc: Radim Krčmář Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kernel/fpu/init.c | 1 + arch/x86/kvm/x86.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index c2f8dde3255c..d5d44c452624 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -90,6 +90,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 464da936c53d..b54125b590e8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3288,11 +3288,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, } } +#define XSAVE_MXCSR_OFFSET 24 + static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* @@ -3300,11 +3303,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0()) + if (xstate_bv & ~kvm_supported_xcr0() || + mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE) + if (xstate_bv & ~XFEATURE_MASK_FPSSE || + mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); From fce6ac4c0508b985d497e3d9c8eff28ec8a43182 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 11 May 2017 02:58:56 -0700 Subject: [PATCH 15/27] KVM: VMX: Don't enable EPT A/D feature if EPT feature is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can observe eptad kvm_intel module parameter is still Y even if ept is disabled which is weird. This patch will not enable EPT A/D feature if EPT feature is disabled. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7698e8f321bf..72f78396bc09 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6504,7 +6504,7 @@ static __init int hardware_setup(void) enable_ept_ad_bits = 0; } - if (!cpu_has_vmx_ept_ad_bits()) + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; if (!cpu_has_vmx_unrestricted_guest()) From 0780516a18f87e881e42ed815f189279b0a1743c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 May 2017 13:23:29 +0200 Subject: [PATCH 16/27] KVM: nVMX: fix EPT permissions as reported in exit qualification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the new ept_access_test_read_only and ept_access_test_read_write testcases from vmx.flat. The problem is that gpte_access moves bits around to switch from EPT bit order (XWR) to ACC_*_MASK bit order (RWX). This results in an incorrect exit qualification. To fix this, make pt_access and pte_access operate on raw PTE values (only with NX flipped to mean "can execute") and call gpte_access at the end of the walk. This lets us use pte_access to compute the exit qualification with XWR bit order. Signed-off-by: Paolo Bonzini Reviewed-by: Xiao Guangrong Signed-off-by: Radim Krčmář --- arch/x86/kvm/paging_tmpl.h | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 56241746abbd..b0454c7e4cff 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -283,11 +283,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; + u64 pt_access, pte_access; + unsigned index, accessed_dirty, pte_pkey; unsigned nested_access; gpa_t pte_gpa; bool have_ad; int offset; + u64 walk_nx_mask = 0; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; @@ -302,6 +304,7 @@ retry_walk: have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 + walk_nx_mask = 1ULL << PT64_NX_SHIFT; if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); @@ -313,8 +316,6 @@ retry_walk: walker->max_level = walker->level; ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); - accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0; - /* * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging * by the MOV to CR instruction are treated as reads and do not cause the @@ -322,14 +323,14 @@ retry_walk: */ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; - pt_access = pte_access = ACC_ALL; + pte_access = ~0; ++walker->level; do { gfn_t real_gfn; unsigned long host_addr; - pt_access &= pte_access; + pt_access = pte_access; --walker->level; index = PT_INDEX(addr, walker->level); @@ -371,6 +372,12 @@ retry_walk: trace_kvm_mmu_paging_element(pte, walker->level); + /* + * Inverting the NX it lets us AND it like other + * permission bits. + */ + pte_access = pt_access & (pte ^ walk_nx_mask); + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; @@ -379,14 +386,16 @@ retry_walk: goto error; } - accessed_dirty &= pte; - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); - walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); - errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access); + accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); + walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask); + errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; @@ -403,7 +412,7 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - FNAME(protect_clean_gpte)(mmu, &pte_access, pte); + FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. @@ -421,10 +430,8 @@ retry_walk: goto retry_walk; } - walker->pt_access = pt_access; - walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, pte_access, pt_access); + __func__, (u64)pte, walker->pte_access, walker->pt_access); return 1; error: @@ -452,7 +459,7 @@ error: */ if (!(errcode & PFERR_RSVD_MASK)) { vcpu->arch.exit_qualification &= 0x187; - vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; } #endif walker->fault.address = addr; From 2952a6070e07ebdd5896f1f5b861acad677caded Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 16 May 2017 10:34:54 +0100 Subject: [PATCH 17/27] kvm: arm/arm64: Force reading uncached stage2 PGD Make sure we don't use a cached value of the KVM stage2 PGD while resetting the PGD. Cc: Marc Zyngier Cc: stable@vger.kernel.org Signed-off-by: Suzuki K Poulose Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- virt/kvm/arm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 909a1a793b31..704e35f312a4 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -837,7 +837,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - pgd = kvm->arch.pgd; + pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; } spin_unlock(&kvm->mmu_lock); From 0c428a6a9256fcd66817e12db32a50b405ed2e5c Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 16 May 2017 10:34:55 +0100 Subject: [PATCH 18/27] kvm: arm/arm64: Fix use after free of stage2 page table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We yield the kvm->mmu_lock occassionaly while performing an operation (e.g, unmap or permission changes) on a large area of stage2 mappings. However this could possibly cause another thread to clear and free up the stage2 page tables while we were waiting for regaining the lock and thus the original thread could end up in accessing memory that was freed. This patch fixes the problem by making sure that the stage2 pagetable is still valid after we regain the lock. The fact that mmu_notifer->release() could be called twice (via __mmu_notifier_release and mmu_notifier_unregsister) enhances the possibility of hitting this race where there are two threads trying to unmap the entire guest shadow pages. While at it, cleanup the redudant checks around cond_resched_lock in stage2_wp_range(), as cond_resched_lock already does the same checks. Cc: Mark Rutland Cc: Radim Krčmář Cc: andreyknvl@google.com Cc: Paolo Bonzini Cc: stable@vger.kernel.org Acked-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- virt/kvm/arm/mmu.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 704e35f312a4..a2d63247d1bb 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -295,6 +295,13 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) assert_spin_locked(&kvm->mmu_lock); pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { + /* + * Make sure the page table is still active, as another thread + * could have possibly freed the page table, while we released + * the lock. + */ + if (!READ_ONCE(kvm->arch.pgd)) + break; next = stage2_pgd_addr_end(addr, end); if (!stage2_pgd_none(*pgd)) unmap_stage2_puds(kvm, pgd, addr, next); @@ -1170,11 +1177,13 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) * large. Otherwise, we may see kernel panics with * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, * CONFIG_LOCKDEP. Additionally, holding the lock too long - * will also starve other vCPUs. + * will also starve other vCPUs. We have to also make sure + * that the page tables are not freed while we released + * the lock. */ - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - + cond_resched_lock(&kvm->mmu_lock); + if (!READ_ONCE(kvm->arch.pgd)) + break; next = stage2_pgd_addr_end(addr, end); if (stage2_pgd_present(*pgd)) stage2_wp_puds(pgd, addr, next); From b401ee0b85a53e89739ff68a5b1a0667d664afc9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 18 Apr 2017 12:41:18 +0200 Subject: [PATCH 19/27] KVM: x86: lower default for halt_poll_ns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some fio benchmarks, halt_poll_ns=400000 caused CPU utilization to increase heavily even in cases where the performance improvement was small. In particular, bandwidth divided by CPU usage was as much as 60% lower. To some extent this is the expected effect of the patch, and the additional CPU utilization is only visible when running the benchmarks. However, halving the threshold also halves the extra CPU utilization (from +30-130% to +20-70%) and has no negative effect on performance. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9c761fea0c98..695605eb1dfb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,7 +43,7 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_HALT_POLL_NS_DEFAULT 400000 +#define KVM_HALT_POLL_NS_DEFAULT 200000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS From 552c9f47f8d451830a6b47151c6d2db77f77cc3e Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 17 May 2017 13:12:51 +0200 Subject: [PATCH 20/27] KVM: arm/arm64: Fix bug when registering redist iodevs If userspace creates the VCPUs after initializing the VGIC, then we end up in a situation where we trigger a bug in kvm_vcpu_get_idx(), because it is called prior to adding the VCPU into the vcpus array on the VM. There is no tight coupling between the VCPU index and the area of the redistributor region used for the VCPU, so we can simply ensure that all creations of redistributors are serialized per VM, and increment an offset when we successfully add a redistributor. The vgic_register_redist_iodev() function can be called from two paths: vgic_redister_all_redist_iodev() which is called via the kvm_vgic_addr() device attribute handler. This patch already holds the kvm->lock mutex. The other path is via kvm_vgic_vcpu_init, which is called through a longer chain from kvm_vm_ioctl_create_vcpu(), which releases the kvm->lock mutex just before calling kvm_arch_vcpu_create(), so we can simply take this mutex again later for our purposes. Fixes: ab6f468c10 ("KVM: arm/arm64: Register iodevs when setting redist base and creating VCPUs") Signed-off-by: Christoffer Dall Tested-by: Jean-Philippe Brucker Reviewed-by: Eric Auger --- include/kvm/arm_vgic.h | 5 ++++- virt/kvm/arm/vgic/vgic-init.c | 5 ++++- virt/kvm/arm/vgic/vgic-mmio-v3.c | 9 ++++++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 97b8d3728b31..ef718586321c 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -195,7 +195,10 @@ struct vgic_dist { /* either a GICv2 CPU interface */ gpa_t vgic_cpu_base; /* or a number of GICv3 redistributor regions */ - gpa_t vgic_redist_base; + struct { + gpa_t vgic_redist_base; + gpa_t vgic_redist_free_offset; + }; }; /* distributor enabled */ diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index dc68e2e424ab..3a0b8999f011 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -242,8 +242,11 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) * If we are creating a VCPU with a GICv3 we must also register the * KVM io device for the redistributor that belongs to this VCPU. */ - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + mutex_lock(&vcpu->kvm->lock); ret = vgic_register_redist_iodev(vcpu); + mutex_unlock(&vcpu->kvm->lock); + } return ret; } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 99da1a207c19..9b0f6810e7a8 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -586,7 +586,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) if (!vgic_v3_check_base(kvm)) return -EINVAL; - rd_base = vgic->vgic_redist_base + kvm_vcpu_get_idx(vcpu) * SZ_64K * 2; + rd_base = vgic->vgic_redist_base + vgic->vgic_redist_free_offset; sgi_base = rd_base + SZ_64K; kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); @@ -615,11 +615,14 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base, SZ_64K, &sgi_dev->dev); mutex_unlock(&kvm->slots_lock); - if (ret) + if (ret) { kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &rd_dev->dev); + return ret; + } - return ret; + vgic->vgic_redist_free_offset += 2 * SZ_64K; + return 0; } static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) From fa472fa91a5a0b241f5ddae927d2e235d07545df Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 17 May 2017 21:16:09 +0200 Subject: [PATCH 21/27] KVM: arm/arm64: Hold slots_lock when unregistering kvm io bus devices We were not holding the kvm->slots_lock as required when calling kvm_io_bus_unregister_dev() as required. This only affects the error path, but still, let's do our due diligence. Reported by: Eric Auger Signed-off-by: Christoffer Dall Reviewed-by: Eric Auger --- virt/kvm/arm/vgic/vgic-mmio-v3.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 9b0f6810e7a8..201d5e2e973d 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -614,15 +614,16 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base, SZ_64K, &sgi_dev->dev); - mutex_unlock(&kvm->slots_lock); if (ret) { kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &rd_dev->dev); - return ret; + goto out; } vgic->vgic_redist_free_offset += 2 * SZ_64K; - return 0; +out: + mutex_unlock(&kvm->slots_lock); + return ret; } static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) @@ -647,10 +648,12 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm) if (ret) { /* The current c failed, so we start with the previous one. */ + mutex_lock(&kvm->slots_lock); for (c--; c >= 0; c--) { vcpu = kvm_get_vcpu(kvm, c); vgic_unregister_redist_iodev(vcpu); } + mutex_unlock(&kvm->slots_lock); } return ret; From d3e7dec054174fdddae33eaa0032a82c3f42181d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 18 May 2017 10:38:53 +0300 Subject: [PATCH 22/27] KVM: Silence underflow warning in avic_get_physical_id_entry() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smatch complains that we check cap the upper bound of "index" but don't check for negatives. It's a false positive because "index" is never negative. But it's also simple enough to make it unsigned which makes the code easier to audit. Signed-off-by: Dan Carpenter Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c27ac6923a18..183ddb235fb4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1272,7 +1272,8 @@ static void init_vmcb(struct vcpu_svm *svm) } -static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index) +static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, + unsigned int index) { u64 *avic_physical_id_table; struct kvm_arch *vm_data = &vcpu->kvm->arch; From e2c2206a18993bc9f62393d49c7b2066c3845b25 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 11 May 2017 18:12:05 -0700 Subject: [PATCH 23/27] KVM: x86: Fix potential preemption when get the current kvmclock timestamp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG: using __this_cpu_read() in preemptible [00000000] code: qemu-system-x86/2809 caller is __this_cpu_preempt_check+0x13/0x20 CPU: 2 PID: 2809 Comm: qemu-system-x86 Not tainted 4.11.0+ #13 Call Trace: dump_stack+0x99/0xce check_preemption_disabled+0xf5/0x100 __this_cpu_preempt_check+0x13/0x20 get_kvmclock_ns+0x6f/0x110 [kvm] get_time_ref_counter+0x5d/0x80 [kvm] kvm_hv_process_stimers+0x2a1/0x8a0 [kvm] ? kvm_hv_process_stimers+0x2a1/0x8a0 [kvm] ? kvm_arch_vcpu_ioctl_run+0xac9/0x1ce0 [kvm] kvm_arch_vcpu_ioctl_run+0x5bf/0x1ce0 [kvm] kvm_vcpu_ioctl+0x384/0x7b0 [kvm] ? kvm_vcpu_ioctl+0x384/0x7b0 [kvm] ? __fget+0xf3/0x210 do_vfs_ioctl+0xa4/0x700 ? __fget+0x114/0x210 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x23/0xc2 RIP: 0033:0x7f9d164ed357 ? __this_cpu_preempt_check+0x13/0x20 This can be reproduced by run kvm-unit-tests/hyperv_stimer.flat w/ CONFIG_PREEMPT and CONFIG_DEBUG_PREEMPT enabled. Safe access to per-CPU data requires a couple of constraints, though: the thread working with the data cannot be preempted and it cannot be migrated while it manipulates per-CPU variables. If the thread is preempted, the thread that replaces it could try to work with the same variables; migration to another CPU could also cause confusion. However there is no preemption disable when reads host per-CPU tsc rate to calculate the current kvmclock timestamp. This patch fixes it by utilizing get_cpu/put_cpu pair to guarantee both __this_cpu_read() and rdtsc() are not preempted. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Reviewed-by: Paolo Bonzini Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b54125b590e8..3b5fc7e35f6e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1763,6 +1763,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + u64 ret; spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { @@ -1774,10 +1775,17 @@ u64 get_kvmclock_ns(struct kvm *kvm) hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; spin_unlock(&ka->pvclock_gtod_sync_lock); + /* both __this_cpu_read() and rdtsc() should be on the same cpu */ + get_cpu(); + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - return __pvclock_read_cycles(&hv_clock, rdtsc()); + ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + + put_cpu(); + + return ret; } static void kvm_setup_pvclock_page(struct kvm_vcpu *v) From cbfc6c9184ce71b52df4b1d82af5afc81a709178 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 19 May 2017 02:46:56 -0700 Subject: [PATCH 24/27] KVM: X86: Fix read out-of-bounds vulnerability in kvm pio emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Huawei folks reported a read out-of-bounds vulnerability in kvm pio emulation. - "inb" instruction to access PIT Mod/Command register (ioport 0x43, write only, a read should be ignored) in guest can get a random number. - "rep insb" instruction to access PIT register port 0x43 can control memcpy() in emulator_pio_in_emulated() to copy max 0x400 bytes but only read 1 bytes, which will disclose the unimportant kernel memory in host but no crash. The similar test program below can reproduce the read out-of-bounds vulnerability: void hexdump(void *mem, unsigned int len) { unsigned int i, j; for(i = 0; i < len + ((len % HEXDUMP_COLS) ? (HEXDUMP_COLS - len % HEXDUMP_COLS) : 0); i++) { /* print offset */ if(i % HEXDUMP_COLS == 0) { printf("0x%06x: ", i); } /* print hex data */ if(i < len) { printf("%02x ", 0xFF & ((char*)mem)[i]); } else /* end of block, just aligning for ASCII dump */ { printf(" "); } /* print ASCII dump */ if(i % HEXDUMP_COLS == (HEXDUMP_COLS - 1)) { for(j = i - (HEXDUMP_COLS - 1); j <= i; j++) { if(j >= len) /* end of block, not really printing */ { putchar(' '); } else if(isprint(((char*)mem)[j])) /* printable char */ { putchar(0xFF & ((char*)mem)[j]); } else /* other char */ { putchar('.'); } } putchar('\n'); } } } int main(void) { int i; if (iopl(3)) { err(1, "set iopl unsuccessfully\n"); return -1; } static char buf[0x40]; /* test ioport 0x40,0x41,0x42,0x43,0x44,0x45 */ memset(buf, 0xab, sizeof(buf)); asm volatile("push %rdi;"); asm volatile("mov %0, %%rdi;"::"q"(buf)); asm volatile ("mov $0x40, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x41, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x42, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x43, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x44, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("mov $0x45, %rdx;"); asm volatile ("in %dx,%al;"); asm volatile ("stosb;"); asm volatile ("pop %rdi;"); hexdump(buf, 0x40); printf("\n"); /* ins port 0x40 */ memset(buf, 0xab, sizeof(buf)); asm volatile("push %rdi;"); asm volatile("mov %0, %%rdi;"::"q"(buf)); asm volatile ("mov $0x20, %rcx;"); asm volatile ("mov $0x40, %rdx;"); asm volatile ("rep insb;"); asm volatile ("pop %rdi;"); hexdump(buf, 0x40); printf("\n"); /* ins port 0x43 */ memset(buf, 0xab, sizeof(buf)); asm volatile("push %rdi;"); asm volatile("mov %0, %%rdi;"::"q"(buf)); asm volatile ("mov $0x20, %rcx;"); asm volatile ("mov $0x43, %rdx;"); asm volatile ("rep insb;"); asm volatile ("pop %rdi;"); hexdump(buf, 0x40); printf("\n"); return 0; } The vcpu->arch.pio_data buffer is used by both in/out instrutions emulation w/o clear after using which results in some random datas are left over in the buffer. Guest reads port 0x43 will be ignored since it is write only, however, the function kernel_pio() can't distigush this ignore from successfully reads data from device's ioport. There is no new data fill the buffer from port 0x43, however, emulator_pio_in_emulated() will copy the stale data in the buffer to the guest unconditionally. This patch fixes it by clearing the buffer before in instruction emulation to avoid to grant guest the stale data in the buffer. In addition, string I/O is not supported for in kernel device. So there is no iteration to read ioport %RCX times for string I/O. The function kernel_pio() just reads one round, and then copy the io size * %RCX to the guest unconditionally, actually it copies the one round ioport data w/ other random datas which are left over in the vcpu->arch.pio_data buffer to the guest. This patch fixes it by introducing the string I/O support for in kernel device in order to grant the right ioport datas to the guest. Before the patch: 0x000000: fe 38 93 93 ff ff ab ab .8...... 0x000008: ab ab ab ab ab ab ab ab ........ 0x000010: ab ab ab ab ab ab ab ab ........ 0x000018: ab ab ab ab ab ab ab ab ........ 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ 0x000000: f6 00 00 00 00 00 00 00 ........ 0x000008: 00 00 00 00 00 00 00 00 ........ 0x000010: 00 00 00 00 4d 51 30 30 ....MQ00 0x000018: 30 30 20 33 20 20 20 20 00 3 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ 0x000000: f6 00 00 00 00 00 00 00 ........ 0x000008: 00 00 00 00 00 00 00 00 ........ 0x000010: 00 00 00 00 4d 51 30 30 ....MQ00 0x000018: 30 30 20 33 20 20 20 20 00 3 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ After the patch: 0x000000: 1e 02 f8 00 ff ff ab ab ........ 0x000008: ab ab ab ab ab ab ab ab ........ 0x000010: ab ab ab ab ab ab ab ab ........ 0x000018: ab ab ab ab ab ab ab ab ........ 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ 0x000000: d2 e2 d2 df d2 db d2 d7 ........ 0x000008: d2 d3 d2 cf d2 cb d2 c7 ........ 0x000010: d2 c4 d2 c0 d2 bc d2 b8 ........ 0x000018: d2 b4 d2 b0 d2 ac d2 a8 ........ 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ 0x000000: 00 00 00 00 00 00 00 00 ........ 0x000008: 00 00 00 00 00 00 00 00 ........ 0x000010: 00 00 00 00 00 00 00 00 ........ 0x000018: 00 00 00 00 00 00 00 00 ........ 0x000020: ab ab ab ab ab ab ab ab ........ 0x000028: ab ab ab ab ab ab ab ab ........ 0x000030: ab ab ab ab ab ab ab ab ........ 0x000038: ab ab ab ab ab ab ab ab ........ Reported-by: Moguofang Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Moguofang Signed-off-by: Wanpeng Li Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b5fc7e35f6e..519f3572e48e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4831,16 +4831,20 @@ emul_write: static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) { - /* TODO: String I/O for in kernel device */ - int r; + int r = 0, i; - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); + for (i = 0; i < vcpu->arch.pio.count; i++) { + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); + if (r) + break; + pd += vcpu->arch.pio.size; + } return r; } @@ -4878,6 +4882,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (vcpu->arch.pio.count) goto data_avail; + memset(vcpu->arch.pio_data, 0, size * count); + ret = emulator_pio_in_out(vcpu, size, port, val, count, true); if (ret) { data_avail: From f0367ee1d64d27fa08be2407df5c125442e885e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 18 May 2017 19:37:30 +0200 Subject: [PATCH 25/27] KVM: x86: zero base3 of unusable segments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Static checker noticed that base3 could be used uninitialized if the segment was not present (useable). Random stack values probably would not pass VMCS entry checks. Reported-by: Dan Carpenter Fixes: 1aa366163b8b ("KVM: x86 emulator: consolidate segment accessors") Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 519f3572e48e..02363e37d4a6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5067,6 +5067,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, if (var.unusable) { memset(desc, 0, sizeof(*desc)); + if (base3) + *base3 = 0; return false; } From 34b0dadbdf698f9b277a31b2747b625b9a75ea1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 18 May 2017 19:37:31 +0200 Subject: [PATCH 26/27] KVM: x86/vPMU: fix undefined shift in intel_pmu_refresh() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Static analysis noticed that pmu->nr_arch_gp_counters can be 32 (INTEL_PMC_MAX_GENERIC) and therefore cannot be used to shift 'int'. I didn't add BUILD_BUG_ON for it as we have a better checker. Reported-by: Dan Carpenter Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch") Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/pmu_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 9d4a8504a95a..5ab4a364348e 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; From 92ceb7679ab8807d3b7fbcc6daf2279036954ef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 18 May 2017 19:37:32 +0200 Subject: [PATCH 27/27] KVM: x86: prevent uninitialized variable warning in check_svme() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_msr() of MSR_EFER is currently always going to succeed, but static checker doesn't see that far. Don't complicate stuff and just use 0 for the fallback -- it means that the feature is not present. Reported-by: Dan Carpenter Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c25cfaf584e7..0816ab2e8adc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4173,7 +4173,7 @@ static int check_dr_write(struct x86_emulate_ctxt *ctxt) static int check_svme(struct x86_emulate_ctxt *ctxt) { - u64 efer; + u64 efer = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);