From 0d640732dbebed0f10f18526de21652931f0b2f2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 9 Nov 2018 15:07:10 +0000 Subject: [PATCH 01/28] arm64: KVM: Skip MMIO insn after emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we emulate an MMIO instruction, we advance the CPU state within decode_hsr(), before emulating the instruction effects. Having this logic in decode_hsr() is opaque, and advancing the state before emulation is problematic. It gets in the way of applying consistent single-step logic, and it prevents us from being able to fail an MMIO instruction with a synchronous exception. Clean this up by only advancing the CPU state *after* the effects of the instruction are emulated. Cc: Peter Maydell Reviewed-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmio.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index dac7ceb1a677..08443a15e6be 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -117,6 +117,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); } + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 0; } @@ -144,11 +150,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) vcpu->arch.mmio_decode.sign_extend = sign_extend; vcpu->arch.mmio_decode.rt = rt; - /* - * The MMIO instruction is emulated and should not be re-executed - * in the guest. - */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); return 0; } From bd7d95cafb499e24903b7d21f9eeb2c5208160c2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 9 Nov 2018 15:07:11 +0000 Subject: [PATCH 02/28] arm64: KVM: Consistently advance singlestep when emulating instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we emulate a guest instruction, we don't advance the hardware singlestep state machine, and thus the guest will receive a software step exception after a next instruction which is not emulated by the host. We bodge around this in an ad-hoc fashion. Sometimes we explicitly check whether userspace requested a single step, and fake a debug exception from within the kernel. Other times, we advance the HW singlestep state rely on the HW to generate the exception for us. Thus, the observed step behaviour differs for host and guest. Let's make this simpler and consistent by always advancing the HW singlestep state machine when we skip an instruction. Thus we can rely on the hardware to generate the singlestep exception for us, and never need to explicitly check for an active-pending step, nor do we need to fake a debug exception from the guest. Cc: Peter Maydell Reviewed-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 5 --- arch/arm64/include/asm/kvm_emulate.h | 35 ++++++++++++++----- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/debug.c | 21 ------------ arch/arm64/kvm/handle_exit.c | 14 +------- arch/arm64/kvm/hyp/switch.c | 43 +++--------------------- arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c | 12 +++++-- virt/kvm/arm/arm.c | 2 -- virt/kvm/arm/hyp/vgic-v3-sr.c | 6 +++- 9 files changed, 46 insertions(+), 93 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 5ca5d9af0c26..c5634c6ffcea 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -296,11 +296,6 @@ static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} -static inline bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, - struct kvm_run *run) -{ - return false; -} int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 21247870def7..506386a3edde 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -24,6 +24,7 @@ #include +#include #include #include #include @@ -147,14 +148,6 @@ static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) return true; } -static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - if (vcpu_mode_is_32bit(vcpu)) - kvm_skip_instr32(vcpu, is_wide_instr); - else - *vcpu_pc(vcpu) += 4; -} - static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) { *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT; @@ -424,4 +417,30 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, return data; /* Leave LE untouched */ } +static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) +{ + if (vcpu_mode_is_32bit(vcpu)) + kvm_skip_instr32(vcpu, is_wide_instr); + else + *vcpu_pc(vcpu) += 4; + + /* advance the singlestep state machine */ + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; +} + +/* + * Skip an instruction which has been emulated at hyp while most guest sysregs + * are live. + */ +static inline void __hyp_text __kvm_skip_instr(struct kvm_vcpu *vcpu) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(elr); + vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); + + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + + write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); + write_sysreg_el2(*vcpu_pc(vcpu), elr); +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 52fbc823ff8c..7a5035f9c5c3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -445,7 +445,6 @@ void kvm_arm_init_debug(void); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); -bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 00d422336a45..f39801e4136c 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -236,24 +236,3 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) } } } - - -/* - * After successfully emulating an instruction, we might want to - * return to user space with a KVM_EXIT_DEBUG. We can only do this - * once the emulation is complete, though, so for userspace emulations - * we have to wait until we have re-entered KVM before calling this - * helper. - * - * Return true (and set exit_reason) to return to userspace or false - * if no further action is required. - */ -bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - run->exit_reason = KVM_EXIT_DEBUG; - run->debug.arch.hsr = ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT; - return true; - } - return false; -} diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 35a81bebd02b..b0643f9c4873 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -229,13 +229,6 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) handled = exit_handler(vcpu, run); } - /* - * kvm_arm_handle_step_debug() sets the exit_reason on the kvm_run - * structure if we need to return to userspace. - */ - if (handled > 0 && kvm_arm_handle_step_debug(vcpu, run)) - handled = 0; - return handled; } @@ -269,12 +262,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case ARM_EXCEPTION_IRQ: return 1; case ARM_EXCEPTION_EL1_SERROR: - /* We may still need to return for single-step */ - if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS) - && kvm_arm_handle_step_debug(vcpu, run)) - return 0; - else - return 1; + return 1; case ARM_EXCEPTION_TRAP: return handle_trap_exceptions(vcpu, run); case ARM_EXCEPTION_HYP_GONE: diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 7cc175c88a37..4282f05771c1 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -305,33 +305,6 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -/* Skip an instruction which has been emulated. Returns true if - * execution can continue or false if we need to exit hyp mode because - * single-step was in effect. - */ -static bool __hyp_text __skip_instr(struct kvm_vcpu *vcpu) -{ - *vcpu_pc(vcpu) = read_sysreg_el2(elr); - - if (vcpu_mode_is_32bit(vcpu)) { - vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); - kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); - } else { - *vcpu_pc(vcpu) += 4; - } - - write_sysreg_el2(*vcpu_pc(vcpu), elr); - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - vcpu->arch.fault.esr_el2 = - (ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT) | 0x22; - return false; - } else { - return true; - } -} - static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) { struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; @@ -420,20 +393,12 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (valid) { int ret = __vgic_v2_perform_cpuif_access(vcpu); - if (ret == 1 && __skip_instr(vcpu)) + if (ret == 1) return true; - if (ret == -1) { - /* Promote an illegal access to an - * SError. If we would be returning - * due to single-step clear the SS - * bit so handle_exit knows what to - * do after dealing with the error. - */ - if (!__skip_instr(vcpu)) - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + /* Promote an illegal access to an SError.*/ + if (ret == -1) *exit_code = ARM_EXCEPTION_EL1_SERROR; - } goto exit; } @@ -444,7 +409,7 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { int ret = __vgic_v3_perform_cpuif_access(vcpu); - if (ret == 1 && __skip_instr(vcpu)) + if (ret == 1) return true; } diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 215c7c0eb3b0..9cbdd034a563 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -41,7 +41,7 @@ static bool __hyp_text __is_be(struct kvm_vcpu *vcpu) * Returns: * 1: GICV access successfully performed * 0: Not a GICV access - * -1: Illegal GICV access + * -1: Illegal GICV access successfully performed */ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) { @@ -61,12 +61,16 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) return 0; /* Reject anything but a 32bit access */ - if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) + if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) { + __kvm_skip_instr(vcpu); return -1; + } /* Not aligned? Don't bother */ - if (fault_ipa & 3) + if (fault_ipa & 3) { + __kvm_skip_instr(vcpu); return -1; + } rd = kvm_vcpu_dabt_get_rd(vcpu); addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va; @@ -88,5 +92,7 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) vcpu_set_reg(vcpu, rd, data); } + __kvm_skip_instr(vcpu); + return 1; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 23774970c9df..4adcee5fc126 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -674,8 +674,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_handle_mmio_return(vcpu, vcpu->run); if (ret) return ret; - if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) - return 0; } if (run->immediate_exit) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 616e5a433ab0..9652c453480f 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -1012,8 +1012,10 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) esr = kvm_vcpu_get_hsr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { - if (!kvm_condition_valid(vcpu)) + if (!kvm_condition_valid(vcpu)) { + __kvm_skip_instr(vcpu); return 1; + } sysreg = esr_cp15_to_sysreg(esr); } else { @@ -1123,6 +1125,8 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) rt = kvm_vcpu_sys_get_rt(vcpu); fn(vcpu, vmcr, rt); + __kvm_skip_instr(vcpu); + return 1; } From fb544d1ca65a89f7a3895f7531221ceeed74ada7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 11 Dec 2018 13:23:57 +0100 Subject: [PATCH 03/28] KVM: arm/arm64: Fix VMID alloc race by reverting to lock-less We recently addressed a VMID generation race by introducing a read/write lock around accesses and updates to the vmid generation values. However, kvm_arch_vcpu_ioctl_run() also calls need_new_vmid_gen() but does so without taking the read lock. As far as I can tell, this can lead to the same kind of race: VM 0, VCPU 0 VM 0, VCPU 1 ------------ ------------ update_vttbr (vmid 254) update_vttbr (vmid 1) // roll over read_lock(kvm_vmid_lock); force_vm_exit() local_irq_disable need_new_vmid_gen == false //because vmid gen matches enter_guest (vmid 254) kvm_arch.vttbr = : read_unlock(kvm_vmid_lock); enter_guest (vmid 1) Which results in running two VCPUs in the same VM with different VMIDs and (even worse) other VCPUs from other VMs could now allocate clashing VMID 254 from the new generation as long as VCPU 0 is not exiting. Attempt to solve this by making sure vttbr is updated before another CPU can observe the updated VMID generation. Cc: stable@vger.kernel.org Fixes: f0cf47d939d0 "KVM: arm/arm64: Close VMID generation race" Reviewed-by: Julien Thierry Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/arm.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4adcee5fc126..d9273f972828 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -66,7 +66,7 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; static unsigned int kvm_vmid_bits __read_mostly; -static DEFINE_RWLOCK(kvm_vmid_lock); +static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; @@ -484,7 +484,9 @@ void force_vm_exit(const cpumask_t *mask) */ static bool need_new_vmid_gen(struct kvm *kvm) { - return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen)); + u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); + smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ + return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen); } /** @@ -499,16 +501,11 @@ static void update_vttbr(struct kvm *kvm) { phys_addr_t pgd_phys; u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; - bool new_gen; - read_lock(&kvm_vmid_lock); - new_gen = need_new_vmid_gen(kvm); - read_unlock(&kvm_vmid_lock); - - if (!new_gen) + if (!need_new_vmid_gen(kvm)) return; - write_lock(&kvm_vmid_lock); + spin_lock(&kvm_vmid_lock); /* * We need to re-check the vmid_gen here to ensure that if another vcpu @@ -516,7 +513,7 @@ static void update_vttbr(struct kvm *kvm) * use the same vmid. */ if (!need_new_vmid_gen(kvm)) { - write_unlock(&kvm_vmid_lock); + spin_unlock(&kvm_vmid_lock); return; } @@ -539,7 +536,6 @@ static void update_vttbr(struct kvm *kvm) kvm_call_hyp(__kvm_flush_vm_context); } - kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen); kvm->arch.vmid = kvm_next_vmid; kvm_next_vmid++; kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; @@ -550,7 +546,10 @@ static void update_vttbr(struct kvm *kvm) vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; - write_unlock(&kvm_vmid_lock); + smp_wmb(); + WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); + + spin_unlock(&kvm_vmid_lock); } static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) From d1878af3a5a6ac00bc8a3edfecf80539ee9c546e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 6 Dec 2018 12:31:44 +0000 Subject: [PATCH 04/28] KVM: arm/arm64: Log PSTATE for unhandled sysregs When KVM traps an unhandled sysreg/coproc access from a guest, it logs the guest PC. To aid debugging, it would be helpful to know which exception level the trap came from, along with other PSTATE/CPSR bits, so let's log the PSTATE/CPSR too. Acked-by: Christoffer Dall Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm/kvm/coproc.c | 4 ++-- arch/arm64/kvm/sys_regs.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index cb094e55dc5f..222c1635bc7a 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -602,8 +602,8 @@ static int emulate_cp15(struct kvm_vcpu *vcpu, } } else { /* If access function fails, it should complain. */ - kvm_err("Unsupported guest CP15 access at: %08lx\n", - *vcpu_pc(vcpu)); + kvm_err("Unsupported guest CP15 access at: %08lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_cp_instr(params); kvm_inject_undefined(vcpu); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 22fbbdbece3c..ecbf67ccabf5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1912,8 +1912,8 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, WARN_ON(1); } - kvm_err("Unsupported guest CP%d access at: %08lx\n", - cp, *vcpu_pc(vcpu)); + kvm_err("Unsupported guest CP%d access at: %08lx [%08lx]\n", + cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_sys_reg_instr(params); kvm_inject_undefined(vcpu); } @@ -2063,8 +2063,8 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, if (likely(r)) { perform_access(vcpu, params, r); } else { - kvm_err("Unsupported guest sys_reg access at: %lx\n", - *vcpu_pc(vcpu)); + kvm_err("Unsupported guest sys_reg access at: %lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_sys_reg_instr(params); kvm_inject_undefined(vcpu); } From 60c3ab30d8c2ff3a52606df03f05af2aae07dc6b Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 11 Dec 2018 12:51:03 +0100 Subject: [PATCH 05/28] KVM: arm/arm64: vgic-v2: Set active_source to 0 when restoring state When restoring the active state from userspace, we don't know which CPU was the source for the active state, and this is not architecturally exposed in any of the register state. Set the active_source to 0 in this case. In the future, we can expand on this and exposse the information as additional information to userspace for GICv2 if anyone cares. Cc: stable@vger.kernel.org Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index f56ff1cf52ec..2b450d49a046 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -338,11 +338,26 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); } else { u32 model = vcpu->kvm->arch.vgic.vgic_model; + u8 active_source; irq->active = active; + + /* + * The GICv2 architecture indicates that the source CPUID for + * an SGI should be provided during an EOI which implies that + * the active state is stored somewhere, but at the same time + * this state is not architecturally exposed anywhere and we + * have no way of knowing the right source. + * + * This may lead to a VCPU not being able to receive + * additional instances of a particular SGI after migration + * for a GICv2 VM on some GIC implementations. Oh well. + */ + active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; + if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && active && vgic_irq_is_sgi(irq->intid)) - irq->active_source = requester_vcpu->vcpu_id; + irq->active_source = active_source; } if (irq->active) From 3f58bf63455588a260b6abc8761c48bc8977b919 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:34 +0000 Subject: [PATCH 06/28] KVM: arm/arm64: Share common code in user_mem_abort() The code for operations such as marking the pfn as dirty, and dcache/icache maintenance during stage 2 fault handling is duplicated between normal pages and PMD hugepages. Instead of creating another copy of the operations when we introduce PUD hugepages, let's share them across the different pagesizes. Signed-off-by: Punit Agrawal Reviewed-by: Suzuki K Poulose Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 49 ++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 5eca48bdb1a6..59595207c5e1 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1475,7 +1475,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long fault_status) { int ret; - bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; + bool write_fault, exec_fault, writable, force_pte = false; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -1484,7 +1484,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); - unsigned long flags = 0; + unsigned long vma_pagesize, flags = 0; write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_iabt(vcpu); @@ -1504,10 +1504,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { - hugetlb = true; + vma_pagesize = vma_kernel_pagesize(vma); + if (vma_pagesize == PMD_SIZE && !logging_active) { gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { + /* + * Fallback to PTE if it's not one of the Stage 2 + * supported hugepage sizes + */ + vma_pagesize = PAGE_SIZE; + /* * Pages belonging to memslots that don't have the same * alignment for userspace and IPA cannot be mapped using @@ -1573,23 +1579,33 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb && !force_pte) - hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); + if (vma_pagesize == PAGE_SIZE && !force_pte) { + /* + * Only PMD_SIZE transparent hugepages(THP) are + * currently supported. This code will need to be + * updated to support other THP sizes. + */ + if (transparent_hugepage_adjust(&pfn, &fault_ipa)) + vma_pagesize = PMD_SIZE; + } - if (hugetlb) { + if (writable) + kvm_set_pfn_dirty(pfn); + + if (fault_status != FSC_PERM) + clean_dcache_guest_page(pfn, vma_pagesize); + + if (exec_fault) + invalidate_icache_guest_page(pfn, vma_pagesize); + + if (vma_pagesize == PMD_SIZE) { pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); - if (writable) { + if (writable) new_pmd = kvm_s2pmd_mkwrite(new_pmd); - kvm_set_pfn_dirty(pfn); - } - - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, PMD_SIZE); if (exec_fault) { new_pmd = kvm_s2pmd_mkexec(new_pmd); - invalidate_icache_guest_page(pfn, PMD_SIZE); } else if (fault_status == FSC_PERM) { /* Preserve execute if XN was already cleared */ if (stage2_is_exec(kvm, fault_ipa)) @@ -1602,16 +1618,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (writable) { new_pte = kvm_s2pte_mkwrite(new_pte); - kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, PAGE_SIZE); - if (exec_fault) { new_pte = kvm_s2pte_mkexec(new_pte); - invalidate_icache_guest_page(pfn, PAGE_SIZE); } else if (fault_status == FSC_PERM) { /* Preserve execute if XN was already cleared */ if (stage2_is_exec(kvm, fault_ipa)) From 6396b852e46e562f4742ed0a9042b537eb26b8aa Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:35 +0000 Subject: [PATCH 07/28] KVM: arm/arm64: Re-factor setting the Stage 2 entry to exec on fault Stage 2 fault handler marks a page as executable if it is handling an execution fault or if it was a permission fault in which case the executable bit needs to be preserved. The logic to decide if the page should be marked executable is duplicated for PMD and PTE entries. To avoid creating another copy when support for PUD hugepages is introduced refactor the code to share the checks needed to mark a page table entry as executable. Signed-off-by: Punit Agrawal Reviewed-by: Suzuki K Poulose Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 59595207c5e1..6912529946fb 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1475,7 +1475,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long fault_status) { int ret; - bool write_fault, exec_fault, writable, force_pte = false; + bool write_fault, writable, force_pte = false; + bool exec_fault, needs_exec; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -1598,19 +1599,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault) invalidate_icache_guest_page(pfn, vma_pagesize); + /* + * If we took an execution fault we have made the + * icache/dcache coherent above and should now let the s2 + * mapping be executable. + * + * Write faults (!exec_fault && FSC_PERM) are orthogonal to + * execute permissions, and we preserve whatever we have. + */ + needs_exec = exec_fault || + (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); + if (vma_pagesize == PMD_SIZE) { pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); if (writable) new_pmd = kvm_s2pmd_mkwrite(new_pmd); - if (exec_fault) { + if (needs_exec) new_pmd = kvm_s2pmd_mkexec(new_pmd); - } else if (fault_status == FSC_PERM) { - /* Preserve execute if XN was already cleared */ - if (stage2_is_exec(kvm, fault_ipa)) - new_pmd = kvm_s2pmd_mkexec(new_pmd); - } ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { @@ -1621,13 +1628,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mark_page_dirty(kvm, gfn); } - if (exec_fault) { + if (needs_exec) new_pte = kvm_s2pte_mkexec(new_pte); - } else if (fault_status == FSC_PERM) { - /* Preserve execute if XN was already cleared */ - if (stage2_is_exec(kvm, fault_ipa)) - new_pte = kvm_s2pte_mkexec(new_pte); - } ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } From f8df73388ee25b5e5f1d26249202e7126ca8139d Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:36 +0000 Subject: [PATCH 08/28] KVM: arm/arm64: Introduce helpers to manipulate page table entries Introduce helpers to abstract architectural handling of the conversion of pfn to page table entries and marking a PMD page table entry as a block entry. The helpers are introduced in preparation for supporting PUD hugepages at stage 2 - which are supported on arm64 but do not exist on arm. Signed-off-by: Punit Agrawal Reviewed-by: Suzuki K Poulose Acked-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 5 +++++ arch/arm64/include/asm/kvm_mmu.h | 5 +++++ virt/kvm/arm/mmu.c | 14 ++++++++------ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 1098ffc3d54b..e6eff8bf5d7f 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -82,6 +82,11 @@ void kvm_clear_hyp_idmap(void); #define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE) #define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; }) +#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) +#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) + +#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= L_PTE_S2_RDWR; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 658657367f2f..13d482710292 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -184,6 +184,11 @@ void kvm_clear_hyp_idmap(void); #define kvm_mk_pgd(pudp) \ __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) +#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) +#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) + +#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= PTE_S2_RDWR; diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 6912529946fb..fb5325f7a1ac 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -607,7 +607,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, addr = start; do { pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, pfn_pte(pfn, prot)); + kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); get_page(virt_to_page(pte)); pfn++; } while (addr += PAGE_SIZE, addr != end); @@ -1202,7 +1202,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pfn = __phys_to_pfn(pa); for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); + pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); if (writable) pte = kvm_s2pte_mkwrite(pte); @@ -1611,8 +1611,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); if (vma_pagesize == PMD_SIZE) { - pmd_t new_pmd = pfn_pmd(pfn, mem_type); - new_pmd = pmd_mkhuge(new_pmd); + pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); + + new_pmd = kvm_pmd_mkhuge(new_pmd); + if (writable) new_pmd = kvm_s2pmd_mkwrite(new_pmd); @@ -1621,7 +1623,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { - pte_t new_pte = pfn_pte(pfn, mem_type); + pte_t new_pte = kvm_pfn_pte(pfn, mem_type); if (writable) { new_pte = kvm_s2pte_mkwrite(new_pte); @@ -1878,7 +1880,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = pfn_pte(pfn, PAGE_S2); + stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); } From 4ea5af53114091e23a8fc279f25637e6c4e892c6 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:37 +0000 Subject: [PATCH 09/28] KVM: arm64: Support dirty page tracking for PUD hugepages In preparation for creating PUD hugepages at stage 2, add support for write protecting PUD hugepages when they are encountered. Write protecting guest tables is used to track dirty pages when migrating VMs. Also, provide trivial implementations of required kvm_s2pud_* helpers to allow sharing of code with arm32. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replaced BUG() => WARN_ON() in arm32 pud helpers ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 15 +++++++++++++++ arch/arm64/include/asm/kvm_mmu.h | 10 ++++++++++ virt/kvm/arm/mmu.c | 11 +++++++---- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index e6eff8bf5d7f..9fe6c30eb2fc 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -87,6 +87,21 @@ void kvm_clear_hyp_idmap(void); #define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) +/* + * The following kvm_*pud*() functions are provided strictly to allow + * sharing code with arm64. They should never be called in practice. + */ +static inline void kvm_set_s2pud_readonly(pud_t *pud) +{ + WARN_ON(1); +} + +static inline bool kvm_s2pud_readonly(pud_t *pud) +{ + WARN_ON(1); + return false; +} + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= L_PTE_S2_RDWR; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 13d482710292..8da6d1b2a196 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -251,6 +251,16 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); } +static inline void kvm_set_s2pud_readonly(pud_t *pudp) +{ + kvm_set_s2pte_readonly((pte_t *)pudp); +} + +static inline bool kvm_s2pud_readonly(pud_t *pudp) +{ + return kvm_s2pte_readonly((pte_t *)pudp); +} + #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fb5325f7a1ac..1c669c3c1208 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1347,9 +1347,12 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { - /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_wp_pmds(kvm, pud, addr, next); + if (stage2_pud_huge(kvm, *pud)) { + if (!kvm_s2pud_readonly(pud)) + kvm_set_s2pud_readonly(pud); + } else { + stage2_wp_pmds(kvm, pud, addr, next); + } } } while (pud++, addr = next, addr != end); } @@ -1392,7 +1395,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) * * Called to start logging dirty pages after memory region * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns - * all present PMD and PTEs are write protected in the memory region. + * all present PUD, PMD and PTEs are write protected in the memory region. * Afterwards read of dirty page log can be called. * * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, From 86d1c55ea605025f78d026e7fc3a2bb4c3fc2d6a Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:38 +0000 Subject: [PATCH 10/28] KVM: arm64: Support PUD hugepage in stage2_is_exec() In preparation for creating PUD hugepages at stage 2, add support for detecting execute permissions on PUD page table entries. Faults due to lack of execute permissions on page table entries is used to perform i-cache invalidation on first execute. Provide trivial implementations of arm32 helpers to allow sharing of code. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replaced BUG() => WARN_ON(1) in arm32 PUD helpers ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 6 +++ arch/arm64/include/asm/kvm_mmu.h | 5 +++ arch/arm64/include/asm/pgtable-hwdef.h | 2 + virt/kvm/arm/mmu.c | 53 +++++++++++++++++++++++--- 4 files changed, 61 insertions(+), 5 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 9fe6c30eb2fc..a49655fe7cd9 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -102,6 +102,12 @@ static inline bool kvm_s2pud_readonly(pud_t *pud) return false; } +static inline bool kvm_s2pud_exec(pud_t *pud) +{ + WARN_ON(1); + return false; +} + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= L_PTE_S2_RDWR; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 8da6d1b2a196..c755b37b3f92 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -261,6 +261,11 @@ static inline bool kvm_s2pud_readonly(pud_t *pudp) return kvm_s2pte_readonly((pte_t *)pudp); } +static inline bool kvm_s2pud_exec(pud_t *pudp) +{ + return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); +} + #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 1d7d8da2ef9b..336e24cddc87 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -193,6 +193,8 @@ #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ #define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ +#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */ + /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 1c669c3c1208..8e44dccd1b47 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1083,23 +1083,66 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache return 0; } -static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +/* + * stage2_get_leaf_entry - walk the stage2 VM page tables and return + * true if a valid and present leaf-entry is found. A pointer to the + * leaf-entry is returned in the appropriate level variable - pudpp, + * pmdpp, ptepp. + */ +static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, + pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) { + pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - pmdp = stage2_get_pmd(kvm, NULL, addr); + *pudpp = NULL; + *pmdpp = NULL; + *ptepp = NULL; + + pudp = stage2_get_pud(kvm, NULL, addr); + if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) + return false; + + if (stage2_pud_huge(kvm, *pudp)) { + *pudpp = pudp; + return true; + } + + pmdp = stage2_pmd_offset(kvm, pudp, addr); if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) return false; - if (pmd_thp_or_huge(*pmdp)) - return kvm_s2pmd_exec(pmdp); + if (pmd_thp_or_huge(*pmdp)) { + *pmdpp = pmdp; + return true; + } ptep = pte_offset_kernel(pmdp, addr); if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) return false; - return kvm_s2pte_exec(ptep); + *ptepp = ptep; + return true; +} + +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +{ + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + bool found; + + found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); + if (!found) + return false; + + if (pudp) + return kvm_s2pud_exec(pudp); + else if (pmdp) + return kvm_s2pmd_exec(pmdp); + else + return kvm_s2pte_exec(ptep); } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, From eb3f0624ea082def887acc79e97934e27d0188b7 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:39 +0000 Subject: [PATCH 11/28] KVM: arm64: Support handling access faults for PUD hugepages In preparation for creating larger hugepages at Stage 2, extend the access fault handling at Stage 2 to support PUD hugepages when encountered. Provide trivial helpers for arm32 to allow sharing of code. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replaced BUG() => WARN_ON(1) in PUD helpers ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 9 +++++++++ arch/arm64/include/asm/kvm_mmu.h | 7 +++++++ arch/arm64/include/asm/pgtable.h | 6 ++++++ virt/kvm/arm/mmu.c | 22 +++++++++++----------- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index a49655fe7cd9..3a407204b957 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -85,6 +85,9 @@ void kvm_clear_hyp_idmap(void); #define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) #define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) +#define kvm_pud_pfn(pud) ({ WARN_ON(1); 0; }) + + #define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) /* @@ -108,6 +111,12 @@ static inline bool kvm_s2pud_exec(pud_t *pud) return false; } +static inline pud_t kvm_s2pud_mkyoung(pud_t pud) +{ + BUG(); + return pud; +} + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= L_PTE_S2_RDWR; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index c755b37b3f92..612032bbb428 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -187,6 +187,8 @@ void kvm_clear_hyp_idmap(void); #define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) #define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) +#define kvm_pud_pfn(pud) pud_pfn(pud) + #define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) static inline pte_t kvm_s2pte_mkwrite(pte_t pte) @@ -266,6 +268,11 @@ static inline bool kvm_s2pud_exec(pud_t *pudp) return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); } +static inline pud_t kvm_s2pud_mkyoung(pud_t pud) +{ + return pud_mkyoung(pud); +} + #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 50b1ef8584c0..f51e2271e6a3 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -314,6 +314,11 @@ static inline pte_t pud_pte(pud_t pud) return __pte(pud_val(pud)); } +static inline pud_t pte_pud(pte_t pte) +{ + return __pud(pte_val(pte)); +} + static inline pmd_t pud_pmd(pud_t pud) { return __pmd(pud_val(pud)); @@ -381,6 +386,7 @@ static inline int pmd_protnone(pmd_t pmd) #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 8e44dccd1b47..bd749601195f 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1698,6 +1698,7 @@ out_unlock: */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { + pud_t *pud; pmd_t *pmd; pte_t *pte; kvm_pfn_t pfn; @@ -1707,24 +1708,23 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) spin_lock(&vcpu->kvm->mmu_lock); - pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) goto out; - if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ + if (pud) { /* HugeTLB */ + *pud = kvm_s2pud_mkyoung(*pud); + pfn = kvm_pud_pfn(*pud); + pfn_valid = true; + } else if (pmd) { /* THP, HugeTLB */ *pmd = pmd_mkyoung(*pmd); pfn = pmd_pfn(*pmd); pfn_valid = true; - goto out; + } else { + *pte = pte_mkyoung(*pte); /* Just a page... */ + pfn = pte_pfn(*pte); + pfn_valid = true; } - pte = pte_offset_kernel(pmd, fault_ipa); - if (pte_none(*pte)) /* Nothing there either */ - goto out; - - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; out: spin_unlock(&vcpu->kvm->mmu_lock); if (pfn_valid) From 35a63966194dd994f44150f07398c62f8dca011e Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:40 +0000 Subject: [PATCH 12/28] KVM: arm64: Update age handlers to support PUD hugepages In preparation for creating larger hugepages at Stage 2, add support to the age handling notifiers for PUD hugepages when encountered. Provide trivial helpers for arm32 to allow sharing code. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replaced BUG() => WARN_ON(1) for arm32 PUD helpers ] Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 6 +++++ arch/arm64/include/asm/kvm_mmu.h | 5 ++++ arch/arm64/include/asm/pgtable.h | 1 + virt/kvm/arm/mmu.c | 39 ++++++++++++++++---------------- 4 files changed, 32 insertions(+), 19 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 3a407204b957..4b4b2146e6a0 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -117,6 +117,12 @@ static inline pud_t kvm_s2pud_mkyoung(pud_t pud) return pud; } +static inline bool kvm_s2pud_young(pud_t pud) +{ + WARN_ON(1); + return false; +} + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= L_PTE_S2_RDWR; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 612032bbb428..9f941f70775c 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -273,6 +273,11 @@ static inline pud_t kvm_s2pud_mkyoung(pud_t pud) return pud_mkyoung(pud); } +static inline bool kvm_s2pud_young(pud_t pud) +{ + return pud_young(pud); +} + #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f51e2271e6a3..bb0f3f17a7a9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -386,6 +386,7 @@ static inline int pmd_protnone(pmd_t pmd) #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) +#define pud_young(pud) pte_young(pud_pte(pud)) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index bd749601195f..3893ea6a50bf 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1225,6 +1225,11 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) return stage2_ptep_test_and_clear_young((pte_t *)pmd); } +static int stage2_pudp_test_and_clear_young(pud_t *pud) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pud); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -1932,42 +1937,38 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { + pud_t *pud; pmd_t *pmd; pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - pmd = stage2_get_pmd(kvm, NULL, gpa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) return 0; - if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + if (pud) + return stage2_pudp_test_and_clear_young(pud); + else if (pmd) return stage2_pmdp_test_and_clear_young(pmd); - - pte = pte_offset_kernel(pmd, gpa); - if (pte_none(*pte)) - return 0; - - return stage2_ptep_test_and_clear_young(pte); + else + return stage2_ptep_test_and_clear_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { + pud_t *pud; pmd_t *pmd; pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - pmd = stage2_get_pmd(kvm, NULL, gpa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) return 0; - if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + if (pud) + return kvm_s2pud_young(*pud); + else if (pmd) return pmd_young(*pmd); - - pte = pte_offset_kernel(pmd, gpa); - if (!pte_none(*pte)) /* Just a page... */ + else return pte_young(*pte); - - return 0; } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) From b8e0ba7c8bea994011aff3b4c35256b180fab874 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 11 Dec 2018 17:10:41 +0000 Subject: [PATCH 13/28] KVM: arm64: Add support for creating PUD hugepages at stage 2 KVM only supports PMD hugepages at stage 2. Now that the various page handling routines are updated, extend the stage 2 fault handling to map in PUD hugepages. Addition of PUD hugepage support enables additional page sizes (e.g., 1G with 4K granule) which can be useful on cores that support mapping larger block sizes in the TLB entries. Signed-off-by: Punit Agrawal Reviewed-by: Christoffer Dall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon [ Replace BUG() => WARN_ON(1) for arm32 PUD helpers ] Signed-off-by: Suzuki Poulose Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_mmu.h | 20 +++++ arch/arm/include/asm/stage2_pgtable.h | 5 ++ arch/arm64/include/asm/kvm_mmu.h | 16 ++++ arch/arm64/include/asm/pgtable-hwdef.h | 2 + arch/arm64/include/asm/pgtable.h | 2 + virt/kvm/arm/mmu.c | 104 +++++++++++++++++++++++-- 6 files changed, 143 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 4b4b2146e6a0..3a875fc1b63c 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -84,11 +84,14 @@ void kvm_clear_hyp_idmap(void); #define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) #define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) +#define kvm_pfn_pud(pfn, prot) (__pud(0)) #define kvm_pud_pfn(pud) ({ WARN_ON(1); 0; }) #define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) +/* No support for pud hugepages */ +#define kvm_pud_mkhuge(pud) ( {WARN_ON(1); pud; }) /* * The following kvm_*pud*() functions are provided strictly to allow @@ -105,6 +108,23 @@ static inline bool kvm_s2pud_readonly(pud_t *pud) return false; } +static inline void kvm_set_pud(pud_t *pud, pud_t new_pud) +{ + WARN_ON(1); +} + +static inline pud_t kvm_s2pud_mkwrite(pud_t pud) +{ + WARN_ON(1); + return pud; +} + +static inline pud_t kvm_s2pud_mkexec(pud_t pud) +{ + WARN_ON(1); + return pud; +} + static inline bool kvm_s2pud_exec(pud_t *pud) { WARN_ON(1); diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index f6a7ea805232..f9017167a8d1 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -68,4 +68,9 @@ stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) #define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) #define stage2_pud_table_empty(kvm, pudp) false +static inline bool kvm_stage2_has_pud(struct kvm *kvm) +{ + return false; +} + #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 9f941f70775c..8af4b1befa42 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -184,12 +184,16 @@ void kvm_clear_hyp_idmap(void); #define kvm_mk_pgd(pudp) \ __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) +#define kvm_set_pud(pudp, pud) set_pud(pudp, pud) + #define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) #define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) +#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot) #define kvm_pud_pfn(pud) pud_pfn(pud) #define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) +#define kvm_pud_mkhuge(pud) pud_mkhuge(pud) static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { @@ -203,6 +207,12 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) return pmd; } +static inline pud_t kvm_s2pud_mkwrite(pud_t pud) +{ + pud_val(pud) |= PUD_S2_RDWR; + return pud; +} + static inline pte_t kvm_s2pte_mkexec(pte_t pte) { pte_val(pte) &= ~PTE_S2_XN; @@ -215,6 +225,12 @@ static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) return pmd; } +static inline pud_t kvm_s2pud_mkexec(pud_t pud) +{ + pud_val(pud) &= ~PUD_S2_XN; + return pud; +} + static inline void kvm_set_s2pte_readonly(pte_t *ptep) { pteval_t old_pteval, pteval; diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 336e24cddc87..6f1c187f1c86 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -193,6 +193,8 @@ #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ #define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ +#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ +#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ #define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */ /* diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bb0f3f17a7a9..576128635f3c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -390,6 +390,8 @@ static inline int pmd_protnone(pmd_t pmd) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) +#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT)) + #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 3893ea6a50bf..2dcff38868d4 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -115,6 +115,25 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) put_page(virt_to_page(pmd)); } +/** + * stage2_dissolve_pud() - clear and flush huge PUD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pud: pud pointer for IPA + * + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all + * pages in the range dirty. + */ +static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) +{ + if (!stage2_pud_huge(kvm, *pudp)) + return; + + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pudp)); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -1022,7 +1041,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); - if (!pud) + if (!pud || stage2_pud_huge(kvm, *pud)) return NULL; if (stage2_pud_none(kvm, *pud)) { @@ -1083,6 +1102,36 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache return 0; } +static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pud_t *new_pudp) +{ + pud_t *pudp, old_pud; + + pudp = stage2_get_pud(kvm, cache, addr); + VM_BUG_ON(!pudp); + + old_pud = *pudp; + + /* + * A large number of vcpus faulting on the same stage 2 entry, + * can lead to a refault due to the + * stage2_pud_clear()/tlb_flush(). Skip updating the page + * tables if there is no change. + */ + if (pud_val(old_pud) == pud_val(*new_pudp)) + return 0; + + if (stage2_pud_present(kvm, old_pud)) { + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pudp)); + } + + kvm_set_pud(pudp, *new_pudp); + return 0; +} + /* * stage2_get_leaf_entry - walk the stage2 VM page tables and return * true if a valid and present leaf-entry is found. A pointer to the @@ -1149,6 +1198,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr, const pte_t *new_pte, unsigned long flags) { + pud_t *pud; pmd_t *pmd; pte_t *pte, old_pte; bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; @@ -1157,7 +1207,31 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, VM_BUG_ON(logging_active && !cache); /* Create stage-2 page table mapping - Levels 0 and 1 */ - pmd = stage2_get_pmd(kvm, cache, addr); + pud = stage2_get_pud(kvm, cache, addr); + if (!pud) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* + * While dirty page logging - dissolve huge PUD, then continue + * on to allocate page. + */ + if (logging_active) + stage2_dissolve_pud(kvm, addr, pud); + + if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return 0; /* ignore calls from kvm_set_spte_hva */ + pmd = mmu_memory_cache_alloc(cache); + stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + + pmd = stage2_pmd_offset(kvm, pud, addr); if (!pmd) { /* * Ignore calls from kvm_set_spte_hva for unallocated @@ -1557,12 +1631,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } vma_pagesize = vma_kernel_pagesize(vma); - if (vma_pagesize == PMD_SIZE && !logging_active) { - gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; + /* + * PUD level may not exist for a VM but PMD is guaranteed to + * exist. + */ + if ((vma_pagesize == PMD_SIZE || + (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) && + !logging_active) { + gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; } else { /* * Fallback to PTE if it's not one of the Stage 2 - * supported hugepage sizes + * supported hugepage sizes or the corresponding level + * doesn't exist */ vma_pagesize = PAGE_SIZE; @@ -1661,7 +1742,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, needs_exec = exec_fault || (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); - if (vma_pagesize == PMD_SIZE) { + if (vma_pagesize == PUD_SIZE) { + pud_t new_pud = kvm_pfn_pud(pfn, mem_type); + + new_pud = kvm_pud_mkhuge(new_pud); + if (writable) + new_pud = kvm_s2pud_mkwrite(new_pud); + + if (needs_exec) + new_pud = kvm_s2pud_mkexec(new_pud); + + ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); + } else if (vma_pagesize == PMD_SIZE) { pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); new_pmd = kvm_pmd_mkhuge(new_pmd); From 2e2f6c3c0b08eed3fcf7de3c7684c940451bdeb1 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Mon, 26 Nov 2018 18:26:44 +0000 Subject: [PATCH 14/28] KVM: arm/arm64: vgic: Do not cond_resched_lock() with IRQs disabled To change the active state of an MMIO, halt is requested for all vcpus of the affected guest before modifying the IRQ state. This is done by calling cond_resched_lock() in vgic_mmio_change_active(). However interrupts are disabled at this point and we cannot reschedule a vcpu. We actually don't need any of this, as kvm_arm_halt_guest ensures that all the other vcpus are out of the guest. Let's just drop that useless code. Signed-off-by: Julien Thierry Suggested-by: Christoffer Dall Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 2b450d49a046..7c2231950c33 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -313,27 +313,6 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, spin_lock_irqsave(&irq->irq_lock, flags); - /* - * If this virtual IRQ was written into a list register, we - * have to make sure the CPU that runs the VCPU thread has - * synced back the LR state to the struct vgic_irq. - * - * As long as the conditions below are true, we know the VCPU thread - * may be on its way back from the guest (we kicked the VCPU thread in - * vgic_change_active_prepare) and still has to sync back this IRQ, - * so we release and re-acquire the spin_lock to let the other thread - * sync back the IRQ. - * - * When accessing VGIC state from user space, requester_vcpu is - * NULL, which is fine, because we guarantee that no VCPUs are running - * when accessing VGIC state from user space so irq->vcpu->cpu is - * always -1. - */ - while (irq->vcpu && /* IRQ may have state in an LR somewhere */ - irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */ - irq->vcpu->cpu != -1) /* VCPU thread is running */ - cond_resched_lock(&irq->irq_lock); - if (irq->hw) { vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); } else { From 6992195cc6c6c4d673a3266ae59cbeeb746d61af Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 6 Nov 2018 13:33:38 +0100 Subject: [PATCH 15/28] KVM: arm64: Clarify explanation of STAGE2_PGTABLE_LEVELS In attempting to re-construct the logic for our stage 2 page table layout I found the reasoning in the comment explaining how we calculate the number of levels used for stage 2 page tables a bit backwards. This commit attempts to clarify the comment, to make it slightly easier to read without having the Arm ARM open on the right page. While we're at it, fixup a typo in a comment that was recently changed. Reviewed-by: Suzuki K Poulose Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/stage2_pgtable.h | 16 +++++++--------- virt/kvm/arm/mmu.c | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index d352f6df8d2c..5412fa40825e 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -30,16 +30,14 @@ #define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) /* - * The hardware supports concatenation of up to 16 tables at stage2 entry level - * and we use the feature whenever possible. + * The hardware supports concatenation of up to 16 tables at stage2 entry + * level and we use the feature whenever possible, which means we resolve 4 + * additional bits of address at the entry level. * - * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). - * On arm64, the smallest PAGE_SIZE supported is 4k, which means - * (PAGE_SHIFT - 3) > 4 holds for all page sizes. - * This implies, the total number of page table levels at stage2 expected - * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) - * in normal translations(e.g, stage1), since we cannot have another level in - * the range (IPA_SHIFT, IPA_SHIFT - 4). + * This implies, the total number of page table levels required for + * IPA_SHIFT at stage2 expected by the hardware can be calculated using + * the same logic used for the (non-collapsable) stage1 page tables but for + * (IPA_SHIFT - 4). */ #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 2dcff38868d4..f605514395a1 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1356,7 +1356,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) struct page *page = pfn_to_page(pfn); /* - * PageTransCompoungMap() returns true for THP and + * PageTransCompoundMap() returns true for THP and * hugetlbfs. Make sure the adjustment is done only for THP * pages. */ From bea2ef803ade3359026d5d357348842bca9edcf1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Dec 2018 17:11:19 +0000 Subject: [PATCH 16/28] KVM: arm/arm64: vgic: Cap SPIs to the VM-defined maximum SPIs should be checked against the VMs specific configuration, and not the architectural maximum. Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 7cfdfbc910e0..8ab0491bcc94 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -108,8 +108,8 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, } /* SPIs */ - if (intid <= VGIC_MAX_SPI) { - intid = array_index_nospec(intid, VGIC_MAX_SPI); + if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; } From c23b2e6fc4ca346018618266bcabd335c0a8a49e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 12 Dec 2018 14:11:23 -0600 Subject: [PATCH 17/28] KVM: arm/arm64: vgic: Fix off-by-one bug in vgic_get_irq() When using the nospec API, it should be taken into account that: "...if the CPU speculates past the bounds check then * array_index_nospec() will clamp the index within the range of [0, * size)." The above is part of the header for macro array_index_nospec() in linux/nospec.h Now, in this particular case, if intid evaluates to exactly VGIC_MAX_SPI or to exaclty VGIC_MAX_PRIVATE, the array_index_nospec() macro ends up returning VGIC_MAX_SPI - 1 or VGIC_MAX_PRIVATE - 1 respectively, instead of VGIC_MAX_SPI or VGIC_MAX_PRIVATE, which, based on the original logic: /* SGIs and PPIs */ if (intid <= VGIC_MAX_PRIVATE) return &vcpu->arch.vgic_cpu.private_irqs[intid]; /* SPIs */ if (intid <= VGIC_MAX_SPI) return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; are valid values for intid. Fix this by calling array_index_nospec() macro with VGIC_MAX_PRIVATE + 1 and VGIC_MAX_SPI + 1 as arguments for its parameter size. Fixes: 41b87599c743 ("KVM: arm/arm64: vgic: fix possible spectre-v1 in vgic_get_irq()") Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva [dropped the SPI part which was fixed separately] Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 8ab0491bcc94..f884a54b2601 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -103,7 +103,7 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, { /* SGIs and PPIs */ if (intid <= VGIC_MAX_PRIVATE) { - intid = array_index_nospec(intid, VGIC_MAX_PRIVATE); + intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); return &vcpu->arch.vgic_cpu.private_irqs[intid]; } From 9009782a4937ad0f4a4be6945080d7fa77fa4092 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 1 Dec 2018 13:21:47 -0800 Subject: [PATCH 18/28] KVM: arm/arm64: vgic: Consider priority and active state for pending irq When checking if there are any pending IRQs for the VM, consider the active state and priority of the IRQs as well. Otherwise we could be continuously scheduling a guest hypervisor without it seeing an IRQ. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index f884a54b2601..a6b135491b6c 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -908,6 +908,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) struct vgic_irq *irq; bool pending = false; unsigned long flags; + struct vgic_vmcr vmcr; if (!vcpu->kvm->arch.vgic.enabled) return false; @@ -915,11 +916,15 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) return true; + vgic_get_vmcr(vcpu, &vmcr); + spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); - pending = irq_is_pending(irq) && irq->enabled; + pending = irq_is_pending(irq) && irq->enabled && + !irq->active && + irq->priority < vmcr.pmr; spin_unlock(&irq->irq_lock); if (pending) From 71a7e47f39a2fb971ef9934e98ba089581eff641 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 3 Dec 2018 21:31:24 +0100 Subject: [PATCH 19/28] KVM: arm/arm64: Fixup the kvm_exit tracepoint The kvm_exit tracepoint strangely always reported exits as being IRQs. This seems to be because either the __print_symbolic or the tracepoint macros use a variable named idx. Take this chance to update the fields in the tracepoint to reflect the concepts in the arm64 architecture that we pass to the tracepoint and move the exception type table to the same location and header files as the exits code. We also clear out the exception code to 0 for IRQ exits (which translates to UNKNOWN in text) to make it slighyly less confusing to parse the trace output. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 4 ---- arch/arm64/include/asm/kvm_asm.h | 6 ++++++ virt/kvm/arm/trace.h | 18 +++++++++--------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 6f602af5263c..9921bb7ab6d8 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -320,10 +320,6 @@ #define PAR_TO_HPFAR(par) \ (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) -#define kvm_arm_exception_type \ - {0, "IRQ" }, \ - {1, "TRAP" } - #define ECN(x) { ESR_ELx_EC_##x, #x } #define kvm_arm_exception_class \ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index aea01a09eb94..b2e12c99db7d 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -34,6 +34,12 @@ /* The hyp-stub will return this for any kvm_call_hyp() call */ #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR +#define kvm_arm_exception_type \ + {ARM_EXCEPTION_IRQ, "IRQ" }, \ + {ARM_EXCEPTION_EL1_SERROR, "SERROR" }, \ + {ARM_EXCEPTION_TRAP, "TRAP" }, \ + {ARM_EXCEPTION_HYP_GONE, "HYP_GONE" } + #ifndef __ASSEMBLY__ #include diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 57b3edebbb40..f21f04f8036d 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -26,25 +26,25 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), - TP_ARGS(idx, exit_reason, vcpu_pc), + TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), + TP_ARGS(ret, esr_ec, vcpu_pc), TP_STRUCT__entry( - __field( int, idx ) - __field( unsigned int, exit_reason ) + __field( int, ret ) + __field( unsigned int, esr_ec ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( - __entry->idx = idx; - __entry->exit_reason = exit_reason; + __entry->ret = ARM_EXCEPTION_CODE(ret); + __entry->esr_ec = (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_TRAP) ? esr_ec : 0; __entry->vcpu_pc = vcpu_pc; ), TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", - __print_symbolic(__entry->idx, kvm_arm_exception_type), - __entry->exit_reason, - __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), + __print_symbolic(__entry->ret, kvm_arm_exception_type), + __entry->esr_ec, + __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), __entry->vcpu_pc) ); From 8a411b060f820140e9894239fde5819bb213bef0 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 27 Nov 2018 13:48:08 +0100 Subject: [PATCH 20/28] KVM: arm/arm64: Remove arch timer workqueue The use of a work queue in the hrtimer expire function for the bg_timer is a leftover from the time when we would inject interrupts when the bg_timer expired. Since we are no longer doing that, we can instead call kvm_vcpu_wake_up() directly from the hrtimer function and remove all workqueue functionality from the arch timer code. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 4 ---- virt/kvm/arm/arch_timer.c | 34 +++++++--------------------------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6502feb9524b..33771352dcd6 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -21,7 +21,6 @@ #include #include -#include struct arch_timer_context { /* Registers: control register, timer value */ @@ -52,9 +51,6 @@ struct arch_timer_cpu { /* Background timer used when the guest is not running */ struct hrtimer bg_timer; - /* Work queued with the above timer expires */ - struct work_struct expired; - /* Physical timer emulation */ struct hrtimer phys_timer; diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 17cecc96f735..da261f5e2a91 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -70,11 +70,9 @@ static void soft_timer_start(struct hrtimer *hrt, u64 ns) HRTIMER_MODE_ABS); } -static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) +static void soft_timer_cancel(struct hrtimer *hrt) { hrtimer_cancel(hrt); - if (work) - cancel_work_sync(work); } static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) @@ -102,23 +100,6 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) return IRQ_HANDLED; } -/* - * Work function for handling the backup timer that we schedule when a vcpu is - * no longer running, but had a timer programmed to fire in the future. - */ -static void kvm_timer_inject_irq_work(struct work_struct *work) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); - - /* - * If the vcpu is blocked we want to wake it up so that it will see - * the timer has expired when entering the guest. - */ - kvm_vcpu_wake_up(vcpu); -} - static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) { u64 cval, now; @@ -188,7 +169,7 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) return HRTIMER_RESTART; } - schedule_work(&timer->expired); + kvm_vcpu_wake_up(vcpu); return HRTIMER_NORESTART; } @@ -300,7 +281,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) * then we also don't need a soft timer. */ if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&timer->phys_timer, NULL); + soft_timer_cancel(&timer->phys_timer); return; } @@ -426,7 +407,7 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu) vtimer_restore_state(vcpu); - soft_timer_cancel(&timer->bg_timer, &timer->expired); + soft_timer_cancel(&timer->bg_timer); } static void set_cntvoff(u64 cntvoff) @@ -544,7 +525,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ - soft_timer_cancel(&timer->phys_timer, NULL); + soft_timer_cancel(&timer->phys_timer); /* * The kernel may decide to run userspace after calling vcpu_put, so @@ -637,7 +618,6 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); vcpu_ptimer(vcpu)->cntvoff = 0; - INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->bg_timer.function = kvm_bg_timer_expire; @@ -794,8 +774,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - soft_timer_cancel(&timer->bg_timer, &timer->expired); - soft_timer_cancel(&timer->phys_timer, NULL); + soft_timer_cancel(&timer->bg_timer); + soft_timer_cancel(&timer->phys_timer); kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); } From 6e14ef1d12dc26ab4f6bef9d47e6906002f0a1ba Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 11 Dec 2018 09:54:11 +0100 Subject: [PATCH 21/28] KVM: arm/arm64: arch_timer: Simplify kvm_timer_vcpu_terminate kvm_timer_vcpu_terminate can only be called in two scenarios: 1. As part of cleanup during a failed VCPU create 2. As part of freeing the whole VM (struct kvm refcount == 0) In the first case, we cannot have programmed any timers or mapped any IRQs, and therefore we do not have to cancel anything or unmap anything. In the second case, the VCPU will have gone through kvm_timer_vcpu_put, which will have canceled the emulated physical timer's hrtimer, and we do not need to that here as well. We also do not care if the irq is recorded as mapped or not in the VGIC data structure, because the whole VM is going away. That leaves us only with having to ensure that we cancel the bg_timer if we were blocking the last time we called kvm_timer_vcpu_put(). Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index da261f5e2a91..b07ac4614e1c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -772,11 +772,8 @@ out_free_irq: void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); soft_timer_cancel(&timer->bg_timer); - soft_timer_cancel(&timer->phys_timer); - kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); } static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) From da6f16662a6eba3de0d1a39f3b8913d38754bb2b Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 29 Nov 2018 12:20:01 +0100 Subject: [PATCH 22/28] KVM: arm64: Make vcpu const in vcpu_read_sys_reg vcpu_read_sys_reg should not be modifying the VCPU structure. Eventually, to handle EL2 sysregs for nested virtualization, we will call vcpu_read_sys_reg from places that have a const vcpu pointer, which will complain about the lack of the const modifier on the read path. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/sys_regs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7a5035f9c5c3..c22e50ba3473 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -319,7 +319,7 @@ struct kvm_vcpu_arch { */ #define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)]) -u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg); +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); /* diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ecbf67ccabf5..965c6f1706d6 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -76,7 +76,7 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, return false; } -u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg) +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { if (!vcpu->arch.sysregs_loaded_on_cpu) goto immediate_read; From 599d79dcd18fa0f88ae15b7042c9d6b011a678b2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Dec 2018 10:44:22 +0000 Subject: [PATCH 23/28] arm64: KVM: Add trapped system register access tracepoint We're pretty blind when it comes to system register tracing, and rely on the ESR value displayed by kvm_handle_sys, which isn't much. Instead, let's add an actual name to the sysreg entries, so that we can finally print it as we're about to perform the access itself. The new tracepoint is conveniently called kvm_sys_access. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 ++ arch/arm64/kvm/sys_regs.h | 4 ++++ arch/arm64/kvm/trace.h | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 965c6f1706d6..a1f57fcb1bdb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1850,6 +1850,8 @@ static void perform_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { + trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); + /* * Not having an accessor means that we have configured a trap * that we don't know how to handle. This certainly qualifies diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index cd710f8b63e0..3b1bc7f01d0b 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -35,6 +35,9 @@ struct sys_reg_params { }; struct sys_reg_desc { + /* Sysreg string for debug */ + const char *name; + /* MRS/MSR instruction which accesses it. */ u8 Op0; u8 Op1; @@ -130,6 +133,7 @@ const struct sys_reg_desc *find_reg_by_id(u64 id, #define Op2(_x) .Op2 = _x #define SYS_DESC(reg) \ + .name = #reg, \ Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h index 3b82fb1ddd09..eab91ad0effb 100644 --- a/arch/arm64/kvm/trace.h +++ b/arch/arm64/kvm/trace.h @@ -3,6 +3,7 @@ #define _TRACE_ARM64_KVM_H #include +#include "sys_regs.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -152,6 +153,40 @@ TRACE_EVENT(kvm_handle_sys_reg, TP_printk("HSR 0x%08lx", __entry->hsr) ); +TRACE_EVENT(kvm_sys_access, + TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), + TP_ARGS(vcpu_pc, params, reg), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_write) + __field(const char *, name) + __field(u8, Op0) + __field(u8, Op1) + __field(u8, CRn) + __field(u8, CRm) + __field(u8, Op2) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_write = params->is_write; + __entry->name = reg->name; + __entry->Op0 = reg->Op0; + __entry->Op0 = reg->Op0; + __entry->Op1 = reg->Op1; + __entry->CRn = reg->CRn; + __entry->CRm = reg->CRm; + __entry->Op2 = reg->Op2; + ), + + TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", + __entry->vcpu_pc, __entry->name ?: "UNKN", + __entry->Op0, __entry->Op1, __entry->CRn, + __entry->CRm, __entry->Op2, + __entry->is_write ? "write" : "read") +); + TRACE_EVENT(kvm_set_guest_debug, TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), TP_ARGS(vcpu, guest_debug), From 107352a24900fb458152b92a4e72fbdc83fd5510 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 18 Dec 2018 14:59:09 +0000 Subject: [PATCH 24/28] arm/arm64: KVM: vgic: Force VM halt when changing the active state of GICv3 PPIs/SGIs We currently only halt the guest when a vCPU messes with the active state of an SPI. This is perfectly fine for GICv2, but isn't enough for GICv3, where all vCPUs can access the state of any other vCPU. Let's broaden the condition to include any GICv3 interrupt that has an active state (i.e. all but LPIs). Cc: stable@vger.kernel.org Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 7c2231950c33..ceeda7e04a4d 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -362,14 +362,16 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, */ static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { - if (intid > VGIC_NR_PRIVATE_IRQS) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid > VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } /* See vgic_change_active_prepare */ static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) { - if (intid > VGIC_NR_PRIVATE_IRQS) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid > VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } From 6794ad5443a2118243e13879f94228524a1a2b92 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 2 Nov 2018 08:53:22 +0100 Subject: [PATCH 25/28] KVM: arm/arm64: Fix unintended stage 2 PMD mappings There are two things we need to take care of when we create block mappings in the stage 2 page tables: (1) The alignment within a PMD between the host address range and the guest IPA range must be the same, since otherwise we end up mapping pages with the wrong offset. (2) The head and tail of a memory slot may not cover a full block size, and we have to take care to not map those with block descriptors, since we could expose memory to the guest that the host did not intend to expose. So far, we have been taking care of (1), but not (2), and our commentary describing (1) was somewhat confusing. This commit attempts to factor out the checks of both into a common function, and if we don't pass the check, we won't attempt any PMD mappings for neither hugetlbfs nor THP. Note that we used to only check the alignment for THP, not for hugetlbfs, but as far as I can tell the check needs to be applied to both scenarios. Cc: Ralph Palutke Cc: Lukas Braun Reported-by: Lukas Braun Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 86 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index f605514395a1..dee3dbd98712 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1595,6 +1595,63 @@ static void kvm_send_hwpoison_signal(unsigned long address, send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); } +static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, + unsigned long hva) +{ + gpa_t gpa_start, gpa_end; + hva_t uaddr_start, uaddr_end; + size_t size; + + size = memslot->npages * PAGE_SIZE; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + gpa_end = gpa_start + size; + + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and IPA cannot be mapped with stage-2 + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SIZE: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 PMDs, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit PMD size mappings + * for the beginning and end of a non-PMD aligned and non-PMD sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & S2_PMD_MASK) >= uaddr_start && + (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -1621,6 +1678,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + if (!fault_supports_stage2_pmd_mappings(memslot, hva)) + force_pte = true; + + if (logging_active) + force_pte = true; + /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(¤t->mm->mmap_sem); vma = find_vma_intersection(current->mm, hva, hva + 1); @@ -1637,28 +1700,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ if ((vma_pagesize == PMD_SIZE || (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) && - !logging_active) { + !force_pte) { gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; - } else { - /* - * Fallback to PTE if it's not one of the Stage 2 - * supported hugepage sizes or the corresponding level - * doesn't exist - */ - vma_pagesize = PAGE_SIZE; - - /* - * Pages belonging to memslots that don't have the same - * alignment for userspace and IPA cannot be mapped using - * block descriptors even if the pages belong to a THP for - * the process, because the stage-2 block descriptor will - * cover more than a single THP and we loose atomicity for - * unmapping, updates, and splits of the THP or other pages - * in the stage-2 block range. - */ - if ((memslot->userspace_addr & ~PMD_MASK) != - ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) - force_pte = true; } up_read(¤t->mm->mmap_sem); @@ -1697,7 +1740,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * should not be mapped with huge pages (it introduces churn * and performance degradation), so force a pte mapping. */ - force_pte = true; flags |= KVM_S2_FLAG_LOGGING_ACTIVE; /* From df655b75c43fba0f2621680ab261083297fd6d16 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 13 Dec 2018 16:06:14 +0000 Subject: [PATCH 26/28] arm64: KVM: Avoid setting the upper 32 bits of VTCR_EL2 to 1 Although bit 31 of VTCR_EL2 is RES1, we inadvertently end up setting all of the upper 32 bits to 1 as well because we define VTCR_EL2_RES1 as signed, which is sign-extended when assigning to kvm->arch.vtcr. Lucky for us, the architecture currently treats these upper bits as RES0 so, whilst we've been naughty, we haven't set fire to anything yet. Cc: Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 9921bb7ab6d8..9c1a065b78ea 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -104,7 +104,7 @@ TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ -#define VTCR_EL2_RES1 (1 << 31) +#define VTCR_EL2_RES1 (1U << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) #define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT From 58466766cd35754a061414c0c93225db2962948e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 19 Dec 2018 08:28:38 +0000 Subject: [PATCH 27/28] arm/arm64: KVM: Add ARM_EXCEPTION_IS_TRAP macro 32 and 64bit use different symbols to identify the traps. 32bit has a fine grained approach (prefetch abort, data abort and HVC), while 64bit is pretty happy with just "trap". This has been fine so far, except that we now need to decode some of that in tracepoints that are common to both architectures. Introduce ARM_EXCEPTION_IS_TRAP which abstracts the trap symbols and make the tracepoint use it. Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_asm.h | 4 ++++ arch/arm64/include/asm/kvm_asm.h | 1 + virt/kvm/arm/trace.h | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 231e87ad45d5..35491af87985 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -23,6 +23,10 @@ #define ARM_EXIT_WITH_ABORT_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) +#define ARM_EXCEPTION_IS_TRAP(x) \ + (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_PREF_ABORT || \ + ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_DATA_ABORT || \ + ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_HVC) #define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) #define ARM_EXCEPTION_RESET 0 diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index b2e12c99db7d..f5b79e995f40 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -25,6 +25,7 @@ #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) +#define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP) #define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT)) #define ARM_EXCEPTION_IRQ 0 diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index f21f04f8036d..3828beab93f2 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -37,7 +37,7 @@ TRACE_EVENT(kvm_exit, TP_fast_assign( __entry->ret = ARM_EXCEPTION_CODE(ret); - __entry->esr_ec = (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_TRAP) ? esr_ec : 0; + __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; __entry->vcpu_pc = vcpu_pc; ), From 8c33df1afd86c611da8473dc6fc5f3af3dabe984 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 19 Dec 2018 08:31:54 +0000 Subject: [PATCH 28/28] arm: KVM: Add S2_PMD_{MASK,SIZE} constants They were missing, and it turns out that we do need them now. Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/stage2_pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index f9017167a8d1..c4b1d4fb1797 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -73,4 +73,7 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm) return false; } +#define S2_PMD_MASK PMD_MASK +#define S2_PMD_SIZE PMD_SIZE + #endif /* __ARM_S2_PGTABLE_H_ */