diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index f2491a8c68b4..e5dd9f4d6100 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -4,7 +4,17 @@ KVM Lock Overview 1. Acquisition Orders --------------------- -(to be written) +The acquisition orders for mutexes are as follows: + +- kvm->lock is taken outside vcpu->mutex + +- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock + +- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring + them together is quite rare. + +For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything +else is a leaf: no other lock is taken inside the critical sections. 2: Exception ------------ diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 07f58cfc1ab9..bebec370324f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -293,7 +293,10 @@ struct kvm_vcpu_arch { /* Host KSEG0 address of the EI/DI offset */ void *kseg0_commpage; - u32 io_gpr; /* GPR used as IO source/target */ + /* Resume PC after MMIO completion */ + unsigned long io_pc; + /* GPR used as IO source/target */ + u32 io_gpr; struct hrtimer comparecount_timer; /* Count timer control KVM register */ @@ -315,8 +318,6 @@ struct kvm_vcpu_arch { /* Bitmask of pending exceptions to be cleared */ unsigned long pending_exceptions_clr; - u32 pending_load_cause; - /* Save/Restore the entryhi register when are are preempted/scheduled back in */ unsigned long preempt_entryhi; diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 8770f32c9e0b..aa0937423e28 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -790,15 +790,15 @@ enum emulation_result kvm_mips_emul_eret(struct kvm_vcpu *vcpu) struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; - if (kvm_read_c0_guest_status(cop0) & ST0_EXL) { + if (kvm_read_c0_guest_status(cop0) & ST0_ERL) { + kvm_clear_c0_guest_status(cop0, ST0_ERL); + vcpu->arch.pc = kvm_read_c0_guest_errorepc(cop0); + } else if (kvm_read_c0_guest_status(cop0) & ST0_EXL) { kvm_debug("[%#lx] ERET to %#lx\n", vcpu->arch.pc, kvm_read_c0_guest_epc(cop0)); kvm_clear_c0_guest_status(cop0, ST0_EXL); vcpu->arch.pc = kvm_read_c0_guest_epc(cop0); - } else if (kvm_read_c0_guest_status(cop0) & ST0_ERL) { - kvm_clear_c0_guest_status(cop0, ST0_ERL); - vcpu->arch.pc = kvm_read_c0_guest_errorepc(cop0); } else { kvm_err("[%#lx] ERET when MIPS_SR_EXL|MIPS_SR_ERL == 0\n", vcpu->arch.pc); @@ -1528,13 +1528,25 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DO_MMIO; + unsigned long curr_pc; u32 op, rt; u32 bytes; rt = inst.i_format.rt; op = inst.i_format.opcode; - vcpu->arch.pending_load_cause = cause; + /* + * Find the resume PC now while we have safe and easy access to the + * prior branch instruction, and save it for + * kvm_mips_complete_mmio_load() to restore later. + */ + curr_pc = vcpu->arch.pc; + er = update_pc(vcpu, cause); + if (er == EMULATE_FAIL) + return er; + vcpu->arch.io_pc = vcpu->arch.pc; + vcpu->arch.pc = curr_pc; + vcpu->arch.io_gpr = rt; switch (op) { @@ -2494,9 +2506,8 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, goto done; } - er = update_pc(vcpu, vcpu->arch.pending_load_cause); - if (er == EMULATE_FAIL) - return er; + /* Restore saved resume PC */ + vcpu->arch.pc = vcpu->arch.io_pc; switch (run->mmio.len) { case 4: @@ -2518,11 +2529,6 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, break; } - if (vcpu->arch.pending_load_cause & CAUSEF_BD) - kvm_debug("[%#lx] Completing %d byte BD Load to gpr %d (0x%08lx) type %d\n", - vcpu->arch.pc, run->mmio.len, vcpu->arch.io_gpr, *gpr, - vcpu->mmio_needed); - done: return er; } diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 622037d851a3..06a60b19acfb 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -426,7 +426,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - int cpu = smp_processor_id(); + int i, cpu = smp_processor_id(); unsigned int gasid; /* @@ -442,6 +442,9 @@ static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) vcpu); vcpu->arch.guest_user_asid[cpu] = vcpu->arch.guest_user_mm.context.asid[cpu]; + for_each_possible_cpu(i) + if (i != cpu) + vcpu->arch.guest_user_asid[cpu] = 0; vcpu->arch.last_user_gasid = gasid; } } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 03883ba806e2..3b677c851be0 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -260,13 +260,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & asid_version_mask(cpu)) { - u32 gasid = kvm_read_c0_guest_entryhi(vcpu->arch.cop0) & - KVM_ENTRYHI_ASID; - kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); vcpu->arch.guest_user_asid[cpu] = vcpu->arch.guest_user_mm.context.asid[cpu]; - vcpu->arch.last_user_gasid = gasid; newasid++; kvm_debug("[%d]: cpu_context: %#lx\n", cpu, diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c index bd98b7d25200..05c98bb853cf 100644 --- a/arch/s390/kvm/sthyi.c +++ b/arch/s390/kvm/sthyi.c @@ -315,7 +315,7 @@ static void fill_diag(struct sthyi_sctns *sctns) if (r < 0) goto out; - diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA); + diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); if (!diag224_buf || diag224(diag224_buf)) goto out; @@ -378,7 +378,7 @@ static void fill_diag(struct sthyi_sctns *sctns) sctns->par.infpval1 |= PAR_WGHT_VLD; out: - kfree(diag224_buf); + free_page((unsigned long)diag224_buf); vfree(diag204_buf); } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4b20f7304b9c..bdde80731f49 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -948,7 +948,6 @@ struct kvm_x86_ops { int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); bool (*invpcid_supported)(void); - void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -958,8 +957,6 @@ struct kvm_x86_ops { void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); int (*check_intercept)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index c2b8d24a235c..d74747b031ec 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -29,9 +29,20 @@ struct kvm_page_track_notifier_node { * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. + * @node: this node */ void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); + int bytes, struct kvm_page_track_notifier_node *node); + /* + * It is called when memory slot is being moved or removed + * users can drop write-protection for the pages in that memory slot + * + * @kvm: the kvm where memory slot being moved or removed + * @slot: the memory slot being moved or removed + * @node: this node + */ + void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node); }; void kvm_page_track_init(struct kvm *kvm); @@ -58,4 +69,5 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n); void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4e95d3eb2955..cbd7b92585bb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5045,7 +5045,7 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); - if (ctxt->rip_relative) + if (ctxt->rip_relative && likely(ctxt->memopp)) ctxt->memopp->addr.mem.ea = address_mask(ctxt, ctxt->memopp->addr.mem.ea + ctxt->_eip); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e986b4e4..87c5880ba3b7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4405,7 +4405,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) } static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -4617,11 +4618,19 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node) +{ + kvm_mmu_invalidate_zap_all_pages(kvm); +} + void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; node->track_write = kvm_mmu_pte_write; + node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); } diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index b431539c3714..4a1c13eaa518 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -106,6 +106,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); /* * remove the guest page from the tracking pool which stops the interception @@ -135,6 +136,7 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, */ kvm_mmu_gfn_allow_lpage(slot, gfn); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. @@ -181,6 +183,7 @@ kvm_page_track_register_notifier(struct kvm *kvm, hlist_add_head_rcu(&n->node, &head->track_notifier_list); spin_unlock(&kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); /* * stop receiving the event interception. It is the opposed operation of @@ -199,6 +202,7 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } +EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); /* * Notify the node that write access is intercepted and write emulation is @@ -222,6 +226,31 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) - n->track_write(vcpu, gpa, new, bytes); + n->track_write(vcpu, gpa, new, bytes, n); + srcu_read_unlock(&head->track_srcu, idx); +} + +/* + * Notify the node that memory slot is being removed or moved so that it can + * drop write-protection for the pages in the memory slot. + * + * The node should figure out it has any write-protected pages in this slot + * by itself. + */ +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_page_track_notifier_head *head; + struct kvm_page_track_notifier_node *n; + int idx; + + head = &kvm->arch.track_notifier_head; + + if (hlist_empty(&head->track_notifier_list)) + return; + + idx = srcu_read_lock(&head->track_srcu); + hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + if (n->track_flush_slot) + n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f8157a36ab09..8ca1eca5038d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1138,21 +1138,6 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.tsc_offset += adjustment; - if (is_guest_mode(vcpu)) - svm->nested.hsave->control.tsc_offset += adjustment; - else - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset - adjustment, - svm->vmcb->control.tsc_offset); - - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); -} - static void avic_init_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb; @@ -3449,12 +3434,6 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } -static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) -{ - struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); - return vmcb->control.tsc_offset + host_tsc; -} - static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5422,8 +5401,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .has_wbinvd_exit = svm_has_wbinvd_exit, .write_tsc_offset = svm_write_tsc_offset, - .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, - .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cf1b16dbc98a..5382b82462fc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -187,6 +187,7 @@ struct vmcs { */ struct loaded_vmcs { struct vmcs *vmcs; + struct vmcs *shadow_vmcs; int cpu; int launched; struct list_head loaded_vmcss_on_cpu_link; @@ -411,7 +412,6 @@ struct nested_vmx { * memory during VMXOFF, VMCLEAR, VMPTRLD. */ struct vmcs12 *cached_vmcs12; - struct vmcs *current_shadow_vmcs; /* * Indicates if the shadow vmcs must be updated with the * data hold by vmcs12 @@ -421,7 +421,6 @@ struct nested_vmx { /* vmcs02_list cache of VMCSs recently used to run L2 guests */ struct list_head vmcs02_pool; int vmcs02_num; - u64 vmcs01_tsc_offset; bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; @@ -1419,6 +1418,8 @@ static void vmcs_clear(struct vmcs *vmcs) static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) { vmcs_clear(loaded_vmcs->vmcs); + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) + vmcs_clear(loaded_vmcs->shadow_vmcs); loaded_vmcs->cpu = -1; loaded_vmcs->launched = 0; } @@ -2604,20 +2605,6 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu) return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; } -/* - * Like guest_read_tsc, but always returns L1's notion of the timestamp - * counter, even if a nested guest (L2) is currently running. - */ -static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) -{ - u64 tsc_offset; - - tsc_offset = is_guest_mode(vcpu) ? - to_vmx(vcpu)->nested.vmcs01_tsc_offset : - vmcs_read64(TSC_OFFSET); - return host_tsc + tsc_offset; -} - /* * writes 'offset' into guest's timestamp counter offset register */ @@ -2631,7 +2618,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * to the newly set TSC to get L2's TSC. */ struct vmcs12 *vmcs12; - to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; /* recalculate vmcs02.TSC_OFFSET: */ vmcs12 = get_vmcs12(vcpu); vmcs_write64(TSC_OFFSET, offset + @@ -2644,19 +2630,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) -{ - u64 offset = vmcs_read64(TSC_OFFSET); - - vmcs_write64(TSC_OFFSET, offset + adjustment); - if (is_guest_mode(vcpu)) { - /* Even when running L2, the adjustment needs to apply to L1 */ - to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; - } else - trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, - offset + adjustment); -} - static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); @@ -3562,6 +3535,7 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs_clear(loaded_vmcs); free_vmcs(loaded_vmcs->vmcs); loaded_vmcs->vmcs = NULL; + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } static void free_kvm_area(void) @@ -6696,6 +6670,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) if (!item) return NULL; item->vmcs02.vmcs = alloc_vmcs(); + item->vmcs02.shadow_vmcs = NULL; if (!item->vmcs02.vmcs) { kfree(item); return NULL; @@ -7072,7 +7047,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ vmcs_clear(shadow_vmcs); - vmx->nested.current_shadow_vmcs = shadow_vmcs; + vmx->vmcs01.shadow_vmcs = shadow_vmcs; } INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); @@ -7174,8 +7149,11 @@ static void free_nested(struct vcpu_vmx *vmx) free_page((unsigned long)vmx->nested.msr_bitmap); vmx->nested.msr_bitmap = NULL; } - if (enable_shadow_vmcs) - free_vmcs(vmx->nested.current_shadow_vmcs); + if (enable_shadow_vmcs) { + vmcs_clear(vmx->vmcs01.shadow_vmcs); + free_vmcs(vmx->vmcs01.shadow_vmcs); + vmx->vmcs01.shadow_vmcs = NULL; + } kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { @@ -7352,7 +7330,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) int i; unsigned long field; u64 field_value; - struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; const unsigned long *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; @@ -7401,7 +7379,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) int i, q; unsigned long field; u64 field_value = 0; - struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; vmcs_load(shadow_vmcs); @@ -7591,7 +7569,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, - __pa(vmx->nested.current_shadow_vmcs)); + __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; } } @@ -7659,7 +7637,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; - if (!(types & (1UL << type))) { + if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); skip_emulated_instruction(vcpu); @@ -7722,7 +7700,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; - if (!(types & (1UL << type))) { + if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); skip_emulated_instruction(vcpu); @@ -9156,6 +9134,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs = &vmx->vmcs01; vmx->loaded_vmcs->vmcs = alloc_vmcs(); + vmx->loaded_vmcs->shadow_vmcs = NULL; if (!vmx->loaded_vmcs->vmcs) goto free_msrs; if (!vmm_exclusive) @@ -10061,9 +10040,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, - vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + vcpu->arch.tsc_offset + vmcs12->tsc_offset); else - vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -10293,8 +10272,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enter_guest_mode(vcpu); - vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); @@ -10818,7 +10795,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); @@ -11339,8 +11316,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .write_tsc_offset = vmx_write_tsc_offset, - .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, - .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e375235d81c9..7e30c720d0c5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1409,7 +1409,7 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc)); + return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); @@ -1547,7 +1547,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); + kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment); } static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) @@ -1555,7 +1555,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); - kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); + adjust_tsc_offset_guest(vcpu, adjustment); } #ifdef CONFIG_X86_64 @@ -2262,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) @@ -2280,11 +2280,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } else { - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); break; } @@ -7410,10 +7410,12 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { + void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; + kvmclock_reset(vcpu); - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kvm_x86_ops->vcpu_free(vcpu); + free_cpumask_var(wbinvd_dirty_mask); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, @@ -8153,7 +8155,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_page_track_flush_slot(kvm, slot); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f397e9b20370..a29786dd9522 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -42,6 +42,7 @@ #ifdef CONFIG_HAVE_KVM_IRQFD +static struct workqueue_struct *irqfd_cleanup_wq; static void irqfd_inject(struct work_struct *work) @@ -167,7 +168,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) list_del_init(&irqfd->list); - schedule_work(&irqfd->shutdown); + queue_work(irqfd_cleanup_wq, &irqfd->shutdown); } int __attribute__((weak)) kvm_arch_set_irq_inatomic( @@ -554,7 +555,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) * so that we guarantee there will not be any more interrupts on this * gsi once this deassign function returns. */ - flush_work(&irqfd->shutdown); + flush_workqueue(irqfd_cleanup_wq); return 0; } @@ -591,7 +592,7 @@ kvm_irqfd_release(struct kvm *kvm) * Block until we know all outstanding shutdown jobs have completed * since we do not take a kvm* reference. */ - flush_work(&irqfd->shutdown); + flush_workqueue(irqfd_cleanup_wq); } @@ -621,8 +622,23 @@ void kvm_irq_routing_update(struct kvm *kvm) spin_unlock_irq(&kvm->irqfds.lock); } +/* + * create a host-wide workqueue for issuing deferred shutdown requests + * aggregated from all vm* instances. We need our own isolated + * queue to ease flushing work items when a VM exits. + */ +int kvm_irqfd_init(void) +{ + irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + void kvm_irqfd_exit(void) { + destroy_workqueue(irqfd_cleanup_wq); } #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 28510e72618a..d92c3d5b0fbe 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3846,7 +3846,12 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, * kvm_arch_init makes sure there's at most one caller * for architectures that support multiple implementations, * like intel and amd on x86. + * kvm_arch_init must be called before kvm_irqfd_init to avoid creating + * conflicts in case kvm is already setup for another implementation. */ + r = kvm_irqfd_init(); + if (r) + goto out_irqfd; if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; @@ -3928,6 +3933,7 @@ out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: kvm_irqfd_exit(); +out_irqfd: kvm_arch_exit(); out_fail: return r;