diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 6d0517ac18e5..0369f26ab96d 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1122,6 +1122,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; case KVM_CAP_MIPS_FPU: /* We don't handle systems with inconsistent cpu_has_fpu */ r = !!raw_cpu_has_fpu; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 013c76a0a03e..d10df677c452 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -309,6 +309,7 @@ struct kvm_arch { #ifdef CONFIG_PPC_BOOK3S_64 struct list_head spapr_tce_tables; struct list_head rtas_tokens; + struct mutex rtas_token_lock; DECLARE_BITMAP(enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); #endif #ifdef CONFIG_KVM_MPIC @@ -325,6 +326,7 @@ struct kvm_arch { #endif struct kvmppc_ops *kvm_ops; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + struct mutex mmu_setup_lock; /* nests inside vcpu mutexes */ u64 l1_ptcr; int max_nested_lpid; struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS]; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 61a212d0daf0..ac5664845aca 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -902,6 +902,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) #ifdef CONFIG_PPC64 INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables); INIT_LIST_HEAD(&kvm->arch.rtas_tokens); + mutex_init(&kvm->arch.rtas_token_lock); #endif return kvm->arch.kvm_ops->init_vm(kvm); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index ab3d484c5e2e..51971311e6c9 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -63,7 +63,7 @@ struct kvm_resize_hpt { struct work_struct work; u32 order; - /* These fields protected by kvm->lock */ + /* These fields protected by kvm->arch.mmu_setup_lock */ /* Possible values and their usage: * <0 an error occurred during allocation, @@ -73,7 +73,7 @@ struct kvm_resize_hpt { int error; /* Private to the work thread, until error != -EBUSY, - * then protected by kvm->lock. + * then protected by kvm->arch.mmu_setup_lock. */ struct kvm_hpt_info hpt; }; @@ -139,7 +139,7 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) long err = -EBUSY; struct kvm_hpt_info info; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); if (kvm->arch.mmu_ready) { kvm->arch.mmu_ready = 0; /* order mmu_ready vs. vcpus_running */ @@ -183,7 +183,7 @@ out: /* Ensure that each vcpu will flush its TLB on next entry. */ cpumask_setall(&kvm->arch.need_tlb_flush); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return err; } @@ -1447,7 +1447,7 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize) static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { - if (WARN_ON(!mutex_is_locked(&kvm->lock))) + if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock))) return; if (!resize) @@ -1474,14 +1474,14 @@ static void resize_hpt_prepare_work(struct work_struct *work) if (WARN_ON(resize->error != -EBUSY)) return; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); /* Request is still current? */ if (kvm->arch.resize_hpt == resize) { /* We may request large allocations here: - * do not sleep with kvm->lock held for a while. + * do not sleep with kvm->arch.mmu_setup_lock held for a while. */ - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", resize->order); @@ -1494,9 +1494,9 @@ static void resize_hpt_prepare_work(struct work_struct *work) if (WARN_ON(err == -EBUSY)) err = -EINPROGRESS; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); /* It is possible that kvm->arch.resize_hpt != resize - * after we grab kvm->lock again. + * after we grab kvm->arch.mmu_setup_lock again. */ } @@ -1505,7 +1505,7 @@ static void resize_hpt_prepare_work(struct work_struct *work) if (kvm->arch.resize_hpt != resize) resize_hpt_release(kvm, resize); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); } long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, @@ -1522,7 +1522,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, if (shift && ((shift < 18) || (shift > 46))) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); resize = kvm->arch.resize_hpt; @@ -1565,7 +1565,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, ret = 100; /* estimated time in ms */ out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return ret; } @@ -1588,7 +1588,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, if (shift && ((shift < 18) || (shift > 46))) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); resize = kvm->arch.resize_hpt; @@ -1625,7 +1625,7 @@ out: smp_mb(); out_no_hpt: resize_hpt_release(kvm, resize); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return ret; } @@ -1868,7 +1868,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, return -EINVAL; /* lock out vcpus from running while we're doing this */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); mmu_ready = kvm->arch.mmu_ready; if (mmu_ready) { kvm->arch.mmu_ready = 0; /* temporarily */ @@ -1876,7 +1876,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { kvm->arch.mmu_ready = 1; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return -EBUSY; } } @@ -1963,7 +1963,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, /* Order HPTE updates vs. mmu_ready */ smp_wmb(); kvm->arch.mmu_ready = mmu_ready; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); if (err) return err; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d5fc624e0655..5e840113eda4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -446,12 +446,7 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) { - struct kvm_vcpu *ret; - - mutex_lock(&kvm->lock); - ret = kvm_get_vcpu_by_id(kvm, id); - mutex_unlock(&kvm->lock); - return ret; + return kvm_get_vcpu_by_id(kvm, id); } static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) @@ -1583,7 +1578,6 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, struct kvmppc_vcore *vc = vcpu->arch.vcore; u64 mask; - mutex_lock(&kvm->lock); spin_lock(&vc->lock); /* * If ILE (interrupt little-endian) has changed, update the @@ -1623,7 +1617,6 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, mask &= 0xFFFFFFFF; vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); spin_unlock(&vc->lock); - mutex_unlock(&kvm->lock); } static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, @@ -2338,11 +2331,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, pr_devel("KVM: collision on id %u", id); vcore = NULL; } else if (!vcore) { + /* + * Take mmu_setup_lock for mutual exclusion + * with kvmppc_update_lpcr(). + */ err = -ENOMEM; vcore = kvmppc_vcore_create(kvm, id & ~(kvm->arch.smt_mode - 1)); + mutex_lock(&kvm->arch.mmu_setup_lock); kvm->arch.vcores[core] = vcore; kvm->arch.online_vcores++; + mutex_unlock(&kvm->arch.mmu_setup_lock); } } mutex_unlock(&kvm->lock); @@ -3663,6 +3662,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->in_guest = 0; mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); kvmhv_load_host_pmu(); @@ -3859,7 +3859,7 @@ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) int r = 0; struct kvm *kvm = vcpu->kvm; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); if (!kvm->arch.mmu_ready) { if (!kvm_is_radix(kvm)) r = kvmppc_hv_setup_htab_rma(vcpu); @@ -3869,7 +3869,7 @@ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) kvm->arch.mmu_ready = 1; } } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return r; } @@ -4091,16 +4091,20 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, kvmppc_check_need_tlb_flush(kvm, pcpu, nested); } - trace_hardirqs_on(); guest_enter_irqoff(); srcu_idx = srcu_read_lock(&kvm->srcu); this_cpu_disable_ftrace(); + /* Tell lockdep that we're about to enable interrupts */ + trace_hardirqs_on(); + trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr); vcpu->arch.trap = trap; + trace_hardirqs_off(); + this_cpu_enable_ftrace(); srcu_read_unlock(&kvm->srcu, srcu_idx); @@ -4110,7 +4114,6 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, isync(); } - trace_hardirqs_off(); set_irq_happened(trap); kvmppc_set_host_core(pcpu); @@ -4478,7 +4481,8 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, /* * Update LPCR values in kvm->arch and in vcores. - * Caller must hold kvm->lock. + * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion + * of kvm->arch.lpcr update). */ void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) { @@ -4530,7 +4534,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm) /* * Set up HPT (hashed page table) and RMA (real-mode area). - * Must be called with kvm->lock held. + * Must be called with kvm->arch.mmu_setup_lock held. */ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { @@ -4618,7 +4622,10 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } -/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +/* + * Must be called with kvm->arch.mmu_setup_lock held and + * mmu_ready = 0 and no vcpus running. + */ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) { if (nesting_enabled(kvm)) @@ -4635,7 +4642,10 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) return 0; } -/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +/* + * Must be called with kvm->arch.mmu_setup_lock held and + * mmu_ready = 0 and no vcpus running. + */ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) { int err; @@ -4740,6 +4750,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) char buf[32]; int ret; + mutex_init(&kvm->arch.mmu_setup_lock); + /* Allocate the guest's logical partition ID */ lpid = kvmppc_alloc_lpid(); @@ -5265,7 +5277,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (kvmhv_on_pseries() && !radix) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); if (radix != kvm_is_radix(kvm)) { if (kvm->arch.mmu_ready) { kvm->arch.mmu_ready = 0; @@ -5293,7 +5305,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) err = 0; out_unlock: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return err; } diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 4e178c4c1ea5..b7ae3dfbf00e 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -146,7 +146,7 @@ static int rtas_token_undefine(struct kvm *kvm, char *name) { struct rtas_token_definition *d, *tmp; - lockdep_assert_held(&kvm->lock); + lockdep_assert_held(&kvm->arch.rtas_token_lock); list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { if (rtas_name_matches(d->handler->name, name)) { @@ -167,7 +167,7 @@ static int rtas_token_define(struct kvm *kvm, char *name, u64 token) bool found; int i; - lockdep_assert_held(&kvm->lock); + lockdep_assert_held(&kvm->arch.rtas_token_lock); list_for_each_entry(d, &kvm->arch.rtas_tokens, list) { if (d->token == token) @@ -206,14 +206,14 @@ int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp) if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.rtas_token_lock); if (args.token) rc = rtas_token_define(kvm, args.name, args.token); else rc = rtas_token_undefine(kvm, args.name); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.rtas_token_lock); return rc; } @@ -245,7 +245,7 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) orig_rets = args.rets; args.rets = &args.args[be32_to_cpu(args.nargs)]; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.rtas_token_lock); rc = -ENOENT; list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) { @@ -256,7 +256,7 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) } } - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.rtas_token_lock); if (rc == 0) { args.rets = orig_rets; @@ -282,8 +282,6 @@ void kvmppc_rtas_tokens_free(struct kvm *kvm) { struct rtas_token_definition *d, *tmp; - lockdep_assert_held(&kvm->lock); - list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { list_del(&d->list); kfree(d); diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 4953957333b7..922fd62bcd2a 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -271,14 +271,14 @@ static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio) return rc; } -/* Called with kvm_lock held */ +/* Called with xive->lock held */ static int xive_check_provisioning(struct kvm *kvm, u8 prio) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvm_vcpu *vcpu; int i, rc; - lockdep_assert_held(&kvm->lock); + lockdep_assert_held(&xive->lock); /* Already provisioned ? */ if (xive->qmap & (1 << prio)) @@ -621,9 +621,12 @@ int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, irq, server, priority); /* First, check provisioning of queues */ - if (priority != MASKED) + if (priority != MASKED) { + mutex_lock(&xive->lock); rc = xive_check_provisioning(xive->kvm, xive_prio_from_guest(priority)); + mutex_unlock(&xive->lock); + } if (rc) { pr_devel(" provisioning failure %d !\n", rc); return rc; @@ -1199,7 +1202,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, return -ENOMEM; /* We need to synchronize with queue provisioning */ - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&xive->lock); vcpu->arch.xive_vcpu = xc; xc->xive = xive; xc->vcpu = vcpu; @@ -1283,7 +1286,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00); bail: - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&xive->lock); if (r) { kvmppc_xive_cleanup_vcpu(vcpu); return r; @@ -1527,13 +1530,12 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr) struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( struct kvmppc_xive *xive, int irq) { - struct kvm *kvm = xive->kvm; struct kvmppc_xive_src_block *sb; int i, bid; bid = irq >> KVMPPC_XICS_ICS_SHIFT; - mutex_lock(&kvm->lock); + mutex_lock(&xive->lock); /* block already exists - somebody else got here first */ if (xive->src_blocks[bid]) @@ -1560,7 +1562,7 @@ struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( xive->max_sbid = bid; out: - mutex_unlock(&kvm->lock); + mutex_unlock(&xive->lock); return xive->src_blocks[bid]; } @@ -1670,9 +1672,9 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) /* If we have a priority target the interrupt */ if (act_prio != MASKED) { /* First, check provisioning of queues */ - mutex_lock(&xive->kvm->lock); + mutex_lock(&xive->lock); rc = xive_check_provisioning(xive->kvm, act_prio); - mutex_unlock(&xive->kvm->lock); + mutex_unlock(&xive->lock); /* Target interrupt */ if (rc == 0) @@ -1826,7 +1828,6 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd) { xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); xive_native_configure_irq(hw_num, 0, MASKED, 0); - xive_cleanup_irq_data(xd); } void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) @@ -1840,9 +1841,10 @@ void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) continue; kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data); + xive_cleanup_irq_data(&state->ipi_data); xive_native_free_irq(state->ipi_number); - /* Pass-through, cleanup too */ + /* Pass-through, cleanup too but keep IRQ hw data */ if (state->pt_number) kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data); @@ -1859,21 +1861,10 @@ static void kvmppc_xive_release(struct kvm_device *dev) struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; int i; - int was_ready; pr_devel("Releasing xive device\n"); - debugfs_remove(xive->dentry); - /* - * Clearing mmu_ready temporarily while holding kvm->lock - * is a way of ensuring that no vcpus can enter the guest - * until we drop kvm->lock. Doing kick_all_cpus_sync() - * ensures that any vcpu executing inside the guest has - * exited the guest. Once kick_all_cpus_sync() has finished, - * we know that no vcpu can be executing the XIVE push or - * pull code, or executing a XICS hcall. - * * Since this is the device release function, we know that * userspace does not have any open fd referring to the * device. Therefore there can not be any of the device @@ -1881,9 +1872,8 @@ static void kvmppc_xive_release(struct kvm_device *dev) * and similarly, the connect_vcpu and set/clr_mapped * functions also cannot be being executed. */ - was_ready = kvm->arch.mmu_ready; - kvm->arch.mmu_ready = 0; - kick_all_cpus_sync(); + + debugfs_remove(xive->dentry); /* * We should clean up the vCPU interrupt presenters first. @@ -1892,12 +1882,22 @@ static void kvmppc_xive_release(struct kvm_device *dev) /* * Take vcpu->mutex to ensure that no one_reg get/set ioctl * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently. + * Holding the vcpu->mutex also means that the vcpu cannot + * be executing the KVM_RUN ioctl, and therefore it cannot + * be executing the XIVE push or pull code or accessing + * the XIVE MMIO regions. */ mutex_lock(&vcpu->mutex); kvmppc_xive_cleanup_vcpu(vcpu); mutex_unlock(&vcpu->mutex); } + /* + * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type + * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe + * against xive code getting called during vcpu execution or + * set/get one_reg operations. + */ kvm->arch.xive = NULL; /* Mask and free interrupts */ @@ -1911,8 +1911,6 @@ static void kvmppc_xive_release(struct kvm_device *dev) if (xive->vp_base != XIVE_INVALID_VP) xive_native_free_vp_block(xive->vp_base); - kvm->arch.mmu_ready = was_ready; - /* * A reference of the kvmppc_xive pointer is now kept under * the xive_devices struct of the machine for reuse. It is @@ -1967,6 +1965,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) dev->private = xive; xive->dev = dev; xive->kvm = kvm; + mutex_init(&xive->lock); /* Already there ? */ if (kvm->arch.xive) diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 426146332984..862c2c9650ae 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -141,6 +141,7 @@ struct kvmppc_xive { struct kvmppc_xive_ops *ops; struct address_space *mapping; struct mutex mapping_lock; + struct mutex lock; }; #define KVMPPC_XIVE_Q_COUNT 8 diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 6a8e698c4b6e..5596c8ec221a 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -109,12 +109,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, return -EPERM; if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; - if (server_num >= KVM_MAX_VCPUS) { + if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { pr_devel("Out of bounds !\n"); return -EINVAL; } - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&xive->lock); if (kvmppc_xive_find_server(vcpu->kvm, server_num)) { pr_devel("Duplicate !\n"); @@ -159,7 +159,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, /* TODO: reset all queues to a clean state ? */ bail: - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&xive->lock); if (rc) kvmppc_xive_native_cleanup_vcpu(vcpu); @@ -172,6 +172,7 @@ bail: static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq) { struct kvmppc_xive *xive = kvm->arch.xive; + pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2; if (irq >= KVMPPC_XIVE_NR_IRQS) return -EINVAL; @@ -185,7 +186,7 @@ static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq) mutex_lock(&xive->mapping_lock); if (xive->mapping) unmap_mapping_range(xive->mapping, - irq * (2ull << PAGE_SHIFT), + esb_pgoff << PAGE_SHIFT, 2ull << PAGE_SHIFT, 1); mutex_unlock(&xive->mapping_lock); return 0; @@ -535,6 +536,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, struct xive_q *q; gfn_t gfn; unsigned long page_size; + int srcu_idx; /* * Demangle priority/server tuple from the EQ identifier @@ -565,24 +567,6 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, __func__, server, priority, kvm_eq.flags, kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex); - /* - * sPAPR specifies a "Unconditional Notify (n) flag" for the - * H_INT_SET_QUEUE_CONFIG hcall which forces notification - * without using the coalescing mechanisms provided by the - * XIVE END ESBs. This is required on KVM as notification - * using the END ESBs is not supported. - */ - if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) { - pr_err("invalid flags %d\n", kvm_eq.flags); - return -EINVAL; - } - - rc = xive_native_validate_queue_size(kvm_eq.qshift); - if (rc) { - pr_err("invalid queue size %d\n", kvm_eq.qshift); - return rc; - } - /* reset queue and disable queueing */ if (!kvm_eq.qshift) { q->guest_qaddr = 0; @@ -604,26 +588,48 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, return 0; } + /* + * sPAPR specifies a "Unconditional Notify (n) flag" for the + * H_INT_SET_QUEUE_CONFIG hcall which forces notification + * without using the coalescing mechanisms provided by the + * XIVE END ESBs. This is required on KVM as notification + * using the END ESBs is not supported. + */ + if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) { + pr_err("invalid flags %d\n", kvm_eq.flags); + return -EINVAL; + } + + rc = xive_native_validate_queue_size(kvm_eq.qshift); + if (rc) { + pr_err("invalid queue size %d\n", kvm_eq.qshift); + return rc; + } + if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) { pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr, 1ull << kvm_eq.qshift); return -EINVAL; } + srcu_idx = srcu_read_lock(&kvm->srcu); gfn = gpa_to_gfn(kvm_eq.qaddr); page = gfn_to_page(kvm, gfn); if (is_error_page(page)) { + srcu_read_unlock(&kvm->srcu, srcu_idx); pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr); return -EINVAL; } page_size = kvm_host_page_size(kvm, gfn); if (1ull << kvm_eq.qshift > page_size) { + srcu_read_unlock(&kvm->srcu, srcu_idx); pr_warn("Incompatible host page size %lx!\n", page_size); return -EINVAL; } qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK); + srcu_read_unlock(&kvm->srcu, srcu_idx); /* * Backup the queue page guest address to the mark EQ page @@ -772,7 +778,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive) pr_devel("%s\n", __func__); - mutex_lock(&kvm->lock); + mutex_lock(&xive->lock); kvm_for_each_vcpu(i, vcpu, kvm) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -810,7 +816,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive) } } - mutex_unlock(&kvm->lock); + mutex_unlock(&xive->lock); return 0; } @@ -854,6 +860,7 @@ static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; unsigned int prio; + int srcu_idx; if (!xc) return -ENOENT; @@ -865,7 +872,9 @@ static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu) continue; /* Mark EQ page dirty for migration */ + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr)); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); } return 0; } @@ -878,7 +887,7 @@ static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive) pr_devel("%s\n", __func__); - mutex_lock(&kvm->lock); + mutex_lock(&xive->lock); for (i = 0; i <= xive->max_sbid; i++) { struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; @@ -892,7 +901,7 @@ static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive) kvm_for_each_vcpu(i, vcpu, kvm) { kvmppc_xive_native_vcpu_eq_sync(vcpu); } - mutex_unlock(&kvm->lock); + mutex_unlock(&xive->lock); return 0; } @@ -965,7 +974,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev, } /* - * Called when device fd is closed + * Called when device fd is closed. kvm->lock is held. */ static void kvmppc_xive_native_release(struct kvm_device *dev) { @@ -973,21 +982,18 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; int i; - int was_ready; - - debugfs_remove(xive->dentry); pr_devel("Releasing xive native device\n"); /* - * Clearing mmu_ready temporarily while holding kvm->lock - * is a way of ensuring that no vcpus can enter the guest - * until we drop kvm->lock. Doing kick_all_cpus_sync() - * ensures that any vcpu executing inside the guest has - * exited the guest. Once kick_all_cpus_sync() has finished, - * we know that no vcpu can be executing the XIVE push or - * pull code or accessing the XIVE MMIO regions. - * + * Clear the KVM device file address_space which is used to + * unmap the ESB pages when a device is passed-through. + */ + mutex_lock(&xive->mapping_lock); + xive->mapping = NULL; + mutex_unlock(&xive->mapping_lock); + + /* * Since this is the device release function, we know that * userspace does not have any open fd or mmap referring to * the device. Therefore there can not be any of the @@ -996,9 +1002,8 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) * connect_vcpu and set/clr_mapped functions also cannot * be being executed. */ - was_ready = kvm->arch.mmu_ready; - kvm->arch.mmu_ready = 0; - kick_all_cpus_sync(); + + debugfs_remove(xive->dentry); /* * We should clean up the vCPU interrupt presenters first. @@ -1007,12 +1012,22 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) /* * Take vcpu->mutex to ensure that no one_reg get/set ioctl * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done. + * Holding the vcpu->mutex also means that the vcpu cannot + * be executing the KVM_RUN ioctl, and therefore it cannot + * be executing the XIVE push or pull code or accessing + * the XIVE MMIO regions. */ mutex_lock(&vcpu->mutex); kvmppc_xive_native_cleanup_vcpu(vcpu); mutex_unlock(&vcpu->mutex); } + /* + * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type + * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe + * against xive code getting called during vcpu execution or + * set/get one_reg operations. + */ kvm->arch.xive = NULL; for (i = 0; i <= xive->max_sbid; i++) { @@ -1025,8 +1040,6 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) if (xive->vp_base != XIVE_INVALID_VP) xive_native_free_vp_block(xive->vp_base); - kvm->arch.mmu_ready = was_ready; - /* * A reference of the kvmppc_xive pointer is now kept under * the xive_devices struct of the machine for reuse. It is @@ -1060,6 +1073,7 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) xive->kvm = kvm; kvm->arch.xive = xive; mutex_init(&xive->mapping_lock); + mutex_init(&xive->lock); /* * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3393b166817a..aa3a678711be 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -657,6 +657,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_PPC_GET_SMMU_INFO: r = 1; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e5e8eb29e68e..28ebd647784c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -539,6 +539,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: r = KVM_S390_BSCA_CPU_SLOTS; if (!kvm_s390_use_sca_entries()) r = KVM_MAX_VCPUS; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index acb179f78fdc..83aefd759846 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3122,6 +3122,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 90cedebaeb94..7eeebe5e9da2 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -224,6 +224,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; case KVM_CAP_MSI_DEVID: if (!kvm) r = -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 134ec0283a8a..ca54b09adf5b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1795,8 +1795,10 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, if (map->page) kunmap(map->page); +#ifdef CONFIG_HAS_IOMEM else memunmap(map->hva); +#endif if (dirty) { kvm_vcpu_mark_page_dirty(vcpu, map->gfn); @@ -3149,8 +3151,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_MULTI_ADDRESS_SPACE: return KVM_ADDRESS_SPACE_NUM; #endif - case KVM_CAP_MAX_VCPU_ID: - return KVM_MAX_VCPU_ID; case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; default: