From 8d43d57036679a8635952c9ef54989a7b48e8c00 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Mon, 10 Dec 2018 11:15:16 +0100 Subject: [PATCH 001/147] KVM: s390: clarify kvm related kernel message As suggested by our ID dept. here are some kernel message updates. Signed-off-by: Michael Mueller Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7f4bc58a53b9..11b4be3dad15 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -432,7 +432,7 @@ int kvm_arch_init(void *opaque) /* Register floating interrupt controller interface. */ rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); if (rc) { - pr_err("Failed to register FLIC rc=%d\n", rc); + pr_err("A FLIC registration call failed with rc=%d\n", rc); goto out_debug_unreg; } return 0; @@ -4293,12 +4293,12 @@ static int __init kvm_s390_init(void) int i; if (!sclp.has_sief2) { - pr_info("SIE not available\n"); + pr_info("SIE is not available\n"); return -ENODEV; } if (nested && hpage) { - pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently"); + pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n"); return -EINVAL; } From b7d455712927cc8042f97fffba5d51052856ddaf Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:32 +0100 Subject: [PATCH 002/147] KVM: s390: drop obsolete else path The explicit else path specified in set_intercept_indicators_io is not required as the function returns in case the first branch is taken anyway. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-2-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index fcb55b02990e..19d556512452 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -345,7 +345,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) { if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_IO_MASK)) return; - else if (psw_ioint_disabled(vcpu)) + if (psw_ioint_disabled(vcpu)) kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT); else vcpu->arch.sie_block->lctl |= LCTL_CR6; From 689bdf9e9c337121d948d48605de0659c088e6bb Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:33 +0100 Subject: [PATCH 003/147] KVM: s390: make bitmap declaration consistent Use a consistent bitmap declaration throughout the code. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-3-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 2 +- arch/s390/kvm/interrupt.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d5d24889c3bc..3cba08f73dc6 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -591,7 +591,7 @@ struct kvm_s390_float_interrupt { struct kvm_s390_mchk_info mchk; struct kvm_s390_ext_info srv_signal; int next_rr_cpu; - unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct mutex ais_lock; u8 simm; u8 nimm; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 19d556512452..167a3068ef84 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2831,7 +2831,7 @@ static void store_local_irq(struct kvm_s390_local_interrupt *li, int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) { int scn; - unsigned long sigp_emerg_pending[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; unsigned long pending_irqs; struct kvm_s390_irq irq; From 246b72183b3538907d5c1e6d383c6956385d068d Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:34 +0100 Subject: [PATCH 004/147] KVM: s390: move bitmap idle_mask into arch struct top level The vcpu idle_mask state is used by but not specific to the emulated floating interruptions. The state is relevant to gisa related interruptions as well. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Cornelia Huck Acked-by: Halil Pasic Message-Id: <20190131085247.13826-4-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 2 +- arch/s390/kvm/interrupt.c | 11 +++++------ arch/s390/kvm/kvm-s390.h | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3cba08f73dc6..c259a67c4298 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -591,7 +591,6 @@ struct kvm_s390_float_interrupt { struct kvm_s390_mchk_info mchk; struct kvm_s390_ext_info srv_signal; int next_rr_cpu; - DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct mutex ais_lock; u8 simm; u8 nimm; @@ -838,6 +837,7 @@ struct kvm_arch{ /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); struct kvm_s390_gisa *gisa; + DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 167a3068ef84..2a3eb9f076c3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -318,13 +318,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -1726,7 +1726,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) */ static void __floating_irq_kick(struct kvm *kvm, u64 type) { - struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; struct kvm_vcpu *dst_vcpu; int sigcpu, online_vcpus, nr_tries = 0; @@ -1735,11 +1734,11 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) return; /* find idle VCPUs first, then round robin */ - sigcpu = find_first_bit(fi->idle_mask, online_vcpus); + sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus); if (sigcpu == online_vcpus) { do { - sigcpu = fi->next_rr_cpu; - fi->next_rr_cpu = (fi->next_rr_cpu + 1) % online_vcpus; + sigcpu = kvm->arch.float_int.next_rr_cpu++; + kvm->arch.float_int.next_rr_cpu %= online_vcpus; /* avoid endless loops if all vcpus are stopped */ if (nr_tries++ >= online_vcpus) return; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 1f6e36cdce0d..72ef87799593 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -67,7 +67,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) From 672128bfee082d37e7eda630f98c46aacda4963c Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:35 +0100 Subject: [PATCH 005/147] KVM: s390: coding style kvm_s390_gisa_init/clear() The change helps to reduce line length and increases code readability. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-5-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2a3eb9f076c3..005dbe7252e7 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2885,20 +2885,20 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) void kvm_s390_gisa_clear(struct kvm *kvm) { - if (kvm->arch.gisa) { - memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); - kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); - } + if (!kvm->arch.gisa) + return; + memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); + kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; + VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); } void kvm_s390_gisa_init(struct kvm *kvm) { - if (css_general_characteristics.aiv) { - kvm->arch.gisa = &kvm->arch.sie_page2->gisa; - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); - kvm_s390_gisa_clear(kvm); - } + if (!css_general_characteristics.aiv) + return; + kvm->arch.gisa = &kvm->arch.sie_page2->gisa; + kvm_s390_gisa_clear(kvm); + VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); } void kvm_s390_gisa_destroy(struct kvm *kvm) From 96723d323a080d98a2b330ef2d7aa020bf589d9e Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:36 +0100 Subject: [PATCH 006/147] KVM: s390: use pending_irqs_no_gisa() where appropriate Interruption types that are not represented in GISA shall use pending_irqs_no_gisa() to test pending interruptions. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-6-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 005dbe7252e7..cb48736867ed 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -353,7 +353,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) { - if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK)) + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_EXT_MASK)) return; if (psw_extint_disabled(vcpu)) kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); @@ -363,7 +363,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) { - if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_MCHK_MASK)) return; if (psw_mchk_disabled(vcpu)) vcpu->arch.sie_block->ictl |= ICTL_LPSW; From bb2fb8cdcf2d9f1845ffaa077013b45e14642019 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:37 +0100 Subject: [PATCH 007/147] KVM: s390: remove kvm_s390_ from gisa static inline functions This will shorten the length of code lines. All GISA related static inline functions are local to interrupt.c. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-7-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index cb48736867ed..942cc7d33766 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -217,22 +217,22 @@ static inline u8 int_word_to_isc(u32 int_word) */ #define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE) -static inline void kvm_s390_gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } -static inline u8 kvm_s390_gisa_get_ipm(struct kvm_s390_gisa *gisa) +static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) { return READ_ONCE(gisa->ipm); } -static inline void kvm_s390_gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } -static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } @@ -246,7 +246,7 @@ static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { return pending_irqs_no_gisa(vcpu) | - kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; + gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; } static inline int isc_to_irq_type(unsigned long isc) @@ -999,7 +999,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, } if (vcpu->kvm->arch.gisa && - kvm_s390_gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) { + gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) { /* * in case an adapter interrupt was not delivered * in SIE context KVM will handle the delivery @@ -1541,10 +1541,10 @@ static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid) if (!kvm->arch.gisa) goto out; - active_mask = (isc_mask & kvm_s390_gisa_get_ipm(kvm->arch.gisa) << 24) << 32; + active_mask = (isc_mask & gisa_get_ipm(kvm->arch.gisa) << 24) << 32; while (active_mask) { isc = __fls(active_mask) ^ (BITS_PER_LONG - 1); - if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) + if (gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) return isc; clear_bit_inv(isc, &active_mask); } @@ -1584,7 +1584,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, /* both types of interrupts present */ if (int_word_to_isc(inti->io.io_int_word) <= isc) { /* classical IO int with higher priority */ - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(kvm->arch.gisa, isc); goto out; } gisa_out: @@ -1596,7 +1596,7 @@ gisa_out: kvm_s390_reinject_io_int(kvm, inti); inti = tmp_inti; } else - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(kvm->arch.gisa, isc); out: return inti; } @@ -1694,7 +1694,7 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(kvm->arch.gisa, isc); kfree(inti); return 0; } @@ -2025,15 +2025,14 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) max_irqs = len / sizeof(struct kvm_s390_irq); - if (kvm->arch.gisa && - kvm_s390_gisa_get_ipm(kvm->arch.gisa)) { + if (kvm->arch.gisa && gisa_get_ipm(kvm->arch.gisa)) { for (i = 0; i <= MAX_ISC; i++) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; goto out_nolock; } - if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { + if (gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { irq = (struct kvm_s390_irq *) &buf[n]; irq->type = KVM_S390_INT_IO(1, 0, 0, 0); irq->u.io.io_int_word = isc_to_int_word(i); From 982cff425959901dfb4df7433622e5c0510f9d37 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:38 +0100 Subject: [PATCH 008/147] KVM: s390: introduce struct kvm_s390_gisa_interrupt Use this struct analog to the kvm interruption structs for kvm emulated floating and local interruptions. GIB handling will add further fields to this structure as required. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-8-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 6 +++- arch/s390/kvm/interrupt.c | 52 ++++++++++++++++++-------------- arch/s390/kvm/kvm-s390.c | 2 +- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index c259a67c4298..0941571d34eb 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -803,6 +803,10 @@ struct kvm_s390_vsie { struct page *pages[KVM_MAX_VCPUS]; }; +struct kvm_s390_gisa_interrupt { + struct kvm_s390_gisa *origin; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -836,8 +840,8 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); - struct kvm_s390_gisa *gisa; DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); + struct kvm_s390_gisa_interrupt gisa_int; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 942cc7d33766..ee91d1de0036 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -246,7 +246,8 @@ static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { return pending_irqs_no_gisa(vcpu) | - gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; + gisa_get_ipm(vcpu->kvm->arch.gisa_int.origin) << + IRQ_PEND_IO_ISC_7; } static inline int isc_to_irq_type(unsigned long isc) @@ -956,6 +957,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, { struct list_head *isc_list; struct kvm_s390_float_interrupt *fi; + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti = NULL; struct kvm_s390_io_info io; u32 isc; @@ -998,8 +1000,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, goto out; } - if (vcpu->kvm->arch.gisa && - gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) { + if (gi->origin && gisa_tac_ipm_gisc(gi->origin, isc)) { /* * in case an adapter interrupt was not delivered * in SIE context KVM will handle the delivery @@ -1533,18 +1534,19 @@ static struct kvm_s390_interrupt_info *get_top_io_int(struct kvm *kvm, static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; unsigned long active_mask; int isc; if (schid) goto out; - if (!kvm->arch.gisa) + if (!gi->origin) goto out; - active_mask = (isc_mask & gisa_get_ipm(kvm->arch.gisa) << 24) << 32; + active_mask = (isc_mask & gisa_get_ipm(gi->origin) << 24) << 32; while (active_mask) { isc = __fls(active_mask) ^ (BITS_PER_LONG - 1); - if (gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) + if (gisa_tac_ipm_gisc(gi->origin, isc)) return isc; clear_bit_inv(isc, &active_mask); } @@ -1567,6 +1569,7 @@ out: struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 isc_mask, u32 schid) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti, *tmp_inti; int isc; @@ -1584,7 +1587,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, /* both types of interrupts present */ if (int_word_to_isc(inti->io.io_int_word) <= isc) { /* classical IO int with higher priority */ - gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); goto out; } gisa_out: @@ -1596,7 +1599,7 @@ gisa_out: kvm_s390_reinject_io_int(kvm, inti); inti = tmp_inti; } else - gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); out: return inti; } @@ -1685,6 +1688,7 @@ static int __inject_float_mchk(struct kvm *kvm, static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_float_interrupt *fi; struct list_head *list; int isc; @@ -1692,9 +1696,9 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) kvm->stat.inject_io++; isc = int_word_to_isc(inti->io.io_int_word); - if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) { + if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); - gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); kfree(inti); return 0; } @@ -1752,7 +1756,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT); break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (!(type & KVM_S390_INT_IO_AI_MASK && kvm->arch.gisa)) + if (!(type & KVM_S390_INT_IO_AI_MASK && + kvm->arch.gisa_int.origin)) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: @@ -2002,6 +2007,7 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti; struct kvm_s390_float_interrupt *fi; struct kvm_s390_irq *buf; @@ -2025,14 +2031,14 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) max_irqs = len / sizeof(struct kvm_s390_irq); - if (kvm->arch.gisa && gisa_get_ipm(kvm->arch.gisa)) { + if (gi->origin && gisa_get_ipm(gi->origin)) { for (i = 0; i <= MAX_ISC; i++) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; goto out_nolock; } - if (gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { + if (gisa_tac_ipm_gisc(gi->origin, i)) { irq = (struct kvm_s390_irq *) &buf[n]; irq->type = KVM_S390_INT_IO(1, 0, 0, 0); irq->u.io.io_int_word = isc_to_int_word(i); @@ -2884,25 +2890,27 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) void kvm_s390_gisa_clear(struct kvm *kvm) { - if (!kvm->arch.gisa) + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) return; - memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); - kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); + memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); + gi->origin->next_alert = (u32)(u64)gi->origin; + VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); } void kvm_s390_gisa_init(struct kvm *kvm) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + if (!css_general_characteristics.aiv) return; - kvm->arch.gisa = &kvm->arch.sie_page2->gisa; + gi->origin = &kvm->arch.sie_page2->gisa; kvm_s390_gisa_clear(kvm); - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); + VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } void kvm_s390_gisa_destroy(struct kvm *kvm) { - if (!kvm->arch.gisa) - return; - kvm->arch.gisa = NULL; + kvm->arch.gisa_int.origin = NULL; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 11b4be3dad15..5eaffb3e1738 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2812,7 +2812,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); - vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa; + vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin; if (vcpu->arch.sie_block->gd && sclp.has_gisaf) vcpu->arch.sie_block->gd |= GISA_FORMAT1; seqcount_init(&vcpu->arch.cputm_seqcount); From 3dec19221788ec29fff91f3a6bebfc7b8132b0a1 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:39 +0100 Subject: [PATCH 009/147] s390/cio: add function chsc_sgib() This patch implements the Set Guest Information Block operation to request association or disassociation of a Guest Information Block (GIB) with the Adapter Interruption Facility. The operation is required to receive GIB alert interrupts for guest adapters in conjunction with AIV and GISA. Signed-off-by: Michael Mueller Reviewed-by: Sebastian Ott Reviewed-by: Pierre Morel Reviewed-by: Christian Borntraeger Acked-by: Halil Pasic Acked-by: Janosch Frank Acked-by: Cornelia Huck Message-Id: <20190131085247.13826-9-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/cio.h | 1 + drivers/s390/cio/chsc.c | 37 +++++++++++++++++++++++++++++++++++++ drivers/s390/cio/chsc.h | 1 + 3 files changed, 39 insertions(+) diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 225667652069..1727180e8ca1 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -331,5 +331,6 @@ extern void css_schedule_reprobe(void); /* Function from drivers/s390/cio/chsc.c */ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); +int chsc_sgib(u32 origin); #endif diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index a0baee25134c..4159c63a5fd2 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -1382,3 +1382,40 @@ int chsc_pnso_brinfo(struct subchannel_id schid, return chsc_error_from_response(brinfo_area->response.code); } EXPORT_SYMBOL_GPL(chsc_pnso_brinfo); + +int chsc_sgib(u32 origin) +{ + struct { + struct chsc_header request; + u16 op; + u8 reserved01[2]; + u8 reserved02:4; + u8 fmt:4; + u8 reserved03[7]; + /* operation data area begin */ + u8 reserved04[4]; + u32 gib_origin; + u8 reserved05[10]; + u8 aix; + u8 reserved06[4029]; + struct chsc_header response; + u8 reserved07[4]; + } *sgib_area; + int ret; + + spin_lock_irq(&chsc_page_lock); + memset(chsc_page, 0, PAGE_SIZE); + sgib_area = chsc_page; + sgib_area->request.length = 0x0fe0; + sgib_area->request.code = 0x0021; + sgib_area->op = 0x1; + sgib_area->gib_origin = origin; + + ret = chsc(sgib_area); + if (ret == 0) + ret = chsc_error_from_response(sgib_area->response.code); + spin_unlock_irq(&chsc_page_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(chsc_sgib); diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h index 78aba8d94eec..e57d68e325a3 100644 --- a/drivers/s390/cio/chsc.h +++ b/drivers/s390/cio/chsc.h @@ -164,6 +164,7 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp); int chsc_ssqd(struct subchannel_id schid, struct chsc_ssqd_area *ssqd); int chsc_sadc(struct subchannel_id schid, struct chsc_scssc_area *scssc, u64 summary_indicator_addr, u64 subchannel_indicator_addr); +int chsc_sgib(u32 origin); int chsc_error_from_response(int response); int chsc_siosl(struct subchannel_id schid); From 1282c21eb3dac324b8531bf2e0e0fbd9d5b6516b Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:40 +0100 Subject: [PATCH 010/147] KVM: s390: add the GIB and its related life-cyle functions The Guest Information Block (GIB) links the GISA of all guests that have adapter interrupts pending. These interrupts cannot be delivered because all vcpus of these guests are currently in WAIT state or have masked the respective Interruption Sub Class (ISC). If enabled, a GIB alert is issued on the host to schedule these guests to run suitable vcpus to consume the pending interruptions. This mechanism allows to process adapter interrupts for currently not running guests. The GIB is created during host initialization and associated with the Adapter Interruption Facility in case an Adapter Interruption Virtualization Facility is available. The GIB initialization and thus the activation of the related code will be done in an upcoming patch of this series. Signed-off-by: Michael Mueller Reviewed-by: Janosch Frank Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: Pierre Morel Acked-by: Halil Pasic Message-Id: <20190131085247.13826-10-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 9 +++++++ arch/s390/kvm/interrupt.c | 43 ++++++++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 1 + arch/s390/kvm/kvm-s390.h | 2 ++ 4 files changed, 55 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 0941571d34eb..397af0d33539 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -784,6 +784,15 @@ struct kvm_s390_gisa { }; }; +struct kvm_s390_gib { + u32 alert_list_origin; + u32 reserved01; + u8:5; + u8 nisc:3; + u8 reserved03[3]; + u32 reserved04[5]; +}; + /* * sie_page2 has to be allocated as DMA because fac_list, crycb and * gisa need 31bit addresses in the sie control block. diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ee91d1de0036..5efcd9e2cf8f 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -7,6 +7,9 @@ * Author(s): Carsten Otte */ +#define KMSG_COMPONENT "kvm-s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include #include #include @@ -31,6 +34,8 @@ #define PFAULT_DONE 0x0680 #define VIRTIO_PARAM 0x0d00 +static struct kvm_s390_gib *gib; + /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { @@ -2914,3 +2919,41 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) { kvm->arch.gisa_int.origin = NULL; } + +void kvm_s390_gib_destroy(void) +{ + if (!gib) + return; + chsc_sgib(0); + free_page((unsigned long)gib); + gib = NULL; +} + +int kvm_s390_gib_init(u8 nisc) +{ + int rc = 0; + + if (!css_general_characteristics.aiv) { + KVM_EVENT(3, "%s", "gib not initialized, no AIV facility"); + goto out; + } + + gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!gib) { + rc = -ENOMEM; + goto out; + } + + gib->nisc = nisc; + if (chsc_sgib((u32)(u64)gib)) { + pr_err("Associating the GIB with the AIV facility failed\n"); + free_page((unsigned long)gib); + gib = NULL; + rc = -EIO; + goto out; + } + + KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); +out: + return rc; +} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5eaffb3e1738..ede89172b8f4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -444,6 +444,7 @@ out_debug_unreg: void kvm_arch_exit(void) { + kvm_s390_gib_destroy(); debug_unregister(kvm_s390_dbf); } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 72ef87799593..6d9448dbd052 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -381,6 +381,8 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, void kvm_s390_gisa_init(struct kvm *kvm); void kvm_s390_gisa_clear(struct kvm *kvm); void kvm_s390_gisa_destroy(struct kvm *kvm); +int kvm_s390_gib_init(u8 nisc); +void kvm_s390_gib_destroy(void); /* implemented in guestdbg.c */ void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); From 25c84dbaec6a5079ab64cf8b633ec811f8e43fdd Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:41 +0100 Subject: [PATCH 011/147] KVM: s390: add kvm reference to struct sie_page2 Adding the kvm reference to struct sie_page2 will allow to determine the kvm a given gisa belongs to: container_of(gisa, struct sie_page2, gisa)->kvm This functionality will be required to process a gisa in gib alert interruption context. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-11-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 3 ++- arch/s390/kvm/kvm-s390.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 397af0d33539..7077762ab460 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -801,7 +801,8 @@ struct sie_page2 { __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ struct kvm_s390_crypto_cb crycb; /* 0x0800 */ struct kvm_s390_gisa gisa; /* 0x0900 */ - u8 reserved920[0x1000 - 0x920]; /* 0x0920 */ + struct kvm *kvm; /* 0x0920 */ + u8 reserved928[0x1000 - 0x928]; /* 0x0928 */ }; struct kvm_s390_vsie { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ede89172b8f4..0de062e989e2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2210,6 +2210,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.sie_page2) goto out_err; + kvm->arch.sie_page2->kvm = kvm; kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list; for (i = 0; i < kvm_s390_fac_size(); i++) { From 6cff2e1046015f2729f6cb68be1c999c4a9b259f Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:42 +0100 Subject: [PATCH 012/147] KVM: s390: add functions to (un)register GISC with GISA Add the Interruption Alert Mask (IAM) to the architecture specific kvm struct. This mask in the GISA is used to define for which ISC a GIB alert will be issued. The functions kvm_s390_gisc_register() and kvm_s390_gisc_unregister() are used to (un)register a GISC (guest ISC) with a virtual machine and its GISA. Upon successful completion, kvm_s390_gisc_register() returns the ISC to be used for GIB alert interruptions. A negative return code indicates an error during registration. Theses functions will be used by other adapter types like AP and PCI to request pass-through interruption support. Signed-off-by: Michael Mueller Acked-by: Pierre Morel Acked-by: Halil Pasic Reviewed-by: Cornelia Huck Message-Id: <20190131085247.13826-12-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 13 ++++ arch/s390/kvm/interrupt.c | 112 +++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 7077762ab460..2cfff617cb21 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -781,6 +781,9 @@ struct kvm_s390_gisa { u8 reserved03[11]; u32 airq_count; } g1; + struct { + u64 word[4]; + } u64; }; }; @@ -813,8 +816,15 @@ struct kvm_s390_vsie { struct page *pages[KVM_MAX_VCPUS]; }; +struct kvm_s390_gisa_iam { + u8 mask; + spinlock_t ref_lock; + u32 ref_count[MAX_ISC + 1]; +}; + struct kvm_s390_gisa_interrupt { struct kvm_s390_gisa *origin; + struct kvm_s390_gisa_iam alert; }; struct kvm_arch{ @@ -885,6 +895,9 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; +extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); +extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); + static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_check_processor_compat(void *rtn) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5efcd9e2cf8f..040745b23224 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -222,6 +222,33 @@ static inline u8 int_word_to_isc(u32 int_word) */ #define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE) +/** + * gisa_set_iam - change the GISA interruption alert mask + * + * @gisa: gisa to operate on + * @iam: new IAM value to use + * + * Change the IAM atomically with the next alert address and the IPM + * of the GISA if the GISA is not part of the GIB alert list. All three + * fields are located in the first long word of the GISA. + * + * Returns: 0 on success + * -EBUSY in case the gisa is part of the alert list + */ +static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) +{ + u64 word, _word; + + do { + word = READ_ONCE(gisa->u64.word[0]); + if ((u64)gisa != word >> 32) + return -EBUSY; + _word = (word & ~0xffUL) | iam; + } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + + return 0; +} + static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -2911,6 +2938,8 @@ void kvm_s390_gisa_init(struct kvm *kvm) if (!css_general_characteristics.aiv) return; gi->origin = &kvm->arch.sie_page2->gisa; + gi->alert.mask = 0; + spin_lock_init(&gi->alert.ref_lock); kvm_s390_gisa_clear(kvm); VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } @@ -2920,6 +2949,89 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) kvm->arch.gisa_int.origin = NULL; } +/** + * kvm_s390_gisc_register - register a guest ISC + * + * @kvm: the kernel vm to work with + * @gisc: the guest interruption sub class to register + * + * The function extends the vm specific alert mask to use. + * The effective IAM mask in the GISA is updated as well + * in case the GISA is not part of the GIB alert list. + * It will be updated latest when the IAM gets restored + * by gisa_get_ipm_or_restore_iam(). + * + * Returns: the nonspecific ISC (NISC) the gib alert mechanism + * has registered with the channel subsystem. + * -ENODEV in case the vm uses no GISA + * -ERANGE in case the guest ISC is invalid + */ +int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return -ENODEV; + if (gisc > MAX_ISC) + return -ERANGE; + + spin_lock(&gi->alert.ref_lock); + gi->alert.ref_count[gisc]++; + if (gi->alert.ref_count[gisc] == 1) { + gi->alert.mask |= 0x80 >> gisc; + gisa_set_iam(gi->origin, gi->alert.mask); + } + spin_unlock(&gi->alert.ref_lock); + + return gib->nisc; +} +EXPORT_SYMBOL_GPL(kvm_s390_gisc_register); + +/** + * kvm_s390_gisc_unregister - unregister a guest ISC + * + * @kvm: the kernel vm to work with + * @gisc: the guest interruption sub class to register + * + * The function reduces the vm specific alert mask to use. + * The effective IAM mask in the GISA is updated as well + * in case the GISA is not part of the GIB alert list. + * It will be updated latest when the IAM gets restored + * by gisa_get_ipm_or_restore_iam(). + * + * Returns: the nonspecific ISC (NISC) the gib alert mechanism + * has registered with the channel subsystem. + * -ENODEV in case the vm uses no GISA + * -ERANGE in case the guest ISC is invalid + * -EINVAL in case the guest ISC is not registered + */ +int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + int rc = 0; + + if (!gi->origin) + return -ENODEV; + if (gisc > MAX_ISC) + return -ERANGE; + + spin_lock(&gi->alert.ref_lock); + if (gi->alert.ref_count[gisc] == 0) { + rc = -EINVAL; + goto out; + } + gi->alert.ref_count[gisc]--; + if (gi->alert.ref_count[gisc] == 0) { + gi->alert.mask &= ~(0x80 >> gisc); + gisa_set_iam(gi->origin, gi->alert.mask); + } +out: + spin_unlock(&gi->alert.ref_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); + void kvm_s390_gib_destroy(void) { if (!gib) From 174dd4f88875f6ebad7ad1dff3c3e9530a1e3506 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:43 +0100 Subject: [PATCH 013/147] KVM: s390: kvm_s390_gisa_clear() now clears the IPM only Function kvm_s390_gisa_clear() now clears the Interruption Pending Mask of the GISA asap. If the GISA is in the alert list at this time it stays in the list but is removed by process_gib_alert_list(). Signed-off-by: Michael Mueller Acked-by: Halil Pasic Reviewed-by: Pierre Morel Message-Id: <20190131085247.13826-13-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 040745b23224..9faaa8f96fc3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -249,6 +249,25 @@ static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) return 0; } +/** + * gisa_clear_ipm - clear the GISA interruption pending mask + * + * @gisa: gisa to operate on + * + * Clear the IPM atomically with the next alert address and the IAM + * of the GISA unconditionally. All three fields are located in the + * first long word of the GISA. + */ +static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) +{ + u64 word, _word; + + do { + word = READ_ONCE(gisa->u64.word[0]); + _word = word & ~(0xffUL << 24); + } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); +} + static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -2926,8 +2945,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm) if (!gi->origin) return; - memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); - gi->origin->next_alert = (u32)(u64)gi->origin; + gisa_clear_ipm(gi->origin); VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); } @@ -2940,7 +2958,8 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->origin = &kvm->arch.sie_page2->gisa; gi->alert.mask = 0; spin_lock_init(&gi->alert.ref_lock); - kvm_s390_gisa_clear(kvm); + memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); + gi->origin->next_alert = (u32)(u64)gi->origin; VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } From 9f30f62163786a0b80e0886046b5c66e714e7e71 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:44 +0100 Subject: [PATCH 014/147] KVM: s390: add gib_alert_irq_handler() The patch implements a handler for GIB alert interruptions on the host. Its task is to alert guests that interrupts are pending for them. A GIB alert interrupt statistic counter is added as well: $ cat /proc/interrupts CPU0 CPU1 ... GAL: 23 37 [I/O] GIB Alert ... Signed-off-by: Michael Mueller Acked-by: Halil Pasic Reviewed-by: Pierre Morel Message-Id: <20190131085247.13826-14-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/irq.h | 1 + arch/s390/include/asm/isc.h | 1 + arch/s390/include/asm/kvm_host.h | 3 + arch/s390/kernel/irq.c | 1 + arch/s390/kvm/interrupt.c | 169 ++++++++++++++++++++++++++++++- arch/s390/kvm/kvm-s390.c | 2 + 6 files changed, 175 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 2f7f27e5493f..afaf5e3c57fd 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -62,6 +62,7 @@ enum interruption_class { IRQIO_MSI, IRQIO_VIR, IRQIO_VAI, + IRQIO_GAL, NMI_NMI, CPU_RST, NR_ARCH_IRQS diff --git a/arch/s390/include/asm/isc.h b/arch/s390/include/asm/isc.h index 6cb9e2ed05b6..b2cc1ec78d06 100644 --- a/arch/s390/include/asm/isc.h +++ b/arch/s390/include/asm/isc.h @@ -21,6 +21,7 @@ /* Adapter interrupts. */ #define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */ #define PCI_ISC 2 /* PCI I/O subchannels */ +#define GAL_ISC 5 /* GIB alert */ #define AP_ISC 6 /* adjunct processor (crypto) devices */ /* Functions for registration of I/O interruption subclasses */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 2cfff617cb21..c5f51566ecd6 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -825,6 +825,9 @@ struct kvm_s390_gisa_iam { struct kvm_s390_gisa_interrupt { struct kvm_s390_gisa *origin; struct kvm_s390_gisa_iam alert; + struct hrtimer timer; + u64 expires; + DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS); }; struct kvm_arch{ diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 0e8d68bac82c..0cd5a5f96729 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -88,6 +88,7 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" }, {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, + {.irq = IRQIO_GAL, .name = "GAL", .desc = "[I/O] GIB Alert"}, {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 9faaa8f96fc3..dc446203e5a4 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" @@ -268,6 +269,38 @@ static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); } +/** + * gisa_get_ipm_or_restore_iam - return IPM or restore GISA IAM + * + * @gi: gisa interrupt struct to work on + * + * Atomically restores the interruption alert mask if none of the + * relevant ISCs are pending and return the IPM. + * + * Returns: the relevant pending ISCs + */ +static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) +{ + u8 pending_mask, alert_mask; + u64 word, _word; + + do { + word = READ_ONCE(gi->origin->u64.word[0]); + alert_mask = READ_ONCE(gi->alert.mask); + pending_mask = (u8)(word >> 24) & alert_mask; + if (pending_mask) + return pending_mask; + _word = (word & ~0xffUL) | alert_mask; + } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word); + + return 0; +} + +static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) +{ + return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa; +} + static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -1141,6 +1174,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu) int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) { + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; u64 sltime; vcpu->stat.exit_wait_state++; @@ -1154,6 +1188,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; /* disabled wait */ } + if (gi->origin && + (gisa_get_ipm_or_restore_iam(gi) & + vcpu->arch.sie_block->gcr[6] >> 24)) + return 0; + if (!ckc_interrupts_enabled(vcpu) && !cpu_timer_interrupts_enabled(vcpu)) { VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer"); @@ -2939,6 +2978,93 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) return n; } +static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) +{ + int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + + for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_id); + if (psw_ioint_disabled(vcpu)) + continue; + deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask) { + /* lately kicked but not yet running */ + if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + return; + kvm_s390_vcpu_wakeup(vcpu); + return; + } + } +} + +static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) +{ + struct kvm_s390_gisa_interrupt *gi = + container_of(timer, struct kvm_s390_gisa_interrupt, timer); + struct kvm *kvm = + container_of(gi->origin, struct sie_page2, gisa)->kvm; + u8 pending_mask; + + pending_mask = gisa_get_ipm_or_restore_iam(gi); + if (pending_mask) { + __airqs_kick_single_vcpu(kvm, pending_mask); + hrtimer_forward_now(timer, ns_to_ktime(gi->expires)); + return HRTIMER_RESTART; + }; + + return HRTIMER_NORESTART; +} + +#define NULL_GISA_ADDR 0x00000000UL +#define NONE_GISA_ADDR 0x00000001UL +#define GISA_ADDR_MASK 0xfffff000UL + +static void process_gib_alert_list(void) +{ + struct kvm_s390_gisa_interrupt *gi; + struct kvm_s390_gisa *gisa; + struct kvm *kvm; + u32 final, origin = 0UL; + + do { + /* + * If the NONE_GISA_ADDR is still stored in the alert list + * origin, we will leave the outer loop. No further GISA has + * been added to the alert list by millicode while processing + * the current alert list. + */ + final = (origin & NONE_GISA_ADDR); + /* + * Cut off the alert list and store the NONE_GISA_ADDR in the + * alert list origin to avoid further GAL interruptions. + * A new alert list can be build up by millicode in parallel + * for guests not in the yet cut-off alert list. When in the + * final loop, store the NULL_GISA_ADDR instead. This will re- + * enable GAL interruptions on the host again. + */ + origin = xchg(&gib->alert_list_origin, + (!final) ? NONE_GISA_ADDR : NULL_GISA_ADDR); + /* + * Loop through the just cut-off alert list and start the + * gisa timers to kick idle vcpus to consume the pending + * interruptions asap. + */ + while (origin & GISA_ADDR_MASK) { + gisa = (struct kvm_s390_gisa *)(u64)origin; + origin = gisa->next_alert; + gisa->next_alert = (u32)(u64)gisa; + kvm = container_of(gisa, struct sie_page2, gisa)->kvm; + gi = &kvm->arch.gisa_int; + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + } + } while (!final); + +} + void kvm_s390_gisa_clear(struct kvm *kvm) { struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; @@ -2958,6 +3084,9 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->origin = &kvm->arch.sie_page2->gisa; gi->alert.mask = 0; spin_lock_init(&gi->alert.ref_lock); + gi->expires = 50 * 1000; /* 50 usec */ + hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gi->timer.function = gisa_vcpu_kicker; memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); gi->origin->next_alert = (u32)(u64)gi->origin; VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); @@ -2965,7 +3094,17 @@ void kvm_s390_gisa_init(struct kvm *kvm) void kvm_s390_gisa_destroy(struct kvm *kvm) { - kvm->arch.gisa_int.origin = NULL; + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return; + if (gi->alert.mask) + KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x", + kvm, gi->alert.mask); + while (gisa_in_alert_list(gi->origin)) + cpu_relax(); + hrtimer_cancel(&gi->timer); + gi->origin = NULL; } /** @@ -3051,11 +3190,23 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); +static void gib_alert_irq_handler(struct airq_struct *airq) +{ + inc_irq_stat(IRQIO_GAL); + process_gib_alert_list(); +} + +static struct airq_struct gib_alert_irq = { + .handler = gib_alert_irq_handler, + .lsi_ptr = &gib_alert_irq.lsi_mask, +}; + void kvm_s390_gib_destroy(void) { if (!gib) return; chsc_sgib(0); + unregister_adapter_interrupt(&gib_alert_irq); free_page((unsigned long)gib); gib = NULL; } @@ -3075,16 +3226,30 @@ int kvm_s390_gib_init(u8 nisc) goto out; } + gib_alert_irq.isc = nisc; + if (register_adapter_interrupt(&gib_alert_irq)) { + pr_err("Registering the GIB alert interruption handler failed\n"); + rc = -EIO; + goto out_free_gib; + } + gib->nisc = nisc; if (chsc_sgib((u32)(u64)gib)) { pr_err("Associating the GIB with the AIV facility failed\n"); free_page((unsigned long)gib); gib = NULL; rc = -EIO; - goto out; + goto out_unreg_gal; } KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); + goto out; + +out_unreg_gal: + unregister_adapter_interrupt(&gib_alert_irq); +out_free_gib: + free_page((unsigned long)gib); + gib = NULL; out: return rc; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0de062e989e2..0099fbda2e98 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3460,6 +3460,8 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } + clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); From b1d1e76ed9ee5a0f7671da257d4f0595d1f5162e Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:45 +0100 Subject: [PATCH 015/147] KVM: s390: start using the GIB By initializing the GIB, it will be used by the kvm host. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-15-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0099fbda2e98..2e47c724679e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -435,8 +435,15 @@ int kvm_arch_init(void *opaque) pr_err("A FLIC registration call failed with rc=%d\n", rc); goto out_debug_unreg; } + + rc = kvm_s390_gib_init(GAL_ISC); + if (rc) + goto out_gib_destroy; + return 0; +out_gib_destroy: + kvm_s390_gib_destroy(); out_debug_unreg: debug_unregister(kvm_s390_dbf); return rc; From b9fa6d6ee9b88eedf2d2261f931841664aa6980e Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:46 +0100 Subject: [PATCH 016/147] KVM: s390: fix possible null pointer dereference in pending_irqs() Assure a GISA is in use before accessing the IPM to avoid a null pointer dereference issue. Signed-off-by: Michael Mueller Reported-by: Halil Pasic Reviewed-by: Pierre Morel Reviewed-by: Cornelia Huck Message-Id: <20190131085247.13826-16-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index dc446203e5a4..82162867f378 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -329,9 +329,13 @@ static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { - return pending_irqs_no_gisa(vcpu) | - gisa_get_ipm(vcpu->kvm->arch.gisa_int.origin) << - IRQ_PEND_IO_ISC_7; + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; + unsigned long pending_mask; + + pending_mask = pending_irqs_no_gisa(vcpu); + if (gi->origin) + pending_mask |= gisa_get_ipm(gi->origin) << IRQ_PEND_IO_ISC_7; + return pending_mask; } static inline int isc_to_irq_type(unsigned long isc) From 65ab26e397555d340290a99e6c736e291526ecc0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 31 Jan 2019 23:49:21 +0100 Subject: [PATCH 017/147] selftests: kvm: add selftest for releasing VM file descriptor while in L2 This adds a test for the previous bug. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/vmx_close_while_nested_test.c | 95 +++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 6210ba41c29e..2689d1ea6d7a 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -3,6 +3,7 @@ /x86_64/platform_info_test /x86_64/set_sregs_test /x86_64/sync_regs_test +/x86_64/vmx_close_while_nested_test /x86_64/vmx_tsc_adjust_test /x86_64/state_test /dirty_log_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index f9a0e9938480..3c1f4bdf9000 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -16,6 +16,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid +TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += clear_dirty_log_test diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c new file mode 100644 index 000000000000..6edec6fd790b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -0,0 +1,95 @@ +/* + * vmx_close_while_nested + * + * Copyright (C) 2019, Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Verify that nothing bad happens if a KVM user exits with open + * file descriptors while executing a nested guest. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include +#include + +#include "kselftest.h" + +#define VCPU_ID 5 + +enum { + PORT_L0_EXIT = 0x2000, +}; + +/* The virtual machine object. */ +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + /* Exit to L0 */ + asm volatile("inb %%dx, %%al" + : : [port] "d" (PORT_L0_EXIT) : "rax"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + uintptr_t save_cr3; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(0); +} + +int main(int argc, char *argv[]) +{ + struct vmx_pages *vmx_pages; + vm_vaddr_t vmx_pages_gva; + struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); + + if (!(entry->ecx & CPUID_VMX)) { + fprintf(stderr, "nested VMX not enabled, skipping test\n"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* Allocate VMX pages and shared descriptors (vmx_pages). */ + vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + if (run->io.port == PORT_L0_EXIT) + break; + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_ASSERT(false, "%s", (const char *)uc.args[0]); + /* NOT REACHED */ + default: + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + } + } +} From 61c08aa9606d4e48a8a50639c956448a720174c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:48 -0800 Subject: [PATCH 018/147] KVM: VMX: Compare only a single byte for VMCS' "launched" in vCPU-run The vCPU-run asm blob does a manual comparison of a VMCS' launched status to execute the correct VM-Enter instruction, i.e. VMLAUNCH vs. VMRESUME. The launched flag is a bool, which is a typedef of _Bool. C99 does not define an exact size for _Bool, stating only that is must be large enough to hold '0' and '1'. Most, if not all, compilers use a single byte for _Bool, including gcc[1]. Originally, 'launched' was of type 'int' and so the asm blob used 'cmpl' to check the launch status. When 'launched' was moved to be stored on a per-VMCS basis, struct vcpu_vmx's "temporary" __launched flag was added in order to avoid having to pass the current VMCS into the asm blob. The new '__launched' was defined as a 'bool' and not an 'int', but the 'cmp' instruction was not updated. This has not caused any known problems, likely due to compilers aligning variables to 4-byte or 8-byte boundaries and KVM zeroing out struct vcpu_vmx during allocation. I.e. vCPU-run accesses "junk" data, it just happens to always be zero and so doesn't affect the result. [1] https://gcc.gnu.org/ml/gcc-patches/2000-10/msg01127.html Fixes: d462b8192368 ("KVM: VMX: Keep list of loaded VMCSs, instead of vcpus") Cc: Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 95d618045001..fdb6305cc971 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6402,7 +6402,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%" _ASM_AX", %%cr2 \n\t" "3: \n\t" /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t" + "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" /* Load guest registers. Don't clobber flags. */ "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" From 1ce072cbfd8dba46f117804850398e0b3040a541 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:49 -0800 Subject: [PATCH 019/147] KVM: nVMX: Check a single byte for VMCS "launched" in nested early checks Nested early checks does a manual comparison of a VMCS' launched status in its asm blob to execute the correct VM-Enter instruction, i.e. VMLAUNCH vs. VMRESUME. The launched flag is a bool, which is a typedef of _Bool. C99 does not define an exact size for _Bool, stating only that is must be large enough to hold '0' and '1'. Most, if not all, compilers use a single byte for _Bool, including gcc[1]. The use of 'cmpl' instead of 'cmpb' was not deliberate, but rather the result of a copy-paste as the asm blob was directly derived from the asm blob for vCPU-run. This has not caused any known problems, likely due to compilers aligning variables to 4-byte or 8-byte boundaries and KVM zeroing out struct vcpu_vmx during allocation. I.e. vCPU-run accesses "junk" data, it just happens to always be zero and so doesn't affect the result. [1] https://gcc.gnu.org/ml/gcc-patches/2000-10/msg01127.html Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Cc: Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d8ea4ebd79e7..50c9e04910ea 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2761,7 +2761,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" + "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" "call vmx_vmenter\n\t" From 0e0ab73c9a0243736bcd779b30b717e23ba9a56d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:50 -0800 Subject: [PATCH 020/147] KVM: VMX: Zero out *all* general purpose registers after VM-Exit ...except RSP, which is restored by hardware as part of VM-Exit. Paolo theorized that restoring registers from the stack after a VM-Exit in lieu of zeroing them could lead to speculative execution with the guest's values, e.g. if the stack accesses miss the L1 cache[1]. Zeroing XORs are dirt cheap, so just be ultra-paranoid. Note that the scratch register (currently RCX) used to save/restore the guest state is also zeroed as its host-defined value is loaded via the stack, just with a MOV instead of a POP. [1] https://patchwork.kernel.org/patch/10771539/#22441255 Fixes: 0cb5b30698fd ("kvm: vmx: Scrub hardware GPRs at VM-exit") Cc: Cc: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fdb6305cc971..10fee67a6dcd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6452,10 +6452,15 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" + /* - * Clear host registers marked as clobbered to prevent - * speculative use. - */ + * Clear all general purpose registers (except RSP, which is loaded by + * the CPU during VM-Exit) to prevent speculative use of the guest's + * values, even those that are saved/loaded via the stack. In theory, + * an L1 cache miss when restoring registers could lead to speculative + * execution with the guest's values. Zeroing XORs are dirt cheap, + * i.e. the extra paranoia is essentially free. + */ "xor %%r8d, %%r8d \n\t" "xor %%r9d, %%r9d \n\t" "xor %%r10d, %%r10d \n\t" @@ -6470,8 +6475,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%eax, %%eax \n\t" "xor %%ebx, %%ebx \n\t" + "xor %%ecx, %%ecx \n\t" + "xor %%edx, %%edx \n\t" "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" + "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" : ASM_CALL_CONSTRAINT : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), From 831a3011294d9aa284afae9194619893d454d708 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:51 -0800 Subject: [PATCH 021/147] KVM: VMX: Modify only RSP when creating a placeholder for guest's RCX In the vCPU-run asm blob, the guest's RCX is temporarily saved onto the stack after VM-Exit as the exit flow must first load a register with a pointer to the vCPU's save area in order to save the guest's registers. RCX is arbitrarily designated as the scratch register. Since the stack usage is to (1)save host, (2)save guest, (3)load host and (4)load guest, the code can't conform to the stack's natural FIFO semantics, i.e. it can't simply do PUSH/POP. Regardless of whether it is done for the host's value or guest's value, at some point the code needs to access the stack using a non-traditional method, e.g. MOV instead of POP. vCPU-run opts to create a placeholder on the stack for guest's RCX (by adjusting RSP) and saves RCX to its place immediately after VM-Exit (via MOV). In other words, the purpose of the first 'PUSH RCX' at the start of the vCPU-run asm blob is to adjust RSP down, i.e. there's no need to actually access memory. Use 'SUB $wordsize, RSP' instead of 'PUSH RCX' to make it more obvious that the intent is simply to create a gap on the stack for the guest's RCX. Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 10fee67a6dcd..cb0dd17c31c6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6378,7 +6378,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" - "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ + "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" From f3689e3f17f064fd4cd5f0cb01ae2395c94f39d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:52 -0800 Subject: [PATCH 022/147] KVM: VMX: Save RSI to an unused output in the vCPU-run asm blob RSI is clobbered by the vCPU-run asm blob, but it's not marked as such, probably because GCC doesn't let you mark inputs as clobbered. "Save" RSI to a dummy output so that GCC recognizes it as being clobbered. Fixes: 773e8a0425c9 ("x86/kvm: use Enlightened VMCS when running on Hyper-V") Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cb0dd17c31c6..3ae51a09f420 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6481,7 +6481,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - : ASM_CALL_CONSTRAINT + : ASM_CALL_CONSTRAINT, "=S"((int){0}) : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), From ccf447434ee624b64a1c215d88453f1921aaa88a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:53 -0800 Subject: [PATCH 023/147] KVM: VMX: Manually load RDX in vCPU-run asm blob Load RDX with the VMCS.HOST_RSP field encoding on-demand instead of delegating to the compiler via an input constraint. In addition to saving one whole MOV instruction, this allows RDX to be properly clobbered (in a future patch) instead of being saved/loaded to/from the stack. Despite nested_vmx_check_vmentry_hw() having similar code, leave it alone, for now. In that case, RDX is unconditionally used and isn't clobbered, i.e. sending in HOST_RSP as an input is simpler. Note that because HOST_RSP is an enum and not a define, it must be redefined as an immediate instead of using __stringify(HOST_RSP). The naming "conflict" between host_rsp and HOST_RSP is slightly confusing, but the former will be removed in a future patch, at which point HOST_RSP is absolutely what is desired. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3ae51a09f420..b935ba3306aa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6390,6 +6390,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" "jmp 1f \n\t" "2: \n\t" + "mov $%c[HOST_RSP], %%" _ASM_DX " \n\t" __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ @@ -6482,10 +6483,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" : ASM_CALL_CONSTRAINT, "=S"((int){0}) - : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), + : "c"(vmx), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), From 6f7c6d23b71af2e65056e777eeb2f1e6e0195f14 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:54 -0800 Subject: [PATCH 024/147] KVM: VMX: Let the compiler save/load RDX during vCPU-run Per commit c20363006af6 ("KVM: VMX: Let gcc to choose which registers to save (x86_64)"), the only reason RDX is saved/loaded to/from the stack is because it was specified as an input, i.e. couldn't be marked as clobbered (ignoring the fact that "saving" it to a dummy output would indirectly mark it as clobbered). Now that RDX is no longer an input, clobber it. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b935ba3306aa..e9d81cd8421f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6377,7 +6377,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ - "push %%" _ASM_DX "; push %%" _ASM_BP ";" + "push %%" _ASM_BP " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ @@ -6481,7 +6481,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" + "pop %%" _ASM_BP " \n\t" : ASM_CALL_CONSTRAINT, "=S"((int){0}) : "c"(vmx), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), @@ -6509,10 +6509,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi" + , "rax", "rbx", "rdx", "rdi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edi" + , "eax", "ebx", "edx", "edi" #endif ); } From 9ce0a07a6f49822238fd4357c02e0dba060a43cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:55 -0800 Subject: [PATCH 025/147] KVM: nVMX: Remove a rogue "rax" clobber from nested_vmx_check_vmentry_hw() RAX is not touched by nested_vmx_check_vmentry_hw(), directly or indirectly (e.g. vmx_vmenter()). Remove it from the clobber list. Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 50c9e04910ea..bb85e3a05424 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2773,7 +2773,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) - : "rax", "cc", "memory" + : "cc", "memory" ); preempt_enable(); From 98ff2acc91d836277d34558fbbba7678e04281ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:56 -0800 Subject: [PATCH 026/147] KVM: nVMX: Drop STACK_FRAME_NON_STANDARD from nested_vmx_check_vmentry_hw() ...as it doesn't technically actually do anything non-standard with the stack even though it modifies RSP in a weird way. E.g. RSP is loaded with VMCS.HOST_RSP if the VM-Enter gets far enough to trigger VM-Exit, but it's simply reloaded with the current value. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bb85e3a05424..e40df725b952 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2809,8 +2809,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); - static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); From 6c1e7e5b40f23b9e754a47852924115febba35df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:57 -0800 Subject: [PATCH 027/147] KVM: nVMX: Explicitly reference the scratch reg in nested early checks Using %1 to reference RCX, i.e. the 'vmx' pointer', is obtuse and fragile, e.g. it results in cryptic and infurating compile errors if the output constraints are touched by anything more than a gentle breeze. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e40df725b952..a562ecabc118 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2757,7 +2757,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ From f1727b4954772a778df0b73a93c4b646fd3c21f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:58 -0800 Subject: [PATCH 028/147] KVM: nVMX: Capture VM-Fail to a local var in nested_vmx_check_vmentry_hw() Unlike the primary vCPU-run flow, the nested early checks code doesn't actually want to propagate VM-Fail back to 'vmx'. Yay copy+paste. In additional to eliminating the need to clear vmx->fail before returning, using a local boolean also drops a reference to 'vmx' in the asm blob. Dropping the reference to 'vmx' will save a register in the long run as future patches will shift all pointer references from 'vmx' to 'vmx->loaded_vmcs'. Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a562ecabc118..bfacf9029466 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2718,6 +2718,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; + bool vm_fail; if (!nested_early_check) return 0; @@ -2763,14 +2764,18 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" + /* + * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set + * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail + * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the + * results of VM-Enter is captured via SETBE to vm_fail. + */ "call vmx_vmenter\n\t" - /* Set vmx->fail accordingly */ - "setbe %c[fail](%% " _ASM_CX")\n\t" - : ASM_CALL_CONSTRAINT + "setbe %[fail]\n\t" + : ASM_CALL_CONSTRAINT, [fail]"=qm"(vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" @@ -2783,10 +2788,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) if (vmx->msr_autoload.guest.nr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - if (vmx->fail) { + if (vm_fail) { WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - vmx->fail = 0; return 1; } From bbc0b8239257d91cd43fe219de5552dee8bb9123 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:59 -0800 Subject: [PATCH 029/147] KVM: nVMX: Capture VM-Fail via CC_{SET,OUT} in nested early checks ...to take advantage of __GCC_ASM_FLAG_OUTPUTS__ when possible. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bfacf9029466..67f8fdf568eb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2768,12 +2768,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the - * results of VM-Enter is captured via SETBE to vm_fail. + * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. */ "call vmx_vmenter\n\t" - "setbe %[fail]\n\t" - : ASM_CALL_CONSTRAINT, [fail]"=qm"(vm_fail) + CC_SET(be) + : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), From 74dfa2784e961fae66143b811f45105b43c63046 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:00 -0800 Subject: [PATCH 030/147] KVM: nVMX: Reference vmx->loaded_vmcs->launched directly Temporarily propagating vmx->loaded_vmcs->launched to vmx->__launched is not functionally necessary, but rather was done historically to avoid passing both 'vmx' and 'loaded_vmcs' to the vCPU-run asm blob. Nested early checks inherited this behavior by virtue of copy+paste. A future patch will move HOST_RSP caching to be per-VMCS, i.e. store 'host_rsp' in loaded VMCS. Now that the reference to 'vmx->fail' is also gone from nested early checks, referencing 'loaded_vmcs' directly means we can drop the 'vmx' reference when introducing per-VMCS RSP caching. And it means __launched can be dropped from struct vcpu_vmx if/when vCPU-run receives similar treatment. Note the use of a named register constraint for 'loaded_vmcs'. Using RCX to hold 'vmx' was inherited from vCPU-run. In the vCPU-run case, the scratch register needs to be explicitly defined as it is crushed when loading guest state, i.e. deferring to the compiler would corrupt the pointer. Since nested early checks never loads guests state, it's a-ok to let the compiler pick any register. Naming the constraint avoids the fragility of referencing constraints via %1, %2, etc.., which breaks horribly when modifying constraints, and generally makes the asm blob more readable. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 67f8fdf568eb..47299e10522f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2752,8 +2752,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } - vmx->__launched = vmx->loaded_vmcs->launched; - asm( /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ @@ -2762,7 +2760,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" + "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" /* * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set @@ -2775,7 +2773,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [loaded_vmcs]"r"(vmx->loaded_vmcs), + [launched]"i"(offsetof(struct loaded_vmcs, launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" From fbda0fd31a6d683637f848ba17956048dd0c7e48 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:01 -0800 Subject: [PATCH 031/147] KVM: nVMX: Let the compiler select the reg for holding HOST_RSP ...and provide an explicit name for the constraint. Naming the input constraint makes the code self-documenting and also avoids the fragility of numerically referring to constraints, e.g. %4 breaks badly whenever the constraints are modified. Explicitly using RDX was inherited from vCPU-run, i.e. completely arbitrary. Even vCPU-run doesn't truly need to explicitly use RDX, but doing so is more robust as vCPU-run needs tight control over its register usage. Note that while the naming "conflict" between host_rsp and HOST_RSP is slightly confusing, the former will be renamed slightly in a future patch, at which point HOST_RSP is absolutely what is desired. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 47299e10522f..23a2c1b91389 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2753,9 +2753,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) } asm( - /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" + __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ @@ -2772,7 +2771,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) - : "c"(vmx), "d"((unsigned long)HOST_RSP), + : "c"(vmx), + [HOST_RSP]"r"((unsigned long)HOST_RSP), [loaded_vmcs]"r"(vmx->loaded_vmcs), [launched]"i"(offsetof(struct loaded_vmcs, launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), From 5a8781607e677eda60b20e0a4c91d2a5f12f9244 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:02 -0800 Subject: [PATCH 032/147] KVM: nVMX: Cache host_rsp on a per-VMCS basis Currently, host_rsp is cached on a per-vCPU basis, i.e. it's stored in struct vcpu_vmx. In non-nested usage the caching is for all intents and purposes 100% effective, e.g. only the first VMLAUNCH needs to synchronize VMCS.HOST_RSP since the call stack to vmx_vcpu_run() is identical each and every time. But when running a nested guest, KVM must invalidate the cache when switching the current VMCS as it can't guarantee the new VMCS has the same HOST_RSP as the previous VMCS. In other words, the cache loses almost all of its efficacy when running a nested VM. Move host_rsp to struct vmcs_host_state, which is per-VMCS, so that it is cached on a per-VMCS basis and restores its 100% hit rate when nested VMs are in play. Note that the host_rsp cache for vmcs02 essentially "breaks" when nested early checks are enabled as nested_vmx_check_vmentry_hw() will see a different RSP at the time of its VM-Enter. While it's possible to avoid even that VMCS.HOST_RSP synchronization, e.g. by employing a dedicated VM-Exit stack, there is little motivation for doing so as the overhead of two VMWRITEs (~55 cycles) is dwarfed by the overhead of the extra VMX transition (600+ cycles) and is a proverbial drop in the ocean relative to the total cost of a nested transtion (10s of thousands of cycles). Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 24 ++++++------------------ arch/x86/kvm/vmx/vmcs.h | 1 + arch/x86/kvm/vmx/vmx.c | 13 ++++++------- arch/x86/kvm/vmx/vmx.h | 1 - 4 files changed, 13 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 23a2c1b91389..0e67649e39ce 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1979,17 +1979,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) prepare_vmcs02_early_full(vmx, vmcs12); - /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. host_rsp will - * also be written unconditionally by nested_vmx_check_vmentry_hw() - * if we are doing early consistency checks via hardware. - */ - vmx->host_rsp = 0; - /* * PIN CONTROLS */ @@ -2754,8 +2743,11 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) asm( "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ + "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "je 1f \n\t" __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" + "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ @@ -2771,11 +2763,10 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) - : "c"(vmx), - [HOST_RSP]"r"((unsigned long)HOST_RSP), + : [HOST_RSP]"r"((unsigned long)HOST_RSP), [loaded_vmcs]"r"(vmx->loaded_vmcs), [launched]"i"(offsetof(struct loaded_vmcs, launched)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" ); @@ -3912,9 +3903,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_flush_tlb(vcpu, true); } - /* This is needed for same reason as it was needed in prepare_vmcs02 */ - vmx->host_rsp = 0; - /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 6def3ba88e3b..cb6079f8a227 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -34,6 +34,7 @@ struct vmcs_host_state { unsigned long cr4; /* May not match real cr4 */ unsigned long gs_base; unsigned long fs_base; + unsigned long rsp; u16 fs_sel, gs_sel, ldt_sel; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e9d81cd8421f..12bb61e7aca6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6381,9 +6381,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" + "cmp %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" /* Avoid VMWRITE when Enlightened VMCS is in use */ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" "jz 2f \n\t" @@ -6482,11 +6482,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=S"((int){0}) - : "c"(vmx), "S"(evmcs_rsp), + : ASM_CALL_CONSTRAINT, "=D"((int){0}), "=S"((int){0}) + : "c"(vmx), "D"(&vmx->loaded_vmcs->host_state.rsp), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), @@ -6509,10 +6508,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdx", "rdi" + , "rax", "rbx", "rdx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edx", "edi" + , "eax", "ebx", "edx" #endif ); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 99328954c2fc..8e203b725928 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -175,7 +175,6 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; - unsigned long host_rsp; u8 fail; u8 msr_bitmap_mode; u32 exit_intr_info; From 47e97c099bbcb3211b22456679991095c0578da2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:03 -0800 Subject: [PATCH 033/147] KVM: VMX: Load/save guest CR2 via C code in __vmx_vcpu_run() ...to eliminate its parameter and struct vcpu_vmx offset definition from the assembly blob. Accessing CR2 from C versus assembly doesn't change the likelihood of taking a page fault (and modifying CR2) while it's loaded with the guest's value, so long as we don't do anything silly between accessing CR2 and VM-Enter/VM-Exit. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 12bb61e7aca6..5e43999ece1d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6375,6 +6375,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" @@ -6395,13 +6398,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ - /* Reload cr2 if changed */ - "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov %%cr2, %%" _ASM_DX " \n\t" - "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 3f \n\t" - "mov %%" _ASM_AX", %%cr2 \n\t" - "3: \n\t" /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" /* Load guest registers. Don't clobber flags. */ @@ -6471,9 +6467,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%r14d, %%r14d \n\t" "xor %%r15d, %%r15d \n\t" #endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t" - "xor %%eax, %%eax \n\t" "xor %%ebx, %%ebx \n\t" "xor %%ecx, %%ecx \n\t" @@ -6504,7 +6497,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 @@ -6514,6 +6506,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) , "eax", "ebx", "edx" #endif ); + + vcpu->arch.cr2 = read_cr2(); } STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); From c09b03eb7f964370149577fdba425623b7f5b0b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:04 -0800 Subject: [PATCH 034/147] KVM: VMX: Update VMCS.HOST_RSP via helper C function Providing a helper function to update HOST_RSP is visibly easier to read, and more importantly (for the future) eliminates two arguments to the VM-Enter assembly blob. Reducing the number of arguments to the asm blob is for all intents and purposes a prerequisite to moving the code to a proper assembly routine. It's not truly mandatory, but it greatly simplifies the future code, and the cost of the extra CALL+RET is negligible in the grand scheme. Note that although _ASM_ARG[1-3] can be used in the inline asm itself, the intput/output constraints need to be manually defined. gcc will actually compile with _ASM_ARG[1-3] specified as constraints, but what it actually ends up doing with the bogus constraint is unknown. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 51 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5e43999ece1d..4b8a94fedb76 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6363,15 +6363,18 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } +void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) +{ + if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { + vmx->loaded_vmcs->host_state.rsp = host_rsp; + vmcs_writel(HOST_RSP, host_rsp); + } +} + static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - unsigned long evmcs_rsp; - vmx->__launched = vmx->loaded_vmcs->launched; - evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? - (unsigned long)¤t_evmcs->host_rsp : 0; - if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); @@ -6382,21 +6385,14 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Store host registers */ "push %%" _ASM_BP " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ - "push %%" _ASM_CX " \n\t" - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" - "je 1f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" - /* Avoid VMWRITE when Enlightened VMCS is in use */ - "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" - "jz 2f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" - "jmp 1f \n\t" - "2: \n\t" - "mov $%c[HOST_RSP], %%" _ASM_DX " \n\t" - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "1: \n\t" - "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ + "push %%" _ASM_ARG1 " \n\t" + + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ + "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" + "call vmx_update_host_rsp \n\t" + + /* Load the vcpu_vmx pointer to RCX. */ + "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" @@ -6475,11 +6471,16 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=D"((int){0}), "=S"((int){0}) - : "c"(vmx), "D"(&vmx->loaded_vmcs->host_state.rsp), "S"(evmcs_rsp), + : ASM_CALL_CONSTRAINT, +#ifdef CONFIG_X86_64 + "=D"((int){0}) + : "D"(vmx), +#else + "=a"((int){0}) + : "a"(vmx), +#endif [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), @@ -6500,10 +6501,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdx" + , "rax", "rbx", "rcx", "rdx", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edx" + , "ebx", "ecx", "edx", "edi", "esi" #endif ); From c9afc58cc368623aaa34a62e321eea2a59240f6f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:05 -0800 Subject: [PATCH 035/147] KVM: VMX: Pass "launched" directly to the vCPU-run asm blob ...and remove struct vcpu_vmx's temporary __launched variable. Eliminating __launched is a bonus, the real motivation is to get to the point where the only reference to struct vcpu_vmx in the asm code is to vcpu.arch.regs, which will simplify moving the blob to a proper asm file. Note that also means this approach is deliberately different than what is used in nested_vmx_check_vmentry_hw(). Use BL as it is a callee-save register in both 32-bit and 64-bit ABIs, i.e. it can't be modified by vmx_update_host_rsp(), to avoid having to temporarily save/restore the launched flag. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 13 ++++++------- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4b8a94fedb76..996a13ea86cc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6373,8 +6373,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->__launched = vmx->loaded_vmcs->launched; - if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); @@ -6395,7 +6393,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" + "cmpb $0, %%bl \n\t" + /* Load guest registers. Don't clobber flags. */ "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" @@ -6471,7 +6470,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, + : ASM_CALL_CONSTRAINT, "=b"((int){0}), #ifdef CONFIG_X86_64 "=D"((int){0}) : "D"(vmx), @@ -6479,7 +6478,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "=a"((int){0}) : "a"(vmx), #endif - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + "b"(vmx->loaded_vmcs->launched), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), @@ -6501,10 +6500,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rcx", "rdx", "rsi" + , "rax", "rcx", "rdx", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ebx", "ecx", "edx", "edi", "esi" + , "ecx", "edx", "edi", "esi" #endif ); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8e203b725928..6ee6a492efaf 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -208,7 +208,7 @@ struct vcpu_vmx { struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; struct loaded_vmcs *loaded_cpu_state; - bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { struct vmx_msrs guest; struct vmx_msrs host; From 217aaff53c25f03b1d2fc23eff9dc2bae34f690e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:06 -0800 Subject: [PATCH 036/147] KVM: VMX: Invert the ordering of saving guest/host scratch reg at VM-Enter Switching the ordering allows for an out-of-line path for VM-Fail that elides saving guest state but still shares the register clearing with the VM-Exit path. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 996a13ea86cc..79b42197ed7e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6382,7 +6382,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_ARG1 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ @@ -6418,11 +6417,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Enter guest mode */ "call vmx_vmenter\n\t" - /* Save guest's RCX to the stack placeholder (see above) */ - "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t" + /* Temporarily save guest's RCX. */ + "push %%" _ASM_CX " \n\t" - /* Load host's RCX, i.e. the vmx_vcpu pointer */ - "pop %%" _ASM_CX " \n\t" + /* Reload the vcpu_vmx pointer to RCX. */ + "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Set vmx->fail based on EFLAGS.{CF,ZF} */ "setbe %c[fail](%%" _ASM_CX ")\n\t" @@ -6469,6 +6468,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" + + /* "POP" the vcpu_vmx pointer. */ + "add $%c[wordsize], %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" : ASM_CALL_CONSTRAINT, "=b"((int){0}), #ifdef CONFIG_X86_64 From f78d0971b7bd5bf4373a1fac27f176af5d5594ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:07 -0800 Subject: [PATCH 037/147] KVM: VMX: Don't save guest registers after VM-Fail A failed VM-Enter (obviously) didn't succeed, meaning the CPU never executed an instrunction in guest mode and so can't have changed the general purpose registers. In addition to saving some instructions in the VM-Fail case, this also provides a separate path entirely and thus an opportunity to propagate the fail condition to vmx->fail via register without introducing undue pain. Using a register, as opposed to directly referencing vmx->fail, eliminates the need to pass the offset of 'fail', which will simplify moving the code to proper assembly in future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 79b42197ed7e..1dcd3f157a70 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6416,6 +6416,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Enter guest mode */ "call vmx_vmenter\n\t" + "jbe 2f \n\t" /* Temporarily save guest's RCX. */ "push %%" _ASM_CX " \n\t" @@ -6423,9 +6424,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Reload the vcpu_vmx pointer to RCX. */ "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" - /* Set vmx->fail based on EFLAGS.{CF,ZF} */ - "setbe %c[fail](%%" _ASM_CX ")\n\t" - /* Save all guest registers, including RCX from the stack */ "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" @@ -6443,15 +6441,22 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" +#endif + + /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ + "xor %%ebx, %%ebx \n\t" /* - * Clear all general purpose registers (except RSP, which is loaded by - * the CPU during VM-Exit) to prevent speculative use of the guest's - * values, even those that are saved/loaded via the stack. In theory, - * an L1 cache miss when restoring registers could lead to speculative - * execution with the guest's values. Zeroing XORs are dirt cheap, - * i.e. the extra paranoia is essentially free. + * Clear all general purpose registers except RSP and RBX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RBX are exempt as RSP is restored by hardware during + * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. */ + "1: \n\t" +#ifdef CONFIG_X86_64 "xor %%r8d, %%r8d \n\t" "xor %%r9d, %%r9d \n\t" "xor %%r10d, %%r10d \n\t" @@ -6462,7 +6467,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%r15d, %%r15d \n\t" #endif "xor %%eax, %%eax \n\t" - "xor %%ebx, %%ebx \n\t" "xor %%ecx, %%ecx \n\t" "xor %%edx, %%edx \n\t" "xor %%esi, %%esi \n\t" @@ -6472,7 +6476,15 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* "POP" the vcpu_vmx pointer. */ "add $%c[wordsize], %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=b"((int){0}), + "jmp 3f \n\t" + + /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ + "2: \n\t" + "mov $1, %%ebx \n\t" + "jmp 1b \n\t" + "3: \n\t" + + : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}) : "D"(vmx), @@ -6481,7 +6493,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) : "a"(vmx), #endif "b"(vmx->loaded_vmcs->launched), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), From d55892049171872a8f1aba542aa0691efe3da52d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:08 -0800 Subject: [PATCH 038/147] KVM: VMX: Use vcpu->arch.regs directly when saving/loading guest state ...now that all other references to struct vcpu_vmx have been removed. Note that 'vmx' still needs to be passed into the asm blob in _ASM_ARG1 as it is consumed by vmx_update_host_rsp(). And similar to that code, use _ASM_ARG2 in the assembly code to prepare for moving to proper asm, while explicitly referencing the exact registers in the clobber list for clarity in the short term and to avoid additional precompiler games. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 53 +++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1dcd3f157a70..cd92568bed26 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6382,13 +6382,18 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" - "push %%" _ASM_ARG1 " \n\t" + + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + "push %%" _ASM_ARG2 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" "call vmx_update_host_rsp \n\t" - /* Load the vcpu_vmx pointer to RCX. */ + /* Load RCX with @regs. */ "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ @@ -6421,7 +6426,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Temporarily save guest's RCX. */ "push %%" _ASM_CX " \n\t" - /* Reload the vcpu_vmx pointer to RCX. */ + /* Reload RCX with @regs. */ "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Save all guest registers, including RCX from the stack */ @@ -6486,37 +6491,37 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 - "=D"((int){0}) - : "D"(vmx), + "=D"((int){0}), "=S"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), #else - "=a"((int){0}) - : "a"(vmx), + "=a"((int){0}), "=d"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), #endif "b"(vmx->loaded_vmcs->launched), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), + [rax]"i"(VCPU_REGS_RAX * sizeof(ulong)), + [rbx]"i"(VCPU_REGS_RBX * sizeof(ulong)), + [rcx]"i"(VCPU_REGS_RCX * sizeof(ulong)), + [rdx]"i"(VCPU_REGS_RDX * sizeof(ulong)), + [rsi]"i"(VCPU_REGS_RSI * sizeof(ulong)), + [rdi]"i"(VCPU_REGS_RDI * sizeof(ulong)), + [rbp]"i"(VCPU_REGS_RBP * sizeof(ulong)), #ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), + [r8]"i"(VCPU_REGS_R8 * sizeof(ulong)), + [r9]"i"(VCPU_REGS_R9 * sizeof(ulong)), + [r10]"i"(VCPU_REGS_R10 * sizeof(ulong)), + [r11]"i"(VCPU_REGS_R11 * sizeof(ulong)), + [r12]"i"(VCPU_REGS_R12 * sizeof(ulong)), + [r13]"i"(VCPU_REGS_R13 * sizeof(ulong)), + [r14]"i"(VCPU_REGS_R14 * sizeof(ulong)), + [r15]"i"(VCPU_REGS_R15 * sizeof(ulong)), #endif [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx", "rsi" + , "rax", "rcx", "rdx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ecx", "edx", "edi", "esi" + , "ecx", "edi", "esi" #endif ); From 41a8645ab1c3c37f96955fec3360e123dc06abcd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 4 Feb 2019 19:06:18 +1100 Subject: [PATCH 039/147] KVM: PPC: Book3S PR: Add emulation for slbfee. instruction Recent kernels, since commit e15a4fea4dee ("powerpc/64s/hash: Add some SLB debugging tests", 2018-10-03) use the slbfee. instruction, which PR KVM currently does not have code to emulate. Consequently recent kernels fail to boot under PR KVM. This adds emulation of slbfee., enabling these kernels to boot successfully. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_32_mmu.c | 1 + arch/powerpc/kvm/book3s_64_mmu.c | 14 ++++++++++++++ arch/powerpc/kvm/book3s_emulate.c | 18 ++++++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0f98f00da2ea..091430339db1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -377,6 +377,7 @@ struct kvmppc_mmu { void (*slbmte)(struct kvm_vcpu *vcpu, u64 rb, u64 rs); u64 (*slbmfee)(struct kvm_vcpu *vcpu, u64 slb_nr); u64 (*slbmfev)(struct kvm_vcpu *vcpu, u64 slb_nr); + int (*slbfee)(struct kvm_vcpu *vcpu, gva_t eaddr, ulong *ret_slb); void (*slbie)(struct kvm_vcpu *vcpu, u64 slb_nr); void (*slbia)(struct kvm_vcpu *vcpu); /* book3s */ diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 612169988a3d..6f789f674048 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -425,6 +425,7 @@ void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu) mmu->slbmte = NULL; mmu->slbmfee = NULL; mmu->slbmfev = NULL; + mmu->slbfee = NULL; mmu->slbie = NULL; mmu->slbia = NULL; } diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index c92dd25bed23..d4b967f0e8d4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -435,6 +435,19 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT); } +static int kvmppc_mmu_book3s_64_slbfee(struct kvm_vcpu *vcpu, gva_t eaddr, + ulong *ret_slb) +{ + struct kvmppc_slb *slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); + + if (slbe) { + *ret_slb = slbe->origv; + return 0; + } + *ret_slb = 0; + return -ENOENT; +} + static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) { struct kvmppc_slb *slbe; @@ -670,6 +683,7 @@ void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu) mmu->slbmte = kvmppc_mmu_book3s_64_slbmte; mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee; mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev; + mmu->slbfee = kvmppc_mmu_book3s_64_slbfee; mmu->slbie = kvmppc_mmu_book3s_64_slbie; mmu->slbia = kvmppc_mmu_book3s_64_slbia; mmu->xlate = kvmppc_mmu_book3s_64_xlate; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 8c7e933e942e..6ef7c5f00a49 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -47,6 +47,7 @@ #define OP_31_XOP_SLBMFEV 851 #define OP_31_XOP_EIOIO 854 #define OP_31_XOP_SLBMFEE 915 +#define OP_31_XOP_SLBFEE 979 #define OP_31_XOP_TBEGIN 654 #define OP_31_XOP_TABORT 910 @@ -416,6 +417,23 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.mmu.slbia(vcpu); break; + case OP_31_XOP_SLBFEE: + if (!(inst & 1) || !vcpu->arch.mmu.slbfee) { + return EMULATE_FAIL; + } else { + ulong b, t; + ulong cr = kvmppc_get_cr(vcpu) & ~CR0_MASK; + + b = kvmppc_get_gpr(vcpu, rb); + if (!vcpu->arch.mmu.slbfee(vcpu, b, &t)) + cr |= 2 << CR0_SHIFT; + kvmppc_set_gpr(vcpu, rt, t); + /* copy XER[SO] bit to CR0[SO] */ + cr |= (vcpu->arch.regs.xer & 0x80000000) >> + (31 - CR0_SHIFT); + kvmppc_set_cr(vcpu, cr); + } + break; case OP_31_XOP_SLBMFEE: if (!vcpu->arch.mmu.slbmfee) { emulated = EMULATE_FAIL; From 08434ab4694887effbdcae518f29062274871c8d Mon Sep 17 00:00:00 2001 From: wangbo Date: Mon, 7 Jan 2019 20:15:52 +0800 Subject: [PATCH 040/147] KVM: PPC: Book3S HV: Replace kmalloc_node+memset with kzalloc_node Replace kmalloc_node and memset with kzalloc_node Signed-off-by: wangbo Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5a066fc299e1..27112b70aa7a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5342,13 +5342,11 @@ static int kvm_init_subcore_bitmap(void) continue; sibling_subcore_state = - kmalloc_node(sizeof(struct sibling_subcore_state), + kzalloc_node(sizeof(struct sibling_subcore_state), GFP_KERNEL, node); if (!sibling_subcore_state) return -ENOMEM; - memset(sibling_subcore_state, 0, - sizeof(struct sibling_subcore_state)); for (j = 0; j < threads_per_core; j++) { int cpu = first_cpu + j; From f1adb9c48a01779311aff57d96dc578f91f37eb7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 11 Jan 2019 12:22:31 +0900 Subject: [PATCH 041/147] KVM: PPC: Remove -I. header search paths The header search path -I. in kernel Makefiles is very suspicious; it allows the compiler to search for headers in the top of $(srctree), where obviously no header file exists. Commit 46f43c6ee022 ("KVM: powerpc: convert marker probes to event trace") first added these options, but they are completely useless. Signed-off-by: Masahiro Yamada Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 64f1135e7732..3223aec88b2c 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -10,11 +10,6 @@ common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o -CFLAGS_e500_mmu.o := -I. -CFLAGS_e500_mmu_host.o := -I. -CFLAGS_emulate.o := -I. -CFLAGS_emulate_loadstore.o := -I. - common-objs-y += powerpc.o emulate_loadstore.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o From 03f953329bd872b176e825584d8c0b50685f16ee Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 4 Feb 2019 22:07:20 +1100 Subject: [PATCH 042/147] KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the KVM code assumes that if the host kernel is using the XIVE interrupt controller (the new interrupt controller that first appeared in POWER9 systems), then the in-kernel XICS emulation will use the XIVE hardware to deliver interrupts to the guest. However, this only works when the host is running in hypervisor mode and has full access to all of the XIVE functionality. It doesn't work in any nested virtualization scenario, either with PR KVM or nested-HV KVM, because the XICS-on-XIVE code calls directly into the native-XIVE routines, which are not initialized and cannot function correctly because they use OPAL calls, and OPAL is not available in a guest. This means that using the in-kernel XICS emulation in a nested hypervisor that is using XIVE as its interrupt controller will cause a (nested) host kernel crash. To fix this, we change most of the places where the current code calls xive_enabled() to select between the XICS-on-XIVE emulation and the plain XICS emulation to call a new function, xics_on_xive(), which returns false in a guest. However, there is a further twist. The plain XICS emulation has some functions which are used in real mode and access the underlying XICS controller (the interrupt controller of the host) directly. In the case of a nested hypervisor, this means doing XICS hypercalls directly. When the nested host is using XIVE as its interrupt controller, these hypercalls will fail. Therefore this also adds checks in the places where the XICS emulation wants to access the underlying interrupt controller directly, and if that is XIVE, makes the code use the virtual mode fallback paths, which call generic kernel infrastructure rather than doing direct XICS access. Signed-off-by: Paul Mackerras Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 14 ++++++++++++++ arch/powerpc/kvm/book3s.c | 10 +++++----- arch/powerpc/kvm/book3s_hv.c | 16 ++++++++-------- arch/powerpc/kvm/book3s_hv_builtin.c | 14 +++++++------- arch/powerpc/kvm/book3s_hv_rm_xics.c | 7 +++++++ arch/powerpc/kvm/book3s_rtas.c | 8 ++++---- arch/powerpc/kvm/powerpc.c | 4 ++-- 7 files changed, 47 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eb0d79f0ca45..b3bf4f61b30c 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -36,6 +36,8 @@ #endif #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include +#include +#include #endif /* @@ -616,6 +618,18 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ +#ifdef CONFIG_PPC_POWERNV +static inline bool xics_on_xive(void) +{ + return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE); +} +#else +static inline bool xics_on_xive(void) +{ + return false; +} +#endif + /* * Prototypes for functions called only from assembler code. * Having prototypes reduces sparse errors. diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index bd1a677dd9e4..601c094f15ab 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -635,7 +635,7 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, r = -ENXIO; break; } - if (xive_enabled()) + if (xics_on_xive()) *val = get_reg_val(id, kvmppc_xive_get_icp(vcpu)); else *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); @@ -708,7 +708,7 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, r = -ENXIO; break; } - if (xive_enabled()) + if (xics_on_xive()) r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val)); else r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); @@ -984,7 +984,7 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall) int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { - if (xive_enabled()) + if (xics_on_xive()) return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level, line_status); else @@ -1037,7 +1037,7 @@ static int kvmppc_book3s_init(void) #ifdef CONFIG_KVM_XICS #ifdef CONFIG_KVM_XIVE - if (xive_enabled()) { + if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); } else @@ -1050,7 +1050,7 @@ static int kvmppc_book3s_init(void) static void kvmppc_book3s_exit(void) { #ifdef CONFIG_KVM_XICS - if (xive_enabled()) + if (xics_on_xive()) kvmppc_xive_exit_module(); #endif #ifdef CONFIG_KVM_BOOK3S_32_HANDLER diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 27112b70aa7a..1860c0bc1395 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -922,7 +922,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_IPOLL: case H_XIRR_X: if (kvmppc_xics_enabled(vcpu)) { - if (xive_enabled()) { + if (xics_on_xive()) { ret = H_NOT_AVAILABLE; return RESUME_GUEST; } @@ -1431,7 +1431,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; r = RESUME_GUEST; - if (!xive_enabled()) + if (!xics_on_xive()) kvmppc_xics_rm_complete(vcpu, 0); break; default: @@ -3649,7 +3649,7 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) #ifdef CONFIG_KVM_XICS static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) { - if (!xive_enabled()) + if (!xics_on_xive()) return false; return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr < vcpu->arch.xive_saved_state.cppr; @@ -4209,7 +4209,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); srcu_read_unlock(&kvm->srcu, srcu_idx); } else if (r == RESUME_PASSTHROUGH) { - if (WARN_ON(xive_enabled())) + if (WARN_ON(xics_on_xive())) r = H_SUCCESS; else r = kvmppc_xics_rm_complete(vcpu, 0); @@ -4733,7 +4733,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * If xive is enabled, we route 0x500 interrupts directly * to the guest. */ - if (xive_enabled()) + if (xics_on_xive()) lpcr |= LPCR_LPES; } @@ -4969,7 +4969,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) if (i == pimap->n_mapped) pimap->n_mapped++; - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); else kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); @@ -5010,7 +5010,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) return -ENODEV; } - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); else kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); @@ -5387,7 +5387,7 @@ static int kvmppc_book3s_init_hv(void) * indirectly, via OPAL. */ #ifdef CONFIG_SMP - if (!xive_enabled() && !kvmhv_on_pseries() && + if (!xics_on_xive() && !kvmhv_on_pseries() && !local_paca->kvm_hstate.xics_phys) { struct device_node *np; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index a71e2fc00a4e..b0cf22477e87 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -257,7 +257,7 @@ void kvmhv_rm_send_ipi(int cpu) } /* We should never reach this */ - if (WARN_ON_ONCE(xive_enabled())) + if (WARN_ON_ONCE(xics_on_xive())) return; /* Else poke the target with an IPI */ @@ -577,7 +577,7 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_xirr(vcpu); if (unlikely(!__xive_vm_h_xirr)) @@ -592,7 +592,7 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; vcpu->arch.regs.gpr[5] = get_tb(); - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_xirr(vcpu); if (unlikely(!__xive_vm_h_xirr)) @@ -606,7 +606,7 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_ipoll(vcpu, server); if (unlikely(!__xive_vm_h_ipoll)) @@ -621,7 +621,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_ipi(vcpu, server, mfrr); if (unlikely(!__xive_vm_h_ipi)) @@ -635,7 +635,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_cppr(vcpu, cppr); if (unlikely(!__xive_vm_h_cppr)) @@ -649,7 +649,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_eoi(vcpu, xirr); if (unlikely(!__xive_vm_h_eoi)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index b3f5786b20dc..3b9662a4207e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -144,6 +144,13 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, return; } + if (xive_enabled() && kvmhv_on_pseries()) { + /* No XICS access or hypercalls available, too hard */ + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + return; + } + /* * Check if the core is loaded, * if not, find an available host core to post to wake the VCPU, diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 2d3b2b1cc272..4e178c4c1ea5 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -33,7 +33,7 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) server = be32_to_cpu(args->args[1]); priority = be32_to_cpu(args->args[2]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority); else rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); @@ -56,7 +56,7 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); server = priority = 0; - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority); else rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); @@ -83,7 +83,7 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_int_off(vcpu->kvm, irq); else rc = kvmppc_xics_int_off(vcpu->kvm, irq); @@ -105,7 +105,7 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_int_on(vcpu->kvm, irq); else rc = kvmppc_xics_int_on(vcpu->kvm, irq); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b90a7d154180..8c69af10f91d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -748,7 +748,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); break; case KVMPPC_IRQ_XICS: - if (xive_enabled()) + if (xics_on_xive()) kvmppc_xive_cleanup_vcpu(vcpu); else kvmppc_xics_free_icp(vcpu); @@ -1931,7 +1931,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = -EPERM; dev = kvm_device_from_filp(f.file); if (dev) { - if (xive_enabled()) + if (xics_on_xive()) r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]); else r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); From 1b6422574e2de4f3164d73c38d40539e19c88849 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Thu, 7 Feb 2019 15:56:50 +1100 Subject: [PATCH 043/147] KVM: PPC: Book3S HV: Optimise mmio emulation for devices on FAST_MMIO_BUS Devices on the KVM_FAST_MMIO_BUS by definition have length zero and are thus used for notification purposes rather than data transfer. For example eventfd for virtio devices. This means that when emulating mmio instructions which target devices on this bus we can immediately handle them and return without needing to load the instruction from guest memory. For now we restrict this to stores as this is the only use case at present. For a normal guest the effect is negligible, however for a nested guest we save on the order of 5us per access. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index bd2dcfbf00cd..be7bc070eae5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -441,6 +441,24 @@ int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, { u32 last_inst; + /* + * Fast path - check if the guest physical address corresponds to a + * device on the FAST_MMIO_BUS, if so we can avoid loading the + * instruction all together, then we can just handle it and return. + */ + if (is_store) { + int idx, ret; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0, + NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + if (!ret) { + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + return RESUME_GUEST; + } + } + /* * If we fail, we just return to the guest and try executing it again. */ From a67614cc05a5052b265ea48196dab2fce11f5f2e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 12 Feb 2019 15:37:45 +1100 Subject: [PATCH 044/147] KVM: PPC: Release all hardware TCE tables attached to a group The SPAPR TCE KVM device references all hardware IOMMU tables assigned to some IOMMU group to ensure that in-kernel KVM acceleration of H_PUT_TCE can work. The tables are references when an IOMMU group gets registered with the VFIO KVM device by the KVM_DEV_VFIO_GROUP_ADD ioctl; KVM_DEV_VFIO_GROUP_DEL calls into the dereferencing code in kvm_spapr_tce_release_iommu_group() which walks through the list of LIOBNs, finds a matching IOMMU table and calls kref_put() when found. However that code stops after the very first successful derefencing leaving other tables referenced till the SPAPR TCE KVM device is destroyed which normally happens on guest reboot or termination so if we do hotplug and unplug in a loop, we are leaking IOMMU tables here. This removes a premature return to let kvm_spapr_tce_release_iommu_group() find and dereference all attached tables. Fixes: 121f80ba68f ("KVM: PPC: VFIO: Add in-kernel acceleration for VFIO") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 532ab79734c7..6630dde56668 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -133,7 +133,6 @@ extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, continue; kref_put(&stit->kref, kvm_spapr_tce_liobn_put); - return; } } } From 8f1f7b9bedbce8d84e0b6b8beac671a6bc8f02c9 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 19 Feb 2019 14:53:45 +1100 Subject: [PATCH 045/147] KVM: PPC: Book3S HV: Add KVM stat largepages_[2M/1G] This adds an entry to the kvm_stats_debugfs directory which provides the number of large (2M or 1G) pages which have been used to setup the guest mappings, for radix guests. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/kvm/book3s.c | 3 +++ arch/powerpc/kvm/book3s_64_mmu_radix.c | 15 ++++++++++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 091430339db1..4af498a53905 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -99,6 +99,8 @@ struct kvm_nested_guest; struct kvm_vm_stat { ulong remote_tlb_flush; + ulong num_2M_pages; + ulong num_1G_pages; }; struct kvm_vcpu_stat { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 601c094f15ab..22a46c64536b 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -39,6 +39,7 @@ #include "book3s.h" #include "trace.h" +#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU /* #define EXIT_DEBUG */ @@ -71,6 +72,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "pthru_all", VCPU_STAT(pthru_all) }, { "pthru_host", VCPU_STAT(pthru_host) }, { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, + { "largepages_2M", VM_STAT(num_2M_pages) }, + { "largepages_1G", VM_STAT(num_1G_pages) }, { NULL } }; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 1b821c6efdef..f55ef071883f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -403,8 +403,13 @@ void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, if (!memslot) return; } - if (shift) + if (shift) { /* 1GB or 2MB page */ page_size = 1ul << shift; + if (shift == PMD_SHIFT) + kvm->stat.num_2M_pages--; + else if (shift == PUD_SHIFT) + kvm->stat.num_1G_pages--; + } gpa &= ~(page_size - 1); hpa = old & PTE_RPN_MASK; @@ -878,6 +883,14 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, put_page(page); } + /* Increment number of large pages if we (successfully) inserted one */ + if (!ret) { + if (level == 1) + kvm->stat.num_2M_pages++; + else if (level == 2) + kvm->stat.num_1G_pages++; + } + return ret; } From ee7930490a8f84570e96268f5736e99570b58307 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 6 Jul 2018 09:11:50 +0100 Subject: [PATCH 046/147] clocksource/arm_arch_timer: Store physical timer IRQ number for KVM on VHE A host running in VHE mode gets the EL2 physical timer as its time source (accessed using the EL1 sysreg accessors, which get re-directed to the EL2 sysregs by VHE). The EL1 physical timer remains unused by the host kernel, allowing us to pass that on directly to a KVM guest and saves us from emulating this timer for the guest on VHE systems. Store the EL1 Physical Timer's IRQ number in struct arch_timer_kvm_info on VHE systems to allow KVM to use it. Acked-by: Daniel Lezcano Signed-off-by: Andre Przywara Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- drivers/clocksource/arm_arch_timer.c | 11 +++++++++-- include/clocksource/arm_arch_timer.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 9a7d4dc00b6e..b9243e2328b4 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1206,6 +1206,13 @@ static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void) return ARCH_TIMER_PHYS_SECURE_PPI; } +static void __init arch_timer_populate_kvm_info(void) +{ + arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; + if (is_kernel_in_hyp_mode()) + arch_timer_kvm_info.physical_irq = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]; +} + static int __init arch_timer_of_init(struct device_node *np) { int i, ret; @@ -1220,7 +1227,7 @@ static int __init arch_timer_of_init(struct device_node *np) for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) arch_timer_ppi[i] = irq_of_parse_and_map(np, i); - arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; + arch_timer_populate_kvm_info(); rate = arch_timer_get_cntfrq(); arch_timer_of_configure_rate(rate, np); @@ -1550,7 +1557,7 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) arch_timer_ppi[ARCH_TIMER_HYP_PPI] = acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI); - arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; + arch_timer_populate_kvm_info(); /* * When probing via ACPI, we have no mechanism to override the sysreg diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 349e5957c949..702967d996bb 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -74,6 +74,7 @@ enum arch_timer_spi_nr { struct arch_timer_kvm_info { struct timecounter timecounter; int virtual_irq; + int physical_irq; }; struct arch_timer_mem_frame { From 7aa8d14641651a61a0b8892314a0bcfaceebe158 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 5 Jan 2019 15:49:50 +0000 Subject: [PATCH 047/147] arm/arm64: KVM: Introduce kvm_call_hyp_ret() Until now, we haven't differentiated between HYP calls that have a return value and those who don't. As we're about to change this, introduce kvm_call_hyp_ret(), and change all call sites that actually make use of a return value. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/debug.c | 2 +- virt/kvm/arm/arm.c | 2 +- virt/kvm/arm/vgic/vgic-v3.c | 4 ++-- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index ca56537b61bc..023c9f2b1eea 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -214,7 +214,10 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); + unsigned long kvm_call_hyp(void *hypfn, ...); +#define kvm_call_hyp_ret(f, ...) kvm_call_hyp(f, ##__VA_ARGS__) + void force_vm_exit(const cpumask_t *mask); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7732d0ba4e60..e54cb7c88a4e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -371,6 +371,7 @@ void kvm_arm_resume_guest(struct kvm *kvm); u64 __kvm_call_hyp(void *hypfn, ...); #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) +#define kvm_call_hyp_ret(f, ...) kvm_call_hyp(f, ##__VA_ARGS__) void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index f39801e4136c..fd917d6d12af 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -76,7 +76,7 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) void kvm_arm_init_debug(void) { - __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2)); + __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2)); } /** diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 9e350fd34504..4d55f98f97f7 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -765,7 +765,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_vcpu_run_vhe(vcpu); kvm_arm_vhe_guest_exit(); } else { - ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu); + ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu); } vcpu->mode = OUTSIDE_GUEST_MODE; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 9c0dd234ebe8..67f98151c88d 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -589,7 +589,7 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); */ int vgic_v3_probe(const struct gic_kvm_info *info) { - u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); + u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); int ret; /* @@ -679,7 +679,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr); + cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); kvm_call_hyp(__vgic_v3_save_aprs, vcpu); From 18fc7bf8e041272f74a3237b99261d59c3ff0388 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 5 Jan 2019 15:57:56 +0000 Subject: [PATCH 048/147] arm64: KVM: Allow for direct call of HYP functions when using VHE When running VHE, there is no need to jump via some stub to perform a "HYP" function call, as there is a single address space. Let's thus change kvm_call_hyp() and co to perform a direct call in this case. Although this results in a bit of code expansion, it allows the compiler to check for type compatibility, something that we are missing so far. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_host.h | 32 +++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e54cb7c88a4e..8b7702bdb219 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -370,8 +370,36 @@ void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); u64 __kvm_call_hyp(void *hypfn, ...); -#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) -#define kvm_call_hyp_ret(f, ...) kvm_call_hyp(f, ##__VA_ARGS__) + +/* + * The couple of isb() below are there to guarantee the same behaviour + * on VHE as on !VHE, where the eret to EL1 acts as a context + * synchronization event. + */ +#define kvm_call_hyp(f, ...) \ + do { \ + if (has_vhe()) { \ + f(__VA_ARGS__); \ + isb(); \ + } else { \ + __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \ + } \ + } while(0) + +#define kvm_call_hyp_ret(f, ...) \ + ({ \ + typeof(f(__VA_ARGS__)) ret; \ + \ + if (has_vhe()) { \ + ret = f(__VA_ARGS__); \ + isb(); \ + } else { \ + ret = __kvm_call_hyp(kvm_ksym_ref(f), \ + ##__VA_ARGS__); \ + } \ + \ + ret; \ + }) void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); From 7cba8a8d0d39c8846d0095fc2d2225c7983f4009 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 5 Jan 2019 16:06:53 +0000 Subject: [PATCH 049/147] arm64: KVM: Drop VHE-specific HYP call stub We now call VHE code directly, without going through any central dispatching function. Let's drop that code. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp.S | 3 --- arch/arm64/kvm/hyp/hyp-entry.S | 12 ------------ 2 files changed, 15 deletions(-) diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 952f6cb9cf72..2845aa680841 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -40,9 +40,6 @@ * arch/arm64/kernel/hyp_stub.S. */ ENTRY(__kvm_call_hyp) -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN hvc #0 ret -alternative_else_nop_endif - b __vhe_hyp_call ENDPROC(__kvm_call_hyp) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 73c1b483ec39..2b1e686772bf 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -43,18 +43,6 @@ ldr lr, [sp], #16 .endm -ENTRY(__vhe_hyp_call) - do_el2_call - /* - * We used to rely on having an exception return to get - * an implicit isb. In the E2H case, we don't have it anymore. - * rather than changing all the leaf functions, just do it here - * before returning to the rest of the kernel. - */ - isb - ret -ENDPROC(__vhe_hyp_call) - el1_sync: // Guest trapped into EL2 mrs x0, esr_el2 From d18232ea8a9480606c53e91d7e2d062c3c151815 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 11 Jan 2019 14:57:58 +0000 Subject: [PATCH 050/147] ARM: KVM: Teach some form of type-safety to kvm_call_hyp Just like on arm64, and for the same reasons, kvm_call_hyp removes any form of type safety when calling into HYP. But we can still try to tell the compiler what we're trying to achieve. Here, we can add code that would do the function call if it wasn't guarded by an always-false predicate. Hopefully, the compiler is dumb enough to do the type checking and clever enough to not emit the corresponding code... Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 31 ++++++++++++++++++++++++++++--- arch/arm/kvm/hyp/hyp-entry.S | 2 +- arch/arm/kvm/interrupts.S | 4 ++-- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 023c9f2b1eea..4b6193f2f0f6 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -215,8 +215,33 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); -unsigned long kvm_call_hyp(void *hypfn, ...); -#define kvm_call_hyp_ret(f, ...) kvm_call_hyp(f, ##__VA_ARGS__) +unsigned long __kvm_call_hyp(void *hypfn, ...); + +/* + * The has_vhe() part doesn't get emitted, but is used for type-checking. + */ +#define kvm_call_hyp(f, ...) \ + do { \ + if (has_vhe()) { \ + f(__VA_ARGS__); \ + } else { \ + __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \ + } \ + } while(0) + +#define kvm_call_hyp_ret(f, ...) \ + ({ \ + typeof(f(__VA_ARGS__)) ret; \ + \ + if (has_vhe()) { \ + ret = f(__VA_ARGS__); \ + } else { \ + ret = __kvm_call_hyp(kvm_ksym_ref(f), \ + ##__VA_ARGS__); \ + } \ + \ + ret; \ + }) void force_vm_exit(const cpumask_t *mask); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, @@ -268,7 +293,7 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, * compliant with the PCS!). */ - kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); + __kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); } static inline void __cpu_init_stage2(void) diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S index aa3f9a9837ac..6ed3cf23fe89 100644 --- a/arch/arm/kvm/hyp/hyp-entry.S +++ b/arch/arm/kvm/hyp/hyp-entry.S @@ -176,7 +176,7 @@ THUMB( orr lr, lr, #PSR_T_BIT ) msr spsr_cxsf, lr ldr lr, =panic msr ELR_hyp, lr - ldr lr, =kvm_call_hyp + ldr lr, =__kvm_call_hyp clrex eret ENDPROC(__hyp_do_panic) diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 80a1d6cd261c..a08e6419ebe9 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -42,7 +42,7 @@ * r12: caller save * rest: callee save */ -ENTRY(kvm_call_hyp) +ENTRY(__kvm_call_hyp) hvc #0 bx lr -ENDPROC(kvm_call_hyp) +ENDPROC(__kvm_call_hyp) From 32f139551954512bfdf9d558341af453bb8b12b4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 19 Jan 2019 15:29:54 +0000 Subject: [PATCH 051/147] arm/arm64: KVM: Statically configure the host's view of MPIDR We currently eagerly save/restore MPIDR. It turns out to be slightly pointless: - On the host, this value is known as soon as we're scheduled on a physical CPU - In the guest, this value cannot change, as it is set by KVM (and this is a read-only register) The result of the above is that we can perfectly avoid the eager saving of MPIDR_EL1, and only keep the restore. We just have to setup the host contexts appropriately at boot time. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 8 ++++++++ arch/arm/kvm/hyp/cp15-sr.c | 1 - arch/arm64/include/asm/kvm_host.h | 8 ++++++++ arch/arm64/kvm/hyp/sysreg-sr.c | 1 - virt/kvm/arm/arm.c | 1 + 5 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 4b6193f2f0f6..43e343e00fb8 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -147,6 +148,13 @@ struct kvm_cpu_context { typedef struct kvm_cpu_context kvm_cpu_context_t; +static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, + int cpu) +{ + /* The host's MPIDR is immutable, so let's set it up at boot time */ + cpu_ctxt->cp15[c0_MPIDR] = cpu_logical_map(cpu); +} + struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; diff --git a/arch/arm/kvm/hyp/cp15-sr.c b/arch/arm/kvm/hyp/cp15-sr.c index c4782812714c..8bf895ec6e04 100644 --- a/arch/arm/kvm/hyp/cp15-sr.c +++ b/arch/arm/kvm/hyp/cp15-sr.c @@ -27,7 +27,6 @@ static u64 *cp15_64(struct kvm_cpu_context *ctxt, int idx) void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) { - ctxt->cp15[c0_MPIDR] = read_sysreg(VMPIDR); ctxt->cp15[c0_CSSELR] = read_sysreg(CSSELR); ctxt->cp15[c1_SCTLR] = read_sysreg(SCTLR); ctxt->cp15[c1_CPACR] = read_sysreg(CPACR); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8b7702bdb219..f497bb31031f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -418,6 +419,13 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); +static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, + int cpu) +{ + /* The host's MPIDR is immutable, so let's set it up at boot time */ + cpu_ctxt->sys_regs[MPIDR_EL1] = cpu_logical_map(cpu); +} + void __kvm_enable_ssbs(void); static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 68d6f7c3b237..2498f86defcb 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -52,7 +52,6 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { - ctxt->sys_regs[MPIDR_EL1] = read_sysreg(vmpidr_el2); ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr); ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4d55f98f97f7..3dd240ea9e76 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1561,6 +1561,7 @@ static int init_hyp_mode(void) kvm_cpu_context_t *cpu_ctxt; cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu); + kvm_init_host_cpu_context(cpu_ctxt, cpu); err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); if (err) { From e329fb75d519e3dc3eb11b22d5bb846516be3521 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 11 Dec 2018 15:26:31 +0100 Subject: [PATCH 052/147] KVM: arm/arm64: Factor out VMID into struct kvm_vmid In preparation for nested virtualization where we are going to have more than a single VMID per VM, let's factor out the VMID data into a separate VMID data structure and change the VMID allocator to operate on this new structure instead of using a struct kvm. This also means that udate_vttbr now becomes update_vmid, and that the vttbr itself is generated on the fly based on the stage 2 page table base address and the vmid. We cache the physical address of the pgd when allocating the pgd to avoid doing the calculation on every entry to the guest and to avoid calling into potentially non-hyp-mapped code from hyp/EL2. If we wanted to merge the VMID allocator with the arm64 ASID allocator at some point in the future, it should actually become easier to do that after this patch. Note that to avoid mapping the kvm_vmid_bits variable into hyp, we simply forego the masking of the vmid value in kvm_get_vttbr and rely on update_vmid to always assign a valid vmid value (within the supported range). Reviewed-by: Marc Zyngier [maz: minor cleanups] Reviewed-by: Julien Thierry Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 13 ++++--- arch/arm/include/asm/kvm_mmu.h | 9 +++-- arch/arm/kvm/hyp/switch.c | 2 +- arch/arm/kvm/hyp/tlb.c | 4 +-- arch/arm64/include/asm/kvm_host.h | 9 +++-- arch/arm64/include/asm/kvm_hyp.h | 3 +- arch/arm64/include/asm/kvm_mmu.h | 10 ++++-- virt/kvm/arm/arm.c | 57 +++++++++++-------------------- virt/kvm/arm/mmu.c | 7 ++++ 9 files changed, 61 insertions(+), 53 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 43e343e00fb8..8073267dc4a0 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -57,10 +57,13 @@ int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_reset_coprocs(struct kvm_vcpu *vcpu); -struct kvm_arch { - /* VTTBR value associated with below pgd and vmid */ - u64 vttbr; +struct kvm_vmid { + /* The VMID generation used for the virt. memory system */ + u64 vmid_gen; + u32 vmid; +}; +struct kvm_arch { /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -70,11 +73,11 @@ struct kvm_arch { */ /* The VMID generation used for the virt. memory system */ - u64 vmid_gen; - u32 vmid; + struct kvm_vmid vmid; /* Stage-2 page table */ pgd_t *pgd; + phys_addr_t pgd_phys; /* Interrupt controller */ struct vgic_dist vgic; diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 3a875fc1b63c..2de96a180166 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -421,9 +421,14 @@ static inline int hyp_map_aux_data(void) static inline void kvm_set_ipa_limit(void) {} -static inline bool kvm_cpu_has_cnp(void) +static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) { - return false; + struct kvm_vmid *vmid = &kvm->arch.vmid; + u64 vmid_field, baddr; + + baddr = kvm->arch.pgd_phys; + vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + return kvm_phys_to_vttbr(baddr) | vmid_field; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index acf1c37fa49c..3b058a5d7c5f 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -77,7 +77,7 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu) { struct kvm *kvm = kern_hyp_va(vcpu->kvm); - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); write_sysreg(vcpu->arch.midr, VPIDR); } diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c index c0edd450e104..8e4afba73635 100644 --- a/arch/arm/kvm/hyp/tlb.c +++ b/arch/arm/kvm/hyp/tlb.c @@ -41,7 +41,7 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); isb(); write_sysreg(0, TLBIALLIS); @@ -61,7 +61,7 @@ void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm); /* Switch to requested VMID */ - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); isb(); write_sysreg(0, TLBIALL); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f497bb31031f..444dd1cb1958 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -57,16 +57,19 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); -struct kvm_arch { +struct kvm_vmid { /* The VMID generation used for the virt. memory system */ u64 vmid_gen; u32 vmid; +}; + +struct kvm_arch { + struct kvm_vmid vmid; /* stage2 entry level table */ pgd_t *pgd; + phys_addr_t pgd_phys; - /* VTTBR value associated with above pgd and vmid */ - u64 vttbr; /* VTCR_EL2 value for this VM */ u64 vtcr; diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index a80a7ef57325..4da765f2cca5 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #define __hyp_text __section(.hyp.text) notrace @@ -163,7 +164,7 @@ void __noreturn __hyp_do_panic(unsigned long, ...); static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) { write_sysreg(kvm->arch.vtcr, vtcr_el2); - write_sysreg(kvm->arch.vttbr, vttbr_el2); + write_sysreg(kvm_get_vttbr(kvm), vttbr_el2); /* * ARM erratum 1165522 requires the actual execution of the above diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 8af4b1befa42..c423c8c4fc39 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -591,9 +591,15 @@ static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); } -static inline bool kvm_cpu_has_cnp(void) +static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) { - return system_supports_cnp(); + struct kvm_vmid *vmid = &kvm->arch.vmid; + u64 vmid_field, baddr; + u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; + + baddr = kvm->arch.pgd_phys; + vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } #endif /* __ASSEMBLY__ */ diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 3dd240ea9e76..b77db673bb03 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -65,7 +65,6 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; -static unsigned int kvm_vmid_bits __read_mostly; static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; @@ -142,7 +141,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_vgic_early_init(kvm); /* Mark the initial VMID generation invalid */ - kvm->arch.vmid_gen = 0; + kvm->arch.vmid.vmid_gen = 0; /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->arch.max_vcpus = vgic_present ? @@ -472,37 +471,31 @@ void force_vm_exit(const cpumask_t *mask) /** * need_new_vmid_gen - check that the VMID is still valid - * @kvm: The VM's VMID to check + * @vmid: The VMID to check * * return true if there is a new generation of VMIDs being used * - * The hardware supports only 256 values with the value zero reserved for the - * host, so we check if an assigned value belongs to a previous generation, - * which which requires us to assign a new value. If we're the first to use a - * VMID for the new generation, we must flush necessary caches and TLBs on all - * CPUs. + * The hardware supports a limited set of values with the value zero reserved + * for the host, so we check if an assigned value belongs to a previous + * generation, which which requires us to assign a new value. If we're the + * first to use a VMID for the new generation, we must flush necessary caches + * and TLBs on all CPUs. */ -static bool need_new_vmid_gen(struct kvm *kvm) +static bool need_new_vmid_gen(struct kvm_vmid *vmid) { u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ - return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen); + return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen); } /** - * update_vttbr - Update the VTTBR with a valid VMID before the guest runs - * @kvm The guest that we are about to run - * - * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the - * VM has a valid VMID, otherwise assigns a new one and flushes corresponding - * caches and TLBs. + * update_vmid - Update the vmid with a valid VMID for the current generation + * @kvm: The guest that struct vmid belongs to + * @vmid: The stage-2 VMID information struct */ -static void update_vttbr(struct kvm *kvm) +static void update_vmid(struct kvm_vmid *vmid) { - phys_addr_t pgd_phys; - u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; - - if (!need_new_vmid_gen(kvm)) + if (!need_new_vmid_gen(vmid)) return; spin_lock(&kvm_vmid_lock); @@ -512,7 +505,7 @@ static void update_vttbr(struct kvm *kvm) * already allocated a valid vmid for this vm, then this vcpu should * use the same vmid. */ - if (!need_new_vmid_gen(kvm)) { + if (!need_new_vmid_gen(vmid)) { spin_unlock(&kvm_vmid_lock); return; } @@ -536,18 +529,12 @@ static void update_vttbr(struct kvm *kvm) kvm_call_hyp(__kvm_flush_vm_context); } - kvm->arch.vmid = kvm_next_vmid; + vmid->vmid = kvm_next_vmid; kvm_next_vmid++; - kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; - - /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm->arch.pgd); - BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)); - vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); - kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; + kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; smp_wmb(); - WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); + WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen)); spin_unlock(&kvm_vmid_lock); } @@ -690,7 +677,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ cond_resched(); - update_vttbr(vcpu->kvm); + update_vmid(&vcpu->kvm->arch.vmid); check_vcpu_requests(vcpu); @@ -739,7 +726,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || + if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) || kvm_request_pending(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ @@ -1417,10 +1404,6 @@ static inline void hyp_cpu_pm_exit(void) static int init_common_resources(void) { - /* set size of VMID supported by CPU */ - kvm_vmid_bits = kvm_get_vmid_bits(); - kvm_info("%d-bit VMID\n", kvm_vmid_bits); - kvm_set_ipa_limit(); return 0; diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fbdf3ac2f001..f8dda452ea24 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -908,6 +908,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, */ int kvm_alloc_stage2_pgd(struct kvm *kvm) { + phys_addr_t pgd_phys; pgd_t *pgd; if (kvm->arch.pgd != NULL) { @@ -920,7 +921,12 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) if (!pgd) return -ENOMEM; + pgd_phys = virt_to_phys(pgd); + if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) + return -EINVAL; + kvm->arch.pgd = pgd; + kvm->arch.pgd_phys = pgd_phys; return 0; } @@ -1008,6 +1014,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; + kvm->arch.pgd_phys = 0; } spin_unlock(&kvm->mmu_lock); From accb99bcd0ca6d3ee412557b0c3f583a3abc0eb6 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 26 Nov 2018 18:21:22 +0100 Subject: [PATCH 053/147] KVM: arm/arm64: Simplify bg_timer programming Instead of calling into kvm_timer_[un]schedule from the main kvm blocking path, test if the VCPU is on the wait queue from the load/put path and perform the background timer setup/cancel in this path. This has the distinct advantage that we no longer race between load/put and schedule/unschedule and programming and canceling of the bg_timer always happens when the timer state is not loaded. Note that we must now remove the checks in kvm_timer_blocking that do not schedule a background timer if one of the timers can fire, because we no longer have a guarantee that kvm_vcpu_check_block() will be called before kvm_timer_blocking. Reported-by: Andre Przywara Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 3 --- virt/kvm/arm/arch_timer.c | 35 ++++++++++++++--------------------- virt/kvm/arm/arm.c | 2 -- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 33771352dcd6..d6e6a45d1d24 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -76,9 +76,6 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); bool kvm_timer_is_pending(struct kvm_vcpu *vcpu); -void kvm_timer_schedule(struct kvm_vcpu *vcpu); -void kvm_timer_unschedule(struct kvm_vcpu *vcpu); - u64 kvm_phys_timer_read(void); void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index b07ac4614e1c..4986028d9829 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -349,22 +349,12 @@ out: * thread is removed from its waitqueue and made runnable when there's a timer * interrupt to handle. */ -void kvm_timer_schedule(struct kvm_vcpu *vcpu) +static void kvm_timer_blocking(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - vtimer_save_state(vcpu); - - /* - * No need to schedule a background timer if any guest timer has - * already expired, because kvm_vcpu_block will return before putting - * the thread to sleep. - */ - if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer)) - return; - /* * If both timers are not capable of raising interrupts (disabled or * masked), then there's no more work for us to do. @@ -373,12 +363,19 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu) return; /* - * The guest timers have not yet expired, schedule a background timer. + * At least one guest time will expire. Schedule a background timer. * Set the earliest expiration time among the guest timers. */ soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); } +static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + soft_timer_cancel(&timer->bg_timer); +} + static void vtimer_restore_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; @@ -401,15 +398,6 @@ out: local_irq_restore(flags); } -void kvm_timer_unschedule(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - vtimer_restore_state(vcpu); - - soft_timer_cancel(&timer->bg_timer); -} - static void set_cntvoff(u64 cntvoff) { u32 low = lower_32_bits(cntvoff); @@ -485,6 +473,8 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) /* Set the background timer for the physical timer emulation. */ phys_timer_emulate(vcpu); + kvm_timer_unblocking(vcpu); + /* If the timer fired while we weren't running, inject it now */ if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); @@ -527,6 +517,9 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) */ soft_timer_cancel(&timer->phys_timer); + if (swait_active(kvm_arch_vcpu_wq(vcpu))) + kvm_timer_blocking(vcpu); + /* * The kernel may decide to run userspace after calling vcpu_put, so * we reset cntvoff to 0 to ensure a consistent read between user diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index b77db673bb03..9fbdb9e1c51f 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -335,13 +335,11 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - kvm_timer_schedule(vcpu); kvm_vgic_v4_enable_doorbell(vcpu); } void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - kvm_timer_unschedule(vcpu); kvm_vgic_v4_disable_doorbell(vcpu); } From b98c079ba480c606b13f6abf844187af09baeaab Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 4 Jan 2019 11:33:42 +0100 Subject: [PATCH 054/147] KVM: arm64: Fix ICH_ELRSR_EL2 sysreg naming We previously incorrectly named the define for this system register. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/arch_gicv3.h | 4 ++-- arch/arm64/include/asm/sysreg.h | 2 +- virt/kvm/arm/hyp/vgic-v3-sr.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 0bd530702118..bdc87700def2 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -54,7 +54,7 @@ #define ICH_VTR __ACCESS_CP15(c12, 4, c11, 1) #define ICH_MISR __ACCESS_CP15(c12, 4, c11, 2) #define ICH_EISR __ACCESS_CP15(c12, 4, c11, 3) -#define ICH_ELSR __ACCESS_CP15(c12, 4, c11, 5) +#define ICH_ELRSR __ACCESS_CP15(c12, 4, c11, 5) #define ICH_VMCR __ACCESS_CP15(c12, 4, c11, 7) #define __LR0(x) __ACCESS_CP15(c12, 4, c12, x) @@ -151,7 +151,7 @@ CPUIF_MAP(ICH_HCR, ICH_HCR_EL2) CPUIF_MAP(ICH_VTR, ICH_VTR_EL2) CPUIF_MAP(ICH_MISR, ICH_MISR_EL2) CPUIF_MAP(ICH_EISR, ICH_EISR_EL2) -CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2) +CPUIF_MAP(ICH_ELRSR, ICH_ELRSR_EL2) CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2) CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2) CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 72dc4c011014..3e5650903d6d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -426,7 +426,7 @@ #define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) #define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) -#define SYS_ICH_ELSR_EL2 sys_reg(3, 4, 12, 11, 5) +#define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) #define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7) #define __SYS__LR0_EL2(x) sys_reg(3, 4, 12, 12, x) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 9652c453480f..264d92da3240 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -226,7 +226,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) int i; u32 elrsr; - elrsr = read_gicreg(ICH_ELSR_EL2); + elrsr = read_gicreg(ICH_ELRSR_EL2); write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); From 09838de943d4c0ee75a99cd7665940705ab8dcea Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 9 Jan 2019 19:18:40 +0000 Subject: [PATCH 055/147] KVM: arm64: Reuse sys_reg() macro when searching the trap table Instead of having an open-coded macro, reuse the sys_reg() macro that does the exact same thing (the encoding is slightly different, but the ordering property is the same). Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/kvm/sys_regs.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e3e37228ae4e..1a5bea4285e4 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -965,6 +965,10 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } +#define reg_to_encoding(x) \ + sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2); + /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ @@ -1820,30 +1824,19 @@ static const struct sys_reg_desc *get_target_table(unsigned target, } } -#define reg_to_match_value(x) \ - ({ \ - unsigned long val; \ - val = (x)->Op0 << 14; \ - val |= (x)->Op1 << 11; \ - val |= (x)->CRn << 7; \ - val |= (x)->CRm << 3; \ - val |= (x)->Op2; \ - val; \ - }) - static int match_sys_reg(const void *key, const void *elt) { const unsigned long pval = (unsigned long)key; const struct sys_reg_desc *r = elt; - return pval - reg_to_match_value(r); + return pval - reg_to_encoding(r); } static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], unsigned int num) { - unsigned long pval = reg_to_match_value(params); + unsigned long pval = reg_to_encoding(params); return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } From 84135d3d18da2ff17d3ad1a609b2818cc3049552 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 5 Jul 2018 16:48:23 +0100 Subject: [PATCH 056/147] KVM: arm/arm64: consolidate arch timer trap handlers At the moment we have separate system register emulation handlers for each timer register. Actually they are quite similar, and we rely on kvm_arm_timer_[gs]et_reg() for the actual emulation anyways, so let's just merge all of those handlers into one function, which just marshalls the arguments and then hands off to a set of common accessors. This makes extending the emulation to include EL2 timers much easier. Signed-off-by: Andre Przywara [Fixed 32-bit VM breakage and reduced to reworking existing code] Signed-off-by: Christoffer Dall [Fixed 32bit host, general cleanup] Signed-off-by: Marc Zyngier --- arch/arm/kvm/coproc.c | 23 +++--- arch/arm64/include/asm/sysreg.h | 4 + arch/arm64/kvm/sys_regs.c | 73 ++++++++---------- include/kvm/arm_arch_timer.h | 23 ++++++ virt/kvm/arm/arch_timer.c | 130 +++++++++++++++++++++++++++----- 5 files changed, 187 insertions(+), 66 deletions(-) diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 222c1635bc7a..51863364f8d1 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -293,15 +293,16 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu, const struct coproc_params *p, const struct coproc_reg *r) { - u64 now = kvm_phys_timer_read(); - u64 val; + u32 val; if (p->is_write) { val = *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val + now); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_TVAL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); - *vcpu_reg(vcpu, p->Rt1) = val - now; + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_TVAL); + *vcpu_reg(vcpu, p->Rt1) = val; } return true; @@ -315,9 +316,11 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu, if (p->is_write) { val = *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, val); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CTL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CTL); *vcpu_reg(vcpu, p->Rt1) = val; } @@ -333,9 +336,11 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu, if (p->is_write) { val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; val |= *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CVAL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CVAL); *vcpu_reg(vcpu, p->Rt1) = val; *vcpu_reg(vcpu, p->Rt2) = val >> 32; } diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 3e5650903d6d..6482e8bcf1b8 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -392,6 +392,10 @@ #define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1) #define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2) +#define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0) +#define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1) +#define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0) + #define __PMEV_op2(n) ((n) & 0x7) #define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3)) #define SYS_PMEVCNTRn_EL0(n) sys_reg(3, 3, 14, __CNTR_CRm(n), __PMEV_op2(n)) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1a5bea4285e4..0d348500b24b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -990,44 +990,38 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \ access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } -static bool access_cntp_tval(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) +static bool access_arch_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) { - u64 now = kvm_phys_timer_read(); - u64 cval; + enum kvm_arch_timers tmr; + enum kvm_arch_timer_regs treg; + u64 reg = reg_to_encoding(r); - if (p->is_write) { - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, - p->regval + now); - } else { - cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); - p->regval = cval - now; + switch (reg) { + case SYS_CNTP_TVAL_EL0: + case SYS_AARCH32_CNTP_TVAL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_CNTP_CTL_EL0: + case SYS_AARCH32_CNTP_CTL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_CNTP_CVAL_EL0: + case SYS_AARCH32_CNTP_CVAL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + default: + BUG(); } - return true; -} - -static bool access_cntp_ctl(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ if (p->is_write) - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval); + kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval); else - p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); - - return true; -} - -static bool access_cntp_cval(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - if (p->is_write) - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval); - else - p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); + p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg); return true; } @@ -1392,9 +1386,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, - { SYS_DESC(SYS_CNTP_TVAL_EL0), access_cntp_tval }, - { SYS_DESC(SYS_CNTP_CTL_EL0), access_cntp_ctl }, - { SYS_DESC(SYS_CNTP_CVAL_EL0), access_cntp_cval }, + { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), @@ -1715,10 +1709,9 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, - /* CNTP_TVAL */ - { Op1( 0), CRn(14), CRm( 2), Op2( 0), access_cntp_tval }, - /* CNTP_CTL */ - { Op1( 0), CRn(14), CRm( 2), Op2( 1), access_cntp_ctl }, + /* Arch Tmers */ + { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer }, /* PMEVCNTRn */ PMU_PMEVCNTR(0), @@ -1795,7 +1788,7 @@ static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ - { Op1( 2), CRn( 0), CRm(14), Op2( 0), access_cntp_cval }, + { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, }; /* Target specific emulation tables */ diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index d6e6a45d1d24..d26b7fde9935 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -22,6 +22,19 @@ #include #include +enum kvm_arch_timers { + TIMER_PTIMER, + TIMER_VTIMER, + NR_KVM_TIMERS +}; + +enum kvm_arch_timer_regs { + TIMER_REG_CNT, + TIMER_REG_CVAL, + TIMER_REG_TVAL, + TIMER_REG_CTL, +}; + struct arch_timer_context { /* Registers: control register, timer value */ u32 cnt_ctl; @@ -87,5 +100,15 @@ bool kvm_arch_timer_get_input_level(int vintid); #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) +#define vcpu_get_timer(v,t) \ + (t == TIMER_VTIMER ? vcpu_vtimer(v) : vcpu_ptimer(v)) + +u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg); +void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg, + u64 val); #endif diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 4986028d9829..f7d377448438 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -52,6 +53,13 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx); static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val); +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg); u64 kvm_phys_timer_read(void) { @@ -628,24 +636,25 @@ static void kvm_timer_init_interrupt(void *info) int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - switch (regid) { case KVM_REG_ARM_TIMER_CTL: - vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; + kvm_arm_timer_write(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CTL, value); break; case KVM_REG_ARM_TIMER_CNT: update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); break; case KVM_REG_ARM_TIMER_CVAL: - vtimer->cnt_cval = value; + kvm_arm_timer_write(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CVAL, value); break; case KVM_REG_ARM_PTIMER_CTL: - ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; + kvm_arm_timer_write(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CTL, value); break; case KVM_REG_ARM_PTIMER_CVAL: - ptimer->cnt_cval = value; + kvm_arm_timer_write(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CVAL, value); break; default: @@ -672,26 +681,113 @@ static u64 read_timer_ctl(struct arch_timer_context *timer) u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) { - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - switch (regid) { case KVM_REG_ARM_TIMER_CTL: - return read_timer_ctl(vtimer); + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CTL); case KVM_REG_ARM_TIMER_CNT: - return kvm_phys_timer_read() - vtimer->cntvoff; + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CNT); case KVM_REG_ARM_TIMER_CVAL: - return vtimer->cnt_cval; + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CVAL); case KVM_REG_ARM_PTIMER_CTL: - return read_timer_ctl(ptimer); - case KVM_REG_ARM_PTIMER_CVAL: - return ptimer->cnt_cval; + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CTL); case KVM_REG_ARM_PTIMER_CNT: - return kvm_phys_timer_read(); + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CNT); + case KVM_REG_ARM_PTIMER_CVAL: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CVAL); } return (u64)-1; } +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + switch (treg) { + case TIMER_REG_TVAL: + val = kvm_phys_timer_read() - timer->cntvoff - timer->cnt_cval; + break; + + case TIMER_REG_CTL: + val = read_timer_ctl(timer); + break; + + case TIMER_REG_CVAL: + val = timer->cnt_cval; + break; + + case TIMER_REG_CNT: + val = kvm_phys_timer_read() - timer->cntvoff; + break; + + default: + BUG(); + } + + return val; +} + +u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); + + return val; +} + +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val) +{ + switch (treg) { + case TIMER_REG_TVAL: + timer->cnt_cval = val - kvm_phys_timer_read() - timer->cntvoff; + break; + + case TIMER_REG_CTL: + timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT; + break; + + case TIMER_REG_CVAL: + timer->cnt_cval = val; + break; + + default: + BUG(); + } +} + +void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg, + u64 val) +{ + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); +} + static int kvm_timer_starting_cpu(unsigned int cpu) { kvm_timer_init_interrupt(NULL); From e604dd5d45c75c2112424dec74853efb708f4fa6 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 18 Sep 2018 10:08:18 -0700 Subject: [PATCH 057/147] KVM: arm/arm64: timer: Rework data structures for multiple timers Prepare for having 4 timer data structures (2 for now). Move loaded to the cpu data structure and not the individual timer structure, in preparation for assigning the EL1 phys timer as well. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 43 +++++++++++++------------- virt/kvm/arm/arch_timer.c | 58 +++++++++++++++++++----------------- 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index d26b7fde9935..ab835112204d 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -36,6 +36,8 @@ enum kvm_arch_timer_regs { }; struct arch_timer_context { + struct kvm_vcpu *vcpu; + /* Registers: control register, timer value */ u32 cnt_ctl; u64 cnt_cval; @@ -43,6 +45,22 @@ struct arch_timer_context { /* Timer IRQ */ struct kvm_irq_level irq; + /* Virtual offset */ + u64 cntvoff; + + /* Emulated Timer (may be unused) */ + struct hrtimer hrtimer; +}; + +struct arch_timer_cpu { + struct arch_timer_context timers[NR_KVM_TIMERS]; + + /* Background timer used when the guest is not running */ + struct hrtimer bg_timer; + + /* Is the timer enabled */ + bool enabled; + /* * We have multiple paths which can save/restore the timer state * onto the hardware, so we need some way of keeping track of @@ -52,23 +70,6 @@ struct arch_timer_context { * loaded == false: State is stored in memory. */ bool loaded; - - /* Virtual offset */ - u64 cntvoff; -}; - -struct arch_timer_cpu { - struct arch_timer_context vtimer; - struct arch_timer_context ptimer; - - /* Background timer used when the guest is not running */ - struct hrtimer bg_timer; - - /* Physical timer emulation */ - struct hrtimer phys_timer; - - /* Is the timer enabled */ - bool enabled; }; int kvm_timer_hyp_init(bool); @@ -98,10 +99,10 @@ void kvm_timer_init_vhe(void); bool kvm_arch_timer_get_input_level(int vintid); -#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) -#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) -#define vcpu_get_timer(v,t) \ - (t == TIMER_VTIMER ? vcpu_vtimer(v) : vcpu_ptimer(v)) +#define vcpu_timer(v) (&(v)->arch.timer_cpu) +#define vcpu_get_timer(v,t) (&vcpu_timer(v)->timers[(t)]) +#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER]) +#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER]) u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index f7d377448438..471f9fd004c9 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -184,13 +184,11 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) { struct arch_timer_context *ptimer; - struct arch_timer_cpu *timer; struct kvm_vcpu *vcpu; u64 ns; - timer = container_of(hrt, struct arch_timer_cpu, phys_timer); - vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); - ptimer = vcpu_ptimer(vcpu); + ptimer = container_of(hrt, struct arch_timer_context, hrtimer); + vcpu = ptimer->vcpu; /* * Check that the timer has really expired from the guest's @@ -209,9 +207,10 @@ static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { + struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu); u64 cval, now; - if (timer_ctx->loaded) { + if (timer->loaded) { u32 cnt_ctl; /* Only the virtual timer can be loaded so far */ @@ -280,7 +279,6 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, /* Schedule the background timer for the emulated timer. */ static void phys_timer_emulate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); /* @@ -289,11 +287,11 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) * then we also don't need a soft timer. */ if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&timer->phys_timer); + soft_timer_cancel(&ptimer->hrtimer); return; } - soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer)); + soft_timer_start(&ptimer->hrtimer, kvm_timer_compute_delta(ptimer)); } /* @@ -303,7 +301,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) */ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); bool level; @@ -329,13 +327,13 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) static void vtimer_save_state(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); unsigned long flags; local_irq_save(flags); - if (!vtimer->loaded) + if (!timer->loaded) goto out; if (timer->enabled) { @@ -347,7 +345,7 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu) write_sysreg_el0(0, cntv_ctl); isb(); - vtimer->loaded = false; + timer->loaded = false; out: local_irq_restore(flags); } @@ -359,7 +357,7 @@ out: */ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); @@ -379,20 +377,20 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); soft_timer_cancel(&timer->bg_timer); } static void vtimer_restore_state(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); unsigned long flags; local_irq_save(flags); - if (vtimer->loaded) + if (timer->loaded) goto out; if (timer->enabled) { @@ -401,7 +399,7 @@ static void vtimer_restore_state(struct kvm_vcpu *vcpu) write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); } - vtimer->loaded = true; + timer->loaded = true; out: local_irq_restore(flags); } @@ -462,7 +460,7 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); @@ -507,7 +505,8 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); if (unlikely(!timer->enabled)) return; @@ -523,7 +522,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ - soft_timer_cancel(&timer->phys_timer); + soft_timer_cancel(&ptimer->hrtimer); if (swait_active(kvm_arch_vcpu_wq(vcpu))) kvm_timer_blocking(vcpu); @@ -559,7 +558,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); if (unlikely(!timer->enabled)) return; @@ -570,7 +569,7 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); @@ -611,22 +610,25 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); /* Synchronize cntvoff across all vtimers of a VM. */ update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); - vcpu_ptimer(vcpu)->cntvoff = 0; + ptimer->cntvoff = 0; hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->bg_timer.function = kvm_bg_timer_expire; - hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - timer->phys_timer.function = kvm_phys_timer_expire; + hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ptimer->hrtimer.function = kvm_phys_timer_expire; vtimer->irq.irq = default_vtimer_irq.irq; ptimer->irq.irq = default_ptimer_irq.irq; + + vtimer->vcpu = vcpu; + ptimer->vcpu = vcpu; } static void kvm_timer_init_interrupt(void *info) @@ -860,7 +862,7 @@ out_free_irq: void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); soft_timer_cancel(&timer->bg_timer); } @@ -904,7 +906,7 @@ bool kvm_arch_timer_get_input_level(int vintid) int kvm_timer_enable(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); int ret; From 9e01dc76be6a3b5768cb02130d2ff0055a68809a Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 19 Feb 2019 14:04:30 +0100 Subject: [PATCH 058/147] KVM: arm/arm64: arch_timer: Assign the phys timer on VHE systems VHE systems don't have to emulate the physical timer, we can simply assign the EL1 physical timer directly to the VM as the host always uses the EL2 timers. In order to minimize the amount of cruft, AArch32 gets definitions for the physical timer too, but is should be generally unused on this architecture. Co-written with Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_hyp.h | 4 + include/kvm/arm_arch_timer.h | 6 + virt/kvm/arm/arch_timer.c | 219 +++++++++++++++++++++++++-------- 3 files changed, 180 insertions(+), 49 deletions(-) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index e93a0cac9add..87bcd18df8d5 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -40,6 +40,7 @@ #define TTBR1 __ACCESS_CP15_64(1, c2) #define VTTBR __ACCESS_CP15_64(6, c2) #define PAR __ACCESS_CP15_64(0, c7) +#define CNTP_CVAL __ACCESS_CP15_64(2, c14) #define CNTV_CVAL __ACCESS_CP15_64(3, c14) #define CNTVOFF __ACCESS_CP15_64(4, c14) @@ -85,6 +86,7 @@ #define TID_PRIV __ACCESS_CP15(c13, 0, c0, 4) #define HTPIDR __ACCESS_CP15(c13, 4, c0, 2) #define CNTKCTL __ACCESS_CP15(c14, 0, c1, 0) +#define CNTP_CTL __ACCESS_CP15(c14, 0, c2, 1) #define CNTV_CTL __ACCESS_CP15(c14, 0, c3, 1) #define CNTHCTL __ACCESS_CP15(c14, 4, c1, 0) @@ -94,6 +96,8 @@ #define read_sysreg_el0(r) read_sysreg(r##_el0) #define write_sysreg_el0(v, r) write_sysreg(v, r##_el0) +#define cntp_ctl_el0 CNTP_CTL +#define cntp_cval_el0 CNTP_CVAL #define cntv_ctl_el0 CNTV_CTL #define cntv_cval_el0 CNTV_CVAL #define cntvoff_el2 CNTVOFF diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index ab835112204d..6d4a33a9c45a 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -50,6 +50,10 @@ struct arch_timer_context { /* Emulated Timer (may be unused) */ struct hrtimer hrtimer; + + /* Duplicated state from arch_timer.c for convenience */ + u32 host_timer_irq; + u32 host_timer_irq_flags; }; struct arch_timer_cpu { @@ -104,6 +108,8 @@ bool kvm_arch_timer_get_input_level(int vintid); #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER]) #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER]) +#define arch_timer_ctx_index(ctx) ((ctx) - vcpu_timer((ctx)->vcpu)->timers) + u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 471f9fd004c9..10c15151c87e 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -35,7 +35,9 @@ static struct timecounter *timecounter; static unsigned int host_vtimer_irq; +static unsigned int host_ptimer_irq; static u32 host_vtimer_irq_flags; +static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); @@ -86,20 +88,24 @@ static void soft_timer_cancel(struct hrtimer *hrt) static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; - struct arch_timer_context *vtimer; + struct arch_timer_context *ctx; /* * We may see a timer interrupt after vcpu_put() has been called which * sets the CPU's vcpu pointer to NULL, because even though the timer - * has been disabled in vtimer_save_state(), the hardware interrupt + * has been disabled in timer_save_state(), the hardware interrupt * signal may not have been retired from the interrupt controller yet. */ if (!vcpu) return IRQ_HANDLED; - vtimer = vcpu_vtimer(vcpu); - if (kvm_timer_should_fire(vtimer)) - kvm_timer_update_irq(vcpu, true, vtimer); + if (irq == host_vtimer_irq) + ctx = vcpu_vtimer(vcpu); + else + ctx = vcpu_ptimer(vcpu); + + if (kvm_timer_should_fire(ctx)) + kvm_timer_update_irq(vcpu, true, ctx); if (userspace_irqchip(vcpu->kvm) && !static_branch_unlikely(&has_gic_active_state)) @@ -208,13 +214,25 @@ static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(timer_ctx); u64 cval, now; if (timer->loaded) { - u32 cnt_ctl; + u32 cnt_ctl = 0; + + switch (index) { + case TIMER_VTIMER: + cnt_ctl = read_sysreg_el0(cntv_ctl); + break; + case TIMER_PTIMER: + cnt_ctl = read_sysreg_el0(cntp_ctl); + break; + case NR_KVM_TIMERS: + /* GCC is braindead */ + cnt_ctl = 0; + break; + } - /* Only the virtual timer can be loaded so far */ - cnt_ctl = read_sysreg_el0(cntv_ctl); return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); @@ -310,7 +328,7 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) return; /* - * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part + * If the timer virtual interrupt is a 'mapped' interrupt, part * of its lifecycle is offloaded to the hardware, and we therefore may * not have lowered the irq.level value before having to signal a new * interrupt, but have to signal an interrupt every time the level is @@ -319,31 +337,55 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) level = kvm_timer_should_fire(vtimer); kvm_timer_update_irq(vcpu, level, vtimer); + if (has_vhe()) { + level = kvm_timer_should_fire(ptimer); + kvm_timer_update_irq(vcpu, level, ptimer); + + return; + } + phys_timer_emulate(vcpu); if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); } -static void vtimer_save_state(struct kvm_vcpu *vcpu) +static void timer_save_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; + if (!timer->enabled) + return; + local_irq_save(flags); if (!timer->loaded) goto out; - if (timer->enabled) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - vtimer->cnt_cval = read_sysreg_el0(cntv_cval); - } + switch (index) { + case TIMER_VTIMER: + ctx->cnt_ctl = read_sysreg_el0(cntv_ctl); + ctx->cnt_cval = read_sysreg_el0(cntv_cval); - /* Disable the virtual timer */ - write_sysreg_el0(0, cntv_ctl); - isb(); + /* Disable the timer */ + write_sysreg_el0(0, cntv_ctl); + isb(); + + break; + case TIMER_PTIMER: + ctx->cnt_ctl = read_sysreg_el0(cntp_ctl); + ctx->cnt_cval = read_sysreg_el0(cntp_cval); + + /* Disable the timer */ + write_sysreg_el0(0, cntp_ctl); + isb(); + + break; + case NR_KVM_TIMERS: + break; /* GCC is braindead */ + } timer->loaded = false; out: @@ -382,21 +424,33 @@ static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) soft_timer_cancel(&timer->bg_timer); } -static void vtimer_restore_state(struct kvm_vcpu *vcpu) +static void timer_restore_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; + if (!timer->enabled) + return; + local_irq_save(flags); if (timer->loaded) goto out; - if (timer->enabled) { - write_sysreg_el0(vtimer->cnt_cval, cntv_cval); + switch (index) { + case TIMER_VTIMER: + write_sysreg_el0(ctx->cnt_cval, cntv_cval); isb(); - write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); + write_sysreg_el0(ctx->cnt_ctl, cntv_ctl); + break; + case TIMER_PTIMER: + write_sysreg_el0(ctx->cnt_cval, cntp_cval); + isb(); + write_sysreg_el0(ctx->cnt_ctl, cntp_ctl); + break; + case NR_KVM_TIMERS: + break; /* GCC is braindead */ } timer->loaded = true; @@ -419,23 +473,23 @@ static void set_cntvoff(u64 cntvoff) kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); } -static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active) +static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) { int r; - r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active); + r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active); WARN_ON(r); } -static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu) +static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct kvm_vcpu *vcpu = ctx->vcpu; bool phys_active; if (irqchip_in_kernel(vcpu->kvm)) - phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); + phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); else - phys_active = vtimer->irq.level; - set_vtimer_irq_phys_active(vcpu, phys_active); + phys_active = ctx->irq.level; + set_timer_irq_phys_active(ctx, phys_active); } static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) @@ -467,14 +521,22 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) if (unlikely(!timer->enabled)) return; - if (static_branch_likely(&has_gic_active_state)) - kvm_timer_vcpu_load_gic(vcpu); - else + if (static_branch_likely(&has_gic_active_state)) { + kvm_timer_vcpu_load_gic(vtimer); + if (has_vhe()) + kvm_timer_vcpu_load_gic(ptimer); + } else { kvm_timer_vcpu_load_nogic(vcpu); + } set_cntvoff(vtimer->cntvoff); - vtimer_restore_state(vcpu); + timer_restore_state(vtimer); + + if (has_vhe()) { + timer_restore_state(ptimer); + return; + } /* Set the background timer for the physical timer emulation. */ phys_timer_emulate(vcpu); @@ -506,12 +568,17 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); if (unlikely(!timer->enabled)) return; - vtimer_save_state(vcpu); + timer_save_state(vtimer); + if (has_vhe()) { + timer_save_state(ptimer); + return; + } /* * Cancel the physical timer emulation, because the only case where we @@ -534,8 +601,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * counter of non-VHE case. For VHE, the virtual counter uses a fixed * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. */ - if (!has_vhe()) - set_cntvoff(0); + set_cntvoff(0); } /* @@ -550,7 +616,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) if (!kvm_timer_should_fire(vtimer)) { kvm_timer_update_irq(vcpu, false, vtimer); if (static_branch_likely(&has_gic_active_state)) - set_vtimer_irq_phys_active(vcpu, false); + set_timer_irq_phys_active(vtimer, false); else enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } @@ -625,7 +691,12 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) ptimer->hrtimer.function = kvm_phys_timer_expire; vtimer->irq.irq = default_vtimer_irq.irq; + vtimer->host_timer_irq = host_vtimer_irq; + vtimer->host_timer_irq_flags = host_vtimer_irq_flags; + ptimer->irq.irq = default_ptimer_irq.irq; + ptimer->host_timer_irq = host_ptimer_irq; + ptimer->host_timer_irq_flags = host_ptimer_irq_flags; vtimer->vcpu = vcpu; ptimer->vcpu = vcpu; @@ -634,6 +705,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) static void kvm_timer_init_interrupt(void *info) { enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); } int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) @@ -815,6 +887,8 @@ int kvm_timer_hyp_init(bool has_gic) return -ENODEV; } + /* First, do the virtual EL1 timer irq */ + if (info->virtual_irq <= 0) { kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", info->virtual_irq); @@ -825,15 +899,15 @@ int kvm_timer_hyp_init(bool has_gic) host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { - kvm_err("Invalid trigger for IRQ%d, assuming level low\n", + kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n", host_vtimer_irq); host_vtimer_irq_flags = IRQF_TRIGGER_LOW; } err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, - "kvm guest timer", kvm_get_running_vcpus()); + "kvm guest vtimer", kvm_get_running_vcpus()); if (err) { - kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", + kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n", host_vtimer_irq, err); return err; } @@ -851,6 +925,43 @@ int kvm_timer_hyp_init(bool has_gic) kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); + /* Now let's do the physical EL1 timer irq */ + + if (info->physical_irq > 0) { + host_ptimer_irq = info->physical_irq; + host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq); + if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH && + host_ptimer_irq_flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n", + host_ptimer_irq); + host_ptimer_irq_flags = IRQF_TRIGGER_LOW; + } + + err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler, + "kvm guest ptimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", + host_ptimer_irq, err); + return err; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_ptimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } + } + + kvm_debug("physical timer IRQ%d\n", host_ptimer_irq); + } else if (has_vhe()) { + kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", + info->physical_irq); + err = -ENODEV; + goto out_free_irq; + } + cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, "kvm/arm/timer:starting", kvm_timer_starting_cpu, kvm_timer_dying_cpu); @@ -898,8 +1009,10 @@ bool kvm_arch_timer_get_input_level(int vintid) if (vintid == vcpu_vtimer(vcpu)->irq.irq) timer = vcpu_vtimer(vcpu); + else if (vintid == vcpu_ptimer(vcpu)->irq.irq) + timer = vcpu_ptimer(vcpu); else - BUG(); /* We only map the vtimer so far */ + BUG(); return kvm_timer_should_fire(timer); } @@ -908,6 +1021,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); int ret; if (timer->enabled) @@ -930,14 +1044,21 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (ret) return ret; + if (has_vhe()) { + ret = kvm_vgic_map_phys_irq(vcpu, host_ptimer_irq, ptimer->irq.irq, + kvm_arch_timer_get_input_level); + if (ret) + return ret; + } + no_vgic: timer->enabled = 1; return 0; } /* - * On VHE system, we only need to configure trap on physical timer and counter - * accesses in EL0 and EL1 once, not for every world switch. + * On VHE system, we only need to configure the EL2 timer trap register once, + * not for every world switch. * The host kernel runs at EL2 with HCR_EL2.TGE == 1, * and this makes those bits have no effect for the host kernel execution. */ @@ -948,11 +1069,11 @@ void kvm_timer_init_vhe(void) u64 val; /* - * Disallow physical timer access for the guest. - * Physical counter access is allowed. + * VHE systems allow the guest direct access to the EL1 physical + * timer/counter. */ val = read_sysreg(cnthctl_el2); - val &= ~(CNTHCTL_EL1PCEN << cnthctl_shift); + val |= (CNTHCTL_EL1PCEN << cnthctl_shift); val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); write_sysreg(val, cnthctl_el2); } From bee038a67487598ebbe995f85bf60c3a5b2e9099 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 4 Jan 2019 13:31:22 +0100 Subject: [PATCH 059/147] KVM: arm/arm64: Rework the timer code to use a timer_map We are currently emulating two timers in two different ways. When we add support for nested virtualization in the future, we are going to be emulating either two timers in two diffferent ways, or four timers in a single way. We need a unified data structure to keep track of how we map virtual state to physical state and we need to cleanup some of the timer code to operate more independently on a struct arch_timer_context instead of trying to consider the global state of the VCPU and recomputing all state. Co-written with Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_arch_timer.h | 23 +-- virt/kvm/arm/arch_timer.c | 295 +++++++++++++++++++---------------- virt/kvm/arm/trace.h | 105 +++++++++++++ 3 files changed, 278 insertions(+), 145 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6d4a33a9c45a..05a18dd265b5 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -51,11 +51,24 @@ struct arch_timer_context { /* Emulated Timer (may be unused) */ struct hrtimer hrtimer; + /* + * We have multiple paths which can save/restore the timer state onto + * the hardware, so we need some way of keeping track of where the + * latest state is. + */ + bool loaded; + /* Duplicated state from arch_timer.c for convenience */ u32 host_timer_irq; u32 host_timer_irq_flags; }; +struct timer_map { + struct arch_timer_context *direct_vtimer; + struct arch_timer_context *direct_ptimer; + struct arch_timer_context *emul_ptimer; +}; + struct arch_timer_cpu { struct arch_timer_context timers[NR_KVM_TIMERS]; @@ -64,16 +77,6 @@ struct arch_timer_cpu { /* Is the timer enabled */ bool enabled; - - /* - * We have multiple paths which can save/restore the timer state - * onto the hardware, so we need some way of keeping track of - * where the latest state is. - * - * loaded == true: State is loaded on the hardware registers. - * loaded == false: State is stored in memory. - */ - bool loaded; }; int kvm_timer_hyp_init(bool); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 10c15151c87e..17f9de73cc8a 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -68,6 +68,21 @@ u64 kvm_phys_timer_read(void) return timecounter->cc->read(timecounter->cc); } +static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + if (has_vhe()) { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_ptimer = NULL; + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = NULL; + map->emul_ptimer = vcpu_ptimer(vcpu); + } + + trace_kvm_get_timer_map(vcpu->vcpu_id, map); +} + static inline bool userspace_irqchip(struct kvm *kvm) { return static_branch_unlikely(&userspace_irqchip_in_use) && @@ -89,6 +104,7 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; struct arch_timer_context *ctx; + struct timer_map map; /* * We may see a timer interrupt after vcpu_put() has been called which @@ -99,10 +115,12 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) if (!vcpu) return IRQ_HANDLED; + get_timer_map(vcpu, &map); + if (irq == host_vtimer_irq) - ctx = vcpu_vtimer(vcpu); + ctx = map.direct_vtimer; else - ctx = vcpu_ptimer(vcpu); + ctx = map.direct_ptimer; if (kvm_timer_should_fire(ctx)) kvm_timer_update_irq(vcpu, true, ctx); @@ -136,7 +154,9 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) { - return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && + WARN_ON(timer_ctx && timer_ctx->loaded); + return timer_ctx && + !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); } @@ -146,21 +166,22 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) */ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) { - u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + u64 min_delta = ULLONG_MAX; + int i; - if (kvm_timer_irq_can_fire(vtimer)) - min_virt = kvm_timer_compute_delta(vtimer); + for (i = 0; i < NR_KVM_TIMERS; i++) { + struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; - if (kvm_timer_irq_can_fire(ptimer)) - min_phys = kvm_timer_compute_delta(ptimer); + WARN(ctx->loaded, "timer %d loaded\n", i); + if (kvm_timer_irq_can_fire(ctx)) + min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); + } /* If none of timers can fire, then return 0 */ - if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX)) + if (min_delta == ULLONG_MAX) return 0; - return min(min_virt, min_phys); + return min_delta; } static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) @@ -187,37 +208,45 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } -static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) +static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) { - struct arch_timer_context *ptimer; + struct arch_timer_context *ctx; struct kvm_vcpu *vcpu; u64 ns; - ptimer = container_of(hrt, struct arch_timer_context, hrtimer); - vcpu = ptimer->vcpu; + ctx = container_of(hrt, struct arch_timer_context, hrtimer); + vcpu = ctx->vcpu; + + trace_kvm_timer_hrtimer_expire(ctx); /* * Check that the timer has really expired from the guest's * PoV (NTP on the host may have forced it to expire * early). If not ready, schedule for a later time. */ - ns = kvm_timer_compute_delta(ptimer); + ns = kvm_timer_compute_delta(ctx); if (unlikely(ns)) { hrtimer_forward_now(hrt, ns_to_ktime(ns)); return HRTIMER_RESTART; } - kvm_timer_update_irq(vcpu, true, ptimer); + kvm_timer_update_irq(vcpu, true, ctx); return HRTIMER_NORESTART; } static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { - struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu); - enum kvm_arch_timers index = arch_timer_ctx_index(timer_ctx); + struct arch_timer_cpu *timer; + enum kvm_arch_timers index; u64 cval, now; - if (timer->loaded) { + if (!timer_ctx) + return false; + + timer = vcpu_timer(timer_ctx->vcpu); + index = arch_timer_ctx_index(timer_ctx); + + if (timer_ctx->loaded) { u32 cnt_ctl = 0; switch (index) { @@ -249,13 +278,13 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; - if (kvm_timer_should_fire(vtimer)) - return true; + get_timer_map(vcpu, &map); - return kvm_timer_should_fire(ptimer); + return kvm_timer_should_fire(map.direct_vtimer) || + kvm_timer_should_fire(map.direct_ptimer) || + kvm_timer_should_fire(map.emul_ptimer); } /* @@ -294,60 +323,28 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, } } -/* Schedule the background timer for the emulated timer. */ -static void phys_timer_emulate(struct kvm_vcpu *vcpu) +static void timer_emulate(struct arch_timer_context *ctx) { - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + bool should_fire = kvm_timer_should_fire(ctx); + + trace_kvm_timer_emulate(ctx, should_fire); + + if (should_fire) { + kvm_timer_update_irq(ctx->vcpu, true, ctx); + return; + } /* * If the timer can fire now, we don't need to have a soft timer * scheduled for the future. If the timer cannot fire at all, * then we also don't need a soft timer. */ - if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&ptimer->hrtimer); + if (!kvm_timer_irq_can_fire(ctx)) { + soft_timer_cancel(&ctx->hrtimer); return; } - soft_timer_start(&ptimer->hrtimer, kvm_timer_compute_delta(ptimer)); -} - -/* - * Check if there was a change in the timer state, so that we should either - * raise or lower the line level to the GIC or schedule a background timer to - * emulate the physical timer. - */ -static void kvm_timer_update_state(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - bool level; - - if (unlikely(!timer->enabled)) - return; - - /* - * If the timer virtual interrupt is a 'mapped' interrupt, part - * of its lifecycle is offloaded to the hardware, and we therefore may - * not have lowered the irq.level value before having to signal a new - * interrupt, but have to signal an interrupt every time the level is - * asserted. - */ - level = kvm_timer_should_fire(vtimer); - kvm_timer_update_irq(vcpu, level, vtimer); - - if (has_vhe()) { - level = kvm_timer_should_fire(ptimer); - kvm_timer_update_irq(vcpu, level, ptimer); - - return; - } - - phys_timer_emulate(vcpu); - - if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) - kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); + soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); } static void timer_save_state(struct arch_timer_context *ctx) @@ -361,7 +358,7 @@ static void timer_save_state(struct arch_timer_context *ctx) local_irq_save(flags); - if (!timer->loaded) + if (!ctx->loaded) goto out; switch (index) { @@ -384,10 +381,12 @@ static void timer_save_state(struct arch_timer_context *ctx) break; case NR_KVM_TIMERS: - break; /* GCC is braindead */ + BUG(); } - timer->loaded = false; + trace_kvm_timer_save_state(ctx); + + ctx->loaded = false; out: local_irq_restore(flags); } @@ -400,14 +399,17 @@ out: static void kvm_timer_blocking(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); /* - * If both timers are not capable of raising interrupts (disabled or + * If no timers are capable of raising interrupts (disabled or * masked), then there's no more work for us to do. */ - if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer)) + if (!kvm_timer_irq_can_fire(map.direct_vtimer) && + !kvm_timer_irq_can_fire(map.direct_ptimer) && + !kvm_timer_irq_can_fire(map.emul_ptimer)) return; /* @@ -435,7 +437,7 @@ static void timer_restore_state(struct arch_timer_context *ctx) local_irq_save(flags); - if (timer->loaded) + if (ctx->loaded) goto out; switch (index) { @@ -450,10 +452,12 @@ static void timer_restore_state(struct arch_timer_context *ctx) write_sysreg_el0(ctx->cnt_ctl, cntp_ctl); break; case NR_KVM_TIMERS: - break; /* GCC is braindead */ + BUG(); } - timer->loaded = true; + trace_kvm_timer_restore_state(ctx); + + ctx->loaded = true; out: local_irq_restore(flags); } @@ -515,37 +519,31 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; if (unlikely(!timer->enabled)) return; + get_timer_map(vcpu, &map); + if (static_branch_likely(&has_gic_active_state)) { - kvm_timer_vcpu_load_gic(vtimer); - if (has_vhe()) - kvm_timer_vcpu_load_gic(ptimer); + kvm_timer_vcpu_load_gic(map.direct_vtimer); + if (map.direct_ptimer) + kvm_timer_vcpu_load_gic(map.direct_ptimer); } else { kvm_timer_vcpu_load_nogic(vcpu); } - set_cntvoff(vtimer->cntvoff); - - timer_restore_state(vtimer); - - if (has_vhe()) { - timer_restore_state(ptimer); - return; - } - - /* Set the background timer for the physical timer emulation. */ - phys_timer_emulate(vcpu); + set_cntvoff(map.direct_vtimer->cntvoff); kvm_timer_unblocking(vcpu); - /* If the timer fired while we weren't running, inject it now */ - if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) - kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); + timer_restore_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_restore_state(map.direct_ptimer); + + if (map.emul_ptimer) + timer_emulate(map.emul_ptimer); } bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) @@ -568,20 +566,19 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; if (unlikely(!timer->enabled)) return; - timer_save_state(vtimer); - if (has_vhe()) { - timer_save_state(ptimer); - return; - } + get_timer_map(vcpu, &map); + + timer_save_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_save_state(map.direct_ptimer); /* - * Cancel the physical timer emulation, because the only case where we + * Cancel soft timer emulation, because the only case where we * need it after a vcpu_put is in the context of a sleeping VCPU, and * in that case we already factor in the deadline for the physical * timer when scheduling the bg_timer. @@ -589,7 +586,8 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ - soft_timer_cancel(&ptimer->hrtimer); + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); if (swait_active(kvm_arch_vcpu_wq(vcpu))) kvm_timer_blocking(vcpu); @@ -636,8 +634,9 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 @@ -645,12 +644,22 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) * resets the timer to be disabled and unmasked and is compliant with * the ARMv7 architecture. */ - vtimer->cnt_ctl = 0; - ptimer->cnt_ctl = 0; - kvm_timer_update_state(vcpu); + vcpu_vtimer(vcpu)->cnt_ctl = 0; + vcpu_ptimer(vcpu)->cnt_ctl = 0; - if (timer->enabled && irqchip_in_kernel(vcpu->kvm)) - kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq); + if (timer->enabled) { + kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu)); + kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu)); + + if (irqchip_in_kernel(vcpu->kvm)) { + kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq); + if (map.direct_ptimer) + kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq); + } + } + + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); return 0; } @@ -687,15 +696,18 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->bg_timer.function = kvm_bg_timer_expire; + hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ptimer->hrtimer.function = kvm_phys_timer_expire; + vtimer->hrtimer.function = kvm_hrtimer_expire; + ptimer->hrtimer.function = kvm_hrtimer_expire; vtimer->irq.irq = default_vtimer_irq.irq; - vtimer->host_timer_irq = host_vtimer_irq; - vtimer->host_timer_irq_flags = host_vtimer_irq_flags; - ptimer->irq.irq = default_ptimer_irq.irq; + + vtimer->host_timer_irq = host_vtimer_irq; ptimer->host_timer_irq = host_ptimer_irq; + + vtimer->host_timer_irq_flags = host_vtimer_irq_flags; ptimer->host_timer_irq_flags = host_ptimer_irq_flags; vtimer->vcpu = vcpu; @@ -710,32 +722,39 @@ static void kvm_timer_init_interrupt(void *info) int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) { + struct arch_timer_context *timer; + bool level; + switch (regid) { case KVM_REG_ARM_TIMER_CTL: - kvm_arm_timer_write(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CTL, value); + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_TIMER_CNT: + timer = vcpu_vtimer(vcpu); update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); break; case KVM_REG_ARM_TIMER_CVAL: - kvm_arm_timer_write(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CVAL, value); + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); break; case KVM_REG_ARM_PTIMER_CTL: - kvm_arm_timer_write(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CTL, value); + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_PTIMER_CVAL: - kvm_arm_timer_write(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CVAL, value); + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); break; default: return -1; } - kvm_timer_update_state(vcpu); + level = kvm_timer_should_fire(timer); + kvm_timer_update_irq(vcpu, level, timer); + timer_emulate(timer); + return 0; } @@ -1020,8 +1039,7 @@ bool kvm_arch_timer_get_input_level(int vintid) int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; int ret; if (timer->enabled) @@ -1039,18 +1057,25 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq, + get_timer_map(vcpu, &map); + + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_vtimer->host_timer_irq, + map.direct_vtimer->irq.irq, kvm_arch_timer_get_input_level); if (ret) return ret; - if (has_vhe()) { - ret = kvm_vgic_map_phys_irq(vcpu, host_ptimer_irq, ptimer->irq.irq, + if (map.direct_ptimer) { + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_ptimer->host_timer_irq, + map.direct_ptimer->irq.irq, kvm_arch_timer_get_input_level); - if (ret) - return ret; } + if (ret) + return ret; + no_vgic: timer->enabled = 1; return 0; diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 3828beab93f2..54bb059243b9 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -2,6 +2,7 @@ #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVM_H +#include #include #undef TRACE_SYSTEM @@ -262,6 +263,110 @@ TRACE_EVENT(kvm_timer_update_irq, __entry->vcpu_id, __entry->irq, __entry->level) ); +TRACE_EVENT(kvm_get_timer_map, + TP_PROTO(unsigned long vcpu_id, struct timer_map *map), + TP_ARGS(vcpu_id, map), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( int, direct_vtimer ) + __field( int, direct_ptimer ) + __field( int, emul_ptimer ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); + __entry->direct_ptimer = + (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; + __entry->emul_ptimer = + (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; + ), + + TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d", + __entry->vcpu_id, + __entry->direct_vtimer, + __entry->direct_ptimer, + __entry->emul_ptimer) +); + +TRACE_EVENT(kvm_timer_save_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_restore_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_hrtimer_expire, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_emulate, + TP_PROTO(struct arch_timer_context *ctx, bool should_fire), + TP_ARGS(ctx, should_fire), + + TP_STRUCT__entry( + __field( int, timer_idx ) + __field( bool, should_fire ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + __entry->should_fire = should_fire; + ), + + TP_printk("arch_timer_ctx_index: %d (should_fire: %d)", + __entry->timer_idx, __entry->should_fire) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH From 64cf98fa5544aee6c547786ee32f92b796b30635 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 1 May 2016 22:29:58 +0200 Subject: [PATCH 060/147] KVM: arm/arm64: Move kvm_is_write_fault to header file Move this little function to the header files for arm/arm64 so other code can make use of it directly. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_emulate.h | 8 ++++++++ arch/arm64/include/asm/kvm_emulate.h | 8 ++++++++ virt/kvm/arm/mmu.c | 8 -------- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 77121b713bef..8927cae7c966 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -265,6 +265,14 @@ static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) } } +static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_trap_is_iabt(vcpu)) + return false; + + return kvm_vcpu_dabt_iswrite(vcpu); +} + static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) { return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 506386a3edde..a0d1ce9ae12b 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -331,6 +331,14 @@ static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) return ESR_ELx_SYS64_ISS_RT(esr); } +static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_trap_is_iabt(vcpu)) + return false; + + return kvm_vcpu_dabt_iswrite(vcpu); +} + static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index f8dda452ea24..e3e5a26b4845 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1403,14 +1403,6 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) return false; } -static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) -{ - if (kvm_vcpu_trap_is_iabt(vcpu)) - return false; - - return kvm_vcpu_dabt_iswrite(vcpu); -} - /** * stage2_wp_ptes - write protect PMD range * @pmd: pointer to pmd entry From f7f2b15c3d42fa5754131b34a0f7ad5a5c3f777f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 31 Jan 2019 14:17:17 +0100 Subject: [PATCH 061/147] arm64: KVM: Expose sanitised cache type register to guest We currently permit CPUs in the same system to deviate in the exact topology of the caches, and we subsequently hide this fact from user space by exposing a sanitised value of the cache type register CTR_EL0. However, guests running under KVM see the bare value of CTR_EL0, which could potentially result in issues with, e.g., JITs or other pieces of code that are sensitive to misreported cache line sizes. So let's start trapping cache ID instructions if there is a mismatch, and expose the sanitised version of CTR_EL0 to guests. Note that CTR_EL0 is treated as an invariant to KVM user space, so update that part as well. Acked-by: Christoffer Dall Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 3 ++ arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kvm/sys_regs.c | 59 +++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index a0d1ce9ae12b..9acccb1e3746 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -77,6 +77,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) */ if (!vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID3; + + if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID2; } static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6482e8bcf1b8..5b267dec6194 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -361,6 +361,7 @@ #define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0) +#define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0) #define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1) #define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 0d348500b24b..45a07cfc57ed 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1146,6 +1146,49 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(rd, uaddr, true); } +static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0); + return true; +} + +static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = read_sysreg(clidr_el1); + return true; +} + +static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, r->reg); + else + p->regval = vcpu_read_sys_reg(vcpu, r->reg); + return true; +} + +static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 csselr; + + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); + p->regval = get_ccsidr(csselr); + return true; +} + /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ SYS_DESC(SYS_##name), \ @@ -1363,7 +1406,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, - { SYS_DESC(SYS_CSSELR_EL1), NULL, reset_unknown, CSSELR_EL1 }, + { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, + { SYS_DESC(SYS_CLIDR_EL1), access_clidr }, + { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, + { SYS_DESC(SYS_CTR_EL0), access_ctr }, { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, }, { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, @@ -1663,6 +1709,7 @@ static const struct sys_reg_desc cp14_64_regs[] = { * register). */ static const struct sys_reg_desc cp15_regs[] = { + { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr }, { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR }, { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, @@ -1779,6 +1826,10 @@ static const struct sys_reg_desc cp15_regs[] = { PMU_PMEVTYPER(30), /* PMCCFILTR */ { Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper }, + + { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, + { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, + { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, c0_CSSELR }, }; static const struct sys_reg_desc cp15_64_regs[] = { @@ -2192,11 +2243,15 @@ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, } FUNCTION_INVARIANT(midr_el1) -FUNCTION_INVARIANT(ctr_el0) FUNCTION_INVARIANT(revidr_el1) FUNCTION_INVARIANT(clidr_el1) FUNCTION_INVARIANT(aidr_el1) +static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) +{ + ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); +} + /* ->val is filled in by kvm_sys_reg_table_init() */ static struct sys_reg_desc invariant_sys_regs[] = { { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, From 793acf870ea3e492c6bb3edb5f8657d9c4f4903f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 31 Jan 2019 14:17:18 +0100 Subject: [PATCH 062/147] arm64: KVM: Describe data or unified caches as having 1 set and 1 way On SMP ARM systems, cache maintenance by set/way should only ever be done in the context of onlining or offlining CPUs, which is typically done by bare metal firmware and never in a virtual machine. For this reason, we trap set/way cache maintenance operations and replace them with conditional flushing of the entire guest address space. Due to this trapping, the set/way arguments passed into the set/way ops are completely ignored, and thus irrelevant. This also means that the set/way geometry is equally irrelevant, and we can simply report it as 1 set and 1 way, so that legacy 32-bit ARM system software (i.e., the kind that only receives odd fixes) doesn't take a performance hit due to the trapping when iterating over the cachelines. Acked-by: Christoffer Dall Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 3 ++- arch/arm64/kvm/sys_regs.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 9acccb1e3746..d3842791e1c4 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -78,7 +78,8 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) if (!vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID3; - if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE)) + if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || + vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID2; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 45a07cfc57ed..81a342679e60 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1186,6 +1186,21 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); p->regval = get_ccsidr(csselr); + + /* + * Guests should not be doing cache operations by set/way at all, and + * for this reason, we trap them and attempt to infer the intent, so + * that we can flush the entire guest's address space at the appropriate + * time. + * To prevent this trapping from causing performance problems, let's + * expose the geometry of all data and unified caches (which are + * guaranteed to be PIPT and thus non-aliasing) as 1 set and 1 way. + * [If guests should attempt to infer aliasing properties from the + * geometry (which is not permitted by the architecture), they would + * only do so for virtually indexed caches.] + */ + if (!(csselr & 1)) // data or unified cache + p->regval &= ~GENMASK(27, 3); return true; } From bae561c0cff7e2cde99990ac3b5a58f815d6789d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 20 Jan 2019 20:32:31 +0000 Subject: [PATCH 063/147] KVM: arm/arm64: arch_timer: Mark physical interrupt active when a virtual interrupt is pending When a guest gets scheduled, KVM performs a "load" operation, which for the timer includes evaluating the virtual "active" state of the interrupt, and replicating it on the physical side. This ensures that the deactivation in the guest will also take place in the physical GIC distributor. If the interrupt is not yet active, we flag it as inactive on the physical side. This means that on restoring the timer registers, if the timer has expired, we'll immediately take an interrupt. That's absolutely fine, as the interrupt will then be flagged as active on the physical side. What this assumes though is that we'll enter the guest right after having taken the interrupt, and that the guest will quickly ACK the interrupt, making it active at on the virtual side. It turns out that quite often, this assumption doesn't really hold. The guest may be preempted on the back on this interrupt, either from kernel space or whilst running at EL1 when a host interrupt fires. When this happens, we repeat the whole sequence on the next load (interrupt marked as inactive, timer registers restored, interrupt fires). And if it takes a really long time for a guest to activate the interrupt (as it does with nested virt), we end-up with many such events in quick succession, leading to the guest only making very slow progress. This can also be seen with the number of virtual timer interrupt on the host being far greater than the same number in the guest. An easy way to fix this is to evaluate the timer state when performing the "load" operation, just like we do when the interrupt actually fires. If the timer has a pending virtual interrupt at this stage, then we can safely flag the physical interrupt as being active, which prevents spurious exits. Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 17f9de73cc8a..af8f2f1d01cc 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -487,12 +487,21 @@ static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, boo static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { struct kvm_vcpu *vcpu = ctx->vcpu; - bool phys_active; + bool phys_active = false; + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); if (irqchip_in_kernel(vcpu->kvm)) phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); - else - phys_active = ctx->irq.level; + + phys_active |= ctx->irq.level; + set_timer_irq_phys_active(ctx, phys_active); } From 49dfe94fe5ad97d380f81544b6d3626b076a1ef6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 16:57:28 +0900 Subject: [PATCH 064/147] KVM: arm/arm64: Fix TRACE_INCLUDE_PATH As the comment block in include/trace/define_trace.h says, TRACE_INCLUDE_PATH should be a relative path to the define_trace.h ../../virt/kvm/arm is the correct relative path. ../../../virt/kvm/arm is working by coincidence because the top Makefile adds -I$(srctree)/arch/$(SRCARCH)/include as a header search path, but we should not rely on it. Acked-by: Christoffer Dall Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- virt/kvm/arm/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 54bb059243b9..204d210d01c2 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -370,7 +370,7 @@ TRACE_EVENT(kvm_timer_emulate, #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm +#define TRACE_INCLUDE_PATH ../../virt/kvm/arm #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace From 3644a35b0244ddaeafa170a371144c4f12682c98 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 16:57:29 +0900 Subject: [PATCH 065/147] KVM: arm/arm64: Remove -I. header search paths The header search path -I. in kernel Makefiles is very suspicious; it allows the compiler to search for headers in the top of $(srctree), where obviously no header file exists. I was able to build without these extra header search paths. Acked-by: Christoffer Dall Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- arch/arm/kvm/Makefile | 3 +-- arch/arm64/kvm/Makefile | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 48de846f2246..bca775e222c3 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -9,8 +9,7 @@ ifeq ($(plus_virt),+virt) endif ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic -CFLAGS_arm.o := -I. $(plus_virt_def) -CFLAGS_mmu.o := -I. +CFLAGS_arm.o := $(plus_virt_def) AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 0f2a135ba15b..3089b3159135 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -4,8 +4,6 @@ # ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic -CFLAGS_arm.o := -I. -CFLAGS_mmu.o := -I. KVM=../../../virt/kvm From 05277f368c33fdc12d91464f25bb5656b808ec8a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 16:57:30 +0900 Subject: [PATCH 066/147] KVM: arm/arm64: Prefix header search paths with $(srctree)/ Currently, the Kbuild core manipulates header search paths in a crazy way [1]. To fix this mess, I want all Makefiles to add explicit $(srctree)/ to the search paths in the srctree. Some Makefiles are already written in that way, but not all. The goal of this work is to make the notation consistent, and finally get rid of the gross hacks. Having whitespaces after -I does not matter since commit 48f6e3cf5bc6 ("kbuild: do not drop -I without parameter"). [1]: https://patchwork.kernel.org/patch/9632347/ Acked-by: Christoffer Dall Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- arch/arm/kvm/Makefile | 2 +- arch/arm64/kvm/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index bca775e222c3..531e59f5be9c 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -8,7 +8,7 @@ ifeq ($(plus_virt),+virt) plus_virt_def := -DREQUIRES_VIRT=1 endif -ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic +ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic CFLAGS_arm.o := $(plus_virt_def) AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 3089b3159135..690e033a91c0 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -3,7 +3,7 @@ # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic +ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic KVM=../../../virt/kvm From 2b59066902548c17ddf5443745dbf8ce50d0c037 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 12 Feb 2019 10:07:47 +0000 Subject: [PATCH 067/147] KVM: arm/arm64: Update MAINTAINERS entries For historical reasons, KVM/arm and KVM/arm64 have had different entries in the MAINTAINER file. This makes little sense, as they are maintained together. On top of that, we have a bunch of talented people helping with the reviewing, and they deserve to be mentioned in the consolidated entry. Acked-by: Christoffer Dall Acked-by: Suzuki K Poulose Acked-by: James Morse Signed-off-by: Marc Zyngier --- MAINTAINERS | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 51029a425dbe..5d68f050d155 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8297,29 +8297,25 @@ S: Maintained F: arch/x86/include/asm/svm.h F: arch/x86/kvm/svm.c -KERNEL VIRTUAL MACHINE FOR ARM (KVM/arm) +KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64) M: Christoffer Dall M: Marc Zyngier +R: James Morse +R: Julien Thierry +R: Suzuki K Pouloze L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: kvmarm@lists.cs.columbia.edu W: http://systems.cs.columbia.edu/projects/kvm-arm T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git -S: Supported +S: Maintained F: arch/arm/include/uapi/asm/kvm* F: arch/arm/include/asm/kvm* F: arch/arm/kvm/ -F: virt/kvm/arm/ -F: include/kvm/arm_* - -KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) -M: Christoffer Dall -M: Marc Zyngier -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -L: kvmarm@lists.cs.columbia.edu -S: Maintained F: arch/arm64/include/uapi/asm/kvm* F: arch/arm64/include/asm/kvm* F: arch/arm64/kvm/ +F: virt/kvm/arm/ +F: include/kvm/arm_* KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips) M: James Hogan From a37f0c3c46d2d5f4e94bb1e80e3f6f1f8fd89b40 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 17 Feb 2019 22:36:26 +0000 Subject: [PATCH 068/147] KVM: arm/arm64: fix spelling mistake: "auxilary" -> "auxiliary" There is a spelling mistake in a kvm_err error message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Marc Zyngier --- virt/kvm/arm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 9fbdb9e1c51f..8de55041e7ba 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1553,7 +1553,7 @@ static int init_hyp_mode(void) err = hyp_map_aux_data(); if (err) - kvm_err("Cannot map host auxilary data: %d\n", err); + kvm_err("Cannot map host auxiliary data: %d\n", err); return 0; From 1b44471b555930cd8d03ae9be86bd2c32fbf0a92 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 14 Feb 2019 01:45:46 +0000 Subject: [PATCH 069/147] KVM: arm64: Fix comment for KVM_PHYS_SHIFT Since Suzuki K Poulose's work on Dynamic IPA support, KVM_PHYS_SHIFT will be used only when machine_type's bits[7:0] equal to 0 (by default). Thus the outdated comment should be fixed. Reviewed-by: Suzuki K Poulose Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index c423c8c4fc39..b0742a16c6c9 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -138,7 +138,8 @@ static inline unsigned long __kern_hyp_va(unsigned long v) }) /* - * We currently only support a 40bit IPA. + * We currently support using a VM-specified IPA size. For backward + * compatibility, the default IPA size is fixed to 40bits. */ #define KVM_PHYS_SHIFT (40) From c2be79a0bcf34ed975787e03a129936df17ec0dc Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 19 Feb 2019 17:22:21 +0800 Subject: [PATCH 070/147] KVM: arm/arm64: Remove unused gpa_end variable The 'gpa_end' local variable is never used and let's remove it. Cc: Christoffer Dall Signed-off-by: Shaokun Zhang Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index e3e5a26b4845..aae68da2717f 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1597,14 +1597,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, unsigned long hva) { - gpa_t gpa_start, gpa_end; + gpa_t gpa_start; hva_t uaddr_start, uaddr_end; size_t size; size = memslot->npages * PAGE_SIZE; gpa_start = memslot->base_gfn << PAGE_SHIFT; - gpa_end = gpa_start + size; uaddr_start = memslot->userspace_addr; uaddr_end = uaddr_start + size; From 95c7b77d6e40a407ca5bd21d7ba2e1c28ad8e85a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:09 -0800 Subject: [PATCH 071/147] KVM: x86: Explicitly #define the VCPU_REGS_* indices Declaring the VCPU_REGS_* as enums allows for more robust C code, but it prevents using the values in assembly files. Expliciting #define the indices in an asm-friendly file to prepare for VMX moving its transition code to a proper assembly file, but keep the enums for general usage. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 33 ++++++++++++++-------------- arch/x86/include/asm/kvm_vcpu_regs.h | 25 +++++++++++++++++++++ 2 files changed, 42 insertions(+), 16 deletions(-) create mode 100644 arch/x86/include/asm/kvm_vcpu_regs.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4660ce90de7f..0e2ef41efb9d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #define KVM_MAX_VCPUS 288 @@ -137,23 +138,23 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define ASYNC_PF_PER_VCPU 64 enum kvm_reg { - VCPU_REGS_RAX = 0, - VCPU_REGS_RCX = 1, - VCPU_REGS_RDX = 2, - VCPU_REGS_RBX = 3, - VCPU_REGS_RSP = 4, - VCPU_REGS_RBP = 5, - VCPU_REGS_RSI = 6, - VCPU_REGS_RDI = 7, + VCPU_REGS_RAX = __VCPU_REGS_RAX, + VCPU_REGS_RCX = __VCPU_REGS_RCX, + VCPU_REGS_RDX = __VCPU_REGS_RDX, + VCPU_REGS_RBX = __VCPU_REGS_RBX, + VCPU_REGS_RSP = __VCPU_REGS_RSP, + VCPU_REGS_RBP = __VCPU_REGS_RBP, + VCPU_REGS_RSI = __VCPU_REGS_RSI, + VCPU_REGS_RDI = __VCPU_REGS_RDI, #ifdef CONFIG_X86_64 - VCPU_REGS_R8 = 8, - VCPU_REGS_R9 = 9, - VCPU_REGS_R10 = 10, - VCPU_REGS_R11 = 11, - VCPU_REGS_R12 = 12, - VCPU_REGS_R13 = 13, - VCPU_REGS_R14 = 14, - VCPU_REGS_R15 = 15, + VCPU_REGS_R8 = __VCPU_REGS_R8, + VCPU_REGS_R9 = __VCPU_REGS_R9, + VCPU_REGS_R10 = __VCPU_REGS_R10, + VCPU_REGS_R11 = __VCPU_REGS_R11, + VCPU_REGS_R12 = __VCPU_REGS_R12, + VCPU_REGS_R13 = __VCPU_REGS_R13, + VCPU_REGS_R14 = __VCPU_REGS_R14, + VCPU_REGS_R15 = __VCPU_REGS_R15, #endif VCPU_REGS_RIP, NR_VCPU_REGS diff --git a/arch/x86/include/asm/kvm_vcpu_regs.h b/arch/x86/include/asm/kvm_vcpu_regs.h new file mode 100644 index 000000000000..1af2cb59233b --- /dev/null +++ b/arch/x86/include/asm/kvm_vcpu_regs.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_KVM_VCPU_REGS_H +#define _ASM_X86_KVM_VCPU_REGS_H + +#define __VCPU_REGS_RAX 0 +#define __VCPU_REGS_RCX 1 +#define __VCPU_REGS_RDX 2 +#define __VCPU_REGS_RBX 3 +#define __VCPU_REGS_RSP 4 +#define __VCPU_REGS_RBP 5 +#define __VCPU_REGS_RSI 6 +#define __VCPU_REGS_RDI 7 + +#ifdef CONFIG_X86_64 +#define __VCPU_REGS_R8 8 +#define __VCPU_REGS_R9 9 +#define __VCPU_REGS_R10 10 +#define __VCPU_REGS_R11 11 +#define __VCPU_REGS_R12 12 +#define __VCPU_REGS_R13 13 +#define __VCPU_REGS_R14 14 +#define __VCPU_REGS_R15 15 +#endif + +#endif /* _ASM_X86_KVM_VCPU_REGS_H */ From c14f9dd50b01b55834a757dd50af35b8e168512d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:10 -0800 Subject: [PATCH 072/147] KVM: VMX: Use #defines in place of immediates in VM-Enter inline asm ...to prepare for moving the inline asm to a proper asm sub-routine. Eliminating the immediates allows a nearly verbatim move, e.g. quotes, newlines, tabs and __stringify need to be dropped, but other than those cosmetic changes the only function change will be to replace the final "jmp" with a "ret". Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 113 ++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 52 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cd92568bed26..12fb342218ad 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,6 +6371,33 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +#ifdef CONFIG_X86_64 +#define WORD_SIZE 8 +#else +#define WORD_SIZE 4 +#endif + +#define _WORD_SIZE __stringify(WORD_SIZE) + +#define VCPU_RAX __stringify(__VCPU_REGS_RAX * WORD_SIZE) +#define VCPU_RCX __stringify(__VCPU_REGS_RCX * WORD_SIZE) +#define VCPU_RDX __stringify(__VCPU_REGS_RDX * WORD_SIZE) +#define VCPU_RBX __stringify(__VCPU_REGS_RBX * WORD_SIZE) +/* Intentionally omit %RSP as it's context switched by hardware */ +#define VCPU_RBP __stringify(__VCPU_REGS_RBP * WORD_SIZE) +#define VCPU_RSI __stringify(__VCPU_REGS_RSI * WORD_SIZE) +#define VCPU_RDI __stringify(__VCPU_REGS_RDI * WORD_SIZE) +#ifdef CONFIG_X86_64 +#define VCPU_R8 __stringify(__VCPU_REGS_R8 * WORD_SIZE) +#define VCPU_R9 __stringify(__VCPU_REGS_R9 * WORD_SIZE) +#define VCPU_R10 __stringify(__VCPU_REGS_R10 * WORD_SIZE) +#define VCPU_R11 __stringify(__VCPU_REGS_R11 * WORD_SIZE) +#define VCPU_R12 __stringify(__VCPU_REGS_R12 * WORD_SIZE) +#define VCPU_R13 __stringify(__VCPU_REGS_R13 * WORD_SIZE) +#define VCPU_R14 __stringify(__VCPU_REGS_R14 * WORD_SIZE) +#define VCPU_R15 __stringify(__VCPU_REGS_R15 * WORD_SIZE) +#endif + static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6390,7 +6417,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "push %%" _ASM_ARG2 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ - "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" + "lea -" _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" "call vmx_update_host_rsp \n\t" /* Load RCX with @regs. */ @@ -6400,24 +6427,24 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "cmpb $0, %%bl \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" - "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t" - "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t" - "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t" - "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t" + "mov " VCPU_RAX "(%%" _ASM_CX "), %%" _ASM_AX " \n\t" + "mov " VCPU_RBX "(%%" _ASM_CX "), %%" _ASM_BX " \n\t" + "mov " VCPU_RDX "(%%" _ASM_CX "), %%" _ASM_DX " \n\t" + "mov " VCPU_RSI "(%%" _ASM_CX "), %%" _ASM_SI " \n\t" + "mov " VCPU_RDI "(%%" _ASM_CX "), %%" _ASM_DI " \n\t" + "mov " VCPU_RBP "(%%" _ASM_CX "), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 - "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t" - "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t" - "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t" - "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t" - "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t" - "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t" - "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t" - "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t" + "mov " VCPU_R8 "(%%" _ASM_CX "), %%r8 \n\t" + "mov " VCPU_R9 "(%%" _ASM_CX "), %%r9 \n\t" + "mov " VCPU_R10 "(%%" _ASM_CX "), %%r10 \n\t" + "mov " VCPU_R11 "(%%" _ASM_CX "), %%r11 \n\t" + "mov " VCPU_R12 "(%%" _ASM_CX "), %%r12 \n\t" + "mov " VCPU_R13 "(%%" _ASM_CX "), %%r13 \n\t" + "mov " VCPU_R14 "(%%" _ASM_CX "), %%r14 \n\t" + "mov " VCPU_R15 "(%%" _ASM_CX "), %%r15 \n\t" #endif /* Load guest RCX. This kills the vmx_vcpu pointer! */ - "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t" + "mov " VCPU_RCX"(%%" _ASM_CX "), %%" _ASM_CX " \n\t" /* Enter guest mode */ "call vmx_vmenter\n\t" @@ -6427,25 +6454,25 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "push %%" _ASM_CX " \n\t" /* Reload RCX with @regs. */ - "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" + "mov " _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Save all guest registers, including RCX from the stack */ - "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" - __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_AX ", " VCPU_RAX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_BX ", " VCPU_RBX "(%%" _ASM_CX ") \n\t" + __ASM_SIZE(pop) " " VCPU_RCX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_DX ", " VCPU_RDX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_SI ", " VCPU_RSI "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_DI ", " VCPU_RDI "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_BP ", " VCPU_RBP "(%%" _ASM_CX ") \n\t" #ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t" - "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t" - "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t" - "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t" - "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t" - "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" - "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" - "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" + "mov %%r8, " VCPU_R8 "(%%" _ASM_CX ") \n\t" + "mov %%r9, " VCPU_R9 "(%%" _ASM_CX ") \n\t" + "mov %%r10, " VCPU_R10 "(%%" _ASM_CX ") \n\t" + "mov %%r11, " VCPU_R11 "(%%" _ASM_CX ") \n\t" + "mov %%r12, " VCPU_R12 "(%%" _ASM_CX ") \n\t" + "mov %%r13, " VCPU_R13 "(%%" _ASM_CX ") \n\t" + "mov %%r14, " VCPU_R14 "(%%" _ASM_CX ") \n\t" + "mov %%r15, " VCPU_R15 "(%%" _ASM_CX ") \n\t" #endif /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ @@ -6479,7 +6506,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%ebp, %%ebp \n\t" /* "POP" the vcpu_vmx pointer. */ - "add $%c[wordsize], %%" _ASM_SP " \n\t" + "add $" _WORD_SIZE ", %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" "jmp 3f \n\t" @@ -6497,25 +6524,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "=a"((int){0}), "=d"((int){0}) : "a"(vmx), "d"(&vcpu->arch.regs), #endif - "b"(vmx->loaded_vmcs->launched), - [rax]"i"(VCPU_REGS_RAX * sizeof(ulong)), - [rbx]"i"(VCPU_REGS_RBX * sizeof(ulong)), - [rcx]"i"(VCPU_REGS_RCX * sizeof(ulong)), - [rdx]"i"(VCPU_REGS_RDX * sizeof(ulong)), - [rsi]"i"(VCPU_REGS_RSI * sizeof(ulong)), - [rdi]"i"(VCPU_REGS_RDI * sizeof(ulong)), - [rbp]"i"(VCPU_REGS_RBP * sizeof(ulong)), -#ifdef CONFIG_X86_64 - [r8]"i"(VCPU_REGS_R8 * sizeof(ulong)), - [r9]"i"(VCPU_REGS_R9 * sizeof(ulong)), - [r10]"i"(VCPU_REGS_R10 * sizeof(ulong)), - [r11]"i"(VCPU_REGS_R11 * sizeof(ulong)), - [r12]"i"(VCPU_REGS_R12 * sizeof(ulong)), - [r13]"i"(VCPU_REGS_R13 * sizeof(ulong)), - [r14]"i"(VCPU_REGS_R14 * sizeof(ulong)), - [r15]"i"(VCPU_REGS_R15 * sizeof(ulong)), -#endif - [wordsize]"i"(sizeof(ulong)) + "b"(vmx->loaded_vmcs->launched) : "cc", "memory" #ifdef CONFIG_X86_64 , "rax", "rcx", "rdx" From 63c73aa07fcabc090661a586f7ae5200a0fc5cb4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:11 -0800 Subject: [PATCH 073/147] KVM: VMX: Create a stack frame in vCPU-run ...in preparation for moving to a proper assembly sub-routnine. vCPU-run isn't a leaf function since it calls vmx_update_host_rsp() and vmx_vmenter(). And since we need to save/restore RBP anyways, unconditionally creating the frame costs a single MOV, i.e. don't bother keying off CONFIG_FRAME_POINTER or using FRAME_BEGIN, etc... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 12fb342218ad..52fea89732db 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6407,8 +6407,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) write_cr2(vcpu->arch.cr2); asm( - /* Store host registers */ "push %%" _ASM_BP " \n\t" + "mov %%" _ASM_SP ", %%" _ASM_BP " \n\t" /* * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and From 5e0781df18990497f4cd96a959ca583d83b6d1e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:12 -0800 Subject: [PATCH 074/147] KVM: VMX: Move vCPU-run code to a proper assembly routine As evidenced by the myriad patches leading up to this moment, using an inline asm blob for vCPU-run is nothing short of horrific. It's also been called "unholy", "an abomination" and likely a whole host of other names that would violate the Code of Conduct if recorded here and now. The code is relocated nearly verbatim, e.g. quotes, newlines, tabs and __stringify need to be dropped, but other than those cosmetic changes the only functional changees are to add the "call" and replace the final "jmp" with a "ret". Note that STACK_FRAME_NON_STANDARD is also dropped from __vmx_vcpu_run(). Suggested-by: Andi Kleen Suggested-by: Josh Poimboeuf Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 144 +++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 138 +---------------------------------- 2 files changed, 145 insertions(+), 137 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bcef2c7e9bc4..e64617f3b196 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -1,6 +1,30 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include +#include + +#define WORD_SIZE (BITS_PER_LONG / 8) + +#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE +#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE +#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE +#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +/* Intentionally omit RSP as it's context switched by hardware */ +#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE +#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE +#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE + +#ifdef CONFIG_X86_64 +#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE +#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE +#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE +#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE +#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE +#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE +#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE +#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#endif .text @@ -55,3 +79,123 @@ ENDPROC(vmx_vmenter) ENTRY(vmx_vmexit) ret ENDPROC(vmx_vmexit) + +/** + * ____vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * @vmx: struct vcpu_vmx * + * @regs: unsigned long * (to guest registers) + * %RBX: VMCS launched status (non-zero indicates already launched) + * + * Returns: + * %RBX is 0 on VM-Exit, 1 on VM-Fail + */ +ENTRY(____vmx_vcpu_run) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + push %_ASM_ARG2 + + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ + lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 + call vmx_update_host_rsp + + /* Load @regs to RCX. */ + mov (%_ASM_SP), %_ASM_CX + + /* Check if vmlaunch or vmresume is needed */ + cmpb $0, %bl + + /* Load guest registers. Don't clobber flags. */ + mov VCPU_RAX(%_ASM_CX), %_ASM_AX + mov VCPU_RBX(%_ASM_CX), %_ASM_BX + mov VCPU_RDX(%_ASM_CX), %_ASM_DX + mov VCPU_RSI(%_ASM_CX), %_ASM_SI + mov VCPU_RDI(%_ASM_CX), %_ASM_DI + mov VCPU_RBP(%_ASM_CX), %_ASM_BP +#ifdef CONFIG_X86_64 + mov VCPU_R8 (%_ASM_CX), %r8 + mov VCPU_R9 (%_ASM_CX), %r9 + mov VCPU_R10(%_ASM_CX), %r10 + mov VCPU_R11(%_ASM_CX), %r11 + mov VCPU_R12(%_ASM_CX), %r12 + mov VCPU_R13(%_ASM_CX), %r13 + mov VCPU_R14(%_ASM_CX), %r14 + mov VCPU_R15(%_ASM_CX), %r15 +#endif + /* Load guest RCX. This kills the vmx_vcpu pointer! */ + mov VCPU_RCX(%_ASM_CX), %_ASM_CX + + /* Enter guest mode */ + call vmx_vmenter + + /* Jump on VM-Fail. */ + jbe 2f + + /* Temporarily save guest's RCX. */ + push %_ASM_CX + + /* Reload @regs to RCX. */ + mov WORD_SIZE(%_ASM_SP), %_ASM_CX + + /* Save all guest registers, including RCX from the stack */ + mov %_ASM_AX, VCPU_RAX(%_ASM_CX) + mov %_ASM_BX, VCPU_RBX(%_ASM_CX) + __ASM_SIZE(pop) VCPU_RCX(%_ASM_CX) + mov %_ASM_DX, VCPU_RDX(%_ASM_CX) + mov %_ASM_SI, VCPU_RSI(%_ASM_CX) + mov %_ASM_DI, VCPU_RDI(%_ASM_CX) + mov %_ASM_BP, VCPU_RBP(%_ASM_CX) +#ifdef CONFIG_X86_64 + mov %r8, VCPU_R8 (%_ASM_CX) + mov %r9, VCPU_R9 (%_ASM_CX) + mov %r10, VCPU_R10(%_ASM_CX) + mov %r11, VCPU_R11(%_ASM_CX) + mov %r12, VCPU_R12(%_ASM_CX) + mov %r13, VCPU_R13(%_ASM_CX) + mov %r14, VCPU_R14(%_ASM_CX) + mov %r15, VCPU_R15(%_ASM_CX) +#endif + + /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ + xor %ebx, %ebx + + /* + * Clear all general purpose registers except RSP and RBX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RBX are exempt as RSP is restored by hardware during + * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. + */ +1: +#ifdef CONFIG_X86_64 + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d +#endif + xor %eax, %eax + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi + xor %edi, %edi + xor %ebp, %ebp + + /* "POP" @regs. */ + add $WORD_SIZE, %_ASM_SP + pop %_ASM_BP + ret + + /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ +2: mov $1, %ebx + jmp 1b +ENDPROC(____vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 52fea89732db..0e6e2dd6265e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,33 +6371,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -#ifdef CONFIG_X86_64 -#define WORD_SIZE 8 -#else -#define WORD_SIZE 4 -#endif - -#define _WORD_SIZE __stringify(WORD_SIZE) - -#define VCPU_RAX __stringify(__VCPU_REGS_RAX * WORD_SIZE) -#define VCPU_RCX __stringify(__VCPU_REGS_RCX * WORD_SIZE) -#define VCPU_RDX __stringify(__VCPU_REGS_RDX * WORD_SIZE) -#define VCPU_RBX __stringify(__VCPU_REGS_RBX * WORD_SIZE) -/* Intentionally omit %RSP as it's context switched by hardware */ -#define VCPU_RBP __stringify(__VCPU_REGS_RBP * WORD_SIZE) -#define VCPU_RSI __stringify(__VCPU_REGS_RSI * WORD_SIZE) -#define VCPU_RDI __stringify(__VCPU_REGS_RDI * WORD_SIZE) -#ifdef CONFIG_X86_64 -#define VCPU_R8 __stringify(__VCPU_REGS_R8 * WORD_SIZE) -#define VCPU_R9 __stringify(__VCPU_REGS_R9 * WORD_SIZE) -#define VCPU_R10 __stringify(__VCPU_REGS_R10 * WORD_SIZE) -#define VCPU_R11 __stringify(__VCPU_REGS_R11 * WORD_SIZE) -#define VCPU_R12 __stringify(__VCPU_REGS_R12 * WORD_SIZE) -#define VCPU_R13 __stringify(__VCPU_REGS_R13 * WORD_SIZE) -#define VCPU_R14 __stringify(__VCPU_REGS_R14 * WORD_SIZE) -#define VCPU_R15 __stringify(__VCPU_REGS_R15 * WORD_SIZE) -#endif - static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6407,115 +6380,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) write_cr2(vcpu->arch.cr2); asm( - "push %%" _ASM_BP " \n\t" - "mov %%" _ASM_SP ", %%" _ASM_BP " \n\t" - - /* - * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and - * @regs is needed after VM-Exit to save the guest's register values. - */ - "push %%" _ASM_ARG2 " \n\t" - - /* Adjust RSP to account for the CALL to vmx_vmenter(). */ - "lea -" _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" - "call vmx_update_host_rsp \n\t" - - /* Load RCX with @regs. */ - "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" - - /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %%bl \n\t" - - /* Load guest registers. Don't clobber flags. */ - "mov " VCPU_RAX "(%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov " VCPU_RBX "(%%" _ASM_CX "), %%" _ASM_BX " \n\t" - "mov " VCPU_RDX "(%%" _ASM_CX "), %%" _ASM_DX " \n\t" - "mov " VCPU_RSI "(%%" _ASM_CX "), %%" _ASM_SI " \n\t" - "mov " VCPU_RDI "(%%" _ASM_CX "), %%" _ASM_DI " \n\t" - "mov " VCPU_RBP "(%%" _ASM_CX "), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov " VCPU_R8 "(%%" _ASM_CX "), %%r8 \n\t" - "mov " VCPU_R9 "(%%" _ASM_CX "), %%r9 \n\t" - "mov " VCPU_R10 "(%%" _ASM_CX "), %%r10 \n\t" - "mov " VCPU_R11 "(%%" _ASM_CX "), %%r11 \n\t" - "mov " VCPU_R12 "(%%" _ASM_CX "), %%r12 \n\t" - "mov " VCPU_R13 "(%%" _ASM_CX "), %%r13 \n\t" - "mov " VCPU_R14 "(%%" _ASM_CX "), %%r14 \n\t" - "mov " VCPU_R15 "(%%" _ASM_CX "), %%r15 \n\t" -#endif - /* Load guest RCX. This kills the vmx_vcpu pointer! */ - "mov " VCPU_RCX"(%%" _ASM_CX "), %%" _ASM_CX " \n\t" - - /* Enter guest mode */ - "call vmx_vmenter\n\t" - "jbe 2f \n\t" - - /* Temporarily save guest's RCX. */ - "push %%" _ASM_CX " \n\t" - - /* Reload RCX with @regs. */ - "mov " _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_CX " \n\t" - - /* Save all guest registers, including RCX from the stack */ - "mov %%" _ASM_AX ", " VCPU_RAX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BX ", " VCPU_RBX "(%%" _ASM_CX ") \n\t" - __ASM_SIZE(pop) " " VCPU_RCX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DX ", " VCPU_RDX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_SI ", " VCPU_RSI "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DI ", " VCPU_RDI "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BP ", " VCPU_RBP "(%%" _ASM_CX ") \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, " VCPU_R8 "(%%" _ASM_CX ") \n\t" - "mov %%r9, " VCPU_R9 "(%%" _ASM_CX ") \n\t" - "mov %%r10, " VCPU_R10 "(%%" _ASM_CX ") \n\t" - "mov %%r11, " VCPU_R11 "(%%" _ASM_CX ") \n\t" - "mov %%r12, " VCPU_R12 "(%%" _ASM_CX ") \n\t" - "mov %%r13, " VCPU_R13 "(%%" _ASM_CX ") \n\t" - "mov %%r14, " VCPU_R14 "(%%" _ASM_CX ") \n\t" - "mov %%r15, " VCPU_R15 "(%%" _ASM_CX ") \n\t" -#endif - - /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ - "xor %%ebx, %%ebx \n\t" - - /* - * Clear all general purpose registers except RSP and RBX to prevent - * speculative use of the guest's values, even those that are reloaded - * via the stack. In theory, an L1 cache miss when restoring registers - * could lead to speculative execution with the guest's values. - * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially - * free. RSP and RBX are exempt as RSP is restored by hardware during - * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. - */ - "1: \n\t" -#ifdef CONFIG_X86_64 - "xor %%r8d, %%r8d \n\t" - "xor %%r9d, %%r9d \n\t" - "xor %%r10d, %%r10d \n\t" - "xor %%r11d, %%r11d \n\t" - "xor %%r12d, %%r12d \n\t" - "xor %%r13d, %%r13d \n\t" - "xor %%r14d, %%r14d \n\t" - "xor %%r15d, %%r15d \n\t" -#endif - "xor %%eax, %%eax \n\t" - "xor %%ecx, %%ecx \n\t" - "xor %%edx, %%edx \n\t" - "xor %%esi, %%esi \n\t" - "xor %%edi, %%edi \n\t" - "xor %%ebp, %%ebp \n\t" - - /* "POP" the vcpu_vmx pointer. */ - "add $" _WORD_SIZE ", %%" _ASM_SP " \n\t" - "pop %%" _ASM_BP " \n\t" - "jmp 3f \n\t" - - /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ - "2: \n\t" - "mov $1, %%ebx \n\t" - "jmp 1b \n\t" - "3: \n\t" - + "call ____vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}) @@ -6536,7 +6401,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) vcpu->arch.cr2 = read_cr2(); } -STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { From c823dd5c0f3fafa595ed51cc72c6e006efc20ad3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:13 -0800 Subject: [PATCH 075/147] KVM: VMX: Fold __vmx_vcpu_run() back into vmx_vcpu_run() ...now that the code is no longer tagged with STACK_FRAME_NON_STANDARD. Arguably, providing __vmx_vcpu_run() to break up vmx_vcpu_run() is valuable on its own, but the previous split was purposely made as small as possible to limit the effects STACK_FRAME_NON_STANDARD. In other words, the current split is now completely arbitrary and likely not the most logical. This also allows renaming ____vmx_vcpu_run() to __vmx_vcpu_run() in a future patch. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 59 +++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0e6e2dd6265e..e61bb8da9767 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,37 +6371,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) -{ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); - - asm( - "call ____vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), -#ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), -#else - "=a"((int){0}), "=d"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), -#endif - "b"(vmx->loaded_vmcs->launched) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ecx", "edi", "esi" -#endif - ); - - vcpu->arch.cr2 = read_cr2(); -} - static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6469,7 +6438,33 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - __vmx_vcpu_run(vcpu, vmx); + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + + asm( + "call ____vmx_vcpu_run \n\t" + : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), +#ifdef CONFIG_X86_64 + "=D"((int){0}), "=S"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), +#else + "=a"((int){0}), "=d"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), +#endif + "b"(vmx->loaded_vmcs->launched) + : "cc", "memory" +#ifdef CONFIG_X86_64 + , "rax", "rcx", "rdx" + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "ecx", "edi", "esi" +#endif + ); + + vcpu->arch.cr2 = read_cr2(); /* * We do not use IBRS in the kernel. If this vCPU has used the From ee2fc635ef714b1d46da3616bbec972067f0d187 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:14 -0800 Subject: [PATCH 076/147] KVM: VMX: Rename ____vmx_vcpu_run() to __vmx_vcpu_run() ...now that the name is no longer usurped by a defunct helper function. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 6 +++--- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e64617f3b196..fa6e03b36348 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -81,7 +81,7 @@ ENTRY(vmx_vmexit) ENDPROC(vmx_vmexit) /** - * ____vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode * @vmx: struct vcpu_vmx * * @regs: unsigned long * (to guest registers) * %RBX: VMCS launched status (non-zero indicates already launched) @@ -89,7 +89,7 @@ ENDPROC(vmx_vmexit) * Returns: * %RBX is 0 on VM-Exit, 1 on VM-Fail */ -ENTRY(____vmx_vcpu_run) +ENTRY(__vmx_vcpu_run) push %_ASM_BP mov %_ASM_SP, %_ASM_BP @@ -198,4 +198,4 @@ ENTRY(____vmx_vcpu_run) /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ 2: mov $1, %ebx jmp 1b -ENDPROC(____vmx_vcpu_run) +ENDPROC(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e61bb8da9767..8bc8f09eaf0c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6445,7 +6445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) write_cr2(vcpu->arch.cr2); asm( - "call ____vmx_vcpu_run \n\t" + "call __vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}) From a62fd5a76c99dd96c74c6638408961b7ff3c71c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:15 -0800 Subject: [PATCH 077/147] KVM: VMX: Use RAX as the scratch register during vCPU-run ...to prepare for making the sub-routine callable from C code. That means returning the result in RAX. Since RAX will be used to return the result, use it as the scratch register as well to make the code readable and to document that the scratch register is more or less arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 76 +++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index fa6e03b36348..7d8b09abcdec 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -103,31 +103,31 @@ ENTRY(__vmx_vcpu_run) lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp - /* Load @regs to RCX. */ - mov (%_ASM_SP), %_ASM_CX + /* Load @regs to RAX. */ + mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ cmpb $0, %bl /* Load guest registers. Don't clobber flags. */ - mov VCPU_RAX(%_ASM_CX), %_ASM_AX - mov VCPU_RBX(%_ASM_CX), %_ASM_BX - mov VCPU_RDX(%_ASM_CX), %_ASM_DX - mov VCPU_RSI(%_ASM_CX), %_ASM_SI - mov VCPU_RDI(%_ASM_CX), %_ASM_DI - mov VCPU_RBP(%_ASM_CX), %_ASM_BP + mov VCPU_RBX(%_ASM_AX), %_ASM_BX + mov VCPU_RCX(%_ASM_AX), %_ASM_CX + mov VCPU_RDX(%_ASM_AX), %_ASM_DX + mov VCPU_RSI(%_ASM_AX), %_ASM_SI + mov VCPU_RDI(%_ASM_AX), %_ASM_DI + mov VCPU_RBP(%_ASM_AX), %_ASM_BP #ifdef CONFIG_X86_64 - mov VCPU_R8 (%_ASM_CX), %r8 - mov VCPU_R9 (%_ASM_CX), %r9 - mov VCPU_R10(%_ASM_CX), %r10 - mov VCPU_R11(%_ASM_CX), %r11 - mov VCPU_R12(%_ASM_CX), %r12 - mov VCPU_R13(%_ASM_CX), %r13 - mov VCPU_R14(%_ASM_CX), %r14 - mov VCPU_R15(%_ASM_CX), %r15 + mov VCPU_R8 (%_ASM_AX), %r8 + mov VCPU_R9 (%_ASM_AX), %r9 + mov VCPU_R10(%_ASM_AX), %r10 + mov VCPU_R11(%_ASM_AX), %r11 + mov VCPU_R12(%_ASM_AX), %r12 + mov VCPU_R13(%_ASM_AX), %r13 + mov VCPU_R14(%_ASM_AX), %r14 + mov VCPU_R15(%_ASM_AX), %r15 #endif - /* Load guest RCX. This kills the vmx_vcpu pointer! */ - mov VCPU_RCX(%_ASM_CX), %_ASM_CX + /* Load guest RAX. This kills the vmx_vcpu pointer! */ + mov VCPU_RAX(%_ASM_AX), %_ASM_AX /* Enter guest mode */ call vmx_vmenter @@ -135,29 +135,29 @@ ENTRY(__vmx_vcpu_run) /* Jump on VM-Fail. */ jbe 2f - /* Temporarily save guest's RCX. */ - push %_ASM_CX + /* Temporarily save guest's RAX. */ + push %_ASM_AX - /* Reload @regs to RCX. */ - mov WORD_SIZE(%_ASM_SP), %_ASM_CX + /* Reload @regs to RAX. */ + mov WORD_SIZE(%_ASM_SP), %_ASM_AX - /* Save all guest registers, including RCX from the stack */ - mov %_ASM_AX, VCPU_RAX(%_ASM_CX) - mov %_ASM_BX, VCPU_RBX(%_ASM_CX) - __ASM_SIZE(pop) VCPU_RCX(%_ASM_CX) - mov %_ASM_DX, VCPU_RDX(%_ASM_CX) - mov %_ASM_SI, VCPU_RSI(%_ASM_CX) - mov %_ASM_DI, VCPU_RDI(%_ASM_CX) - mov %_ASM_BP, VCPU_RBP(%_ASM_CX) + /* Save all guest registers, including RAX from the stack */ + __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_CX, VCPU_RCX(%_ASM_AX) + mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_SI, VCPU_RSI(%_ASM_AX) + mov %_ASM_DI, VCPU_RDI(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) #ifdef CONFIG_X86_64 - mov %r8, VCPU_R8 (%_ASM_CX) - mov %r9, VCPU_R9 (%_ASM_CX) - mov %r10, VCPU_R10(%_ASM_CX) - mov %r11, VCPU_R11(%_ASM_CX) - mov %r12, VCPU_R12(%_ASM_CX) - mov %r13, VCPU_R13(%_ASM_CX) - mov %r14, VCPU_R14(%_ASM_CX) - mov %r15, VCPU_R15(%_ASM_CX) + mov %r8, VCPU_R8 (%_ASM_AX) + mov %r9, VCPU_R9 (%_ASM_AX) + mov %r10, VCPU_R10(%_ASM_AX) + mov %r11, VCPU_R11(%_ASM_AX) + mov %r12, VCPU_R12(%_ASM_AX) + mov %r13, VCPU_R13(%_ASM_AX) + mov %r14, VCPU_R14(%_ASM_AX) + mov %r15, VCPU_R15(%_ASM_AX) #endif /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ From 77df549559dbe7f265ab19bd444d6acb3a718b4d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:16 -0800 Subject: [PATCH 078/147] KVM: VMX: Pass @launched to the vCPU-run asm via standard ABI regs ...to prepare for making the sub-routine callable from C code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 5 ++++- arch/x86/kvm/vmx/vmx.c | 13 ++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 7d8b09abcdec..a3d9a8e062f9 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -84,7 +84,7 @@ ENDPROC(vmx_vmexit) * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode * @vmx: struct vcpu_vmx * * @regs: unsigned long * (to guest registers) - * %RBX: VMCS launched status (non-zero indicates already launched) + * @launched: %true if the VMCS has been launched * * Returns: * %RBX is 0 on VM-Exit, 1 on VM-Fail @@ -99,6 +99,9 @@ ENTRY(__vmx_vcpu_run) */ push %_ASM_ARG2 + /* Copy @launched to BL, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3B, %bl + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8bc8f09eaf0c..1b73a82444a2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6448,19 +6448,18 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) "call __vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), + "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) #else - "=a"((int){0}), "=d"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), + "=a"((int){0}), "=d"((int){0}), "=c"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) #endif - "b"(vmx->loaded_vmcs->launched) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx" + , "rax", "rcx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ecx", "edi", "esi" + , "edi", "esi" #endif ); From e75c3c3a0487da878cbfa7f125dcd080a8606eaf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:17 -0800 Subject: [PATCH 079/147] KVM: VMX: Return VM-Fail from vCPU-run assembly via standard ABI reg ...to prepare for making the assembly sub-routine callable from C code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 16 ++++++++-------- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index a3d9a8e062f9..e06a3f33311e 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -87,7 +87,7 @@ ENDPROC(vmx_vmexit) * @launched: %true if the VMCS has been launched * * Returns: - * %RBX is 0 on VM-Exit, 1 on VM-Fail + * 0 on VM-Exit, 1 on VM-Fail */ ENTRY(__vmx_vcpu_run) push %_ASM_BP @@ -163,17 +163,17 @@ ENTRY(__vmx_vcpu_run) mov %r15, VCPU_R15(%_ASM_AX) #endif - /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ - xor %ebx, %ebx + /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ + xor %eax, %eax /* - * Clear all general purpose registers except RSP and RBX to prevent + * Clear all general purpose registers except RSP and RAX to prevent * speculative use of the guest's values, even those that are reloaded * via the stack. In theory, an L1 cache miss when restoring registers * could lead to speculative execution with the guest's values. * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially - * free. RSP and RBX are exempt as RSP is restored by hardware during - * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. + * free. RSP and RAX are exempt as RSP is restored by hardware during + * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. */ 1: #ifdef CONFIG_X86_64 @@ -186,7 +186,7 @@ ENTRY(__vmx_vcpu_run) xor %r14d, %r14d xor %r15d, %r15d #endif - xor %eax, %eax + xor %ebx, %ebx xor %ecx, %ecx xor %edx, %edx xor %esi, %esi @@ -199,6 +199,6 @@ ENTRY(__vmx_vcpu_run) ret /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ -2: mov $1, %ebx +2: mov $1, %eax jmp 1b ENDPROC(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1b73a82444a2..9a1d27e77684 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6446,20 +6446,20 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm( "call __vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), + : ASM_CALL_CONSTRAINT, "=a"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) #else - "=a"((int){0}), "=d"((int){0}), "=c"((int){0}) + "=d"((int){0}), "=c"((int){0}) : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) #endif : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx" + , "rbx", "rcx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "edi", "esi" + , "ebx", "edi", "esi" #endif ); From 3b895ef48615382db03adcf125e0db8437b9acbe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:18 -0800 Subject: [PATCH 080/147] KVM: VMX: Preserve callee-save registers in vCPU-run asm sub-routine ...to make it callable from C code. Note that because KVM chooses to be ultra paranoid about guest register values, all callee-save registers are still cleared after VM-Exit even though the host's values are now reloaded from the stack. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 21 +++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 5 +---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e06a3f33311e..d325f1d6110b 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -92,6 +92,16 @@ ENDPROC(vmx_vmexit) ENTRY(__vmx_vcpu_run) push %_ASM_BP mov %_ASM_SP, %_ASM_BP +#ifdef CONFIG_X86_64 + push %r15 + push %r14 + push %r13 + push %r12 +#else + push %edi + push %esi +#endif + push %_ASM_BX /* * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and @@ -195,6 +205,17 @@ ENTRY(__vmx_vcpu_run) /* "POP" @regs. */ add $WORD_SIZE, %_ASM_SP + pop %_ASM_BX + +#ifdef CONFIG_X86_64 + pop %r12 + pop %r13 + pop %r14 + pop %r15 +#else + pop %esi + pop %edi +#endif pop %_ASM_BP ret diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9a1d27e77684..43723d0007be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6456,10 +6456,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif : "cc", "memory" #ifdef CONFIG_X86_64 - , "rbx", "rcx" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "esi" + , "rcx", "r8", "r9", "r10", "r11" #endif ); From fc2ba5a27a1aaa16b664e64f85e0e1307d2bde3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:19 -0800 Subject: [PATCH 081/147] KVM: VMX: Call vCPU-run asm sub-routine from C and remove clobbering ...now that the sub-routine follows standard calling conventions. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 43723d0007be..c39f1c38b878 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,6 +6371,8 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); + static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6444,21 +6446,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.cr2 != read_cr2()) write_cr2(vcpu->arch.cr2); - asm( - "call __vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=a"(vmx->fail), -#ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) -#else - "=d"((int){0}), "=c"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) -#endif - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rcx", "r8", "r9", "r10", "r11" -#endif - ); + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); vcpu->arch.cr2 = read_cr2(); From 4f44c4eec5b72020c6d99e9e4f3d0505874c98c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:20 -0800 Subject: [PATCH 082/147] KVM: VMX: Reorder clearing of registers in the vCPU-run assembly flow Move the clearing of the common registers (not 64-bit-only) to the start of the flow that clears registers holding guest state. This is purely a cosmetic change so that the label doesn't point at a blank line and a #define. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index d325f1d6110b..7b272738c576 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -185,7 +185,12 @@ ENTRY(__vmx_vcpu_run) * free. RSP and RAX are exempt as RSP is restored by hardware during * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. */ -1: +1: xor %ebx, %ebx + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi + xor %edi, %edi + xor %ebp, %ebp #ifdef CONFIG_X86_64 xor %r8d, %r8d xor %r9d, %r9d @@ -196,12 +201,6 @@ ENTRY(__vmx_vcpu_run) xor %r14d, %r14d xor %r15d, %r15d #endif - xor %ebx, %ebx - xor %ecx, %ecx - xor %edx, %edx - xor %esi, %esi - xor %edi, %edi - xor %ebp, %ebp /* "POP" @regs. */ add $WORD_SIZE, %_ASM_SP From b5179ec4187251a751832193693d6e474d3445ac Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Sat, 26 Jan 2019 12:49:56 -0500 Subject: [PATCH 083/147] x86/kvmclock: set offset for kvm unstable clock VMs may show incorrect uptime and dmesg printk offsets on hypervisors with unstable clock. The problem is produced when VM is rebooted without exiting from qemu. The fix is to calculate clock offset not only for stable clock but for unstable clock as well, and use kvm_sched_clock_read() which substracts the offset for both clocks. This is safe, because pvclock_clocksource_read() does the right thing and makes sure that clock always goes forward, so once offset is calculated with unstable clock, we won't get new reads that are smaller than offset, and thus won't get negative results. Thank you Jon DeVree for helping to reproduce this issue. Fixes: 857baa87b642 ("sched/clock: Enable sched clock early") Cc: stable@vger.kernel.org Reported-by: Dominique Martinet Signed-off-by: Pavel Tatashin Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e811d4d1c824..d908a37bf3f3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -104,12 +104,8 @@ static u64 kvm_sched_clock_read(void) static inline void kvm_sched_clock_init(bool stable) { - if (!stable) { - pv_ops.time.sched_clock = kvm_clock_read; + if (!stable) clear_sched_clock_stable(); - return; - } - kvm_sched_clock_offset = kvm_clock_read(); pv_ops.time.sched_clock = kvm_sched_clock_read; From 90952cd388595317170ad22a6bcee3cb8cde4942 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 30 Jan 2019 17:07:47 +0100 Subject: [PATCH 084/147] kvm: Use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kmalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 585845203db8..791ced69dd0b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3714,8 +3714,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; - new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * - sizeof(struct kvm_io_range)), GFP_KERNEL); + new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), + GFP_KERNEL); if (!new_bus) return -ENOMEM; @@ -3760,8 +3760,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, if (i == bus->dev_count) return; - new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * - sizeof(struct kvm_io_range)), GFP_KERNEL); + new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), + GFP_KERNEL); if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); goto broken; From 98d90582be2e08246a70af31e09950ecb8876252 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Tue, 29 Jan 2019 08:08:42 +0000 Subject: [PATCH 085/147] svm: Fix AVIC DFR and LDR handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current SVM AVIC driver makes two incorrect assumptions: 1. APIC LDR register cannot be zero 2. APIC DFR for all vCPUs must be the same LDR=0 means the local APIC does not support logical destination mode. Therefore, the driver should mark any previously assigned logical APIC ID table entry as invalid, and return success. Also, DFR is specific to a particular local APIC, and can be different among all vCPUs (as observed on Windows 10). These incorrect assumptions cause Windows 10 and FreeBSD VMs to fail to boot with AVIC enabled. So, instead of flush the whole logical APIC ID table, handle DFR and LDR for each vCPU independently. Fixes: 18f40c53e10f ('svm: Add VMEXIT handlers for AVIC') Cc: Radim Krčmář Cc: Paolo Bonzini Reported-by: Julian Stecklina Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 64 ++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f13a3a24d360..01b66bbcdd7a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -145,7 +145,6 @@ struct kvm_svm { /* Struct members for AVIC */ u32 avic_vm_id; - u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; struct hlist_node hnode; @@ -236,6 +235,7 @@ struct vcpu_svm { bool nrips_enabled : 1; u32 ldr_reg; + u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; bool avic_is_running; @@ -2106,6 +2106,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) INIT_LIST_HEAD(&svm->ir_list); spin_lock_init(&svm->ir_list_lock); + svm->dfr_reg = APIC_DFR_FLAT; return ret; } @@ -4565,8 +4566,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) return &logical_apic_id_table[index]; } -static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, - bool valid) +static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) { bool flat; u32 *entry, new_entry; @@ -4579,31 +4579,39 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, new_entry = READ_ONCE(*entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); - if (valid) - new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; - else - new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); return 0; } +static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool flat = svm->dfr_reg == APIC_DFR_FLAT; + u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + + if (entry) + WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK); +} + static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) { - int ret; + int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); - if (!ldr) - return 1; + if (ldr == svm->ldr_reg) + return 0; - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); - if (ret && svm->ldr_reg) { - avic_ldr_write(vcpu, 0, svm->ldr_reg, false); - svm->ldr_reg = 0; - } else { + avic_invalidate_logical_id_entry(vcpu); + + if (ldr) + ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + + if (!ret) svm->ldr_reg = ldr; - } + return ret; } @@ -4637,27 +4645,16 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) return 0; } -static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) +static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); - u32 mod = (dfr >> 28) & 0xf; - /* - * We assume that all local APICs are using the same type. - * If this changes, we need to flush the AVIC logical - * APID id table. - */ - if (kvm_svm->ldr_mode == mod) - return 0; + if (svm->dfr_reg == dfr) + return; - clear_page(page_address(kvm_svm->avic_logical_id_table_page)); - kvm_svm->ldr_mode = mod; - - if (svm->ldr_reg) - avic_handle_ldr_update(vcpu); - return 0; + avic_invalidate_logical_id_entry(vcpu); + svm->dfr_reg = dfr; } static int avic_unaccel_trap_write(struct vcpu_svm *svm) @@ -6163,8 +6160,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) return; - if (avic_handle_dfr_update(vcpu) != 0) - return; + avic_handle_dfr_update(vcpu); avic_handle_ldr_update(vcpu); } From f7589cca50ef8970dda56cdced8d96d05aa0a777 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 30 Jan 2019 17:18:59 +0100 Subject: [PATCH 086/147] KVM: x86: cull apicv code when userspace irqchip is requested Currently apicv_active can be true even if in-kernel LAPIC emulation is disabled. Avoid this by properly initializing it in kvm_arch_vcpu_init, and then do not do anything to deactivate APICv when it is actually not used (Currently APICv is only deactivated by SynIC code that in turn is only reachable when in-kernel LAPIC is in use. However, it is cleaner if kvm_vcpu_deactivate_apicv avoids relying on this. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e67ecf25e690..a1fb99f21317 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7055,6 +7055,13 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) { + if (!lapic_in_kernel(vcpu)) { + WARN_ON_ONCE(vcpu->arch.apicv_active); + return; + } + if (!vcpu->arch.apicv_active) + return; + vcpu->arch.apicv_active = false; kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } @@ -9005,7 +9012,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) struct page *page; int r; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -9026,6 +9032,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_pio_data; if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; From c57cd3c89ecf2812976f53e494580a396f93efd2 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Tue, 29 Jan 2019 08:09:46 +0000 Subject: [PATCH 087/147] svm: Fix improper check when deactivate AVIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function svm_refresh_apicv_exec_ctrl() always returning prematurely as kvm_vcpu_apicv_active() always return false when calling from the function arch/x86/kvm/x86.c:kvm_vcpu_deactivate_apicv(). This is because the apicv_active is set to false just before calling refresh_apicv_exec_ctrl(). Also, we need to mark VMCB_AVIC bit as dirty instead of VMCB_INTR. So, fix svm_refresh_apicv_exec_ctrl() to properly deactivate AVIC. Fixes: 67034bb9dd5e ('KVM: SVM: Add irqchip_split() checks before enabling AVIC') Cc: Radim Krčmář Cc: Paolo Bonzini Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 01b66bbcdd7a..74ceda470eae 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5122,11 +5122,11 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!kvm_vcpu_apicv_active(&svm->vcpu)) - return; - - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; - mark_dirty(vmcb, VMCB_INTR); + if (kvm_vcpu_apicv_active(vcpu)) + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + else + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + mark_dirty(vmcb, VMCB_AVIC); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) From 946c522b603f281195af1df91837a1d4d1eb3bc9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jan 2019 14:39:23 -0800 Subject: [PATCH 088/147] KVM: nVMX: Sign extend displacements of VMX instr's mem operands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VMCS.EXIT_QUALIFCATION field reports the displacements of memory operands for various instructions, including VMX instructions, as a naturally sized unsigned value, but masks the value by the addr size, e.g. given a ModRM encoded as -0x28(%ebp), the -0x28 displacement is reported as 0xffffffd8 for a 32-bit address size. Despite some weird wording regarding sign extension, the SDM explicitly states that bits beyond the instructions address size are undefined: In all cases, bits of this field beyond the instruction’s address size are undefined. Failure to sign extend the displacement results in KVM incorrectly treating a negative displacement as a large positive displacement when the address size of the VMX instruction is smaller than KVM's native size, e.g. a 32-bit address size on a 64-bit KVM. The very original decoding, added by commit 064aea774768 ("KVM: nVMX: Decoding memory operands of VMX instructions"), sort of modeled sign extension by truncating the final virtual/linear address for a 32-bit address size. I.e. it messed up the effective address but made it work by adjusting the final address. When segmentation checks were added, the truncation logic was kept as-is and no sign extension logic was introduced. In other words, it kept calculating the wrong effective address while mostly generating the correct virtual/linear address. As the effective address is what's used in the segment limit checks, this results in KVM incorreclty injecting #GP/#SS faults due to non-existent segment violations when a nested VMM uses negative displacements with an address size smaller than KVM's native address size. Using the -0x28(%ebp) example, an EBP value of 0x1000 will result in KVM using 0x100000fd8 as the effective address when checking for a segment limit violation. This causes a 100% failure rate when running a 32-bit KVM build as L1 on top of a 64-bit KVM L0. Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0e67649e39ce..d531f4c91a34 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4020,6 +4020,10 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ off = exit_qualification; /* holds the displacement */ + if (addr_size == 1) + off = (gva_t)sign_extend64(off, 31); + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) From 8570f9e881e3fde98801bb3a47eef84dd934d405 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jan 2019 14:39:24 -0800 Subject: [PATCH 089/147] KVM: nVMX: Apply addr size mask to effective address for VMX instructions The address size of an instruction affects the effective address, not the virtual/linear address. The final address may still be truncated, e.g. to 32-bits outside of long mode, but that happens irrespective of the address size, e.g. a 32-bit address size can yield a 64-bit virtual address when using FS/GS with a non-zero base. Fixes: 064aea774768 ("KVM: nVMX: Decoding memory operands of VMX instructions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d531f4c91a34..34081cc8cdeb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4029,20 +4029,41 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, if (index_is_valid) off += kvm_register_read(vcpu, index_reg)< Date: Wed, 23 Jan 2019 14:39:25 -0800 Subject: [PATCH 090/147] KVM: nVMX: Ignore limit checks on VMX instructions using flat segments Regarding segments with a limit==0xffffffff, the SDM officially states: When the effective limit is FFFFFFFFH (4 GBytes), these accesses may or may not cause the indicated exceptions. Behavior is implementation-specific and may vary from one execution to another. In practice, all CPUs that support VMX ignore limit checks for "flat segments", i.e. an expand-up data or code segment with base=0 and limit=0xffffffff. This is subtly different than wrapping the effective address calculation based on the address size, as the flat segment behavior also applies to accesses that would wrap the 4g boundary, e.g. a 4-byte access starting at 0xffffffff will access linear addresses 0xffffffff, 0x0, 0x1 and 0x2. Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 34081cc8cdeb..0050c179c5d3 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4087,10 +4087,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); - /* Protected mode: #GP(0)/#SS(0) if the memory - * operand is outside the segment limit. + + /* + * Protected mode: #GP(0)/#SS(0) if the memory operand is + * outside the segment limit. All CPUs that support VMX ignore + * limit checks for flat segments, i.e. segments with base==0, + * limit==0xffffffff and of type expand-up data or code. */ - exn = exn || (off + sizeof(u64) > s.limit); + if (!(s.base == 0 && s.limit == 0xffffffff && + ((s.type & 8) || !(s.type & 4)))) + exn = exn || (off + sizeof(u64) > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, From e0dfacbfe91ad7947f0c44829c0358ac2e17d3c6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 30 Jan 2019 17:25:38 +0100 Subject: [PATCH 091/147] KVM: nVMX: remove useless is_protmode check VMX is only accessible in protected mode, remove a confusing check that causes the conditional to lack a final "else" branch. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0050c179c5d3..be9b2c8fd32e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4056,7 +4056,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * destination for long mode! */ exn = is_noncanonical_address(*ret, vcpu); - } else if (is_protmode(vcpu)) { + } else { /* * When not in long mode, the virtual/linear address is * unconditionally truncated to 32 bits regardless of the From 8acc0993e3f9a04a407ff1507dd329455f340121 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jan 2019 17:28:40 +1300 Subject: [PATCH 092/147] kvm, x86, mmu: Use kernel generic dynamic physical address mask AMD's SME/SEV is no longer the only case which reduces supported physical address bits, since Intel introduced Multi-key Total Memory Encryption (MKTME), which repurposes high bits of physical address as keyID, thus effectively shrinks supported physical address bits. To cover both cases (and potential similar future features), kernel MM introduced generic dynamaic physical address mask instead of hard-coded __PHYSICAL_MASK in 'commit 94d49eb30e854 ("x86/mm: Decouple dynamic __PHYSICAL_MASK from AMD SME")'. KVM should use that too. Change PT64_BASE_ADDR_MASK to use kernel dynamic physical address mask when it is enabled, instead of sme_clr. PT64_DIR_BASE_ADDR_MASK is also deleted since it is not used at all. Signed-off-by: Kai Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index da9c42349b1f..45eb988aa411 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -109,9 +109,11 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) -#define PT64_DIR_BASE_ADDR_MASK \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif #define PT64_LVL_ADDR_MASK(level) \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT64_LEVEL_BITS))) - 1)) From 74f2370bb64f5c1c418f115e338f20f11b75f954 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Tue, 6 Nov 2018 13:55:27 +0800 Subject: [PATCH 093/147] KVM: x86: expose MOVDIRI CPU feature into VM. MOVDIRI moves doubleword or quadword from register to memory through direct store which is implemented by using write combining (WC) for writing data directly into memory without caching the data. Availability of the MOVDIRI instruction is indicated by the presence of the CPUID feature flag MOVDIRI(CPUID.0x07.0x0:ECX[bit 27]). This patch exposes the movdiri feature to the guest. The release document ref below link: https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf Signed-off-by: Liu Jingqi Cc: Xu Tao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbffa6c54697..8045434093f8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -404,7 +404,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE); + F(CLDEMOTE) | F(MOVDIRI); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = From c029b5deb0b5d7e5090317b835f21c5d93999db7 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Tue, 6 Nov 2018 13:55:28 +0800 Subject: [PATCH 094/147] KVM: x86: expose MOVDIR64B CPU feature into VM. MOVDIR64B moves 64-bytes as direct-store with 64-bytes write atomicity. Direct store is implemented by using write combining (WC) for writing data directly into memory without caching the data. Availability of the MOVDIR64B instruction is indicated by the presence of the CPUID feature flag MOVDIR64B (CPUID.0x07.0x0:ECX[bit 28]). This patch exposes the movdir64b feature to the guest. The release document ref below link: https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf Signed-off-by: Liu Jingqi Cc: Xu Tao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8045434093f8..e4644ba9c3da 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -404,7 +404,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI); + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = From 81b016676e1c8f58027bd4d2b1d8a981776b36fe Mon Sep 17 00:00:00 2001 From: Luwei Kang Date: Thu, 31 Jan 2019 16:52:02 +0800 Subject: [PATCH 095/147] KVM: x86: Sync the pending Posted-Interrupts Some Posted-Interrupts from passthrough devices may be lost or overwritten when the vCPU is in runnable state. The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state but not running on physical CPU). If a posted interrupt coming at this time, the irq remmaping facility will set the bit of PIR (Posted Interrupt Requests) without ON (Outstanding Notification). So this interrupt can't be sync to APIC virtualization register and will not be handled by Guest because ON is zero. Signed-off-by: Luwei Kang [Eliminate the pi_clear_sn fast path. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 26 +++++++++++--------------- arch/x86/kvm/vmx/vmx.h | 16 ++++++++-------- arch/x86/kvm/x86.c | 2 +- 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c39f1c38b878..c81dc632464a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1193,21 +1193,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; - /* - * First handle the simple case where no cmpxchg is necessary; just - * allow posting non-urgent interrupts. - * - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block will do it for us and the wakeup_handler - * expects the VCPU to be on the blocked_vcpu_list that matches - * PI.NDST. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || - vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - return; - } - /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1222,6 +1207,17 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.sn = 0; } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); + + /* + * Clear SN before reading the bitmap; this ensures that any + * interrupt that comes after the bitmap is read sets ON. The + * VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 + * in the spec), so it doesn't really have a memory barrier that + * pairs with this. However, we cannot do that and we need one. + */ + smp_mb__after_atomic(); + if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + pi_set_on(pi_desc); } /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6ee6a492efaf..ec23b4d65fb7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -336,16 +336,16 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - static inline void pi_set_sn(struct pi_desc *pi_desc) { - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); } static inline void pi_clear_on(struct pi_desc *pi_desc) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1fb99f21317..96f87d356c79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7808,7 +7808,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * 1) We should set ->mode before checking ->requests. Please see * the comment in kvm_vcpu_exiting_guest_mode(). * - * 2) For APICv, we should set ->mode before checking PIR.ON. This + * 2) For APICv, we should set ->mode before checking PID.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on * (see vmx_deliver_posted_interrupt). * From b4b65b5642d6edd50e35a159f20e793dc478d686 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Jan 2019 19:12:35 +0100 Subject: [PATCH 096/147] KVM: x86: cleanup freeing of nested state Ensure that the VCPU free path goes through vmx_leave_nested and thus nested_vmx_vmexit, so that the cancellation of the timer does not have to be in free_nested. In addition, because some paths through nested_vmx_vmexit do not go through sync_vmcs12, the cancellation of the timer is moved there. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index be9b2c8fd32e..523d30e935e1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - hrtimer_cancel(&vmx->nested.preemption_timer); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; free_vpid(vmx->nested.vpid02); @@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); + vmx_leave_nested(vcpu); vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); free_nested(vcpu); vcpu_put(vcpu); @@ -3438,13 +3438,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + if (nested_cpu_has_preemption_timer(vmcs12) && + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) vmcs12->vmx_preemption_timer_value = vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -3852,6 +3849,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); + if (nested_cpu_has_preemption_timer(vmcs12)) + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c81dc632464a..de6a9e7c6471 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6546,7 +6546,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (enable_pml) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); - leave_guest_mode(vcpu); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); From d92935979adba274b1099e67b7f713f6d8413121 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Thu, 31 Jan 2019 11:26:39 +0800 Subject: [PATCH 097/147] kvm: vmx: Fix typos in vmentry/vmexit control setting Previously, 'commit f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work mode")' work mode' offered framework to support Intel PT virtualization. However, the patch has some typos in vmx_vmentry_ctrl() and vmx_vmexit_ctrl(), e.g. used wrong flags and wrong variable, which will cause the VM entry failure later. Fixes: 'commit f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work mode")' Signed-off-by: Yu Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ec23b4d65fb7..d7d1048d221b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -444,7 +444,8 @@ static inline u32 vmx_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; if (pt_mode == PT_MODE_SYSTEM) - vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmentry_ctrl & ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); @@ -454,9 +455,10 @@ static inline u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; if (pt_mode == PT_MODE_SYSTEM) - vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmcs_config.vmexit_ctrl & + return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } From 359a6c3ddc5184122989d5152a4b975c1c262d33 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Jan 2019 19:14:46 +0100 Subject: [PATCH 098/147] KVM: nVMX: do not start the preemption timer hrtimer unnecessarily The preemption timer can be started even if there is a vmentry failure during or after loading guest state. That is pointless, move the call after all conditions have been checked. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 523d30e935e1..11431bb4c7ed 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2278,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); - /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. @@ -3018,6 +3014,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) if (unlikely(evaluate_pending_interrupts)) kvm_make_request(KVM_REQ_EVENT, vcpu); + /* + * Do not start the preemption timer hrtimer until after we know + * we are successful, so that only nested_vmx_vmexit needs to cancel + * the timer. + */ + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet From b12ce36a43f29dbff0bca14c5a51c276aea5662f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:49 -0800 Subject: [PATCH 099/147] kvm: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. There remain a few allocations which should be charged to the VM's cgroup but are not. In they include: vcpu->run kvm->coalesced_mmio_ring There allocations are unaccounted in this patch because they are mapped to userspace, and accounting them to a cgroup causes problems. This should be addressed in a future patch. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- virt/kvm/coalesced_mmio.c | 3 ++- virt/kvm/eventfd.c | 7 ++++--- virt/kvm/irqchip.c | 4 ++-- virt/kvm/kvm_main.c | 29 +++++++++++++++-------------- virt/kvm/vfio.c | 4 ++-- 5 files changed, 25 insertions(+), 22 deletions(-) diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 6855cce3e528..5294abb3f178 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -144,7 +144,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, if (zone->pio != 1 && zone->pio != 0) return -EINVAL; - dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), + GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index b20b751286fc..4325250afd72 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -297,7 +297,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) if (!kvm_arch_intc_initialized(kvm)) return -EAGAIN; - irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); if (!irqfd) return -ENOMEM; @@ -345,7 +345,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } if (!irqfd->resampler) { - resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); + resampler = kzalloc(sizeof(*resampler), + GFP_KERNEL_ACCOUNT); if (!resampler) { ret = -ENOMEM; mutex_unlock(&kvm->irqfds.resampler_lock); @@ -797,7 +798,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, if (IS_ERR(eventfd)) return PTR_ERR(eventfd); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); if (!p) { ret = -ENOMEM; goto fail; diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index b1286c4e0712..3547b0d8c91e 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -196,7 +196,7 @@ int kvm_set_irq_routing(struct kvm *kvm, nr_rt_entries += 1; new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; @@ -208,7 +208,7 @@ int kvm_set_irq_routing(struct kvm *kvm, for (i = 0; i < nr; ++i) { r = -ENOMEM; - e = kzalloc(sizeof(*e), GFP_KERNEL); + e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT); if (!e) goto out; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 791ced69dd0b..0a0ea8f4bb1b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -525,7 +525,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void) int i; struct kvm_memslots *slots; - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); if (!slots) return NULL; @@ -601,12 +601,12 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, sizeof(*kvm->debugfs_stat_data), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!kvm->debugfs_stat_data) return -ENOMEM; for (p = debugfs_entries; p->name; p++) { - stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); + stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) return -ENOMEM; @@ -671,7 +671,7 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err_no_irq_srcu; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], - kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); + kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) goto out_err; } @@ -789,7 +789,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); + memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); if (!memslot->dirty_bitmap) return -ENOMEM; @@ -1018,7 +1018,7 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); if (!slots) goto out_free; memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); @@ -2683,7 +2683,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_regs *kvm_regs; r = -ENOMEM; - kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); if (!kvm_regs) goto out; r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); @@ -2711,7 +2711,8 @@ out_free1: break; } case KVM_GET_SREGS: { - kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), + GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!kvm_sregs) goto out; @@ -2803,7 +2804,7 @@ out_free1: break; } case KVM_GET_FPU: { - fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!fpu) goto out; @@ -2980,7 +2981,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, if (test) return 0; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; @@ -3715,7 +3716,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, return -ENOSPC; new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_bus) return -ENOMEM; @@ -3761,7 +3762,7 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, return; new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); goto broken; @@ -4029,7 +4030,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) active = kvm_active_vms; spin_unlock(&kvm_lock); - env = kzalloc(sizeof(*env), GFP_KERNEL); + env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); if (!env) return; @@ -4045,7 +4046,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) add_uevent_var(env, "PID=%d", kvm->userspace_pid); if (kvm->debugfs_dentry) { - char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); if (p) { tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index d99850c462a1..524cbd20379f 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -219,7 +219,7 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) } } - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); + kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); if (!kvg) { mutex_unlock(&kv->lock); kvm_vfio_group_put_external_user(vfio_group); @@ -405,7 +405,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) if (tmp->ops == &kvm_vfio_ops) return -EBUSY; - kv = kzalloc(sizeof(*kv), GFP_KERNEL); + kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT); if (!kv) return -ENOMEM; From 254272ce6505948ecc0b4bf5cd0aa61cdd815994 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:50 -0800 Subject: [PATCH 100/147] kvm: x86: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. There remain a few allocations which should be charged to the VM's cgroup but are not. In x86, they include: vcpu->arch.pio_data There allocations are unaccounted in this patch because they are mapped to userspace, and accounting them to a cgroup causes problems. This should be addressed in a future patch. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/i8259.c | 2 +- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/lapic.c | 7 ++++--- arch/x86/kvm/mmu.c | 6 +++--- arch/x86/kvm/page_track.c | 2 +- arch/x86/kvm/x86.c | 16 +++++++++------- 8 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 89d20ed1d2e8..27c43525a05f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index af192895b1fc..4a6dc54cc12b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_t pid_nr; int ret; - pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); + pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT); if (!pit) return NULL; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index bdcd4139eca9..8b38bb4868a6 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm) struct kvm_pic *s; int ret; - s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT); if (!s) return -ENOMEM; spin_lock_init(&s->lock); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 4e822ad363f3..1add1bc881e2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm) struct kvm_ioapic *ioapic; int ret; - ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT); if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b6c2da7265c..991fdf7fc17f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), + GFP_KERNEL_ACCOUNT); if (!new) goto out; @@ -2259,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) ASSERT(vcpu != NULL); apic_debug("apic_init %d\n", vcpu->vcpu_id); - apic = kzalloc(sizeof(*apic), GFP_KERNEL); + apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) goto nomem; vcpu->arch.apic = apic; - apic->regs = (void *)get_zeroed_page(GFP_KERNEL); + apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 45eb988aa411..415d0e62cb3e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -961,7 +961,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); if (!obj) return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -3702,7 +3702,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) u64 *lm_root; - lm_root = (void*)get_zeroed_page(GFP_KERNEL); + lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (lm_root == NULL) return 1; @@ -5499,7 +5499,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) * Therefore we need to allocate shadow page tables in the first * 4GB of memory, which happens to fit the DMA32 zone. */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 3052a59a3065..fd04d462fdae 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.gfn_track[i]) goto track_free; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96f87d356c79..3de586f89730 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3879,7 +3879,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), + GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.lapic) @@ -4066,7 +4067,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) break; @@ -4090,7 +4091,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xcrs) break; @@ -9040,14 +9041,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) static_key_slow_inc(&kvm_no_apic_vcpu); vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks) { r = -ENOMEM; goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, + GFP_KERNEL_ACCOUNT)) { r = -ENOMEM; goto fail_free_mce_banks; } @@ -9306,13 +9308,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->arch.rmap[i] = kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL); + linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; From 1ec696470c860e2b1b95e178e368e180ad2b040f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:51 -0800 Subject: [PATCH 101/147] kvm: svm: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 74ceda470eae..b5b128a0a051 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1795,9 +1795,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = vmalloc(size); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); else - pages = kmalloc(size, GFP_KERNEL); + pages = kmalloc(size, GFP_KERNEL_ACCOUNT); if (!pages) return NULL; @@ -1865,7 +1866,9 @@ static void __unregister_enc_region_locked(struct kvm *kvm, static struct kvm *svm_vm_alloc(void) { - struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm)); + struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm), + GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); return &kvm_svm->kvm; } @@ -1940,7 +1943,7 @@ static int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL); + p_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!p_page) goto free_avic; @@ -1948,7 +1951,7 @@ static int avic_vm_init(struct kvm *kvm) clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL); + l_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!l_page) goto free_avic; @@ -2120,13 +2123,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!svm) { err = -ENOMEM; goto out; } - svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); if (!svm->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; @@ -2138,19 +2142,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto free_svm; err = -ENOMEM; - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) goto uninit; - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!msrpm_pages) goto free_page1; - nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) goto free_page2; - hsave_page = alloc_page(GFP_KERNEL); + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!hsave_page) goto free_page3; @@ -5192,7 +5196,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -6307,7 +6311,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) if (ret) return ret; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6357,7 +6361,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - start = kzalloc(sizeof(*start), GFP_KERNEL); + start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT); if (!start) return -ENOMEM; @@ -6454,7 +6458,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6531,7 +6535,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, measure, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6593,7 +6597,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6614,7 +6618,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6642,7 +6646,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, struct sev_data_dbg *data; int ret; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6897,7 +6901,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) } ret = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) goto e_unpin_memory; @@ -7003,7 +7007,7 @@ static int svm_register_enc_region(struct kvm *kvm, if (range->addr > ULONG_MAX || range->size > ULONG_MAX) return -EINVAL; - region = kzalloc(sizeof(*region), GFP_KERNEL); + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); if (!region) return -ENOMEM; From 4183683918efc3549b5ebddde4ed5edfdac45c17 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:52 -0800 Subject: [PATCH 102/147] kvm: vmx: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++++-- arch/x86/kvm/vmx/vmx.c | 27 ++++++++++++++++++--------- arch/x86/kvm/vmx/vmx.h | 5 +++-- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 11431bb4c7ed..a4b1b5a50489 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4166,11 +4166,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (r < 0) goto out_vmcs02; - vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5715,6 +5715,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) { + /* + * The vmx_bitmap is not tied to a VM and so should + * not be charged to a memcg. + */ vmx_bitmap[i] = (unsigned long *) __get_free_page(GFP_KERNEL); if (!vmx_bitmap[i]) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index de6a9e7c6471..4950bb20e06a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -246,6 +246,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + /* + * This allocation for vmx_l1d_flush_pages is not tied to a VM + * lifetime and so should not be charged to a memcg. + */ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); if (!page) return -ENOMEM; @@ -2386,13 +2390,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = __alloc_pages_node(node, flags, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); @@ -2439,7 +2443,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs_init(loaded_vmcs); if (cpu_has_vmx_msr_bitmap()) { - loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + loaded_vmcs->msr_bitmap = (unsigned long *) + __get_free_page(GFP_KERNEL_ACCOUNT); if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); @@ -2480,7 +2485,7 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(false, cpu); + vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -6530,7 +6535,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) static struct kvm *vmx_vm_alloc(void) { - struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); + struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), + GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); return &kvm_vmx->kvm; } @@ -6557,14 +6564,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + struct vcpu_vmx *vmx; unsigned long *msr_bitmap; int cpu; + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx) return ERR_PTR(-ENOMEM); - vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); if (!vmx->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; @@ -6586,12 +6595,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * for the guest, etc. */ if (enable_pml) { - vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) goto uninit_vcpu; } - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) > PAGE_SIZE); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d7d1048d221b..1554cb45b393 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -479,7 +479,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu); +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -488,7 +488,8 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id(), + GFP_KERNEL_ACCOUNT); } u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); From 152482580a1b0accb60676063a1ac57b2d12daf6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 12:54:17 -0800 Subject: [PATCH 103/147] KVM: Call kvm_arch_memslots_updated() before updating memslots kvm_arch_memslots_updated() is at this point in time an x86-specific hook for handling MMIO generation wraparound. x86 stashes 19 bits of the memslots generation number in its MMIO sptes in order to avoid full page fault walks for repeat faults on emulated MMIO addresses. Because only 19 bits are used, wrapping the MMIO generation number is possible, if unlikely. kvm_arch_memslots_updated() alerts x86 that the generation has changed so that it can invalidate all MMIO sptes in case the effective MMIO generation has wrapped so as to avoid using a stale spte, e.g. a (very) old spte that was created with generation==0. Given that the purpose of kvm_arch_memslots_updated() is to prevent consuming stale entries, it needs to be called before the new generation is propagated to memslots. Invalidating the MMIO sptes after updating memslots means that there is a window where a vCPU could dereference the new memslots generation, e.g. 0, and incorrectly reuse an old MMIO spte that was created with (pre-wrap) generation==0. Fixes: e59dbe09f8e6 ("KVM: Introduce kvm_arch_memslots_updated()") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/s390/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 2 +- virt/kvm/arm/mmu.c | 2 +- virt/kvm/kvm_main.c | 7 +++++-- 9 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index d2abd98471e8..41204a49cf95 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -1134,7 +1134,7 @@ static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0f98f00da2ea..19693b8add93 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -837,7 +837,7 @@ struct kvm_vcpu_arch { static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d5d24889c3bc..c2b8c8c6c9be 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -878,7 +878,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0e2ef41efb9d..c4758e1a8843 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 415d0e62cb3e..a53a0e7ad9e6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5893,13 +5893,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { /* * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { + if (unlikely((gen & MMIO_GEN_MASK) == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3de586f89730..03d26ffb29cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9357,13 +9357,13 @@ out_free: return -ENOMEM; } -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ - kvm_mmu_invalidate_mmio_sptes(kvm, slots); + kvm_mmu_invalidate_mmio_sptes(kvm, gen); } int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c38cc5eb7e73..cf761ff58224 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -634,7 +634,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont); int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages); -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots); +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, const struct kvm_userspace_memory_region *mem, diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fbdf3ac2f001..e0355e0f8712 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -2350,7 +2350,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; } -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0a0ea8f4bb1b..d54f6578a849 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -874,6 +874,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, int as_id, struct kvm_memslots *slots) { struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); + u64 gen; /* * Set the low bit in the generation, which disables SPTE caching @@ -896,9 +897,11 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, * space 0 will use generations 0, 4, 8, ... while * address space 1 will * use generations 2, 6, 10, 14, ... */ - slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; + gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1; - kvm_arch_memslots_updated(kvm, slots); + kvm_arch_memslots_updated(kvm, gen); + + slots->generation = gen; return old_memslots; } From e1359e2beb8b0a1188abc997273acbaedc8ee791 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:12 -0800 Subject: [PATCH 104/147] KVM: x86/mmu: Detect MMIO generation wrap in any address space The check to detect a wrap of the MMIO generation explicitly looks for a generation number of zero. Now that unique memslots generation numbers are assigned to each address space, only address space 0 will get a generation number of exactly zero when wrapping. E.g. when address space 1 goes from 0x7fffe to 0x80002, the MMIO generation number will wrap to 0x2. Adjust the MMIO generation to strip the address space modifier prior to checking for a wrap. Fixes: 4bd518f1598d ("KVM: use separate generations for each address space") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a53a0e7ad9e6..c2f2c9de63ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5895,11 +5895,28 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { + gen &= MMIO_GEN_MASK; + /* - * The very rare case: if the generation-number is round, + * Shift to eliminate the "update in-progress" flag, which isn't + * included in the spte's generation number. + */ + gen >>= 1; + + /* + * Generation numbers are incremented in multiples of the number of + * address spaces in order to provide unique generations across all + * address spaces. Strip what is effectively the address space + * modifier prior to checking for a wrap of the MMIO generation so + * that a wrap in any address space is detected. + */ + gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); + + /* + * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ - if (unlikely((gen & MMIO_GEN_MASK) == 0)) { + if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } From ddfd1730fd829743e41213e32ccc8b4aa6dc8325 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:13 -0800 Subject: [PATCH 105/147] KVM: x86/mmu: Do not cache MMIO accesses while memslots are in flux When installing new memslots, KVM sets bit 0 of the generation number to indicate that an update is in-progress. Until the update is complete, there are no guarantees as to whether a vCPU will see the old or the new memslots. Explicity prevent caching MMIO accesses so as to avoid using an access cached from the old memslots after the new memslots have been installed. Note that it is unclear whether or not disabling caching during the update window is strictly necessary as there is no definitive documentation as to what ordering guarantees KVM provides with respect to updating memslots. That being said, the MMIO spte code does not allow reusing sptes created while an update is in-progress, and the associated documentation explicitly states: We do not want to use an MMIO sptes created with an odd generation number, ... If KVM is unlucky and creates an MMIO spte while the low bit is 1, the next access to the spte will always be a cache miss. At the very least, disabling the per-vCPU MMIO cache during updates will make its behavior consistent with the MMIO spte behavior and documentation. Fixes: 56f17dd3fbc4 ("kvm: x86: fix stale mmio cache bug") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 224cd0a47568..20ede17202bf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -181,6 +181,11 @@ static inline bool emul_is_noncanonical_address(u64 la, static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, unsigned access) { + u64 gen = kvm_memslots(vcpu->kvm)->generation; + + if (unlikely(gen & 1)) + return; + /* * If this is a shadow nested page table, the "GVA" is * actually a nGPA. @@ -188,7 +193,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; - vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; + vcpu->arch.mmio_gen = gen; } static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) From 361209e054a2c9f34da090ee1ee4c1e8bfe76a64 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:14 -0800 Subject: [PATCH 106/147] KVM: Explicitly define the "memslot update in-progress" bit KVM uses bit 0 of the memslots generation as an "update in-progress" flag, which is used by x86 to prevent caching MMIO access while the memslots are changing. Although the intended behavior is flag-like, e.g. MMIO sptes intentionally drop the in-progress bit so as to avoid caching data from in-flux memslots, the implementation oftentimes treats the bit as part of the generation number itself, e.g. incrementing the generation increments twice, once to set the flag and once to clear it. Prior to commit 4bd518f1598d ("KVM: use separate generations for each address space"), incorporating the "update in-progress" bit into the generation number largely made sense, e.g. "real" generations are even, "bogus" generations are odd, most code doesn't need to be aware of the bit, etc... Now that unique memslots generation numbers are assigned to each address space, stealthing the in-progress status into the generation number results in a wide variety of subtle code, e.g. kvm_create_vm() jumps over bit 0 when initializing the memslots generation without any hint as to why. Explicitly define the flag and convert as much code as possible (which isn't much) to actually treat it like a flag. This paves the way for eventually using a different bit for "update in-progress" so that it can be a flag in truth instead of a awkward extension to the generation number. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 2 +- include/linux/kvm_host.h | 21 +++++++++++++++++++++ virt/kvm/kvm_main.c | 26 +++++++++++++------------- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 20ede17202bf..28406aa1136d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -183,7 +183,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, { u64 gen = kvm_memslots(vcpu->kvm)->generation; - if (unlikely(gen & 1)) + if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) return; /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cf761ff58224..5e1cb74922b3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -48,6 +48,27 @@ */ #define KVM_MEMSLOT_INVALID (1UL << 16) +/* + * Bit 0 of the memslot generation number is an "update in-progress flag", + * e.g. is temporarily set for the duration of install_new_memslots(). + * This flag effectively creates a unique generation number that is used to + * mark cached memslot data, e.g. MMIO accesses, as potentially being stale, + * i.e. may (or may not) have come from the previous memslots generation. + * + * This is necessary because the actual memslots update is not atomic with + * respect to the generation number update. Updating the generation number + * first would allow a vCPU to cache a spte from the old memslots using the + * new generation number, and updating the generation number after switching + * to the new memslots would allow cache hits using the old generation number + * to reference the defunct memslots. + * + * This mechanism is used to prevent getting hits in KVM's caches while a + * memslot update is in-progress, and to prevent cache hits *after* updating + * the actual generation number against accesses that were inserted into the + * cache *before* the memslots were updated. + */ +#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(0) + /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d54f6578a849..0f1f1c7c7a36 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -874,30 +874,30 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, int as_id, struct kvm_memslots *slots) { struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); - u64 gen; + u64 gen = old_memslots->generation; - /* - * Set the low bit in the generation, which disables SPTE caching - * until the end of synchronize_srcu_expedited. - */ - WARN_ON(old_memslots->generation & 1); - slots->generation = old_memslots->generation + 1; + WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; rcu_assign_pointer(kvm->memslots[as_id], slots); synchronize_srcu_expedited(&kvm->srcu); /* - * Increment the new memslot generation a second time. This prevents - * vm exits that race with memslot updates from caching a memslot - * generation that will (potentially) be valid forever. - * + * Increment the new memslot generation a second time, dropping the + * update in-progress flag and incrementing then generation based on + * the number of address spaces. This provides a unique and easily + * identifiable generation number while the memslots are in flux. + */ + gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + + /* * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address - * space 0 will use generations 0, 4, 8, ... while * address space 1 will + * space 0 will use generations 0, 4, 8, ... while address space 1 will * use generations 2, 6, 10, 14, ... */ - gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1; + gen += KVM_ADDRESS_SPACE_NUM * 2; kvm_arch_memslots_updated(kvm, gen); From 5192f9b976f9687569a90602b8a6c053da4498f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:15 -0800 Subject: [PATCH 107/147] KVM: x86: Use a u64 when passing the MMIO gen around KVM currently uses an 'unsigned int' for the MMIO generation number despite it being derived from the 64-bit memslots generation and being propagated to (potentially) 64-bit sptes. There is no hidden agenda behind using an 'unsigned int', it's done simply because the MMIO generation will never set bits above bit 19. Passing a u64 will allow the "update in-progress" flag to be relocated from bit 0 to bit 63 and removes the need to cast the generation back to a u64 when propagating it to a spte. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c2f2c9de63ed..54bb706e40e8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -348,20 +348,20 @@ static inline bool is_access_track_spte(u64 spte) #define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) -static u64 generation_mmio_spte_mask(unsigned int gen) +static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; WARN_ON(gen & ~MMIO_GEN_MASK); mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; - mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + mask |= (gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; return mask; } -static unsigned int get_mmio_spte_generation(u64 spte) +static u64 get_mmio_spte_generation(u64 spte) { - unsigned int gen; + u64 gen; spte &= ~shadow_mmio_mask; @@ -370,7 +370,7 @@ static unsigned int get_mmio_spte_generation(u64 spte) return gen; } -static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) +static u64 kvm_current_mmio_generation(struct kvm_vcpu *vcpu) { return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; } @@ -378,7 +378,7 @@ static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - unsigned int gen = kvm_current_mmio_generation(vcpu); + u64 gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -426,7 +426,7 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { - unsigned int kvm_gen, spte_gen; + u64 kvm_gen, spte_gen; kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); From cae7ed3c2cb06680400adab632a243c5e5f42637 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:16 -0800 Subject: [PATCH 108/147] KVM: x86: Refactor the MMIO SPTE generation handling The code to propagate the memslots generation number into MMIO sptes is a bit convoluted. The "what" is relatively straightfoward, e.g. the comment explaining which bits go where is quite readable, but the "how" requires a lot of staring to understand what is happening. For example, 'MMIO_GEN_LOW_SHIFT' is actually used to calculate the high bits of the spte, while 'MMIO_SPTE_GEN_LOW_SHIFT' is used to calculate the low bits. Refactor the code to: - use #defines whose values align with the bits defined in the comment - use consistent code for both the high and low mask - explicitly highlight the handling of bit 0 (update in-progress flag) - explicitly call out that the defines are for MMIO sptes (to avoid confusion with the per-vCPU MMIO cache, which uses the full memslots generation) In addition to making the code a little less magical, this paves the way for moving the update in-progress flag to bit 63 without having to simultaneously rewrite all of the MMIO spte code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 72 ++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 54bb706e40e8..364b2a737d94 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -332,30 +332,41 @@ static inline bool is_access_track_spte(u64 spte) } /* - * the low bit of the generation number is always presumed to be zero. - * This disables mmio caching during memslot updates. The concept is - * similar to a seqcount but instead of retrying the access we just punt - * and ignore the cache. + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: * - * spte bits 3-11 are used as bits 1-9 of the generation number, - * the bits 52-61 are used as bits 10-19 of the generation number. + * Bits 1-9 of the memslot generation are propagated to spte bits 3-11 + * Bits 10-19 of the memslot generation are propagated to spte bits 52-61 + * + * The MMIO generation starts at bit 1 of the memslots generation in order to + * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag. Including + * the flag would require stealing a bit from the "real" generation number and + * thus effectively halve the maximum number of MMIO generations that can be + * handled before encountering a wrap (which requires a full MMU zap). The + * flag is instead explicitly queried when checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_LOW_SHIFT 2 -#define MMIO_SPTE_GEN_HIGH_SHIFT 52 +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(19, 1) +#define MMIO_SPTE_GEN_SHIFT 1 -#define MMIO_GEN_SHIFT 20 -#define MMIO_GEN_LOW_SHIFT 10 -#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) -#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) +#define MMIO_SPTE_GEN_HIGH_START 52 +#define MMIO_SPTE_GEN_HIGH_END 61 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; - WARN_ON(gen & ~MMIO_GEN_MASK); + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; - mask |= (gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + gen >>= MMIO_SPTE_GEN_SHIFT; + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; return mask; } @@ -365,20 +376,15 @@ static u64 get_mmio_spte_generation(u64 spte) spte &= ~shadow_mmio_mask; - gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; - gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; - return gen; -} - -static u64 kvm_current_mmio_generation(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen << MMIO_SPTE_GEN_SHIFT; } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - u64 gen = kvm_current_mmio_generation(vcpu); + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -409,7 +415,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; + u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask; return (spte & ~mask) & ~PAGE_MASK; } @@ -426,9 +432,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { - u64 kvm_gen, spte_gen; + u64 kvm_gen, spte_gen, gen; - kvm_gen = kvm_current_mmio_generation(vcpu); + gen = kvm_vcpu_memslots(vcpu)->generation; + if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) + return false; + + kvm_gen = gen & MMIO_SPTE_GEN_MASK; spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); @@ -5895,13 +5905,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - gen &= MMIO_GEN_MASK; + gen &= MMIO_SPTE_GEN_MASK; /* - * Shift to eliminate the "update in-progress" flag, which isn't - * included in the spte's generation number. + * Shift to adjust for the "update in-progress" flag, which isn't + * included in the MMIO generation number. */ - gen >>= 1; + gen >>= MMIO_SPTE_GEN_SHIFT; /* * Generation numbers are incremented in multiples of the number of From 0e32958ec449a9bb63c031ed04ac7a494ea1bc1c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:17 -0800 Subject: [PATCH 109/147] KVM: Remove the hack to trigger memslot generation wraparound x86 captures a subset of the memslot generation (19 bits) in its MMIO sptes so that it can expedite emulated MMIO handling by checking only the releveant spte, i.e. doesn't need to do a full page fault walk. Because the MMIO sptes capture only 19 bits (due to limited space in the sptes), there is a non-zero probability that the MMIO generation could wrap, e.g. after 500k memslot updates. Since normal usage is extremely unlikely to result in 500k memslot updates, a hack was added by commit 69c9ea93eaea ("KVM: MMU: init kvm generation close to mmio wrap-around value") to offset the MMIO generation in order to trigger a wraparound, e.g. after 150 memslot updates. When separate memslot generation sequences were assigned to each address space, commit 00f034a12fdd ("KVM: do not bias the generation number in kvm_current_mmio_generation") moved the offset logic into the initialization of the memslot generation itself so that the per-address space bit(s) were not dropped/corrupted by the MMIO shenanigans. Remove the offset hack for three reasons: - While it does exercise x86's kvm_mmu_invalidate_mmio_sptes(), simply wrapping the generation doesn't actually test the interesting case of having stale MMIO sptes with the new generation number, e.g. old sptes with a generation number of 0. - Triggering kvm_mmu_invalidate_mmio_sptes() prematurely makes its performance rather important since the probability of invalidating MMIO sptes jumps from "effectively never" to "fairly likely". This limits what can be done in future patches, e.g. to simplify the invalidation code, as doing so without proper caution could lead to a noticeable performance regression. - Forcing the memslots generation, which is a 64-bit number, to wrap prevents KVM from assuming the memslots generation will never wrap. This in turn prevents KVM from using an arbitrary bit for the "update in-progress" flag, e.g. using bit 63 would immediately collide with using a large value as the starting generation number. The "update in-progress" flag is effectively forced into bit 0 so that it's (subtly) taken into account when incrementing the generation. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0f1f1c7c7a36..5c2e7e173a46 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -656,12 +656,8 @@ static struct kvm *kvm_create_vm(unsigned long type) struct kvm_memslots *slots = kvm_alloc_memslots(); if (!slots) goto out_err_no_srcu; - /* - * Generations must be different for each address space. - * Init kvm generation close to the maximum to easily test the - * code of handling generation number wrap-around. - */ - slots->generation = i * 2 - 150; + /* Generations must be different for each address space. */ + slots->generation = i * 2; rcu_assign_pointer(kvm->memslots[i], slots); } From 164bf7e56c5a73f2f819c39ba7e0f20e0f97dc7b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:18 -0800 Subject: [PATCH 110/147] KVM: Move the memslot update in-progress flag to bit 63 ...now that KVM won't explode by moving it out of bit 0. Using bit 63 eliminates the need to jump over bit 0, e.g. when calculating a new memslots generation or when propagating the memslots generation to an MMIO spte. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/mmu.txt | 13 ++++++++----- arch/x86/kvm/mmu.c | 31 ++++++++++++------------------- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 8 ++++---- 4 files changed, 26 insertions(+), 30 deletions(-) diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index e507a9e0421e..367a952f50ab 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -452,13 +452,16 @@ stored into the MMIO spte. Thus, the MMIO spte might be created based on out-of-date information, but with an up-to-date generation number. To avoid this, the generation number is incremented again after synchronize_srcu -returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a +returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a memslot update, while some SRCU readers might be using the old copy. We do not want to use an MMIO sptes created with an odd generation number, and we can do -this without losing a bit in the MMIO spte. The low bit of the generation -is not stored in MMIO spte, and presumed zero when it is extracted out of the -spte. If KVM is unlucky and creates an MMIO spte while the low bit is 1, -the next access to the spte will always be a cache miss. +this without losing a bit in the MMIO spte. The "update in-progress" bit of the +generation is not stored in MMIO spte, and is so is implicitly zero when the +generation is extracted out of the spte. If KVM is unlucky and creates an MMIO +spte while an update is in-progress, the next access to the spte will always be +a cache miss. For example, a subsequent access during the update window will +miss due to the in-progress flag diverging, while an access after the update +window closes will have a higher generation number (as compared to the spte). Further reading diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 364b2a737d94..bcf62e1e1ff7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -335,18 +335,17 @@ static inline bool is_access_track_spte(u64 spte) * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * - * Bits 1-9 of the memslot generation are propagated to spte bits 3-11 - * Bits 10-19 of the memslot generation are propagated to spte bits 52-61 + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 * - * The MMIO generation starts at bit 1 of the memslots generation in order to - * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag. Including - * the flag would require stealing a bit from the "real" generation number and - * thus effectively halve the maximum number of MMIO generations that can be - * handled before encountering a wrap (which requires a full MMU zap). The - * flag is instead explicitly queried when checking for MMIO spte cache hits. + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(19, 1) -#define MMIO_SPTE_GEN_SHIFT 1 +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 11 @@ -363,8 +362,6 @@ static u64 generation_mmio_spte_mask(u64 gen) WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - gen >>= MMIO_SPTE_GEN_SHIFT; - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; return mask; @@ -378,7 +375,7 @@ static u64 get_mmio_spte_generation(u64 spte) gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen << MMIO_SPTE_GEN_SHIFT; + return gen; } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, @@ -5905,13 +5902,9 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - gen &= MMIO_SPTE_GEN_MASK; + WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); - /* - * Shift to adjust for the "update in-progress" flag, which isn't - * included in the MMIO generation number. - */ - gen >>= MMIO_SPTE_GEN_SHIFT; + gen &= MMIO_SPTE_GEN_MASK; /* * Generation numbers are incremented in multiples of the number of diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5e1cb74922b3..85c0c00d5159 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -49,7 +49,7 @@ #define KVM_MEMSLOT_INVALID (1UL << 16) /* - * Bit 0 of the memslot generation number is an "update in-progress flag", + * Bit 63 of the memslot generation number is an "update in-progress flag", * e.g. is temporarily set for the duration of install_new_memslots(). * This flag effectively creates a unique generation number that is used to * mark cached memslot data, e.g. MMIO accesses, as potentially being stale, @@ -67,7 +67,7 @@ * the actual generation number against accesses that were inserted into the * cache *before* the memslots were updated. */ -#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(0) +#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(63) /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5c2e7e173a46..c9d0bc01f8cb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -657,7 +657,7 @@ static struct kvm *kvm_create_vm(unsigned long type) if (!slots) goto out_err_no_srcu; /* Generations must be different for each address space. */ - slots->generation = i * 2; + slots->generation = i; rcu_assign_pointer(kvm->memslots[i], slots); } @@ -890,10 +890,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address - * space 0 will use generations 0, 4, 8, ... while address space 1 will - * use generations 2, 6, 10, 14, ... + * space 0 will use generations 0, 2, 4, ... while address space 1 will + * use generations 1, 3, 5, ... */ - gen += KVM_ADDRESS_SPACE_NUM * 2; + gen += KVM_ADDRESS_SPACE_NUM; kvm_arch_memslots_updated(kvm, gen); From 85875a133ea3aca5e4ea423c7cb991ad53f4866e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:19 -0800 Subject: [PATCH 111/147] KVM: x86/mmu: Move slot_level_*() helper functions up a few lines ...so that kvm_mmu_invalidate_zap_pages_in_memslot() can utilize the helpers in future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 139 +++++++++++++++++++++++---------------------- 1 file changed, 70 insertions(+), 69 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bcf62e1e1ff7..5a8af2bcd891 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5487,6 +5487,76 @@ void kvm_disable_tdp(void) } EXPORT_SYMBOL_GPL(kvm_disable_tdp); + +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); + +/* The caller should hold mmu-lock before calling this function. */ +static __always_inline bool +slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) +{ + struct slot_rmap_walk_iterator iterator; + bool flush = false; + + for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } + + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + + return flush; +} + +static __always_inline bool +slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +{ + return slot_handle_level_range(kvm, memslot, fn, start_level, + end_level, memslot->base_gfn, + memslot->base_gfn + memslot->npages - 1, + lock_flush_tlb); +} + +static __always_inline bool +slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static __always_inline bool +slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static __always_inline bool +slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); +} + static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu->pae_root); @@ -5561,75 +5631,6 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); } -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); - -/* The caller should hold mmu-lock before calling this function. */ -static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) -{ - struct slot_rmap_walk_iterator iterator; - bool flush = false; - - for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - cond_resched_lock(&kvm->mmu_lock); - } - } - - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - - return flush; -} - -static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - bool lock_flush_tlb) -{ - return slot_handle_level_range(kvm, memslot, fn, start_level, - end_level, memslot->base_gfn, - memslot->base_gfn + memslot->npages - 1, - lock_flush_tlb); -} - -static __always_inline bool -slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool -slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_PAGE_TABLE_LEVEL, lock_flush_tlb); -} - void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; From a21136345cb6f1a5b7f576701b6a454da5b6e606 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:20 -0800 Subject: [PATCH 112/147] KVM: x86/mmu: Split remote_flush+zap case out of kvm_mmu_flush_or_zap() ...and into a separate helper, kvm_mmu_remote_flush_or_zap(), that does not require a vcpu so that the code can be (re)used by kvm_mmu_invalidate_zap_pages_in_memslot(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5a8af2bcd891..1cce120f06ae 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2240,18 +2240,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return true; } +static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, + struct list_head *invalid_list, + bool remote_flush) +{ + if (!remote_flush && !list_empty(invalid_list)) + return false; + + if (!list_empty(invalid_list)) + kvm_mmu_commit_zap_page(kvm, invalid_list); + else + kvm_flush_remote_tlbs(kvm); + return true; +} + static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, struct list_head *invalid_list, bool remote_flush, bool local_flush) { - if (!list_empty(invalid_list)) { - kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list); + if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) return; - } - if (remote_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) + if (local_flush) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } From 4e103134b862314dc2f2f18f2fb0ab972adc3f5f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:21 -0800 Subject: [PATCH 113/147] KVM: x86/mmu: Zap only the relevant pages when removing a memslot Modify kvm_mmu_invalidate_zap_pages_in_memslot(), a.k.a. the x86 MMU's handler for kvm_arch_flush_shadow_memslot(), to zap only the pages/PTEs that actually belong to the memslot being removed. This improves performance, especially why the deleted memslot has only a few shadow entries, or even no entries. E.g. a microbenchmark to access regular memory while concurrently reading PCI ROM to trigger memslot deletion showed a 5% improvement in throughput. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1cce120f06ae..b81e2cad0237 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5622,7 +5622,38 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) { - kvm_mmu_invalidate_zap_all_pages(kvm); + struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); + unsigned long i; + bool flush; + gfn_t gfn; + + spin_lock(&kvm->mmu_lock); + + if (list_empty(&kvm->arch.active_mmu_pages)) + goto out_unlock; + + flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false); + + for (i = 0; i < slot->npages; i++) { + gfn = slot->base_gfn + i; + + for_each_valid_sp(kvm, sp, gfn) { + if (sp->gfn != gfn) + continue; + + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + flush = false; + cond_resched_lock(&kvm->mmu_lock); + } + } + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + +out_unlock: + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_init_vm(struct kvm *kvm) From a592a3b8fc62af25a6e76aebde97a5d5f6f13e0f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:22 -0800 Subject: [PATCH 114/147] Revert "KVM: MMU: document fast invalidate all pages" Remove x86 KVM's fast invalidate mechanism, i.e. revert all patches from the original series[1]. Though not explicitly stated, for all intents and purposes the fast invalidate mechanism was added to speed up the scenario where removing a memslot, e.g. as part of accessing reading PCI ROM, caused KVM to flush all shadow entries[1]. Now that the memslot case flushes only shadow entries belonging to the memslot, i.e. doesn't use the fast invalidate mechanism, the only remaining usage of the mechanism are when the VM is being destroyed and when the MMIO generation rolls over. When a VM is being destroyed, either there are no active vcpus, i.e. there's no lock contention, or the VM has ungracefully terminated, in which case we want to reclaim its pages as quickly as possible, i.e. not release the MMU lock if there are still CPUs executing in the VM. The MMIO generation scenario is almost literally a one-in-a-million occurrence, i.e. is not a performance sensitive scenario. Given that lock-breaking is not desirable (VM teardown) or irrelevant (MMIO generation overflow), remove the fast invalidate mechanism to simplify the code (a small amount) and to discourage future code from zapping all pages as using such a big hammer should be a last restort. This reverts commit f6f8adeef542a18b1cb26a0b772c9781a10bb477. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/mmu.txt | 28 +--------------------------- arch/x86/include/asm/kvm_host.h | 3 --- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 367a952f50ab..f365102c80f5 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -224,10 +224,6 @@ Shadow pages contain the following information: A bitmap indicating which sptes in spt point (directly or indirectly) at pages that may be unsynchronized. Used to quickly locate all unsychronized pages reachable from a given page. - mmu_valid_gen: - Generation number of the page. It is compared with kvm->arch.mmu_valid_gen - during hash table lookup, and used to skip invalidated shadow pages (see - "Zapping all pages" below.) clear_spte_count: Only present on 32-bit hosts, where a 64-bit spte cannot be written atomically. The reader uses this while running out of the MMU lock @@ -402,27 +398,6 @@ causes its disallow_lpage to be incremented, thus preventing instantiation of a large spte. The frames at the end of an unaligned memory slot have artificially inflated ->disallow_lpages so they can never be instantiated. -Zapping all pages (page generation count) -========================================= - -For the large memory guests, walking and zapping all pages is really slow -(because there are a lot of pages), and also blocks memory accesses of -all VCPUs because it needs to hold the MMU lock. - -To make it be more scalable, kvm maintains a global generation number -which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores -the current global generation-number into sp->mmu_valid_gen when it -is created. Pages with a mismatching generation number are "obsolete". - -When KVM need zap all shadow pages sptes, it just simply increases the global -generation-number then reload root shadow pages on all vcpus. As the VCPUs -create new shadow page tables, the old pages are not used because of the -mismatching generation number. - -KVM then walks through all pages and zaps obsolete pages. While the zap -operation needs to take the MMU lock, the lock can be released periodically -so that the VCPUs can make progress. - Fast invalidation of MMIO sptes =============================== @@ -435,8 +410,7 @@ shadow pages, and is made more scalable with a similar technique. MMIO sptes have a few spare bits, which are used to store a generation number. The global generation number is stored in kvm_memslots(kvm)->generation, and increased whenever guest memory info -changes. This generation number is distinct from the one described in -the previous section. +changes. When KVM finds an MMIO spte, it checks the generation number of the spte. If the generation number of the spte does not equal the global generation diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c4758e1a8843..69daa57b08a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -333,10 +333,7 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - - /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ unsigned long mmu_valid_gen; - DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 From 4771450c345dc5e3e3417d82aff62e0d88e7eee6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:23 -0800 Subject: [PATCH 115/147] Revert "KVM: MMU: drop kvm_mmu_zap_mmio_sptes" Revert back to a dedicated (and slower) mechanism for handling the scenario where all MMIO shadow PTEs need to be zapped due to overflowing the MMIO generation number. The MMIO generation scenario is almost literally a one-in-a-million occurrence, i.e. is not a performance sensitive scenario. Restoring kvm_mmu_zap_mmio_sptes() leaves VM teardown as the only user of kvm_mmu_invalidate_zap_all_pages() and paves the way for removing the fast invalidate mechanism altogether. This reverts commit a8eca9dcc656a405a28ffba43f3d86a1ff0eb331. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 69daa57b08a7..aeca3fb1cf63 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -319,6 +319,7 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; bool unsync; + bool mmio_cached; /* * The following two entries are used to key the shadow page in the diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b81e2cad0237..d80c1558b23c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -391,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; + page_header(__pa(sptep))->mmio_cached = true; + trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -5942,6 +5944,24 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + + spin_lock(&kvm->mmu_lock); +restart: + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (!sp->mmio_cached) + continue; + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + } + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -5963,7 +5983,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_mmu_zap_mmio_sptes(kvm); } } From 571c5af06e303b4a69016193fd6b5afbc96eac40 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:24 -0800 Subject: [PATCH 116/147] KVM: x86/mmu: Voluntarily reschedule as needed when zapping MMIO sptes Call cond_resched_lock() when zapping MMIO to reschedule if needed or to release and reacquire mmu_lock in case of contention. There is no need to flush or zap when temporarily dropping mmu_lock as zapping MMIO sptes is done when holding the memslots lock and with the "update in-progress" bit set in the memslots generation, which disables MMIO spte caching. The walk does need to be restarted if mmu_lock is dropped as the active pages list may be modified. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d80c1558b23c..2190679eda39 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5954,7 +5954,8 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + cond_resched_lock(&kvm->mmu_lock)) goto restart; } From 5ff0568374ed2e585376a3832857ade5daccd381 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:25 -0800 Subject: [PATCH 117/147] KVM: x86/mmu: Remove is_obsolete() call Unwinding usage of is_obsolete() is a step towards removing x86's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This is a partial revert of commit 05988d728dcd ("KVM: MMU: reduce KVM_REQ_MMU_RELOAD when root page is zapped"). [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2190679eda39..6cbffc775220 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2713,11 +2713,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - /* - * The obsolete pages can not be used on any vcpus. - * See the comments in kvm_mmu_invalidate_zap_all_pages(). - */ - if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + if (!sp->role.invalid) kvm_reload_remote_mmus(kvm); } From 52d5dedc79bdcbac2976159a172069618cf31be5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:26 -0800 Subject: [PATCH 118/147] Revert "KVM: MMU: reclaim the zapped-obsolete page first" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 365c886860c4ba670d245e762b23987c912c129a. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 21 ++++----------------- arch/x86/kvm/x86.c | 1 - 3 files changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aeca3fb1cf63..fbe16a908076 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -851,7 +851,6 @@ struct kvm_arch { * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; - struct list_head zapped_obsolete_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6cbffc775220..255b0212fc5b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5858,6 +5858,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); int batch = 0; restart: @@ -5890,8 +5891,7 @@ restart: goto restart; } - ret = kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages); + ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); batch += ret; if (ret) @@ -5902,7 +5902,7 @@ restart: * Should flush tlb before free page tables since lockless-walking * may use the pages. */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* @@ -5935,11 +5935,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); } -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) -{ - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); -} - static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; @@ -6011,24 +6006,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) + if (!kvm->arch.n_used_mmu_pages) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); -unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03d26ffb29cd..78fb13f190a3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9113,7 +9113,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); From 210f494261e1e84ad1f15877baa1c615afe3b342 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:27 -0800 Subject: [PATCH 119/147] Revert "KVM: MMU: collapse TLB flushes when zap all pages" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit f34d251d66ba263c077ed9d2bbd1874339a4c887. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 255b0212fc5b..e733262027ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2211,14 +2211,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -/* - * NOTE: we should pay more attention on the zapped-obsolete page - * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk - * since it has been deleted from active_mmu_pages but still can be found - * at hast list. - * - * for_each_valid_sp() has skipped that kind of pages. - */ #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ @@ -5881,13 +5873,11 @@ restart: if (sp->role.invalid) continue; - /* - * Need not flush tlb since we only zap the sp with invalid - * generation number. - */ if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + (need_resched() || spin_needbreak(&kvm->mmu_lock))) { batch = 0; + kvm_mmu_commit_zap_page(kvm, &invalid_list); + cond_resched_lock(&kvm->mmu_lock); goto restart; } @@ -5898,10 +5888,6 @@ restart: goto restart; } - /* - * Should flush tlb before free page tables since lockless-walking - * may use the pages. - */ kvm_mmu_commit_zap_page(kvm, &invalid_list); } @@ -5920,17 +5906,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; - /* - * Notify all vcpus to reload its shadow page table - * and flush TLB. Then all vcpus will switch to new - * shadow page table with the new mmu_valid_gen. - * - * Note: we should do this under the protection of - * mmu-lock, otherwise, vcpu would purge shadow page - * but miss tlb flush. - */ - kvm_reload_remote_mmus(kvm); - kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } From 43d2b14b105fb00b8864c7b0ee7043cc1cc4a969 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:28 -0800 Subject: [PATCH 120/147] Revert "KVM: MMU: zap pages in batch" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit e7d11c7a894986a13817c1c001e1e7668c5c4eb4. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e733262027ed..cb9fd69d2632 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5846,18 +5846,14 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -#define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); - int batch = 0; restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { - int ret; - /* * No obsolete page exists before new created page since * active_mmu_pages is the FIFO list. @@ -5866,6 +5862,28 @@ restart: break; /* + * Do not repeatedly zap a root page to avoid unnecessary + * KVM_REQ_MMU_RELOAD, otherwise we may not be able to + * progress: + * vcpu 0 vcpu 1 + * call vcpu_enter_guest(): + * 1): handle KVM_REQ_MMU_RELOAD + * and require mmu-lock to + * load mmu + * repeat: + * 1): zap root page and + * send KVM_REQ_MMU_RELOAD + * + * 2): if (cond_resched_lock(mmu-lock)) + * + * 2): hold mmu-lock and load mmu + * + * 3): see KVM_REQ_MMU_RELOAD bit + * on vcpu->requests is set + * then return 1 to call + * vcpu_enter_guest() again. + * goto repeat; + * * Since we are reversely walking the list and the invalid * list will be moved to the head, skip the invalid page * can help us to avoid the infinity list walking. @@ -5873,18 +5891,13 @@ restart: if (sp->role.invalid) continue; - if (batch >= BATCH_ZAP_PAGES && - (need_resched() || spin_needbreak(&kvm->mmu_lock))) { - batch = 0; + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); cond_resched_lock(&kvm->mmu_lock); goto restart; } - ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - batch += ret; - - if (ret) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } From 42560fb1f3c6c7f730897b7fa7a478bc37e0be50 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:29 -0800 Subject: [PATCH 121/147] Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages" ...as part of removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 35006126f024f68727c67001b9cb703c38f69268. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 1 - arch/x86/kvm/mmutrace.h | 21 --------------------- 2 files changed, 22 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb9fd69d2632..df4025e6f1e1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5916,7 +5916,6 @@ restart: void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) { spin_lock(&kvm->mmu_lock); - trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; kvm_zap_obsolete_pages(kvm); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index c73bf4e4988c..cac88910b310 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -282,27 +282,6 @@ TRACE_EVENT( ) ); -TRACE_EVENT( - kvm_mmu_invalidate_zap_all_pages, - TP_PROTO(struct kvm *kvm), - TP_ARGS(kvm), - - TP_STRUCT__entry( - __field(unsigned long, mmu_valid_gen) - __field(unsigned int, mmu_used_pages) - ), - - TP_fast_assign( - __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; - __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; - ), - - TP_printk("kvm-mmu-valid-gen %lx used_pages %x", - __entry->mmu_valid_gen, __entry->mmu_used_pages - ) -); - - TRACE_EVENT( check_mmio_spte, TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), From b59c4830ca185ba0e9f9e046fb1cd10a4a92627a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:30 -0800 Subject: [PATCH 122/147] Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints" ...as part of removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 2248b023219251908aedda0621251cffc548f258. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmutrace.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index cac88910b310..9f6c855a0043 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -8,18 +8,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(unsigned long, mmu_valid_gen) \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->mmu_valid_gen = sp->mmu_valid_gen; \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -31,9 +29,8 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \ " %snxe %sad root %u %s%c", \ - __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ From 7390de1e99a70895721165d0ccd4a6e16482960a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:31 -0800 Subject: [PATCH 123/147] Revert "KVM: x86: use the fast way to invalidate all pages" Revert to a slow kvm_mmu_zap_all() for kvm_arch_flush_shadow_all(). Flushing all shadow entries is only done during VM teardown, i.e. kvm_arch_flush_shadow_all() is only called when the associated MM struct is being released or when the VM instance is being freed. Although the performance of teardown itself isn't critical, KVM should still voluntarily schedule to play nice with the rest of the kernel; but that can be done without the fast invalidate mechanism in a future patch. This reverts commit 6ca18b6950f8dee29361722f28f69847724b276f. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 15 +++++++++++++++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index df4025e6f1e1..5cdeb88850f8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5846,6 +5846,21 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); +void kvm_mmu_zap_all(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + + spin_lock(&kvm->mmu_lock); +restart: + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); +} + static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78fb13f190a3..65e4559eef2f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9470,7 +9470,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_mmu_zap_all(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, From 8a674adc11cd4cc59e51eaea6f0cc4b3d5710411 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:32 -0800 Subject: [PATCH 124/147] KVM: x86/mmu: skip over invalid root pages when zapping all sptes ...to guarantee forward progress. When zapped, root pages are marked invalid and moved to the head of the active pages list until they are explicitly freed. Theoretically, having unzappable root pages at the head of the list could prevent kvm_mmu_zap_all() from making forward progress were a future patch to add a loop restart after processing a page, e.g. to drop mmu_lock on contention. Although kvm_mmu_prepare_zap_page() can theoretically take action on invalid pages, e.g. to zap unsync children, functionally it's not necessary (root pages will be re-zapped when freed) and practically speaking the odds of e.g. @unsync or @unsync_children becoming %true while zapping all pages is basically nil. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5cdeb88850f8..c79ad7f31fdb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5853,9 +5853,12 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (sp->role.invalid && sp->root_count) + continue; if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; + } kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); From 5d6317ca4e61a3fa7528f832cd945c42fde8e67f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:33 -0800 Subject: [PATCH 125/147] KVM: x86/mmu: Voluntarily reschedule as needed when zapping all sptes Call cond_resched_lock() when zapping all sptes to reschedule if needed or to release and reacquire mmu_lock in case of contention. There is no need to flush or zap when temporarily dropping mmu_lock as zapping all sptes is done only when the owning userspace VMM has exited or when the VM is being destroyed, i.e. there is no interplay with memslots or MMIO generations to worry about. Be paranoid and restart the walk if mmu_lock is dropped to avoid any potential issues with consuming a stale iterator. The overhead in doing so is negligible as at worst there will be a few root shadow pages at the head of the list, i.e. the iterator is essentially the head of the list already. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c79ad7f31fdb..fa153d771f47 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5856,7 +5856,8 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (sp->role.invalid && sp->root_count) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + cond_resched_lock(&kvm->mmu_lock)) goto restart; } From ea145aacf4ae8485cf179a4d0dc502e9f75044f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:34 -0800 Subject: [PATCH 126/147] Revert "KVM: MMU: fast invalidate all pages" Remove x86 KVM's fast invalidate mechanism, i.e. revert all patches from the original series[1], now that all users of the fast invalidate mechanism are gone. This reverts commit 5304b8d37c2a5ebca48330f5e7868d240eafbed1. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 - arch/x86/kvm/mmu.c | 98 +-------------------------------- arch/x86/kvm/mmu.h | 1 - 3 files changed, 1 insertion(+), 100 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fbe16a908076..9417febf8490 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -334,7 +334,6 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - unsigned long mmu_valid_gen; DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 @@ -845,7 +844,6 @@ struct kvm_arch { unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; - unsigned long mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fa153d771f47..6d602d4c3ca4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2060,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * The active_mmu_pages list is the FIFO list, do not move the - * page until it is zapped. kvm_zap_obsolete_pages depends on - * this feature. See the comments in kvm_zap_obsolete_pages(). - */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; @@ -2214,7 +2208,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + if ((_sp)->role.invalid) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ @@ -2266,11 +2260,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif -static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); -} - static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2495,7 +2484,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); @@ -4206,14 +4194,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, return false; if (cached_root_available(vcpu, new_cr3, new_role)) { - /* - * It is possible that the cached previous root page is - * obsolete because of a change in the MMU - * generation number. However, that is accompanied by - * KVM_REQ_MMU_RELOAD, which will free the root that we - * have set here and allocate a new one. - */ - kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); @@ -5865,82 +5845,6 @@ restart: spin_unlock(&kvm->mmu_lock); } -static void kvm_zap_obsolete_pages(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - -restart: - list_for_each_entry_safe_reverse(sp, node, - &kvm->arch.active_mmu_pages, link) { - /* - * No obsolete page exists before new created page since - * active_mmu_pages is the FIFO list. - */ - if (!is_obsolete_sp(kvm, sp)) - break; - - /* - * Do not repeatedly zap a root page to avoid unnecessary - * KVM_REQ_MMU_RELOAD, otherwise we may not be able to - * progress: - * vcpu 0 vcpu 1 - * call vcpu_enter_guest(): - * 1): handle KVM_REQ_MMU_RELOAD - * and require mmu-lock to - * load mmu - * repeat: - * 1): zap root page and - * send KVM_REQ_MMU_RELOAD - * - * 2): if (cond_resched_lock(mmu-lock)) - * - * 2): hold mmu-lock and load mmu - * - * 3): see KVM_REQ_MMU_RELOAD bit - * on vcpu->requests is set - * then return 1 to call - * vcpu_enter_guest() again. - * goto repeat; - * - * Since we are reversely walking the list and the invalid - * list will be moved to the head, skip the invalid page - * can help us to avoid the infinity list walking. - */ - if (sp->role.invalid) - continue; - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); - cond_resched_lock(&kvm->mmu_lock); - goto restart; - } - - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; - } - - kvm_mmu_commit_zap_page(kvm, &invalid_list); -} - -/* - * Fast invalidate all shadow pages and use lock-break technique - * to zap obsolete pages. - * - * It's required when memslot is being deleted or VM is being - * destroyed, in these cases, we should ensure that KVM MMU does - * not use any resource of the being-deleted slot or all slots - * after calling the function. - */ -void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) -{ - spin_lock(&kvm->mmu_lock); - kvm->arch.mmu_valid_gen++; - - kvm_zap_obsolete_pages(kvm); - spin_unlock(&kvm->mmu_lock); -} - static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c7b333147c4a..bbdc60f2fae8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); From 83cdb56864bcb1466b454f17fff47348ca7925a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:35 -0800 Subject: [PATCH 127/147] KVM: x86/mmu: Differentiate between nr zapped and list unstable The return value of kvm_mmu_prepare_zap_page() has evolved to become overloaded to convey two separate pieces of information. 1) was at least one page zapped and 2) has the list of MMU pages become unstable. In it's original incarnation (as kvm_mmu_zap_page()), there was no return value at all. Commit 0738541396be ("KVM: MMU: awareness of new kvm_mmu_zap_page behaviour") added a return value in preparation for commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"). Although the return value was of type 'int', it was actually used as a boolean to indicate whether or not active_mmu_pages may have become unstable due to zapping children. Walking a list with list_for_each_entry_safe() only protects against deleting/moving the current entry, i.e. zapping a child page would break iteration due to modifying any number of entries. Later, commit 60c8aec6e2c9 ("KVM: MMU: use page array in unsync walk") modified mmu_zap_unsync_children() to return an approximation of the number of children zapped. This was not intentional, it was simply a side effect of how the code was written. The unintented side affect was then morphed into an actual feature by commit 77662e0028c7 ("KVM: MMU: fix kvm_mmu_zap_page() and its calling path"), which modified kvm_mmu_change_mmu_pages() to use the number of zapped pages when determining the number of MMU pages in use by the VM. Finally, commit 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed") added the initial page to the return value to make its behavior more consistent with what most users would expect. Incorporating the initial parent page in the return value of kvm_mmu_zap_page() breaks the original usage of restarting a list walk on a non-zero return value to handle a potentially unstable list, i.e. walks will unnecessarily restart when any page is zapped. Fix this by restoring the original behavior of kvm_mmu_zap_page(), i.e. return a boolean to indicate that the list may be unstable and move the number of zapped children to a dedicated parameter. Since the majority of callers to kvm_mmu_prepare_zap_page() don't care about either return value, preserve the current definition of kvm_mmu_prepare_zap_page() by making it a wrapper of a new helper, __kvm_mmu_prepare_zap_page(). This avoids having to update every call site and also provides cleaner code for functions that only care about the number of pages zapped. Fixes: 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6d602d4c3ca4..4b93fcdf0839 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2200,8 +2200,8 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) --kvm->stat.mmu_unsync; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list); +static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); @@ -2669,17 +2669,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list) +static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list, + int *nr_zapped) { - int ret; + bool list_unstable; trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp, invalid_list); + *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + /* Zapping children means active_mmu_pages has become unstable. */ + list_unstable = *nr_zapped; + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp); @@ -2687,7 +2692,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { /* Count self */ - ret++; + (*nr_zapped)++; list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { @@ -2698,7 +2703,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, } sp->role.invalid = 1; - return ret; + return list_unstable; +} + +static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) +{ + int nr_zapped; + + __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); + return nr_zapped; } static void kvm_mmu_commit_zap_page(struct kvm *kvm, @@ -5830,13 +5844,14 @@ void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); + int ign; spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (sp->role.invalid && sp->root_count) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || cond_resched_lock(&kvm->mmu_lock)) goto restart; } @@ -5849,13 +5864,14 @@ static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); + int ign; spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || cond_resched_lock(&kvm->mmu_lock)) goto restart; } From 24efe61f696c7b44a66c7d6a41c181b31aa338fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:36 -0800 Subject: [PATCH 128/147] KVM: x86/mmu: WARN if zapping a MMIO spte results in zapping children Paolo expressed a concern that kvm_mmu_zap_mmio_sptes() could have a quadratic runtime[1], i.e. restarting the spte walk while zapping only MMIO sptes could result in re-walking large portions of the list over and over due to the non-MMIO sptes encountered before the restart not being removed. At the time, the concern was legitimate as the walk was restarted when any spte was zapped. But that is no longer the case as the walk is now restarted iff one or more children have been zapped, which is necessary because zapping children makes the active_mmu_pages list unstable. Furthermore, it should be impossible for an MMIO spte to have children, i.e. zapping an MMIO spte should never result in zapping children. In other words, kvm_mmu_zap_mmio_sptes() should never restart its walk, and so should always execute in linear time. WARN if this assertion fails. Although it should never be needed, leave the restart logic in place. In normal operation, the cost is at worst an extra CMP+Jcc, and if for some reason the list does become unstable, not restarting would likely crash KVM, or worse, the kernel. [1] https://patchwork.kernel.org/patch/10756589/#22452085 Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4b93fcdf0839..81618070367b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5871,8 +5871,11 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || - cond_resched_lock(&kvm->mmu_lock)) + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { + WARN_ON_ONCE(1); + goto restart; + } + if (cond_resched_lock(&kvm->mmu_lock)) goto restart; } From 8ab3c471eef20925bf64c6d4fa46e88cdb4e86d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:37 -0800 Subject: [PATCH 129/147] KVM: x86/mmu: Consolidate kvm_mmu_zap_all() and kvm_mmu_zap_mmio_sptes() ...via a new helper, __kvm_mmu_zap_all(). An alternative to passing a 'bool mmio_only' would be to pass a callback function to filter the shadow page, i.e. to make __kvm_mmu_zap_all() generic and reusable, but zapping all shadow pages is a last resort, i.e. making the helper less extensible is a feature of sorts. And the explicit MMIO parameter makes it easy to preserve the WARN_ON_ONCE() if a restart is triggered when zapping MMIO sptes. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 81618070367b..8d43b7c0f56f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5840,7 +5840,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -void kvm_mmu_zap_all(struct kvm *kvm) +static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -5849,30 +5849,12 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (mmio_only && !sp->mmio_cached) + continue; if (sp->role.invalid && sp->root_count) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || - cond_resched_lock(&kvm->mmu_lock)) - goto restart; - } - - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); -} - -static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - int ign; - - spin_lock(&kvm->mmu_lock); -restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (!sp->mmio_cached) - continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(mmio_only); goto restart; } if (cond_resched_lock(&kvm->mmu_lock)) @@ -5883,6 +5865,11 @@ restart: spin_unlock(&kvm->mmu_lock); } +void kvm_mmu_zap_all(struct kvm *kvm) +{ + return __kvm_mmu_zap_all(kvm, false); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -5904,7 +5891,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - kvm_mmu_zap_mmio_sptes(kvm); + __kvm_mmu_zap_all(kvm, true); } } From 7fa08e71b4a0591a518814fa78b32e124f90d587 Mon Sep 17 00:00:00 2001 From: Nir Weiner Date: Sun, 27 Jan 2019 12:17:14 +0200 Subject: [PATCH 130/147] KVM: grow_halt_poll_ns() should never shrink vCPU halt_poll_ns grow_halt_poll_ns() have a strange behavior in case (halt_poll_ns_grow == 0) && (vcpu->halt_poll_ns != 0). In this case, vcpu->halt_pol_ns will be set to zero. That results in shrinking instead of growing. Fix issue by changing grow_halt_poll_ns() to not modify vcpu->halt_poll_ns in case halt_poll_ns_grow is zero Reviewed-by: Boris Ostrovsky Reviewed-by: Liran Alon Signed-off-by: Nir Weiner Suggested-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 5 ++++- virt/kvm/kvm_main.c | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5a066fc299e1..e316a2ddb70b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3631,8 +3631,11 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, static void grow_halt_poll_ns(struct kvmppc_vcore *vc) { + if (!halt_poll_ns_grow) + return; + /* 10us base */ - if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) + if (vc->halt_poll_ns == 0) vc->halt_poll_ns = 10000; else vc->halt_poll_ns *= halt_poll_ns_grow; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c9d0bc01f8cb..9c8a8bf6e686 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2188,8 +2188,11 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) old = val = vcpu->halt_poll_ns; grow = READ_ONCE(halt_poll_ns_grow); + if (!grow) + goto out; + /* 10us base */ - if (val == 0 && grow) + if (val == 0) val = 10000; else val *= grow; @@ -2198,6 +2201,7 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) val = halt_poll_ns; vcpu->halt_poll_ns = val; +out: trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); } From 49113d360bdeb4dd916fb6bffbcc3e157422b6fd Mon Sep 17 00:00:00 2001 From: Nir Weiner Date: Sun, 27 Jan 2019 12:17:15 +0200 Subject: [PATCH 131/147] KVM: Expose the initial start value in grow_halt_poll_ns() as a module parameter The hard-coded value 10000 in grow_halt_poll_ns() stands for the initial start value when raising up vcpu->halt_poll_ns. It actually sets the first timeout to the first polling session. This value has significant effect on how tolerant we are to outliers. On the standard case, higher value is better - we will spend more time in the polling busyloop, handle events/interrupts faster and result in better performance. But on outliers it puts us in a busy loop that does nothing. Even if the shrink factor is zero, we will still waste time on the first iteration. The optimal value changes between different workloads. It depends on outliers rate and polling sessions length. As this value has significant effect on the dynamic halt-polling algorithm, it should be configurable and exposed. Reviewed-by: Boris Ostrovsky Reviewed-by: Liran Alon Signed-off-by: Nir Weiner Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/halt-polling.txt | 37 ++++++++++++++-------- arch/powerpc/kvm/book3s_hv.c | 3 +- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 8 +++-- 4 files changed, 31 insertions(+), 18 deletions(-) diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virtual/kvm/halt-polling.txt index 4a8418318769..4f791b128dd2 100644 --- a/Documentation/virtual/kvm/halt-polling.txt +++ b/Documentation/virtual/kvm/halt-polling.txt @@ -53,7 +53,8 @@ the global max polling interval then the polling interval can be increased in the hope that next time during the longer polling interval the wake up source will be received while the host is polling and the latency benefits will be received. The polling interval is grown in the function grow_halt_poll_ns() and -is multiplied by the module parameter halt_poll_ns_grow. +is multiplied by the module parameters halt_poll_ns_grow and +halt_poll_ns_grow_start. In the event that the total block time was greater than the global max polling interval then the host will never poll for long enough (limited by the global @@ -80,22 +81,30 @@ shrunk. These variables are defined in include/linux/kvm_host.h and as module parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the powerpc kvm-hv case. -Module Parameter | Description | Default Value +Module Parameter | Description | Default Value -------------------------------------------------------------------------------- -halt_poll_ns | The global max polling interval | KVM_HALT_POLL_NS_DEFAULT - | which defines the ceiling value | - | of the polling interval for | (per arch value) - | each vcpu. | +halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT + | interval which defines | + | the ceiling value of the | + | polling interval for | (per arch value) + | each vcpu. | -------------------------------------------------------------------------------- -halt_poll_ns_grow | The value by which the halt | 2 - | polling interval is multiplied | - | in the grow_halt_poll_ns() | - | function. | +halt_poll_ns_grow | The value by which the | 2 + | halt polling interval is | + | multiplied in the | + | grow_halt_poll_ns() | + | function. | -------------------------------------------------------------------------------- -halt_poll_ns_shrink | The value by which the halt | 0 - | polling interval is divided in | - | the shrink_halt_poll_ns() | - | function. | +halt_poll_ns_grow_start | The initial value to grow | 10000 + | to from zero in the | + | grow_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- +halt_poll_ns_shrink | The value by which the | 0 + | halt polling interval is | + | divided in the | + | shrink_halt_poll_ns() | + | function. | -------------------------------------------------------------------------------- These module parameters can be set from the debugfs files in: diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e316a2ddb70b..29ffc99bd79b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3634,9 +3634,8 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc) if (!halt_poll_ns_grow) return; - /* 10us base */ if (vc->halt_poll_ns == 0) - vc->halt_poll_ns = 10000; + vc->halt_poll_ns = halt_poll_ns_grow_start; else vc->halt_poll_ns *= halt_poll_ns_grow; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 85c0c00d5159..9d55c63db09b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1203,6 +1203,7 @@ extern bool kvm_rebooting; extern unsigned int halt_poll_ns; extern unsigned int halt_poll_ns_grow; +extern unsigned int halt_poll_ns_grow_start; extern unsigned int halt_poll_ns_shrink; struct kvm_device { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9c8a8bf6e686..ae818d27a1a4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -81,6 +81,11 @@ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow); +/* The start value to grow halt_poll_ns from */ +unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ +module_param(halt_poll_ns_grow_start, uint, 0644); +EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); + /* Default resets per-vcpu halt_poll_ns . */ unsigned int halt_poll_ns_shrink; module_param(halt_poll_ns_shrink, uint, 0644); @@ -2191,9 +2196,8 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) if (!grow) goto out; - /* 10us base */ if (val == 0) - val = 10000; + val = halt_poll_ns_grow_start; else val *= grow; From dee339b5c1da3e6fa139b97f74f99dc9f0b03ff6 Mon Sep 17 00:00:00 2001 From: Nir Weiner Date: Sun, 27 Jan 2019 12:17:16 +0200 Subject: [PATCH 132/147] KVM: Never start grow vCPU halt_poll_ns from value below halt_poll_ns_grow_start grow_halt_poll_ns() have a strange behaviour in case (vcpu->halt_poll_ns != 0) && (vcpu->halt_poll_ns < halt_poll_ns_grow_start). In this case, vcpu->halt_poll_ns will be multiplied by grow factor (halt_poll_ns_grow) which will require several grow iteration in order to reach a value bigger than halt_poll_ns_grow_start. This means that growing vcpu->halt_poll_ns from value of 0 is slower than growing it from a positive value less than halt_poll_ns_grow_start. Which is misleading and inaccurate. Fix issue by changing grow_halt_poll_ns() to set vcpu->halt_poll_ns to halt_poll_ns_grow_start in any case that (vcpu->halt_poll_ns < halt_poll_ns_grow_start). Regardless if vcpu->halt_poll_ns is 0. use READ_ONCE to get a consistent number for all cases. Reviewed-by: Boris Ostrovsky Reviewed-by: Liran Alon Signed-off-by: Nir Weiner Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 5 ++--- virt/kvm/kvm_main.c | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 29ffc99bd79b..062f3c92c871 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3634,10 +3634,9 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc) if (!halt_poll_ns_grow) return; - if (vc->halt_poll_ns == 0) + vc->halt_poll_ns *= halt_poll_ns_grow; + if (vc->halt_poll_ns < halt_poll_ns_grow_start) vc->halt_poll_ns = halt_poll_ns_grow_start; - else - vc->halt_poll_ns *= halt_poll_ns_grow; } static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ae818d27a1a4..5087cf703ed1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2189,17 +2189,17 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { - unsigned int old, val, grow; + unsigned int old, val, grow, grow_start; old = val = vcpu->halt_poll_ns; + grow_start = READ_ONCE(halt_poll_ns_grow_start); grow = READ_ONCE(halt_poll_ns_grow); if (!grow) goto out; - if (val == 0) - val = halt_poll_ns_grow_start; - else - val *= grow; + val *= grow; + if (val < grow_start) + val = grow_start; if (val > halt_poll_ns) val = halt_poll_ns; From 7539b174aef405d9d57db48c58390ba360c91312 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 4 Jan 2019 15:54:14 -0200 Subject: [PATCH 133/147] x86: kvmguest: use TSC clocksource if invariant TSC is exposed The invariant TSC bit has the following meaning: "The time stamp counter in newer processors may support an enhancement, referred to as invariant TSC. Processor's support for invariant TSC is indicated by CPUID.80000007H:EDX[8]. The invariant TSC will run at a constant rate in all ACPI P-, C-. and T-states. This is the architectural behavior moving forward. On processors with invariant TSC support, the OS may use the TSC for wall clock timer services (instead of ACPI or HPET timers). TSC reads are much more efficient and do not incur the overhead associated with a ring transition or access to a platform resource." IOW, TSC does not change frequency. In such case, and with TSC scaling hardware available to handle migration, it is possible to use the TSC clocksource directly, whose system calls are faster. Reduce the rating of kvmclock clocksource to allow TSC clocksource to be the default if invariant TSC is exposed. Signed-off-by: Marcelo Tosatti v2: Use feature bits and tsc_unstable() check (Sean Christopherson) Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d908a37bf3f3..904494b924c1 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -351,6 +351,20 @@ void __init kvmclock_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); + + /* + * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate + * with P/T states and does not stop in deep C-states. + * + * Invariant TSC exposed by host means kvmclock is not necessary: + * can use TSC as clocksource. + * + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + !check_tsc_unstable()) + kvm_clock.rating = 299; + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } From a67794cafbc4594debf53dbe4e2a7708426f492e Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sat, 2 Feb 2019 17:20:27 +0800 Subject: [PATCH 134/147] Revert "KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()" The value of "dirty_bitmap[i]" is already check before setting its value to mask. The following check of "mask" is redundant. The check of "mask" was introduced by commit 58d2930f4ee3 ("KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()"), revert it. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5087cf703ed1..276af92ace6c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1205,11 +1205,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; - if (mask) { - offset = i * BITS_PER_LONG; - kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, - offset, mask); - } + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); } spin_unlock(&kvm->mmu_lock); } From e40542aff909ac34d2c24712c5c0769c8f77f895 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Thu, 21 Feb 2019 14:28:48 +1100 Subject: [PATCH 135/147] KVM: PPC: Book3S HV: Fix build failure without IOMMU support Currently trying to build without IOMMU support will fail: (.text+0x1380): undefined reference to `kvmppc_h_get_tce' (.text+0x1384): undefined reference to `kvmppc_rm_h_put_tce' (.text+0x149c): undefined reference to `kvmppc_rm_h_stuff_tce' (.text+0x14a0): undefined reference to `kvmppc_rm_h_put_tce_indirect' This happens because turning off IOMMU support will prevent book3s_64_vio_hv.c from being built because it is only built when SPAPR_TCE_IOMMU is set, which depends on IOMMU support. Fix it using ifdefs for the undefined references. Fixes: 76d837a4c0f9 ("KVM: PPC: Book3S PR: Don't include SPAPR TCE code on non-pseries platforms") Signed-off-by: Jordan Niethe Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 2 ++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1860c0bc1395..ba8db3881ef9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -937,6 +937,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5)); break; +#ifdef CONFIG_SPAPR_TCE_IOMMU case H_GET_TCE: ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5)); @@ -966,6 +967,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (ret == H_TOO_HARD) return RESUME_HOST; break; +#endif case H_RANDOM: if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) ret = H_HARDWARE; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9b8d50a7cbaf..541b121477e4 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2264,8 +2264,13 @@ hcall_real_table: .long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table .long DOTSYM(kvmppc_h_protect) - hcall_real_table +#ifdef CONFIG_SPAPR_TCE_IOMMU .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table .long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table +#else + .long 0 /* 0x1c */ + .long 0 /* 0x20 */ +#endif .long 0 /* 0x24 - H_SET_SPRG0 */ .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table .long 0 /* 0x2c */ @@ -2343,8 +2348,13 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table +#ifdef CONFIG_SPAPR_TCE_IOMMU .long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table +#else + .long 0 /* 0x138 */ + .long 0 /* 0x13c */ +#endif .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ From 716cb1160819721c39f807e103d9c307dbca2cf4 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Feb 2019 14:44:14 +1100 Subject: [PATCH 136/147] KVM: PPC: Book3S: Improve KVM reference counting The anon fd's ops releases the KVM reference in the release hook. However we reference the KVM object after we create the fd so there is small window when the release function can be called and dereferenced the KVM object which potentially may free it. It is not a problem at the moment as the file is created and KVM is referenced under the KVM lock and the release function obtains the same lock before dereferencing the KVM (although the lock is not held when calling kvm_put_kvm()) but it is potentially fragile against future changes. This references the KVM object before creating a file. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 6630dde56668..f02b04973710 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -337,14 +337,15 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, } } + kvm_get_kvm(kvm); if (!ret) ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, stt, O_RDWR | O_CLOEXEC); - if (ret >= 0) { + if (ret >= 0) list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); - kvm_get_kvm(kvm); - } + else + kvm_put_kvm(kvm); mutex_unlock(&kvm->lock); From 7f5d9c1bc0e6c13ba01546e9c6fda083764a4e9c Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Fri, 22 Feb 2019 14:34:29 +0800 Subject: [PATCH 137/147] KVM: arm/arm64: Remove unused timer variable The 'timer' local variable became unused after commit bee038a67487 ("KVM: arm/arm64: Rework the timer code to use a timer_map"). Remove it to avoid [-Wunused-but-set-variable] warning. Cc: Christoffer Dall Cc: James Morse Cc: Suzuki K Pouloze Reviewed-by: Julien Thierry Signed-off-by: Shaokun Zhang Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index af8f2f1d01cc..3417f2dbc366 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -236,14 +236,12 @@ static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { - struct arch_timer_cpu *timer; enum kvm_arch_timers index; u64 cval, now; if (!timer_ctx) return false; - timer = vcpu_timer(timer_ctx->vcpu); index = arch_timer_ctx_index(timer_ctx); if (timer_ctx->loaded) { From c88b093693ccbe41991ef2e9b1d251945e6e54ed Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Thu, 21 Feb 2019 11:42:32 +0000 Subject: [PATCH 138/147] arm64: KVM: Fix architecturally invalid reset value for FPEXC32_EL2 Due to what looks like a typo dating back to the original addition of FPEXC32_EL2 handling, KVM currently initialises this register to an architecturally invalid value. As a result, the VECITR field (RES1) in bits [10:8] is initialised with 0, and the two reserved (RES0) bits [6:5] are initialised with 1. (In the Common VFP Subarchitecture as specified by ARMv7-A, these two bits were IMP DEF. ARMv8-A removes them.) This patch changes the reset value from 0x70 to 0x700, which reflects the architectural constraints and is presumably what was originally intended. Cc: # 4.12.x- Cc: Christoffer Dall Fixes: 62a89c44954f ("arm64: KVM: 32bit handling of coprocessor traps") Signed-off-by: Dave Martin Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 81a342679e60..a398d04274b7 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1523,7 +1523,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, - { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x70 }, + { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, }; static bool trap_dbgidr(struct kvm_vcpu *vcpu, From 346fa2f891c71a9b98014f8f62c15f4c7dd95ec1 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 18 Feb 2019 07:48:25 -0500 Subject: [PATCH 139/147] KVM: s390: implement subfunction processor calls While we will not implement interception for query functions yet, we can and should disable functions that have a control bit based on the given CPU model. Let us start with enabling the subfunction interface. Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Janosch Frank Reviewed-by: Cornelia Huck --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/kvm-s390.c | 35 ++++++++++++++++++++------------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index c5f51566ecd6..3369677a7df7 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -711,6 +711,7 @@ struct s390_io_adapter { struct kvm_s390_cpu_model { /* facility mask supported by kvm & hosting machine */ __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; + struct kvm_s390_vm_cpu_subfunc subfuncs; /* facility list requested by guest (in dma page) */ __u64 *fac_list; u64 cpuid; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2e47c724679e..82a95afa6629 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1266,11 +1266,20 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, static int kvm_s390_set_processor_subfunc(struct kvm *kvm, struct kvm_device_attr *attr) { - /* - * Once supported by kernel + hw, we have to store the subfunctions - * in kvm->arch and remember that user space configured them. - */ - return -ENXIO; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + if (copy_from_user(&kvm->arch.model.subfuncs, (void __user *)attr->addr, + sizeof(struct kvm_s390_vm_cpu_subfunc))) { + mutex_unlock(&kvm->lock); + return -EFAULT; + } + mutex_unlock(&kvm->lock); + + return 0; } static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) @@ -1389,12 +1398,11 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm, static int kvm_s390_get_processor_subfunc(struct kvm *kvm, struct kvm_device_attr *attr) { - /* - * Once we can actually configure subfunctions (kernel + hw support), - * we have to check if they were already set by user space, if so copy - * them from kvm->arch. - */ - return -ENXIO; + if (copy_to_user((void __user *)attr->addr, &kvm->arch.model.subfuncs, + sizeof(struct kvm_s390_vm_cpu_subfunc))) + return -EFAULT; + + return 0; } static int kvm_s390_get_machine_subfunc(struct kvm *kvm, @@ -1405,6 +1413,7 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, return -EFAULT; return 0; } + static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -1522,10 +1531,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR_FEAT: case KVM_S390_VM_CPU_MACHINE_FEAT: case KVM_S390_VM_CPU_MACHINE_SUBFUNC: + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: ret = 0; break; - /* configuring subfunctions is not supported yet */ - case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: default: ret = -ENXIO; break; @@ -2227,6 +2235,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] & kvm_s390_fac_base[i]; } + kvm->arch.model.subfuncs = kvm_s390_available_subfunc; /* we are always in czam mode - even on pre z14 machines */ set_kvm_facility(kvm->arch.model.fac_mask, 138); From 11ba5961a2156a4f210627ed8421387e2531b100 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 20 Feb 2019 11:38:42 -0500 Subject: [PATCH 140/147] KVM: s390: add debug logging for cpu model subfunctions As userspace can now get/set the subfunctions we want to trace those. This will allow to also check QEMUs cpu model vs. what the real hardware provides. Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Reviewed-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 136 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 82a95afa6629..4638303ba6a8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1279,6 +1279,51 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm, } mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "SET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]); + VM_EVENT(kvm, 3, "SET: guest PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]); + VM_EVENT(kvm, 3, "SET: guest KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]); + VM_EVENT(kvm, 3, "SET: guest KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]); + VM_EVENT(kvm, 3, "SET: guest KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.km)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]); + VM_EVENT(kvm, 3, "SET: guest KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]); + VM_EVENT(kvm, 3, "SET: guest KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]); + VM_EVENT(kvm, 3, "SET: guest PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]); + VM_EVENT(kvm, 3, "SET: guest KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]); + VM_EVENT(kvm, 3, "SET: guest KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]); + VM_EVENT(kvm, 3, "SET: guest KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]); + VM_EVENT(kvm, 3, "SET: guest PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]); + VM_EVENT(kvm, 3, "SET: guest PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]); + VM_EVENT(kvm, 3, "SET: guest KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + return 0; } @@ -1402,6 +1447,51 @@ static int kvm_s390_get_processor_subfunc(struct kvm *kvm, sizeof(struct kvm_s390_vm_cpu_subfunc))) return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]); + VM_EVENT(kvm, 3, "GET: guest PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]); + VM_EVENT(kvm, 3, "GET: guest KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]); + VM_EVENT(kvm, 3, "GET: guest KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]); + VM_EVENT(kvm, 3, "GET: guest KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.km)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]); + VM_EVENT(kvm, 3, "GET: guest KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]); + VM_EVENT(kvm, 3, "GET: guest KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]); + VM_EVENT(kvm, 3, "GET: guest PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]); + VM_EVENT(kvm, 3, "GET: guest KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]); + VM_EVENT(kvm, 3, "GET: guest KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]); + VM_EVENT(kvm, 3, "GET: guest KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]); + VM_EVENT(kvm, 3, "GET: guest PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]); + VM_EVENT(kvm, 3, "GET: guest PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]); + VM_EVENT(kvm, 3, "GET: guest KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + return 0; } @@ -1411,6 +1501,52 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc, sizeof(struct kvm_s390_vm_cpu_subfunc))) return -EFAULT; + + VM_EVENT(kvm, 3, "GET: host PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.plo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[1], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[2], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[3]); + VM_EVENT(kvm, 3, "GET: host PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.ptff)[0], + ((unsigned long *) &kvm_s390_available_subfunc.ptff)[1]); + VM_EVENT(kvm, 3, "GET: host KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmac)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmac)[1]); + VM_EVENT(kvm, 3, "GET: host KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmc)[1]); + VM_EVENT(kvm, 3, "GET: host KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.km)[0], + ((unsigned long *) &kvm_s390_available_subfunc.km)[1]); + VM_EVENT(kvm, 3, "GET: host KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kimd)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kimd)[1]); + VM_EVENT(kvm, 3, "GET: host KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.klmd)[0], + ((unsigned long *) &kvm_s390_available_subfunc.klmd)[1]); + VM_EVENT(kvm, 3, "GET: host PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[1]); + VM_EVENT(kvm, 3, "GET: host KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[1]); + VM_EVENT(kvm, 3, "GET: host KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmf)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmf)[1]); + VM_EVENT(kvm, 3, "GET: host KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmo)[1]); + VM_EVENT(kvm, 3, "GET: host PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pcc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pcc)[1]); + VM_EVENT(kvm, 3, "GET: host PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.ppno)[0], + ((unsigned long *) &kvm_s390_available_subfunc.ppno)[1]); + VM_EVENT(kvm, 3, "GET: host KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kma)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kma)[1]); + return 0; } From a242010776f880ec0d8917aae0406e4a70cdc36c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 22 Feb 2019 16:10:09 +0800 Subject: [PATCH 141/147] KVM: Minor cleanups for kvm_main.c This patch contains two minor cleanups: firstly it puts exported symbol for kvm_io_bus_write() by following the function definition; secondly it removes a redundant blank line. Signed-off-by: Leo Yan Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 276af92ace6c..0fb0e9aa0935 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3631,6 +3631,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } +EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, @@ -3681,7 +3682,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, @@ -3703,7 +3703,6 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, return r < 0 ? r : 0; } - /* Caller must hold slots_lock. */ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) From e74d53e30e2927fa5b223296ac7922baf15ea89a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 25 Feb 2019 14:35:06 +1100 Subject: [PATCH 142/147] KVM: PPC: Fix compilation when KVM is not enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling with CONFIG_PPC_POWERNV=y and KVM disabled currently gives an error like this: CC arch/powerpc/kernel/dbell.o In file included from arch/powerpc/kernel/dbell.c:20:0: arch/powerpc/include/asm/kvm_ppc.h: In function ‘xics_on_xive’: arch/powerpc/include/asm/kvm_ppc.h:625:9: error: implicit declaration of function ‘xive_enabled’ [-Werror=implicit-function-declaration] return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE); ^ cc1: all warnings being treated as errors scripts/Makefile.build:276: recipe for target 'arch/powerpc/kernel/dbell.o' failed make[3]: *** [arch/powerpc/kernel/dbell.o] Error 1 Fix this by making the xics_on_xive() definition conditional on the same symbol (CONFIG_KVM_BOOK3S_64_HANDLER) that determines whether we include or not, since that's the header that defines xive_enabled(). Fixes: 03f953329bd8 ("KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE") Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d283d3179fbc..ac22b28ae78d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -619,7 +619,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ -#ifdef CONFIG_PPC_POWERNV +#if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER) static inline bool xics_on_xive(void) { return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE); From 2b57ecd0208f7ac0b20b1b171698f027481a39f6 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Fri, 1 Mar 2019 14:25:16 +1100 Subject: [PATCH 143/147] KVM: PPC: Book3S: Add count cache flush parameters to kvmppc_get_cpu_char() Add KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST & KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE to the characteristics returned from the H_GET_CPU_CHARACTERISTICS H-CALL, as queried from either the hypervisor or the device tree. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras --- arch/powerpc/include/uapi/asm/kvm.h | 2 ++ arch/powerpc/kvm/powerpc.c | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 8c876c166ef2..26ca425f4c2c 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -463,10 +463,12 @@ struct kvm_ppc_cpu_char { #define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED (1ULL << 58) #define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF (1ULL << 57) #define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS (1ULL << 56) +#define KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST (1ull << 54) #define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY (1ULL << 63) #define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR (1ULL << 62) #define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ULL << 61) +#define KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE (1ull << 58) /* Per-vcpu XICS interrupt controller state */ #define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 8c69af10f91d..8885377ec3e0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2189,10 +2189,12 @@ static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp) KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | KVM_PPC_CPU_CHAR_BR_HINT_HONOURED | KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF | - KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS | + KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | - KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR | + KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; } return 0; } @@ -2251,12 +2253,16 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) if (have_fw_feat(fw_features, "enabled", "fw-count-cache-disabled")) cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + if (have_fw_feat(fw_features, "enabled", + "fw-count-cache-flush-bcctr2,0,0")) + cp->character |= KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 | KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED | KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 | KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 | KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | - KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS | + KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; if (have_fw_feat(fw_features, "enabled", "speculation-policy-favor-security")) @@ -2267,9 +2273,13 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) if (!have_fw_feat(fw_features, "disabled", "needs-spec-barrier-for-bound-checks")) cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + if (have_fw_feat(fw_features, "enabled", + "needs-count-cache-flush-on-context-switch")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | - KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR | + KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; of_node_put(fw_features); } From 92da008fa21034c369cdb8ca2b629fe5c196826b Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 12 Mar 2019 11:45:58 -0700 Subject: [PATCH 144/147] Revert "KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range()" This reverts commit 71883a62fcd6c70639fa12cda733378b4d997409. The above commit contains an optimization to kvm_zap_gfn_range which uses gfn-limited TLB flushes, if enabled. If using these limited flushes, kvm_zap_gfn_range passes lock_flush_tlb=false to slot_handle_level_range which creates a race when the function unlocks to call cond_resched. See an example of this race below: CPU 0 CPU 1 CPU 3 // zap_direct_gfn_range mmu_lock() // *ptep == pte_1 *ptep = 0 if (lock_flush_tlb) flush_tlbs() mmu_unlock() // In invalidate range // MMU notifier mmu_lock() if (pte != 0) *ptep = 0 flush = true if (flush) flush_remote_tlbs() mmu_unlock() return // Host MM reallocates // page previously // backing guest memory. // Guest accesses // invalid page // through pte_1 // in its TLB!! Tested: Ran all kvm-unit-tests on a Intel Haswell machine with and without this patch. The patch introduced no new failures. Signed-off-by: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8d43b7c0f56f..4cda5ee48845 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5660,13 +5660,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - bool flush_tlb = true; - bool flush = false; int i; - if (kvm_available_flush_tlb_with_range()) - flush_tlb = false; - spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); @@ -5678,17 +5673,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush |= slot_handle_level_range(kvm, memslot, - kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, start, - end - 1, flush_tlb); + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); } } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start + 1); - spin_unlock(&kvm->mmu_lock); } From 46333236485c8647e40ac2922579e29c5e49ed16 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 13 Mar 2019 12:55:55 -0700 Subject: [PATCH 145/147] MAINTAINERS: Add KVM selftests to existing KVM entry It's safe to assume Paolo and Radim are maintaining the KVM selftests given that the vast majority of commits have their SOBs. Play nice with get_maintainers and make it official. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8347269aa386..b95f5b69e5be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8295,6 +8295,7 @@ F: include/linux/kvm* F: include/kvm/iodev.h F: virt/kvm/* F: tools/kvm/ +F: tools/testing/selftests/kvm/ KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd) M: Joerg Roedel From eca6be566d47029f945a5f8e1c94d374e31df2ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Feb 2019 12:48:40 -0800 Subject: [PATCH 146/147] KVM: doc: Document the life cycle of a VM and its resources The series to add memcg accounting to KVM allocations[1] states: There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. While it is correct to account KVM kernel allocations to the cgroup of the process that created the VM, it's technically incorrect to state that the KVM kernel memory allocations are tied to the life of the VM process. This is because the VM itself, i.e. struct kvm, is not tied to the life of the process which created it, rather it is tied to the life of its associated file descriptor. In other words, kvm_destroy_vm() is not invoked until fput() decrements its associated file's refcount to zero. A simple example is to fork() in Qemu and have the child sleep indefinitely; kvm_destroy_vm() isn't called until Qemu closes its file descriptor *and* the rogue child is killed. The allocations are guaranteed to be *accounted* to the process which created the VM, but only because KVM's per-{VM,vCPU} ioctls reject the ioctl() with -EIO if kvm->mm != current->mm. I.e. the child can keep the VM "alive" but can't do anything useful with its reference. Note that because 'struct kvm' also holds a reference to the mm_struct of its owner, the above behavior also applies to userspace allocations. Given that mucking with a VM's file descriptor can lead to subtle and undesirable behavior, e.g. memcg charges persisting after a VM is shut down, explicitly document a VM's lifecycle and its impact on the VM's resources. Alternatively, KVM could aggressively free resources when the creating process exits, e.g. via mmu_notifier->release(). However, mmu_notifier isn't guaranteed to be available, and freeing resources when the creator exits is likely to be error prone and fragile as KVM would need to ensure that it only freed resources that are truly out of reach. In practice, the existing behavior shouldn't be problematic as a properly configured system will prevent a child process from being moved out of the appropriate cgroup hierarchy, i.e. prevent hiding the process from the OOM killer, and will prevent an unprivileged user from being able to to hold a reference to struct kvm via another method, e.g. debugfs. [1]https://patchwork.kernel.org/patch/10806707/ Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 356156f5c52d..7de9eee73fcd 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -45,6 +45,23 @@ the API. The only supported use is one virtual machine per process, and one vcpu per thread. +It is important to note that althought VM ioctls may only be issued from +the process that created the VM, a VM's lifecycle is associated with its +file descriptor, not its creator (process). In other words, the VM and +its resources, *including the associated address space*, are not freed +until the last reference to the VM's file descriptor has been released. +For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will +not be freed until both the parent (original) process and its child have +put their references to the VM's file descriptor. + +Because a VM's resources are not freed until the last reference to its +file descriptor is released, creating additional references to a VM via +via fork(), dup(), etc... without careful consideration is strongly +discouraged and may have unwanted side effects, e.g. memory allocated +by and on behalf of the VM's process may not be freed/unaccounted when +the VM is shut down. + + 3. Extensions ------------- From 4a605bc08e98381d8df61c30a4acb2eac15eb7da Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 15 Mar 2019 19:23:45 +0100 Subject: [PATCH 147/147] kvm: vmx: fix formatting of a comment Eliminate a gratuitous conflict with 5.0. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4950bb20e06a..886930db77c5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1213,13 +1213,13 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control) != old.control); /* - * Clear SN before reading the bitmap; this ensures that any - * interrupt that comes after the bitmap is read sets ON. The - * VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 - * in the spec), so it doesn't really have a memory barrier that - * pairs with this. However, we cannot do that and we need one. + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. */ smp_mb__after_atomic(); + if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) pi_set_on(pi_desc); }