From 0230cae75df62de5813a4ca39a425ba439d036da Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Wed, 4 Jul 2018 15:50:01 +0100
Subject: [PATCH 01/85] KVM: s390: Replace clear_user with kvm_clear_guest

kvm_clear_guest also does the dirty tracking for us, which we want to
have.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/priv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index eb0eb60c7be6..c4da375553e8 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -956,7 +956,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
 		if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
-			if (clear_user((void __user *)useraddr, PAGE_SIZE))
+			if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE))
 				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 		}
 

From 03133347b4452ef9b1f1456b92f5fafa467c0655 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Date: Mon, 30 Apr 2018 18:33:24 +0200
Subject: [PATCH 02/85] KVM: s390: a utility function for migration

Introduce a utility function that will be used later on for storage
attributes migration, and use it in kvm_main.c to replace existing code
that does the same thing.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Message-Id: <1525106005-13931-2-git-send-email-imbrenda@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 include/linux/kvm_host.h | 7 +++++++
 virt/kvm/kvm_main.c      | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4ee7bc548a83..5f138b40e433 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -309,6 +309,13 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
 	return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 }
 
+static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+	unsigned long len = kvm_dirty_bitmap_bytes(memslot);
+
+	return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap);
+}
+
 struct kvm_s390_adapter_int {
 	u64 ind_addr;
 	u64 summary_addr;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b47507faab5..f519eb8d06b1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1169,7 +1169,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 
 	n = kvm_dirty_bitmap_bytes(memslot);
 
-	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
 	memset(dirty_bitmap_buffer, 0, n);
 
 	spin_lock(&kvm->mmu_lock);

From afdad61615cc37fef91bc70af7ee5b293785fbd6 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Date: Mon, 30 Apr 2018 18:33:25 +0200
Subject: [PATCH 03/85] KVM: s390: Fix storage attributes migration with memory
 slots

This is a fix for several issues that were found in the original code
for storage attributes migration.

Now no bitmap is allocated to keep track of dirty storage attributes;
the extra bits of the per-memslot bitmap that are always present anyway
are now used for this purpose.

The code has also been refactored a little to improve readability.

Fixes: 190df4a212a ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
Fixes: 4036e3874a1 ("KVM: s390: ioctls to get and set guest storage attributes")
Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Message-Id: <1525106005-13931-3-git-send-email-imbrenda@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |   9 +-
 arch/s390/kvm/kvm-s390.c         | 297 ++++++++++++++++++-------------
 arch/s390/kvm/priv.c             |  28 ++-
 3 files changed, 197 insertions(+), 137 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index a2188e309bd6..44b09b366782 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -793,12 +793,6 @@ struct kvm_s390_vsie {
 	struct page *pages[KVM_MAX_VCPUS];
 };
 
-struct kvm_s390_migration_state {
-	unsigned long bitmap_size;	/* in bits (number of guest pages) */
-	atomic64_t dirty_pages;		/* number of dirty pages */
-	unsigned long *pgste_bitmap;
-};
-
 struct kvm_arch{
 	void *sca;
 	int use_esca;
@@ -828,7 +822,8 @@ struct kvm_arch{
 	struct kvm_s390_vsie vsie;
 	u8 epdx;
 	u64 epoch;
-	struct kvm_s390_migration_state *migration_state;
+	int migration_mode;
+	atomic64_t cmma_dirty_pages;
 	/* subset of available cpu features enabled by user space */
 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 	struct kvm_s390_gisa *gisa;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3b7a5151b6a5..b350ec14d882 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -862,54 +862,37 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
  */
 static int kvm_s390_vm_start_migration(struct kvm *kvm)
 {
-	struct kvm_s390_migration_state *mgs;
 	struct kvm_memory_slot *ms;
-	/* should be the only one */
 	struct kvm_memslots *slots;
-	unsigned long ram_pages;
+	unsigned long ram_pages = 0;
 	int slotnr;
 
 	/* migration mode already enabled */
-	if (kvm->arch.migration_state)
+	if (kvm->arch.migration_mode)
 		return 0;
-
 	slots = kvm_memslots(kvm);
 	if (!slots || !slots->used_slots)
 		return -EINVAL;
 
-	mgs = kzalloc(sizeof(*mgs), GFP_KERNEL);
-	if (!mgs)
-		return -ENOMEM;
-	kvm->arch.migration_state = mgs;
-
-	if (kvm->arch.use_cmma) {
-		/*
-		 * Get the first slot. They are reverse sorted by base_gfn, so
-		 * the first slot is also the one at the end of the address
-		 * space. We have verified above that at least one slot is
-		 * present.
-		 */
-		ms = slots->memslots;
-		/* round up so we only use full longs */
-		ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
-		/* allocate enough bytes to store all the bits */
-		mgs->pgste_bitmap = vmalloc(ram_pages / 8);
-		if (!mgs->pgste_bitmap) {
-			kfree(mgs);
-			kvm->arch.migration_state = NULL;
-			return -ENOMEM;
-		}
-
-		mgs->bitmap_size = ram_pages;
-		atomic64_set(&mgs->dirty_pages, ram_pages);
-		/* mark all the pages in active slots as dirty */
-		for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
-			ms = slots->memslots + slotnr;
-			bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages);
-		}
-
-		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
+	if (!kvm->arch.use_cmma) {
+		kvm->arch.migration_mode = 1;
+		return 0;
 	}
+	/* mark all the pages in active slots as dirty */
+	for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
+		ms = slots->memslots + slotnr;
+		/*
+		 * The second half of the bitmap is only used on x86,
+		 * and would be wasted otherwise, so we put it to good
+		 * use here to keep track of the state of the storage
+		 * attributes.
+		 */
+		memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms));
+		ram_pages += ms->npages;
+	}
+	atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
+	kvm->arch.migration_mode = 1;
+	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
 	return 0;
 }
 
@@ -919,21 +902,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
  */
 static int kvm_s390_vm_stop_migration(struct kvm *kvm)
 {
-	struct kvm_s390_migration_state *mgs;
-
 	/* migration mode already disabled */
-	if (!kvm->arch.migration_state)
+	if (!kvm->arch.migration_mode)
 		return 0;
-	mgs = kvm->arch.migration_state;
-	kvm->arch.migration_state = NULL;
-
-	if (kvm->arch.use_cmma) {
+	kvm->arch.migration_mode = 0;
+	if (kvm->arch.use_cmma)
 		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
-		/* We have to wait for the essa emulation to finish */
-		synchronize_srcu(&kvm->srcu);
-		vfree(mgs->pgste_bitmap);
-	}
-	kfree(mgs);
 	return 0;
 }
 
@@ -961,7 +935,7 @@ static int kvm_s390_vm_set_migration(struct kvm *kvm,
 static int kvm_s390_vm_get_migration(struct kvm *kvm,
 				     struct kvm_device_attr *attr)
 {
-	u64 mig = (kvm->arch.migration_state != NULL);
+	u64 mig = kvm->arch.migration_mode;
 
 	if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
 		return -ENXIO;
@@ -1599,6 +1573,134 @@ out:
 /* for consistency */
 #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
 
+/*
+ * Similar to gfn_to_memslot, but returns the index of a memslot also when the
+ * address falls in a hole. In that case the index of one of the memslots
+ * bordering the hole is returned.
+ */
+static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
+{
+	int start = 0, end = slots->used_slots;
+	int slot = atomic_read(&slots->lru_slot);
+	struct kvm_memory_slot *memslots = slots->memslots;
+
+	if (gfn >= memslots[slot].base_gfn &&
+	    gfn < memslots[slot].base_gfn + memslots[slot].npages)
+		return slot;
+
+	while (start < end) {
+		slot = start + (end - start) / 2;
+
+		if (gfn >= memslots[slot].base_gfn)
+			end = slot;
+		else
+			start = slot + 1;
+	}
+
+	if (gfn >= memslots[start].base_gfn &&
+	    gfn < memslots[start].base_gfn + memslots[start].npages) {
+		atomic_set(&slots->lru_slot, start);
+	}
+
+	return start;
+}
+
+static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
+			      u8 *res, unsigned long bufsize)
+{
+	unsigned long pgstev, hva, cur_gfn = args->start_gfn;
+
+	args->count = 0;
+	while (args->count < bufsize) {
+		hva = gfn_to_hva(kvm, cur_gfn);
+		/*
+		 * We return an error if the first value was invalid, but we
+		 * return successfully if at least one value was copied.
+		 */
+		if (kvm_is_error_hva(hva))
+			return args->count ? 0 : -EFAULT;
+		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
+			pgstev = 0;
+		res[args->count++] = (pgstev >> 24) & 0x43;
+		cur_gfn++;
+	}
+
+	return 0;
+}
+
+static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
+					      unsigned long cur_gfn)
+{
+	int slotidx = gfn_to_memslot_approx(slots, cur_gfn);
+	struct kvm_memory_slot *ms = slots->memslots + slotidx;
+	unsigned long ofs = cur_gfn - ms->base_gfn;
+
+	if (ms->base_gfn + ms->npages <= cur_gfn) {
+		slotidx--;
+		/* If we are above the highest slot, wrap around */
+		if (slotidx < 0)
+			slotidx = slots->used_slots - 1;
+
+		ms = slots->memslots + slotidx;
+		ofs = 0;
+	}
+	ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
+	while ((slotidx > 0) && (ofs >= ms->npages)) {
+		slotidx--;
+		ms = slots->memslots + slotidx;
+		ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
+	}
+	return ms->base_gfn + ofs;
+}
+
+static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
+			     u8 *res, unsigned long bufsize)
+{
+	unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *ms;
+
+	cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
+	ms = gfn_to_memslot(kvm, cur_gfn);
+	args->count = 0;
+	args->start_gfn = cur_gfn;
+	if (!ms)
+		return 0;
+	next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
+	mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
+
+	while (args->count < bufsize) {
+		hva = gfn_to_hva(kvm, cur_gfn);
+		if (kvm_is_error_hva(hva))
+			return 0;
+		/* Decrement only if we actually flipped the bit to 0 */
+		if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
+			atomic64_dec(&kvm->arch.cmma_dirty_pages);
+		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
+			pgstev = 0;
+		/* Save the value */
+		res[args->count++] = (pgstev >> 24) & 0x43;
+		/* If the next bit is too far away, stop. */
+		if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
+			return 0;
+		/* If we reached the previous "next", find the next one */
+		if (cur_gfn == next_gfn)
+			next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
+		/* Reached the end of memory or of the buffer, stop */
+		if ((next_gfn >= mem_end) ||
+		    (next_gfn - args->start_gfn >= bufsize))
+			return 0;
+		cur_gfn++;
+		/* Reached the end of the current memslot, take the next one. */
+		if (cur_gfn - ms->base_gfn >= ms->npages) {
+			ms = gfn_to_memslot(kvm, cur_gfn);
+			if (!ms)
+				return 0;
+		}
+	}
+	return 0;
+}
+
 /*
  * This function searches for the next page with dirty CMMA attributes, and
  * saves the attributes in the buffer up to either the end of the buffer or
@@ -1610,22 +1712,18 @@ out:
 static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 				  struct kvm_s390_cmma_log *args)
 {
-	struct kvm_s390_migration_state *s = kvm->arch.migration_state;
-	unsigned long bufsize, hva, pgstev, i, next, cur;
-	int srcu_idx, peek, r = 0, rr;
-	u8 *res;
+	unsigned long bufsize;
+	int srcu_idx, peek, ret;
+	u8 *values;
 
-	cur = args->start_gfn;
-	i = next = pgstev = 0;
-
-	if (unlikely(!kvm->arch.use_cmma))
+	if (!kvm->arch.use_cmma)
 		return -ENXIO;
 	/* Invalid/unsupported flags were specified */
 	if (args->flags & ~KVM_S390_CMMA_PEEK)
 		return -EINVAL;
 	/* Migration mode query, and we are not doing a migration */
 	peek = !!(args->flags & KVM_S390_CMMA_PEEK);
-	if (!peek && !s)
+	if (!peek && !kvm->arch.migration_mode)
 		return -EINVAL;
 	/* CMMA is disabled or was not used, or the buffer has length zero */
 	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
@@ -1633,74 +1731,35 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 		memset(args, 0, sizeof(*args));
 		return 0;
 	}
-
-	if (!peek) {
-		/* We are not peeking, and there are no dirty pages */
-		if (!atomic64_read(&s->dirty_pages)) {
-			memset(args, 0, sizeof(*args));
-			return 0;
-		}
-		cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
-				    args->start_gfn);
-		if (cur >= s->bitmap_size)	/* nothing found, loop back */
-			cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
-		if (cur >= s->bitmap_size) {	/* again! (very unlikely) */
-			memset(args, 0, sizeof(*args));
-			return 0;
-		}
-		next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
+	/* We are not peeking, and there are no dirty pages */
+	if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) {
+		memset(args, 0, sizeof(*args));
+		return 0;
 	}
 
-	res = vmalloc(bufsize);
-	if (!res)
+	values = vmalloc(bufsize);
+	if (!values)
 		return -ENOMEM;
 
-	args->start_gfn = cur;
-
 	down_read(&kvm->mm->mmap_sem);
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	while (i < bufsize) {
-		hva = gfn_to_hva(kvm, cur);
-		if (kvm_is_error_hva(hva)) {
-			r = -EFAULT;
-			break;
-		}
-		/* decrement only if we actually flipped the bit to 0 */
-		if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
-			atomic64_dec(&s->dirty_pages);
-		r = get_pgste(kvm->mm, hva, &pgstev);
-		if (r < 0)
-			pgstev = 0;
-		/* save the value */
-		res[i++] = (pgstev >> 24) & 0x43;
-		/*
-		 * if the next bit is too far away, stop.
-		 * if we reached the previous "next", find the next one
-		 */
-		if (!peek) {
-			if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
-				break;
-			if (cur == next)
-				next = find_next_bit(s->pgste_bitmap,
-						     s->bitmap_size, cur + 1);
-		/* reached the end of the bitmap or of the buffer, stop */
-			if ((next >= s->bitmap_size) ||
-			    (next >= args->start_gfn + bufsize))
-				break;
-		}
-		cur++;
-	}
+	if (peek)
+		ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
+	else
+		ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	up_read(&kvm->mm->mmap_sem);
-	args->count = i;
-	args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
 
-	rr = copy_to_user((void __user *)args->values, res, args->count);
-	if (rr)
-		r = -EFAULT;
+	if (kvm->arch.migration_mode)
+		args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
+	else
+		args->remaining = 0;
 
-	vfree(res);
-	return r;
+	if (copy_to_user((void __user *)args->values, values, args->count))
+		ret = -EFAULT;
+
+	vfree(values);
+	return ret;
 }
 
 /*
@@ -2139,10 +2198,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
 	kvm_s390_vsie_destroy(kvm);
-	if (kvm->arch.migration_state) {
-		vfree(kvm->arch.migration_state->pgste_bitmap);
-		kfree(kvm->arch.migration_state);
-	}
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index c4da375553e8..c623b6f1dd4e 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -987,9 +987,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
+/*
+ * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu)
+ */
+static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
 {
-	struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
 	int r1, r2, nappended, entries;
 	unsigned long gfn, hva, res, pgstev, ptev;
 	unsigned long *cbrlo;
@@ -1039,10 +1041,12 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
 		cbrlo[entries] = gfn << PAGE_SHIFT;
 	}
 
-	if (orc && gfn < ms->bitmap_size) {
-		/* increment only if we are really flipping the bit to 1 */
-		if (!test_and_set_bit(gfn, ms->pgste_bitmap))
-			atomic64_inc(&ms->dirty_pages);
+	if (orc) {
+		struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn);
+
+		/* Increment only if we are really flipping the bit */
+		if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
+			atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
 	}
 
 	return nappended;
@@ -1071,7 +1075,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 						: ESSA_SET_STABLE_IF_RESIDENT))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	if (likely(!vcpu->kvm->arch.migration_state)) {
+	if (!vcpu->kvm->arch.migration_mode) {
 		/*
 		 * CMMA is enabled in the KVM settings, but is disabled in
 		 * the SIE block and in the mm_context, and we are not doing
@@ -1099,10 +1103,16 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 		/* Retry the ESSA instruction */
 		kvm_s390_retry_instr(vcpu);
 	} else {
-		/* Account for the possible extra cbrl entry */
-		i = do_essa(vcpu, orc);
+		int srcu_idx;
+
+		down_read(&vcpu->kvm->mm->mmap_sem);
+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		i = __do_essa(vcpu, orc);
+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		up_read(&vcpu->kvm->mm->mmap_sem);
 		if (i < 0)
 			return i;
+		/* Account for the possible extra cbrl entry */
 		entries += i;
 	}
 	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */

From 63747bf73cf329d4bce9087067042241c5330a4a Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 16 Jul 2018 10:34:28 +0200
Subject: [PATCH 04/85] KVM: s390/vsie: avoid sparse warning

This is a non-functional change that avoids
arch/s390/kvm/vsie.c:839:25: warning: context imbalance in 'do_vsie_run' - unexpected unlock

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 84c89cb9636f..5539df037f91 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -818,6 +818,8 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
  *          - < 0 if an error occurred
  */
 static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+	__releases(vcpu->kvm->srcu)
+	__acquires(vcpu->kvm->srcu)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;

From 4eeb85568e5653c71d901f7593d3f3e7e2a5414f Mon Sep 17 00:00:00 2001
From: Simon Guo <wei.guo.simon@gmail.com>
Date: Mon, 28 May 2018 09:48:26 +0800
Subject: [PATCH 05/85] KVM: PPC: Remove mmio_vsx_tx_sx_enabled in KVM MMIO
 emulation

Originally PPC KVM MMIO emulation uses only 0~31#(5 bits) for VSR
reg number, and use mmio_vsx_tx_sx_enabled field together for
0~63# VSR regs.

Currently PPC KVM MMIO emulation is reimplemented with analyse_instr()
assistance.  analyse_instr() returns 0~63 for VSR register number, so
it is not necessary to use additional mmio_vsx_tx_sx_enabled field
any more.

This patch extends related reg bits (expand io_gpr to u16 from u8
and use 6 bits for VSR reg#), so that mmio_vsx_tx_sx_enabled can
be removed.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h  | 17 ++++++++--------
 arch/powerpc/kvm/emulate_loadstore.c |  7 +++----
 arch/powerpc/kvm/powerpc.c           | 30 ++++++++++++++--------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index fa4efa7e88f7..5b9e6608c5bf 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -672,7 +672,7 @@ struct kvm_vcpu_arch {
 	gva_t vaddr_accessed;
 	pgd_t *pgdir;
 
-	u8 io_gpr; /* GPR used as IO source/target */
+	u16 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_host_swabbed;
 	u8 mmio_sign_extend;
 	/* conversion between single and double precision */
@@ -688,7 +688,6 @@ struct kvm_vcpu_arch {
 	 */
 	u8 mmio_vsx_copy_nums;
 	u8 mmio_vsx_offset;
-	u8 mmio_vsx_tx_sx_enabled;
 	u8 mmio_vmx_copy_nums;
 	u8 mmio_vmx_offset;
 	u8 mmio_copy_type;
@@ -801,14 +800,14 @@ struct kvm_vcpu_arch {
 #define KVMPPC_VCPU_BUSY_IN_HOST	2
 
 /* Values for vcpu->arch.io_gpr */
-#define KVM_MMIO_REG_MASK	0x001f
-#define KVM_MMIO_REG_EXT_MASK	0xffe0
+#define KVM_MMIO_REG_MASK	0x003f
+#define KVM_MMIO_REG_EXT_MASK	0xffc0
 #define KVM_MMIO_REG_GPR	0x0000
-#define KVM_MMIO_REG_FPR	0x0020
-#define KVM_MMIO_REG_QPR	0x0040
-#define KVM_MMIO_REG_FQPR	0x0060
-#define KVM_MMIO_REG_VSX	0x0080
-#define KVM_MMIO_REG_VMX	0x00c0
+#define KVM_MMIO_REG_FPR	0x0040
+#define KVM_MMIO_REG_QPR	0x0080
+#define KVM_MMIO_REG_FQPR	0x00c0
+#define KVM_MMIO_REG_VSX	0x0100
+#define KVM_MMIO_REG_VMX	0x0180
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index afde788be141..75dce1ef3bc8 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -106,7 +106,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 	 * if mmio_vsx_tx_sx_enabled == 1, copy data between
 	 * VSR[32..63] and memory
 	 */
-	vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
 	vcpu->arch.mmio_vsx_copy_nums = 0;
 	vcpu->arch.mmio_vsx_offset = 0;
 	vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
@@ -242,8 +241,8 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			}
 
 			emulated = kvmppc_handle_vsx_load(run, vcpu,
-					KVM_MMIO_REG_VSX | (op.reg & 0x1f),
-					io_size_each, 1, op.type & SIGNEXT);
+					KVM_MMIO_REG_VSX|op.reg, io_size_each,
+					1, op.type & SIGNEXT);
 			break;
 		}
 #endif
@@ -363,7 +362,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			}
 
 			emulated = kvmppc_handle_vsx_store(run, vcpu,
-					op.reg & 0x1f, io_size_each, 1);
+					op.reg, io_size_each, 1);
 			break;
 		}
 #endif
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 0e8c20c5eaac..b6fd7c5b8d1a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -880,10 +880,10 @@ static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu,
 	if (offset == -1)
 		return;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsxval[offset] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		VCPU_VSX_FPR(vcpu, index, offset) = gpr;
 	}
@@ -895,11 +895,11 @@ static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu,
 	union kvmppc_one_reg val;
 	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsxval[0] = gpr;
 		val.vsxval[1] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		VCPU_VSX_FPR(vcpu, index, 0) = gpr;
 		VCPU_VSX_FPR(vcpu, index, 1) = gpr;
@@ -912,12 +912,12 @@ static inline void kvmppc_set_vsr_word_dump(struct kvm_vcpu *vcpu,
 	union kvmppc_one_reg val;
 	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+	if (index >= 32) {
 		val.vsx32val[0] = gpr;
 		val.vsx32val[1] = gpr;
 		val.vsx32val[2] = gpr;
 		val.vsx32val[3] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		val.vsx32val[0] = gpr;
 		val.vsx32val[1] = gpr;
@@ -937,10 +937,10 @@ static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
 	if (offset == -1)
 		return;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsx32val[offset] = gpr32;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		dword_offset = offset / 2;
 		word_offset = offset % 2;
@@ -1361,10 +1361,10 @@ static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
 			break;
 		}
 
-		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		if (rs < 32) {
 			*val = VCPU_VSX_FPR(vcpu, rs, vsx_offset);
 		} else {
-			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
 			*val = reg.vsxval[vsx_offset];
 		}
 		break;
@@ -1378,13 +1378,13 @@ static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
 			break;
 		}
 
-		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		if (rs < 32) {
 			dword_offset = vsx_offset / 2;
 			word_offset = vsx_offset % 2;
 			reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset);
 			*val = reg.vsx32val[word_offset];
 		} else {
-			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
 			*val = reg.vsx32val[vsx_offset];
 		}
 		break;

From 76346cd93a5eca33700f82685d56172dd65d4c0a Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 20 Jun 2018 18:42:58 +1000
Subject: [PATCH 06/85] KVM: PPC: Book3S: Fix matching of hardware and emulated
 TCE tables

When attaching a hardware table to LIOBN in KVM, we match table parameters
such as page size, table offset and table size. However the tables are
created via very different paths - VFIO and KVM - and the VFIO path goes
through the platform code which has minimum TCE page size requirement
(which is 4K but since we allocate memory by pages and cannot avoid
alignment anyway, we align to 64k pages for powernv_defconfig).

So when we match the tables, one might be bigger that the other which
means the hardware table cannot get attached to LIOBN and DMA mapping
fails.

This removes the table size alignment from the guest visible table.
This does not affect the memory allocation which is still aligned -
kvmppc_tce_pages() takes care of this.

This relaxes the check we do when attaching tables to allow the hardware
table be bigger than the guest visible table.

Ideally we want the KVM table to cover the same space as the hardware
table does but since the hardware table may use multiple levels, and
all levels must use the same table size (IODA2 design), the area it can
actually cover might get very different from the window size which
the guest requested, even though the guest won't map it all.

Fixes: ca1fc489cf "KVM: PPC: Book3S: Allow backing bigger guest IOMMU pages with smaller physical pages"
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_vio.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index d066e37551ec..85396e93747f 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -180,7 +180,7 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
 		if ((tbltmp->it_page_shift <= stt->page_shift) &&
 				(tbltmp->it_offset << tbltmp->it_page_shift ==
 				 stt->offset << stt->page_shift) &&
-				(tbltmp->it_size << tbltmp->it_page_shift ==
+				(tbltmp->it_size << tbltmp->it_page_shift >=
 				 stt->size << stt->page_shift)) {
 			/*
 			 * Reference the table to avoid races with
@@ -296,7 +296,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 {
 	struct kvmppc_spapr_tce_table *stt = NULL;
 	struct kvmppc_spapr_tce_table *siter;
-	unsigned long npages, size;
+	unsigned long npages, size = args->size;
 	int ret = -ENOMEM;
 	int i;
 
@@ -304,7 +304,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
 		return -EINVAL;
 
-	size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
 	npages = kvmppc_tce_pages(size);
 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
 	if (ret)

From 51eaa08f029c7343df846325d7cf047be8b96e81 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Sat, 7 Jul 2018 08:53:07 +0200
Subject: [PATCH 07/85] KVM: PPC: Book3S HV: Add of_node_put() in success path

The call to of_find_compatible_node() is returning a pointer with
incremented refcount so it must be explicitly decremented after the
last use. As here it is only being used for checking of node presence
but the result is not actually used in the success path it can be
dropped immediately.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Fixes: commit f725758b899f ("KVM: PPC: Book3S HV: Use OPAL XICS emulation on POWER9")
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index de686b340f4a..fba21c91b2ff 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4561,6 +4561,8 @@ static int kvmppc_book3s_init_hv(void)
 			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
 			return -ENODEV;
 		}
+		/* presence of intc confirmed - node can be dropped again */
+		of_node_put(np);
 	}
 #endif
 

From 0abb75b7a16d21e2fb0d98634df44c37c184f186 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Sat, 7 Jul 2018 11:07:25 +0200
Subject: [PATCH 08/85] KVM: PPC: Book3S HV: Fix constant size warning

The constants are 64bit but not explicitly declared UL resulting
in sparse warnings. Fix this by declaring the constants UL.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/reg.h |  2 +-
 arch/powerpc/kvm/book3s_hv.c   | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 562568414cf4..858aa7984ab0 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -161,7 +161,7 @@
 #define PSSCR_ESL		0x00200000 /* Enable State Loss */
 #define PSSCR_SD		0x00400000 /* Status Disable */
 #define PSSCR_PLS	0xf000000000000000 /* Power-saving Level Status */
-#define PSSCR_GUEST_VIS	0xf0000000000003ff /* Guest-visible PSSCR fields */
+#define PSSCR_GUEST_VIS	0xf0000000000003ffUL /* Guest-visible PSSCR fields */
 #define PSSCR_FAKE_SUSPEND	0x00000400 /* Fake-suspend bit (P9 DD2.2) */
 #define PSSCR_FAKE_SUSPEND_LG	10	   /* Fake-suspend bit position */
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index fba21c91b2ff..d73b29b6aaa1 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -128,14 +128,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
  * and SPURR count and should be set according to the number of
  * online threads in the vcore being run.
  */
-#define RWMR_RPA_P8_1THREAD	0x164520C62609AECA
-#define RWMR_RPA_P8_2THREAD	0x7FFF2908450D8DA9
-#define RWMR_RPA_P8_3THREAD	0x164520C62609AECA
-#define RWMR_RPA_P8_4THREAD	0x199A421245058DA9
-#define RWMR_RPA_P8_5THREAD	0x164520C62609AECA
-#define RWMR_RPA_P8_6THREAD	0x164520C62609AECA
-#define RWMR_RPA_P8_7THREAD	0x164520C62609AECA
-#define RWMR_RPA_P8_8THREAD	0x164520C62609AECA
+#define RWMR_RPA_P8_1THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_2THREAD	0x7FFF2908450D8DA9UL
+#define RWMR_RPA_P8_3THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_4THREAD	0x199A421245058DA9UL
+#define RWMR_RPA_P8_5THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_6THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_7THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_8THREAD	0x164520C62609AECAUL
 
 static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = {
 	RWMR_RPA_P8_1THREAD,

From a3da7b4a3be51f37f434f14e11e60491f098b6ea Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 8 Mar 2018 16:08:49 +0000
Subject: [PATCH 09/85] KVM: s390: add etoken support for guests

We want to provide facility 156 (etoken facility) to our
guests. This includes migration support (via sync regs) and
VSIE changes. The tokens are being reset on clear reset. This
has to be implemented by userspace (via sync regs).

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
---
 arch/s390/include/asm/kvm_host.h | 1 +
 arch/s390/include/uapi/asm/kvm.h | 5 ++++-
 arch/s390/kvm/kvm-s390.c         | 8 ++++++--
 arch/s390/kvm/vsie.c             | 9 +++++++--
 arch/s390/tools/gen_facilities.c | 3 ++-
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 44b09b366782..1a7b6834797e 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -269,6 +269,7 @@ struct kvm_s390_sie_block {
 	__u8	reserved1c0[8];		/* 0x01c0 */
 #define ECD_HOSTREGMGMT	0x20000000
 #define ECD_MEF		0x08000000
+#define ECD_ETOKENF	0x02000000
 	__u32	ecd;			/* 0x01c8 */
 	__u8	reserved1cc[18];	/* 0x01cc */
 	__u64	pp;			/* 0x01de */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 4cdaa55fabfe..9a50f02b9894 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -4,7 +4,7 @@
 /*
  * KVM s390 specific structures and definitions
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008, 2018
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -225,6 +225,7 @@ struct kvm_guest_debug_arch {
 #define KVM_SYNC_FPRS   (1UL << 8)
 #define KVM_SYNC_GSCB   (1UL << 9)
 #define KVM_SYNC_BPBC   (1UL << 10)
+#define KVM_SYNC_ETOKEN (1UL << 11)
 /* length and alignment of the sdnx as a power of two */
 #define SDNXC 8
 #define SDNXL (1UL << SDNXC)
@@ -258,6 +259,8 @@ struct kvm_sync_regs {
 		struct {
 			__u64 reserved1[2];
 			__u64 gscb[4];
+			__u64 etoken;
+			__u64 etoken_extension;
 		};
 	};
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b350ec14d882..0cc8a48fc732 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2355,6 +2355,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
 	if (test_kvm_facility(vcpu->kvm, 133))
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
+	if (test_kvm_facility(vcpu->kvm, 156))
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
 	/* fprs can be synchronized via vrs, even if the guest has no vx. With
 	 * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
 	 */
@@ -2604,7 +2606,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	}
 	if (test_kvm_facility(vcpu->kvm, 139))
 		vcpu->arch.sie_block->ecd |= ECD_MEF;
-
+	if (test_kvm_facility(vcpu->kvm, 156))
+		vcpu->arch.sie_block->ecd |= ECD_ETOKENF;
 	if (vcpu->arch.sie_block->gd) {
 		vcpu->arch.sie_block->eca |= ECA_AIV;
 		VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
@@ -3522,6 +3525,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 		preempt_enable();
 	}
+	/* SIE will load etoken directly from SDNX and therefore kvm_run */
 
 	kvm_run->kvm_dirty_regs = 0;
 }
@@ -3561,7 +3565,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			__ctl_clear_bit(2, 4);
 		vcpu->arch.host_gscb = NULL;
 	}
-
+	/* SIE will save etoken directly into SDNX and therefore kvm_run */
 }
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 5539df037f91..63844b95c22c 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -2,7 +2,7 @@
 /*
  * kvm nested virtualization support for s390x
  *
- * Copyright IBM Corp. 2016
+ * Copyright IBM Corp. 2016, 2018
  *
  *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
  */
@@ -378,6 +378,10 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	if (test_kvm_facility(vcpu->kvm, 139))
 		scb_s->ecd |= scb_o->ecd & ECD_MEF;
 
+	/* etoken */
+	if (test_kvm_facility(vcpu->kvm, 156))
+		scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
+
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);
 out:
@@ -627,7 +631,8 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		vsie_page->riccbd_gpa = gpa;
 		scb_s->riccbd = hpa;
 	}
-	if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
+	if (((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) ||
+	    (scb_s->ecd & ECD_ETOKENF)) {
 		unsigned long sdnxc;
 
 		gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL;
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 90a8c9e84ca6..0c85aedcf9b3 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -4,7 +4,7 @@
  * numbering scheme from the Princples of Operations: most significant bit
  * has bit number 0.
  *
- *    Copyright IBM Corp. 2015
+ *    Copyright IBM Corp. 2015, 2018
  *
  */
 
@@ -106,6 +106,7 @@ static struct facility_def facility_defs[] = {
 
 		.name = "FACILITIES_KVM_CPUMODEL",
 		.bits = (int[]){
+			156, /* etoken facility */
 			-1  /* END */
 		}
 	},

From 1e175d2e07c71d9574f5b1c74523abca54e2654f Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sam.bobroff@au1.ibm.com>
Date: Wed, 25 Jul 2018 16:12:02 +1000
Subject: [PATCH 10/85] KVM: PPC: Book3S HV: Pack VCORE IDs to access full VCPU
 ID space

It is not currently possible to create the full number of possible
VCPUs (KVM_MAX_VCPUS) on Power9 with KVM-HV when the guest uses fewer
threads per core than its core stride (or "VSMT mode"). This is
because the VCORE ID and XIVE offsets grow beyond KVM_MAX_VCPUS
even though the VCPU ID is less than KVM_MAX_VCPU_ID.

To address this, "pack" the VCORE ID and XIVE offsets by using
knowledge of the way the VCPU IDs will be used when there are fewer
guest threads per core than the core stride. The primary thread of
each core will always be used first. Then, if the guest uses more than
one thread per core, these secondary threads will sequentially follow
the primary in each core.

So, the only way an ID above KVM_MAX_VCPUS can be seen, is if the
VCPUs are being spaced apart, so at least half of each core is empty,
and IDs between KVM_MAX_VCPUS and (KVM_MAX_VCPUS * 2) can be mapped
into the second half of each core (4..7, in an 8-thread core).

Similarly, if IDs above KVM_MAX_VCPUS * 2 are seen, at least 3/4 of
each core is being left empty, and we can map down into the second and
third quarters of each core (2, 3 and 5, 6 in an 8-thread core).

Lastly, if IDs above KVM_MAX_VCPUS * 4 are seen, only the primary
threads are being used and 7/8 of the core is empty, allowing use of
the 1, 5, 3 and 7 thread slots.

(Strides less than 8 are handled similarly.)

This allows the VCORE ID or offset to be calculated quickly from the
VCPU ID or XIVE server numbers, without access to the VCPU structure.

[paulus@ozlabs.org - tidied up comment a little, changed some WARN_ONCE
 to pr_devel, wrapped line, fixed id check.]

Signed-off-by: Sam Bobroff <sam.bobroff@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s.h | 47 +++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c          | 27 +++++++++++----
 arch/powerpc/kvm/book3s_xive.c        | 19 +++++++----
 3 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 1f345a0b6ba2..83a9aa3cf689 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -390,4 +390,51 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 #define SPLIT_HACK_MASK			0xff000000
 #define SPLIT_HACK_OFFS			0xfb000000
 
+/*
+ * This packs a VCPU ID from the [0..KVM_MAX_VCPU_ID) space down to the
+ * [0..KVM_MAX_VCPUS) space, using knowledge of the guest's core stride
+ * (but not its actual threading mode, which is not available) to avoid
+ * collisions.
+ *
+ * The implementation leaves VCPU IDs from the range [0..KVM_MAX_VCPUS) (block
+ * 0) unchanged: if the guest is filling each VCORE completely then it will be
+ * using consecutive IDs and it will fill the space without any packing.
+ *
+ * For higher VCPU IDs, the packed ID is based on the VCPU ID modulo
+ * KVM_MAX_VCPUS (effectively masking off the top bits) and then an offset is
+ * added to avoid collisions.
+ *
+ * VCPU IDs in the range [KVM_MAX_VCPUS..(KVM_MAX_VCPUS*2)) (block 1) are only
+ * possible if the guest is leaving at least 1/2 of each VCORE empty, so IDs
+ * can be safely packed into the second half of each VCORE by adding an offset
+ * of (stride / 2).
+ *
+ * Similarly, if VCPU IDs in the range [(KVM_MAX_VCPUS*2)..(KVM_MAX_VCPUS*4))
+ * (blocks 2 and 3) are seen, the guest must be leaving at least 3/4 of each
+ * VCORE empty so packed IDs can be offset by (stride / 4) and (stride * 3 / 4).
+ *
+ * Finally, VCPU IDs from blocks 5..7 will only be seen if the guest is using a
+ * stride of 8 and 1 thread per core so the remaining offsets of 1, 5, 3 and 7
+ * must be free to use.
+ *
+ * (The offsets for each block are stored in block_offsets[], indexed by the
+ * block number if the stride is 8. For cases where the guest's stride is less
+ * than 8, we can re-use the block_offsets array by multiplying the block
+ * number by (MAX_SMT_THREADS / stride) to reach the correct entry.)
+ */
+static inline u32 kvmppc_pack_vcpu_id(struct kvm *kvm, u32 id)
+{
+	const int block_offsets[MAX_SMT_THREADS] = {0, 4, 2, 6, 1, 5, 3, 7};
+	int stride = kvm->arch.emul_smt_mode;
+	int block = (id / KVM_MAX_VCPUS) * (MAX_SMT_THREADS / stride);
+	u32 packed_id;
+
+	if (WARN_ONCE(block >= MAX_SMT_THREADS, "VCPU ID too large to pack"))
+		return 0;
+	packed_id = (id % KVM_MAX_VCPUS) + block_offsets[block];
+	if (WARN_ONCE(packed_id >= KVM_MAX_VCPUS, "VCPU ID packing failed"))
+		return 0;
+	return packed_id;
+}
+
 #endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d73b29b6aaa1..785245e09f32 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1816,7 +1816,7 @@ static int threads_per_vcore(struct kvm *kvm)
 	return threads_per_subcore;
 }
 
-static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
+static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
 {
 	struct kvmppc_vcore *vcore;
 
@@ -1830,7 +1830,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	init_swait_queue_head(&vcore->wq);
 	vcore->preempt_tb = TB_NIL;
 	vcore->lpcr = kvm->arch.lpcr;
-	vcore->first_vcpuid = core * kvm->arch.smt_mode;
+	vcore->first_vcpuid = id;
 	vcore->kvm = kvm;
 	INIT_LIST_HEAD(&vcore->preempt_list);
 
@@ -1989,10 +1989,16 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 						   unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
-	int err;
+	int err = -EINVAL;
 	int core;
 	struct kvmppc_vcore *vcore;
 
+	if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode) &&
+	    cpu_has_feature(CPU_FTR_ARCH_300)) {
+		pr_devel("DNCI: VCPU ID too high\n");
+		goto out;
+	}
+
 	err = -ENOMEM;
 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
 	if (!vcpu)
@@ -2048,12 +2054,21 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	mutex_lock(&kvm->lock);
 	vcore = NULL;
 	err = -EINVAL;
-	core = id / kvm->arch.smt_mode;
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		BUG_ON(kvm->arch.smt_mode != 1);
+		core = kvmppc_pack_vcpu_id(kvm, id);
+	} else {
+		core = id / kvm->arch.smt_mode;
+	}
 	if (core < KVM_MAX_VCORES) {
 		vcore = kvm->arch.vcores[core];
-		if (!vcore) {
+		if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
+			pr_devel("KVM: collision on id %u", id);
+			vcore = NULL;
+		} else if (!vcore) {
 			err = -ENOMEM;
-			vcore = kvmppc_vcore_create(kvm, core);
+			vcore = kvmppc_vcore_create(kvm,
+					id & ~(kvm->arch.smt_mode - 1));
 			kvm->arch.vcores[core] = vcore;
 			kvm->arch.online_vcores++;
 		}
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index f9818d7d3381..126f02b3ffb8 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -317,6 +317,11 @@ static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
 	return -EBUSY;
 }
 
+static u32 xive_vp(struct kvmppc_xive *xive, u32 server)
+{
+	return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
+}
+
 static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
 			     struct kvmppc_xive_src_block *sb,
 			     struct kvmppc_xive_irq_state *state)
@@ -362,7 +367,7 @@ static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
 	 */
 	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
 		xive_native_configure_irq(hw_num,
-					  xive->vp_base + state->act_server,
+					  xive_vp(xive, state->act_server),
 					  MASKED, state->number);
 		/* set old_p so we can track if an H_EOI was done */
 		state->old_p = true;
@@ -418,7 +423,7 @@ static void xive_finish_unmask(struct kvmppc_xive *xive,
 	 */
 	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
 		xive_native_configure_irq(hw_num,
-					  xive->vp_base + state->act_server,
+					  xive_vp(xive, state->act_server),
 					  state->act_priority, state->number);
 		/* If an EOI is needed, do it here */
 		if (!state->old_p)
@@ -495,7 +500,7 @@ static int xive_target_interrupt(struct kvm *kvm,
 	kvmppc_xive_select_irq(state, &hw_num, NULL);
 
 	return xive_native_configure_irq(hw_num,
-					 xive->vp_base + server,
+					 xive_vp(xive, server),
 					 prio, state->number);
 }
 
@@ -883,7 +888,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
 	 * which is fine for a never started interrupt.
 	 */
 	xive_native_configure_irq(hw_irq,
-				  xive->vp_base + state->act_server,
+				  xive_vp(xive, state->act_server),
 				  state->act_priority, state->number);
 
 	/*
@@ -959,7 +964,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
 
 	/* Reconfigure the IPI */
 	xive_native_configure_irq(state->ipi_number,
-				  xive->vp_base + state->act_server,
+				  xive_vp(xive, state->act_server),
 				  state->act_priority, state->number);
 
 	/*
@@ -1084,7 +1089,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 		pr_devel("Duplicate !\n");
 		return -EEXIST;
 	}
-	if (cpu >= KVM_MAX_VCPUS) {
+	if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
 		pr_devel("Out of bounds !\n");
 		return -EINVAL;
 	}
@@ -1098,7 +1103,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 	xc->xive = xive;
 	xc->vcpu = vcpu;
 	xc->server_num = cpu;
-	xc->vp_id = xive->vp_base + cpu;
+	xc->vp_id = xive_vp(xive, cpu);
 	xc->mfrr = 0xff;
 	xc->valid = true;
 

From 1ebe6b81ebdba8faf377d1d7d84ad9368e7a0bae Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 26 Jul 2018 14:53:54 +1000
Subject: [PATCH 11/85] KVM: PPC: Book3S HV: Allow creating max number of VCPUs
 on POWER9

Commit 1e175d2 ("KVM: PPC: Book3S HV: Pack VCORE IDs to access full
VCPU ID space", 2018-07-25) allowed use of VCPU IDs up to
KVM_MAX_VCPU_ID on POWER9 in all guest SMT modes and guest emulated
hardware SMT modes.  However, with the current definition of
KVM_MAX_VCPU_ID, a guest SMT mode of 1 and an emulated SMT mode of 8,
it is only possible to create KVM_MAX_VCPUS / 2 VCPUS, because
threads_per_subcore is 4 on POWER9 CPUs.  (Using an emulated SMT mode
of 8 is useful when migrating VMs to or from POWER8 hosts.)

This increases KVM_MAX_VCPU_ID to 8 * KVM_MAX_VCPUS when HV KVM is
configured in, so that a full complement of KVM_MAX_VCPUS VCPUs can
be created on POWER9 in all guest SMT modes and emulated hardware
SMT modes.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5b9e6608c5bf..906bcbdfd2a1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -42,7 +42,14 @@
 #define KVM_USER_MEM_SLOTS	512
 
 #include <asm/cputhreads.h>
-#define KVM_MAX_VCPU_ID                (threads_per_subcore * KVM_MAX_VCORES)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+#include <asm/kvm_book3s_asm.h>		/* for MAX_SMT_THREADS */
+#define KVM_MAX_VCPU_ID		(MAX_SMT_THREADS * KVM_MAX_VCORES)
+
+#else
+#define KVM_MAX_VCPU_ID		KVM_MAX_VCPUS
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 

From b5c6f7607b908b1445f2556c8d2f3b1ec5fc5aa8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 26 Jul 2018 15:38:41 +1000
Subject: [PATCH 12/85] KVM: PPC: Book3S HV: Read kvm->arch.emul_smt_mode under
 kvm->lock

Commit 1e175d2 ("KVM: PPC: Book3S HV: Pack VCORE IDs to access full
VCPU ID space", 2018-07-25) added code that uses kvm->arch.emul_smt_mode
before any VCPUs are created.  However, userspace can change
kvm->arch.emul_smt_mode at any time up until the first VCPU is created.
Hence it is (theoretically) possible for the check in
kvmppc_core_vcpu_create_hv() to race with another userspace thread
changing kvm->arch.emul_smt_mode.

This fixes it by moving the test that uses kvm->arch.emul_smt_mode into
the block where kvm->lock is held.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 785245e09f32..113f81577668 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1989,16 +1989,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 						   unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
-	int err = -EINVAL;
+	int err;
 	int core;
 	struct kvmppc_vcore *vcore;
 
-	if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode) &&
-	    cpu_has_feature(CPU_FTR_ARCH_300)) {
-		pr_devel("DNCI: VCPU ID too high\n");
-		goto out;
-	}
-
 	err = -ENOMEM;
 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
 	if (!vcpu)
@@ -2055,8 +2049,13 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	vcore = NULL;
 	err = -EINVAL;
 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-		BUG_ON(kvm->arch.smt_mode != 1);
-		core = kvmppc_pack_vcpu_id(kvm, id);
+		if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
+			pr_devel("KVM: VCPU ID too high\n");
+			core = KVM_MAX_VCORES;
+		} else {
+			BUG_ON(kvm->arch.smt_mode != 1);
+			core = kvmppc_pack_vcpu_id(kvm, id);
+		}
 	} else {
 		core = id / kvm->arch.smt_mode;
 	}

From 57cb198cfdd2e666a8dc2b7e5df6d2708191eddb Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Fri, 20 Jul 2018 13:51:21 +0100
Subject: [PATCH 13/85] KVM: s390: Beautify skey enable check

Let's introduce an explicit check if skeys have already been enabled
for the vcpu, so we don't have to check the mm context if we don't have
the storage key facility.

This lets us check for enablement without having to take the mm
semaphore and thus speedup skey emulation.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Farhan Ali <alifm@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  1 +
 arch/s390/kvm/priv.c             | 10 ++++------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 1a7b6834797e..29c940bf8506 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -656,6 +656,7 @@ struct kvm_vcpu_arch {
 	seqcount_t cputm_seqcount;
 	__u64 cputm_start;
 	bool gs_enabled;
+	bool skey_enabled;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index c623b6f1dd4e..63285b14ff31 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -205,13 +205,10 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 {
 	int rc;
-	struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
 
 	trace_kvm_s390_skey_related_inst(vcpu);
 	/* Already enabled? */
-	if (vcpu->kvm->arch.use_skf &&
-	    !(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) &&
-	    !kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
+	if (vcpu->arch.skey_enabled)
 		return 0;
 
 	rc = s390_enable_skey();
@@ -222,9 +219,10 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 	if (kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
 		kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS);
 	if (!vcpu->kvm->arch.use_skf)
-		sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+		vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 	else
-		sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+		vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+	vcpu->arch.skey_enabled = true;
 	return 0;
 }
 

From ca359066889f753ca6d54bb333b783e7a784c0d1 Mon Sep 17 00:00:00 2001
From: Wei Huang <wei@redhat.com>
Date: Mon, 25 Jun 2018 23:41:57 -0400
Subject: [PATCH 14/85] kvm: selftests: add cr4_cpuid_sync_test

KVM is supposed to update some guest VM's CPUID bits (e.g. OSXSAVE) when
CR4 is changed. A bug was found in KVM recently and it was fixed by
Commit c4d2188206ba ("KVM: x86: Update cpuid properly when CR4.OSXAVE or
CR4.PKE is changed"). This patch adds a test to verify the synchronization
between guest VM's CR4 and CPUID bits.

Signed-off-by: Wei Huang <wei@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore        |   1 +
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../selftests/kvm/cr4_cpuid_sync_test.c       | 129 ++++++++++++++++++
 3 files changed, 131 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/cr4_cpuid_sync_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 63fc1ab9248f..8fb7903f6643 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -1,3 +1,4 @@
+cr4_cpuid_sync_test
 set_sregs_test
 sync_regs_test
 vmx_tsc_adjust_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index d9d00319b07c..65bda4fb0ad6 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -9,6 +9,7 @@ LIBKVM_x86_64 = lib/x86.c lib/vmx.c
 TEST_GEN_PROGS_x86_64 = set_sregs_test
 TEST_GEN_PROGS_x86_64 += sync_regs_test
 TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
 LIBKVM += $(LIBKVM_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c
new file mode 100644
index 000000000000..8346b33c2073
--- /dev/null
+++ b/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CR4 and CPUID sync test
+ *
+ * Copyright 2018, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ *   Wei Huang <wei@redhat.com>
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "x86.h"
+
+#define X86_FEATURE_XSAVE	(1<<26)
+#define X86_FEATURE_OSXSAVE	(1<<27)
+#define VCPU_ID			1
+
+enum {
+	GUEST_UPDATE_CR4 = 0x1000,
+	GUEST_FAILED,
+	GUEST_DONE,
+};
+
+static void exit_to_hv(uint16_t port)
+{
+	__asm__ __volatile__("in %[port], %%al"
+			     :
+			     : [port]"d"(port)
+			     : "rax");
+}
+
+static inline bool cr4_cpuid_is_sync(void)
+{
+	int func, subfunc;
+	uint32_t eax, ebx, ecx, edx;
+	uint64_t cr4;
+
+	func = 0x1;
+	subfunc = 0x0;
+	__asm__ __volatile__("cpuid"
+			     : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+			     : "a"(func), "c"(subfunc));
+
+	cr4 = get_cr4();
+
+	return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE));
+}
+
+static void guest_code(void)
+{
+	uint64_t cr4;
+
+	/* turn on CR4.OSXSAVE */
+	cr4 = get_cr4();
+	cr4 |= X86_CR4_OSXSAVE;
+	set_cr4(cr4);
+
+	/* verify CR4.OSXSAVE == CPUID.OSXSAVE */
+	if (!cr4_cpuid_is_sync())
+		exit_to_hv(GUEST_FAILED);
+
+	/* notify hypervisor to change CR4 */
+	exit_to_hv(GUEST_UPDATE_CR4);
+
+	/* check again */
+	if (!cr4_cpuid_is_sync())
+		exit_to_hv(GUEST_FAILED);
+
+	exit_to_hv(GUEST_DONE);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct kvm_sregs sregs;
+	struct kvm_cpuid_entry2 *entry;
+	int rc;
+
+	entry = kvm_get_supported_cpuid_entry(1);
+	if (!(entry->ecx & X86_FEATURE_XSAVE)) {
+		printf("XSAVE feature not supported, skipping test\n");
+		return 0;
+	}
+
+	/* Tell stdout not to buffer its content */
+	setbuf(stdout, NULL);
+
+	/* Create VM */
+	vm = vm_create_default(VCPU_ID, guest_code);
+	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	run = vcpu_state(vm, VCPU_ID);
+
+	while (1) {
+		rc = _vcpu_run(vm, VCPU_ID);
+
+		if (run->exit_reason == KVM_EXIT_IO) {
+			switch (run->io.port) {
+			case GUEST_UPDATE_CR4:
+				/* emulate hypervisor clearing CR4.OSXSAVE */
+				vcpu_sregs_get(vm, VCPU_ID, &sregs);
+				sregs.cr4 &= ~X86_CR4_OSXSAVE;
+				vcpu_sregs_set(vm, VCPU_ID, &sregs);
+				break;
+			case GUEST_FAILED:
+				TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
+				break;
+			case GUEST_DONE:
+				goto done;
+			default:
+				TEST_ASSERT(false, "Unknown port 0x%x.",
+					    run->io.port);
+			}
+		}
+	}
+
+	kvm_vm_free(vm);
+
+done:
+	return 0;
+}

From 86dafed50e2b9b18198cc663775ece819ff113ba Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Tue, 10 Jul 2018 11:27:19 +0200
Subject: [PATCH 15/85] KVM: Switch 'requests' to be 64-bit (explicitly)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch 'requests' to be explicitly 64-bit and update BUILD_BUG_ON check to
use the size of "requests" instead of the hard-coded '32'.

That gives us a bit more room again for arch-specific requests as we
already ran out of space for x86 due to the hard-coded check.

The only exception here is ARM32 as it is still 32-bits.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5f138b40e433..2840ac74a3d7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -130,7 +130,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQUEST_ARCH_BASE     8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
-	BUILD_BUG_ON((unsigned)(nr) >= 32 - KVM_REQUEST_ARCH_BASE); \
+	BUILD_BUG_ON((unsigned)(nr) >= (FIELD_SIZEOF(struct kvm_vcpu, requests) * 8) - KVM_REQUEST_ARCH_BASE); \
 	(unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \
 })
 #define KVM_ARCH_REQ(nr)           KVM_ARCH_REQ_FLAGS(nr, 0)
@@ -224,7 +224,7 @@ struct kvm_vcpu {
 	int vcpu_id;
 	int srcu_idx;
 	int mode;
-	unsigned long requests;
+	u64 requests;
 	unsigned long guest_debug;
 
 	int pre_pcpu;
@@ -1131,7 +1131,7 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 	 * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
 	 */
 	smp_wmb();
-	set_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+	set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
@@ -1141,12 +1141,12 @@ static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
 
 static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
 {
-	return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+	return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
 {
-	clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+	clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)

From cf81a7e580ac0d598d3e7e9c06864168b2b4073d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 11 Jul 2018 09:54:30 -0700
Subject: [PATCH 16/85] KVM: vmx: remove save/restore of host BNDCGFS MSR

Linux does not support Memory Protection Extensions (MPX) in the
kernel itself, thus the BNDCFGS (Bound Config Supervisor) MSR will
always be zero in the KVM host, i.e. RDMSR in vmx_save_host_state()
is superfluous.  KVM unconditionally sets VM_EXIT_CLEAR_BNDCFGS,
i.e. BNDCFGS will always be zero after VMEXIT, thus manually loading
BNDCFGS is also superfluous.

And in the event the MPX kernel support is added (unlikely given
that MPX for userspace is in its death throes[1]), BNDCFGS will
likely be common across all CPUs[2], and at the least shouldn't
change on a regular basis, i.e. saving the MSR on every VMENTRY is
completely unnecessary.

WARN_ONCE in hardware_setup() if the host's BNDCFGS is non-zero to
document that KVM does not preserve BNDCFGS and to serve as a hint
as to how BNDCFGS likely should be handled if MPX is used in the
kernel, e.g. BNDCFGS should be saved once during KVM setup.

[1] https://lkml.org/lkml/2018/4/27/1046
[2] http://www.openwall.com/lists/kernel-hardening/2017/07/24/28

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e30da9a2430c..6318167b1464 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -802,7 +802,6 @@ struct vcpu_vmx {
 #endif
 		int           gs_ldt_reload_needed;
 		int           fs_reload_needed;
-		u64           msr_host_bndcfgs;
 	} host_state;
 	struct {
 		int vm86_active;
@@ -2630,8 +2629,6 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
 	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
 #endif
-	if (boot_cpu_has(X86_FEATURE_MPX))
-		rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
 	for (i = 0; i < vmx->save_nmsrs; ++i)
 		kvm_set_shared_msr(vmx->guest_msrs[i].index,
 				   vmx->guest_msrs[i].data,
@@ -2669,8 +2666,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
-	if (vmx->host_state.msr_host_bndcfgs)
-		wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
 	load_fixmap_gdt(raw_smp_processor_id());
 }
 
@@ -7502,6 +7497,7 @@ static void vmx_enable_tdp(void)
 
 static __init int hardware_setup(void)
 {
+	unsigned long host_bndcfgs;
 	int r = -ENOMEM, i;
 
 	rdmsrl_safe(MSR_EFER, &host_efer);
@@ -7526,6 +7522,11 @@ static __init int hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
+	if (boot_cpu_has(X86_FEATURE_MPX)) {
+		rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
+		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+	}
+
 	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
 		!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
 		enable_vpid = 0;

From 44883f01fe6ae436a8604c47d8435276fef369b0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 26 Jul 2018 13:01:52 +0200
Subject: [PATCH 17/85] KVM: x86: ensure all MSRs can always be
 KVM_GET/SET_MSR'd

Some of the MSRs returned by GET_MSR_INDEX_LIST currently cannot be sent back
to KVM_GET_MSR and/or KVM_SET_MSR; either they can never be sent back, or you
they are only accepted under special conditions.  This makes the API a pain to
use.

To avoid this pain, this patch makes it so that the result of the get-list
ioctl can always be used for host-initiated get and set.  Since we don't have
a separate way to check for read-only MSRs, this means some Hyper-V MSRs are
ignored when written.  Arguably they should not even be in the result of
GET_MSR_INDEX_LIST, but I am leaving there in case userspace is using the
outcome of GET_MSR_INDEX_LIST to derive the support for the corresponding
Hyper-V feature.

Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 27 ++++++++++++++++++++-------
 arch/x86/kvm/hyperv.h |  2 +-
 arch/x86/kvm/x86.c    | 15 +++++++++------
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index af8caf965baa..01d209ab5481 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -235,7 +235,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
 	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
 	int ret;
 
-	if (!synic->active)
+	if (!synic->active && !host)
 		return 1;
 
 	trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
@@ -295,11 +295,12 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
 	return ret;
 }
 
-static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
+			 bool host)
 {
 	int ret;
 
-	if (!synic->active)
+	if (!synic->active && !host)
 		return 1;
 
 	ret = 0;
@@ -1014,6 +1015,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 	case HV_X64_MSR_TSC_EMULATION_STATUS:
 		hv->hv_tsc_emulation_status = data;
 		break;
+	case HV_X64_MSR_TIME_REF_COUNT:
+		/* read-only, but still ignore it if host-initiated */
+		if (!host)
+			return 1;
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -1101,6 +1107,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 		return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
 					data, host);
 	}
+	case HV_X64_MSR_TSC_FREQUENCY:
+	case HV_X64_MSR_APIC_FREQUENCY:
+		/* read-only, but still ignore it if host-initiated */
+		if (!host)
+			return 1;
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -1156,7 +1168,8 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	return 0;
 }
 
-static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+			  bool host)
 {
 	u64 data = 0;
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
@@ -1183,7 +1196,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_SIMP:
 	case HV_X64_MSR_EOM:
 	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
-		return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata);
+		return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
 	case HV_X64_MSR_STIMER0_CONFIG:
 	case HV_X64_MSR_STIMER1_CONFIG:
 	case HV_X64_MSR_STIMER2_CONFIG:
@@ -1229,7 +1242,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 		return kvm_hv_set_msr(vcpu, msr, data, host);
 }
 
-int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 {
 	if (kvm_hv_msr_partition_wide(msr)) {
 		int r;
@@ -1239,7 +1252,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
 		return r;
 	} else
-		return kvm_hv_get_msr(vcpu, msr, pdata);
+		return kvm_hv_get_msr(vcpu, msr, pdata, host);
 }
 
 static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 837465d69c6d..d6aa969e20f1 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -48,7 +48,7 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
 }
 
 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
-int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
 
 bool kvm_hv_hypercall_enabled(struct kvm *kvm);
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b812b3c5088..6f0fabdb2109 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2160,10 +2160,11 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vcpu->arch.mcg_status = data;
 		break;
 	case MSR_IA32_MCG_CTL:
-		if (!(mcg_cap & MCG_CTL_P))
+		if (!(mcg_cap & MCG_CTL_P) &&
+		    (data || !msr_info->host_initiated))
 			return 1;
 		if (data != 0 && data != ~(u64)0)
-			return -1;
+			return 1;
 		vcpu->arch.mcg_ctl = data;
 		break;
 	default:
@@ -2551,7 +2552,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 }
 EXPORT_SYMBOL_GPL(kvm_get_msr);
 
-static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 {
 	u64 data;
 	u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -2566,7 +2567,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = vcpu->arch.mcg_cap;
 		break;
 	case MSR_IA32_MCG_CTL:
-		if (!(mcg_cap & MCG_CTL_P))
+		if (!(mcg_cap & MCG_CTL_P) && !host)
 			return 1;
 		data = vcpu->arch.mcg_ctl;
 		break;
@@ -2699,7 +2700,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
-		return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
+		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+				   msr_info->host_initiated);
 	case MSR_K7_CLK_CTL:
 		/*
 		 * Provide expected ramp-up count for K7. All other
@@ -2720,7 +2722,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case HV_X64_MSR_TSC_EMULATION_CONTROL:
 	case HV_X64_MSR_TSC_EMULATION_STATUS:
 		return kvm_hv_get_msr_common(vcpu,
-					     msr_info->index, &msr_info->data);
+					     msr_info->index, &msr_info->data,
+					     msr_info->host_initiated);
 		break;
 	case MSR_IA32_BBL_CR_CTL3:
 		/* This legacy MSR exists but isn't fully documented in current

From 2305339ee7129d9d56af3bdd4d59aff4d29ed390 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 28 Jul 2018 18:09:44 +0200
Subject: [PATCH 18/85] kvm: selftests: create a GDT and TSS

The GDT and the TSS base were left to zero, and this has interesting effects
when the TSS descriptor is later read to set up a VMCS's TR_BASE.  Basically
it worked by chance, and this patch fixes it by setting up all the protected
mode data structures properly.

Because the GDT and TSS addresses are virtual, the page tables now always
exist at the time of vcpu setup.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../testing/selftests/kvm/include/kvm_util.h  |   2 +-
 tools/testing/selftests/kvm/include/x86.h     |   4 +-
 tools/testing/selftests/kvm/lib/kvm_util.c    |   4 +-
 .../selftests/kvm/lib/kvm_util_internal.h     |   5 +-
 tools/testing/selftests/kvm/lib/x86.c         | 111 +++++++++++-------
 5 files changed, 79 insertions(+), 47 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 637b7017b6ee..87e05664c7f9 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -75,7 +75,7 @@ void vcpu_ioctl(struct kvm_vm *vm,
 	uint32_t vcpuid, unsigned long ioctl, void *arg);
 void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot);
 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
 	uint32_t data_memslot, uint32_t pgd_memslot);
 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
diff --git a/tools/testing/selftests/kvm/include/x86.h b/tools/testing/selftests/kvm/include/x86.h
index 4a5b2c4c1a0f..560304e59512 100644
--- a/tools/testing/selftests/kvm/include/x86.h
+++ b/tools/testing/selftests/kvm/include/x86.h
@@ -59,8 +59,8 @@ enum x86_register {
 struct desc64 {
 	uint16_t limit0;
 	uint16_t base0;
-	unsigned base1:8, type:5, dpl:2, p:1;
-	unsigned limit1:4, zero0:3, g:1, base2:8;
+	unsigned base1:8, s:1, type:4, dpl:2, p:1;
+	unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8;
 	uint32_t base3;
 	uint32_t zero1;
 } __attribute__((packed));
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 37e2a787d2fc..610d1326f03d 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -701,7 +701,7 @@ static int vcpu_mmap_sz(void)
  * Creates and adds to the VM specified by vm and virtual CPU with
  * the ID given by vcpuid.
  */
-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot)
 {
 	struct vcpu *vcpu;
 
@@ -736,7 +736,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
 	vcpu->next = vm->vcpu_head;
 	vm->vcpu_head = vcpu;
 
-	vcpu_setup(vm, vcpuid);
+	vcpu_setup(vm, vcpuid, pgd_memslot, gdt_memslot);
 }
 
 /* VM Virtual Address Unused Gap
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
index a0bd1980c81c..cbb40288890a 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -51,13 +51,16 @@ struct kvm_vm {
 	struct userspace_mem_region *userspace_mem_region_head;
 	struct sparsebit *vpages_valid;
 	struct sparsebit *vpages_mapped;
+
 	bool pgd_created;
 	vm_paddr_t pgd;
+	vm_vaddr_t gdt;
+	vm_vaddr_t tss;
 };
 
 struct vcpu *vcpu_find(struct kvm_vm *vm,
 	uint32_t vcpuid);
-void vcpu_setup(struct kvm_vm *vm, int vcpuid);
+void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot);
 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
 void regs_dump(FILE *stream, struct kvm_regs *regs,
 	uint8_t indent);
diff --git a/tools/testing/selftests/kvm/lib/x86.c b/tools/testing/selftests/kvm/lib/x86.c
index 2f17675f4275..024e95f1b470 100644
--- a/tools/testing/selftests/kvm/lib/x86.c
+++ b/tools/testing/selftests/kvm/lib/x86.c
@@ -239,25 +239,6 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
 		vm_paddr_t paddr = vm_phy_page_alloc(vm,
 			KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
 		vm->pgd = paddr;
-
-		/* Set pointer to pgd tables in all the VCPUs that
-		 * have already been created.  Future VCPUs will have
-		 * the value set as each one is created.
-		 */
-		for (struct vcpu *vcpu = vm->vcpu_head; vcpu;
-			vcpu = vcpu->next) {
-			struct kvm_sregs sregs;
-
-			/* Obtain the current system register settings */
-			vcpu_sregs_get(vm, vcpu->id, &sregs);
-
-			/* Set and store the pointer to the start of the
-			 * pgd tables.
-			 */
-			sregs.cr3 = vm->pgd;
-			vcpu_sregs_set(vm, vcpu->id, &sregs);
-		}
-
 		vm->pgd_created = true;
 	}
 }
@@ -460,9 +441,32 @@ static void kvm_seg_set_unusable(struct kvm_segment *segp)
 	segp->unusable = true;
 }
 
+static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
+{
+	void *gdt = addr_gva2hva(vm, vm->gdt);
+	struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
+
+	desc->limit0 = segp->limit & 0xFFFF;
+	desc->base0 = segp->base & 0xFFFF;
+	desc->base1 = segp->base >> 16;
+	desc->s = segp->s;
+	desc->type = segp->type;
+	desc->dpl = segp->dpl;
+	desc->p = segp->present;
+	desc->limit1 = segp->limit >> 16;
+	desc->l = segp->l;
+	desc->db = segp->db;
+	desc->g = segp->g;
+	desc->base2 = segp->base >> 24;
+	if (!segp->s)
+		desc->base3 = segp->base >> 32;
+}
+
+
 /* Set Long Mode Flat Kernel Code Segment
  *
  * Input Args:
+ *   vm - VM whose GDT is being filled, or NULL to only write segp
  *   selector - selector value
  *
  * Output Args:
@@ -473,7 +477,7 @@ static void kvm_seg_set_unusable(struct kvm_segment *segp)
  * Sets up the KVM segment pointed to by segp, to be a code segment
  * with the selector value given by selector.
  */
-static void kvm_seg_set_kernel_code_64bit(uint16_t selector,
+static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
 	struct kvm_segment *segp)
 {
 	memset(segp, 0, sizeof(*segp));
@@ -486,11 +490,14 @@ static void kvm_seg_set_kernel_code_64bit(uint16_t selector,
 	segp->g = true;
 	segp->l = true;
 	segp->present = 1;
+	if (vm)
+		kvm_seg_fill_gdt_64bit(vm, segp);
 }
 
 /* Set Long Mode Flat Kernel Data Segment
  *
  * Input Args:
+ *   vm - VM whose GDT is being filled, or NULL to only write segp
  *   selector - selector value
  *
  * Output Args:
@@ -501,7 +508,7 @@ static void kvm_seg_set_kernel_code_64bit(uint16_t selector,
  * Sets up the KVM segment pointed to by segp, to be a data segment
  * with the selector value given by selector.
  */
-static void kvm_seg_set_kernel_data_64bit(uint16_t selector,
+static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
 	struct kvm_segment *segp)
 {
 	memset(segp, 0, sizeof(*segp));
@@ -513,6 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(uint16_t selector,
 					  */
 	segp->g = true;
 	segp->present = true;
+	if (vm)
+		kvm_seg_fill_gdt_64bit(vm, segp);
 }
 
 /* Address Guest Virtual to Guest Physical
@@ -575,13 +584,45 @@ unmapped_gva:
 		    "gva: 0x%lx", gva);
 }
 
-void vcpu_setup(struct kvm_vm *vm, int vcpuid)
+static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot,
+			  int pgd_memslot)
+{
+	if (!vm->gdt)
+		vm->gdt = vm_vaddr_alloc(vm, getpagesize(),
+			KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
+
+	dt->base = vm->gdt;
+	dt->limit = getpagesize();
+}
+
+static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
+				int selector, int gdt_memslot,
+				int pgd_memslot)
+{
+	if (!vm->tss)
+		vm->tss = vm_vaddr_alloc(vm, getpagesize(),
+			KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
+
+	memset(segp, 0, sizeof(*segp));
+	segp->base = vm->tss;
+	segp->limit = 0x67;
+	segp->selector = selector;
+	segp->type = 0xb;
+	segp->present = 1;
+	kvm_seg_fill_gdt_64bit(vm, segp);
+}
+
+void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
 {
 	struct kvm_sregs sregs;
 
 	/* Set mode specific system register values. */
 	vcpu_sregs_get(vm, vcpuid, &sregs);
 
+	sregs.idt.limit = 0;
+
+	kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
+
 	switch (vm->mode) {
 	case VM_MODE_FLAT48PG:
 		sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
@@ -589,30 +630,18 @@ void vcpu_setup(struct kvm_vm *vm, int vcpuid)
 		sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
 
 		kvm_seg_set_unusable(&sregs.ldt);
-		kvm_seg_set_kernel_code_64bit(0x8, &sregs.cs);
-		kvm_seg_set_kernel_data_64bit(0x10, &sregs.ds);
-		kvm_seg_set_kernel_data_64bit(0x10, &sregs.es);
+		kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs);
+		kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds);
+		kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es);
+		kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot);
 		break;
 
 	default:
 		TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode);
 	}
+
+	sregs.cr3 = vm->pgd;
 	vcpu_sregs_set(vm, vcpuid, &sregs);
-
-	/* If virtual translation table have been setup, set system register
-	 * to point to the tables.  It's okay if they haven't been setup yet,
-	 * in that the code that sets up the virtual translation tables, will
-	 * go back through any VCPUs that have already been created and set
-	 * their values.
-	 */
-	if (vm->pgd_created) {
-		struct kvm_sregs sregs;
-
-		vcpu_sregs_get(vm, vcpuid, &sregs);
-
-		sregs.cr3 = vm->pgd;
-		vcpu_sregs_set(vm, vcpuid, &sregs);
-	}
 }
 /* Adds a vCPU with reasonable defaults (i.e., a stack)
  *
@@ -629,7 +658,7 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
 				     DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
 
 	/* Create VCPU */
-	vm_vcpu_add(vm, vcpuid);
+	vm_vcpu_add(vm, vcpuid, 0, 0);
 
 	/* Setup guest general purpose registers */
 	vcpu_regs_get(vm, vcpuid, &regs);

From 87ccb7dbb25b28c0003fe4be88ed02289b651f96 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 28 Jul 2018 18:45:38 +0200
Subject: [PATCH 19/85] kvm: selftests: actually use all of lib/vmx.c

The allocation of the VMXON and VMCS is currently done twice, in
lib/vmx.c and in vmx_tsc_adjust_test.c.  Reorganize the code to
provide a cleaner and easier to use API to the tests.  lib/vmx.c
now does the complete setup of the VMX data structures, but does not
create the VM or set CPUID.  This has to be done by the caller.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/vmx.h     | 21 ++++--
 tools/testing/selftests/kvm/lib/vmx.c         | 71 +++++++++++--------
 .../selftests/kvm/vmx_tsc_adjust_test.c       | 71 ++++---------------
 3 files changed, 73 insertions(+), 90 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/vmx.h b/tools/testing/selftests/kvm/include/vmx.h
index 6ed8499807fd..9caaf56696a4 100644
--- a/tools/testing/selftests/kvm/include/vmx.h
+++ b/tools/testing/selftests/kvm/include/vmx.h
@@ -486,9 +486,22 @@ static inline uint32_t vmcs_revision(void)
 	return rdmsr(MSR_IA32_VMX_BASIC);
 }
 
-void prepare_for_vmx_operation(void);
-void prepare_vmcs(void *guest_rip, void *guest_rsp);
-struct kvm_vm *vm_create_default_vmx(uint32_t vcpuid,
-				     vmx_guest_code_t guest_code);
+struct vmx_pages {
+	void *vmxon_hva;
+	uint64_t vmxon_gpa;
+	void *vmxon;
+
+	void *vmcs_hva;
+	uint64_t vmcs_gpa;
+	void *vmcs;
+
+	void *msr_hva;
+	uint64_t msr_gpa;
+	void *msr;
+};
+
+struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
+bool prepare_for_vmx_operation(struct vmx_pages *vmx);
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
 
 #endif /* !SELFTEST_KVM_VMX_H */
diff --git a/tools/testing/selftests/kvm/lib/vmx.c b/tools/testing/selftests/kvm/lib/vmx.c
index 0231bc0aae7b..5701e52a33ed 100644
--- a/tools/testing/selftests/kvm/lib/vmx.c
+++ b/tools/testing/selftests/kvm/lib/vmx.c
@@ -13,47 +13,43 @@
 #include "x86.h"
 #include "vmx.h"
 
-/* Create a default VM for VMX tests.
+/* Allocate memory regions for nested VMX tests.
  *
  * Input Args:
- *   vcpuid - The id of the single VCPU to add to the VM.
- *   guest_code - The vCPU's entry point
+ *   vm - The VM to allocate guest-virtual addresses in.
  *
- * Output Args: None
+ * Output Args:
+ *   p_vmx_gva - The guest virtual address for the struct vmx_pages.
  *
  * Return:
- *   Pointer to opaque structure that describes the created VM.
+ *   Pointer to structure with the addresses of the VMX areas.
  */
-struct kvm_vm *
-vm_create_default_vmx(uint32_t vcpuid, vmx_guest_code_t guest_code)
+struct vmx_pages *
+vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
 {
-	struct kvm_cpuid2 *cpuid;
-	struct kvm_vm *vm;
-	vm_vaddr_t vmxon_vaddr;
-	vm_paddr_t vmxon_paddr;
-	vm_vaddr_t vmcs_vaddr;
-	vm_paddr_t vmcs_paddr;
-
-	vm = vm_create_default(vcpuid, (void *) guest_code);
-
-	/* Enable nesting in CPUID */
-	vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid());
+	vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+	struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva);
 
 	/* Setup of a region of guest memory for the vmxon region. */
-	vmxon_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
-	vmxon_paddr = addr_gva2gpa(vm, vmxon_vaddr);
+	vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+	vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon);
+	vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon);
 
 	/* Setup of a region of guest memory for a vmcs. */
-	vmcs_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
-	vmcs_paddr = addr_gva2gpa(vm, vmcs_vaddr);
+	vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+	vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs);
+	vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs);
 
-	vcpu_args_set(vm, vcpuid, 4, vmxon_vaddr, vmxon_paddr, vmcs_vaddr,
-		      vmcs_paddr);
+	/* Setup of a region of guest memory for the MSR bitmap. */
+	vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+	vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr);
+	vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr);
 
-	return vm;
+	*p_vmx_gva = vmx_gva;
+	return vmx;
 }
 
-void prepare_for_vmx_operation(void)
+bool prepare_for_vmx_operation(struct vmx_pages *vmx)
 {
 	uint64_t feature_control;
 	uint64_t required;
@@ -88,12 +84,27 @@ void prepare_for_vmx_operation(void)
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & required) != required)
 		wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | required);
+
+	/* Enter VMX root operation. */
+	*(uint32_t *)(vmx->vmxon) = vmcs_revision();
+	if (vmxon(vmx->vmxon_gpa))
+		return false;
+
+	/* Load a VMCS. */
+	*(uint32_t *)(vmx->vmcs) = vmcs_revision();
+	if (vmclear(vmx->vmcs_gpa))
+		return false;
+
+	if (vmptrld(vmx->vmcs_gpa))
+		return false;
+
+	return true;
 }
 
 /*
  * Initialize the control fields to the most basic settings possible.
  */
-static inline void init_vmcs_control_fields(void)
+static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
 {
 	vmwrite(VIRTUAL_PROCESSOR_ID, 0);
 	vmwrite(POSTED_INTR_NV, 0);
@@ -119,6 +130,8 @@ static inline void init_vmcs_control_fields(void)
 	vmwrite(CR4_GUEST_HOST_MASK, 0);
 	vmwrite(CR0_READ_SHADOW, get_cr0());
 	vmwrite(CR4_READ_SHADOW, get_cr4());
+
+	vmwrite(MSR_BITMAP, vmx->msr_gpa);
 }
 
 /*
@@ -235,9 +248,9 @@ static inline void init_vmcs_guest_state(void *rip, void *rsp)
 	vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
 }
 
-void prepare_vmcs(void *guest_rip, void *guest_rsp)
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
 {
-	init_vmcs_control_fields();
+	init_vmcs_control_fields(vmx);
 	init_vmcs_host_state();
 	init_vmcs_guest_state(guest_rip, guest_rsp);
 }
diff --git a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
index d7cb7944a42e..fc414c284368 100644
--- a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
+++ b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
@@ -46,11 +46,6 @@ enum {
 	PORT_DONE,
 };
 
-struct vmx_page {
-	vm_vaddr_t virt;
-	vm_paddr_t phys;
-};
-
 enum {
 	VMXON_PAGE = 0,
 	VMCS_PAGE,
@@ -67,9 +62,6 @@ struct kvm_single_msr {
 /* The virtual machine object. */
 static struct kvm_vm *vm;
 
-/* Array of vmx_page descriptors that is shared with the guest. */
-struct vmx_page *vmx_pages;
-
 #define exit_to_l0(_port, _arg) do_exit_to_l0(_port, (unsigned long) (_arg))
 static void do_exit_to_l0(uint16_t port, unsigned long arg)
 {
@@ -105,7 +97,7 @@ static void l2_guest_code(void)
 	__asm__ __volatile__("vmcall");
 }
 
-static void l1_guest_code(struct vmx_page *vmx_pages)
+static void l1_guest_code(struct vmx_pages *vmx_pages)
 {
 #define L2_GUEST_STACK_SIZE 64
 	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
@@ -116,23 +108,14 @@ static void l1_guest_code(struct vmx_page *vmx_pages)
 	wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
 	check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
 
-	prepare_for_vmx_operation();
-
-	/* Enter VMX root operation. */
-	*(uint32_t *)vmx_pages[VMXON_PAGE].virt = vmcs_revision();
-	GUEST_ASSERT(!vmxon(vmx_pages[VMXON_PAGE].phys));
-
-	/* Load a VMCS. */
-	*(uint32_t *)vmx_pages[VMCS_PAGE].virt = vmcs_revision();
-	GUEST_ASSERT(!vmclear(vmx_pages[VMCS_PAGE].phys));
-	GUEST_ASSERT(!vmptrld(vmx_pages[VMCS_PAGE].phys));
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
 
 	/* Prepare the VMCS for L2 execution. */
-	prepare_vmcs(l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
 	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
 	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETING;
 	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
-	vmwrite(MSR_BITMAP, vmx_pages[MSR_BITMAP_PAGE].phys);
 	vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE);
 
 	/* Jump into L2.  First, test failure to load guest CR3.  */
@@ -152,33 +135,6 @@ static void l1_guest_code(struct vmx_page *vmx_pages)
 	exit_to_l0(PORT_DONE, 0);
 }
 
-static void allocate_vmx_page(struct vmx_page *page)
-{
-	vm_vaddr_t virt;
-
-	virt = vm_vaddr_alloc(vm, PAGE_SIZE, 0, 0, 0);
-	memset(addr_gva2hva(vm, virt), 0, PAGE_SIZE);
-
-	page->virt = virt;
-	page->phys = addr_gva2gpa(vm, virt);
-}
-
-static vm_vaddr_t allocate_vmx_pages(void)
-{
-	vm_vaddr_t vmx_pages_vaddr;
-	int i;
-
-	vmx_pages_vaddr = vm_vaddr_alloc(
-		vm, sizeof(struct vmx_page) * NUM_VMX_PAGES, 0, 0, 0);
-
-	vmx_pages = (void *) addr_gva2hva(vm, vmx_pages_vaddr);
-
-	for (i = 0; i < NUM_VMX_PAGES; i++)
-		allocate_vmx_page(&vmx_pages[i]);
-
-	return vmx_pages_vaddr;
-}
-
 void report(int64_t val)
 {
 	printf("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
@@ -187,7 +143,8 @@ void report(int64_t val)
 
 int main(int argc, char *argv[])
 {
-	vm_vaddr_t vmx_pages_vaddr;
+	struct vmx_pages *vmx_pages;
+	vm_vaddr_t vmx_pages_gva;
 	struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
 
 	if (!(entry->ecx & CPUID_VMX)) {
@@ -195,23 +152,23 @@ int main(int argc, char *argv[])
 		exit(KSFT_SKIP);
 	}
 
-	vm = vm_create_default_vmx(VCPU_ID, (void *) l1_guest_code);
+	vm = vm_create_default(VCPU_ID, (void *) l1_guest_code);
+	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 
 	/* Allocate VMX pages and shared descriptors (vmx_pages). */
-	vmx_pages_vaddr = allocate_vmx_pages();
-	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_vaddr);
+	vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
 
 	for (;;) {
 		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 		struct kvm_regs regs;
 
 		vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Got exit_reason other than KVM_EXIT_IO: %u (%s),\n",
-			    run->exit_reason,
-			    exit_reason_str(run->exit_reason));
-
 		vcpu_regs_get(vm, VCPU_ID, &regs);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Got exit_reason other than KVM_EXIT_IO: %u (%s), rip=%lx\n",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason), regs.rip);
 
 		switch (run->io.port) {
 		case PORT_ABORT:

From 0a505fe6f272c5c1ceebbd266535ad79d9ca6920 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 26 Jul 2018 13:02:24 +0200
Subject: [PATCH 20/85] kvm: selftests: ensure vcpu file is released

The selftests were not munmap-ing the kvm_run area from the vcpu file descriptor.
The result was that kvm_vcpu_release was not called and a reference was left in the
parent "struct kvm".  Ultimately this was visible in the upcoming state save/restore
test as an error when KVM attempted to create a duplicate debugfs entry.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 610d1326f03d..163482873363 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -238,8 +238,12 @@ struct vcpu *vcpu_find(struct kvm_vm *vm,
 static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
 {
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+	int ret;
 
-	int ret = close(vcpu->fd);
+	ret = munmap(vcpu->state, sizeof(*vcpu->state));
+	TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i "
+		"errno: %i", ret, errno);
+	close(vcpu->fd);
 	TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
 		"errno: %i", ret, errno);
 
@@ -295,6 +299,10 @@ void kvm_vm_free(struct kvm_vm *vmp)
 	TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
 		"  vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
 
+	close(vmp->kvm_fd);
+	TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n"
+		"  vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
+
 	/* Free the structure describing the VM. */
 	free(vmp);
 }

From fa3899add1056f209c5b61d0c60cec34775a6781 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 26 Jul 2018 13:19:23 +0200
Subject: [PATCH 21/85] kvm: selftests: add basic test for state save and
 restore

The test calls KVM_RUN repeatedly, and creates an entirely new VM with the
old memory and vCPU state on every exit to userspace.  The kvm_util API is
expanded with two functions that manage the lifetime of a kvm_vm struct:
the first closes the file descriptors and leaves the memory allocated,
and the second opens the file descriptors and reuses the memory from
the previous incarnation of the kvm_vm struct.

For now the test is very basic, as it does not test for example XSAVE or
vCPU events.  However, it will test nested virtualization state starting
with the next patch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore        |   1 +
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../testing/selftests/kvm/include/kvm_util.h  |   2 +
 tools/testing/selftests/kvm/include/x86.h     |   4 +
 tools/testing/selftests/kvm/lib/kvm_util.c    |  88 +++++++++----
 .../selftests/kvm/lib/kvm_util_internal.h     |   2 +
 tools/testing/selftests/kvm/lib/x86.c         | 116 +++++++++++++++++
 tools/testing/selftests/kvm/state_test.c      | 119 ++++++++++++++++++
 8 files changed, 311 insertions(+), 22 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/state_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 8fb7903f6643..4202139d81d9 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -2,3 +2,4 @@ cr4_cpuid_sync_test
 set_sregs_test
 sync_regs_test
 vmx_tsc_adjust_test
+state_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 65bda4fb0ad6..dd0e5163f01f 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -10,6 +10,7 @@ TEST_GEN_PROGS_x86_64 = set_sregs_test
 TEST_GEN_PROGS_x86_64 += sync_regs_test
 TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test
 TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += state_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
 LIBKVM += $(LIBKVM_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 87e05664c7f9..d32632f71ab8 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -53,6 +53,8 @@ int kvm_check_cap(long cap);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
 void kvm_vm_free(struct kvm_vm *vmp);
+void kvm_vm_restart(struct kvm_vm *vmp, int perm);
+void kvm_vm_release(struct kvm_vm *vmp);
 
 int kvm_memcmp_hva_gva(void *hva,
 	struct kvm_vm *vm, const vm_vaddr_t gva, size_t len);
diff --git a/tools/testing/selftests/kvm/include/x86.h b/tools/testing/selftests/kvm/include/x86.h
index 560304e59512..42c3596815b8 100644
--- a/tools/testing/selftests/kvm/include/x86.h
+++ b/tools/testing/selftests/kvm/include/x86.h
@@ -303,6 +303,10 @@ static inline unsigned long get_xmm(int n)
 	return 0;
 }
 
+struct kvm_x86_state;
+struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state);
+
 /*
  * Basic CPU control in CR0
  */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 163482873363..643309d6de74 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -62,6 +62,18 @@ int kvm_check_cap(long cap)
 	return ret;
 }
 
+static void vm_open(struct kvm_vm *vm, int perm)
+{
+	vm->kvm_fd = open(KVM_DEV_PATH, perm);
+	if (vm->kvm_fd < 0)
+		exit(KSFT_SKIP);
+
+	/* Create VM. */
+	vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, NULL);
+	TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
+		"rc: %i errno: %i", vm->fd, errno);
+}
+
 /* VM Create
  *
  * Input Args:
@@ -90,16 +102,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 	TEST_ASSERT(vm != NULL, "Insufficent Memory");
 
 	vm->mode = mode;
-	kvm_fd = open(KVM_DEV_PATH, perm);
-	if (kvm_fd < 0)
-		exit(KSFT_SKIP);
-
-	/* Create VM. */
-	vm->fd = ioctl(kvm_fd, KVM_CREATE_VM, NULL);
-	TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
-		"rc: %i errno: %i", vm->fd, errno);
-
-	close(kvm_fd);
+	vm_open(vm, perm);
 
 	/* Setup mode specific traits. */
 	switch (vm->mode) {
@@ -132,6 +135,39 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 	return vm;
 }
 
+/* VM Restart
+ *
+ * Input Args:
+ *   vm - VM that has been released before
+ *   perm - permission
+ *
+ * Output Args: None
+ *
+ * Reopens the file descriptors associated to the VM and reinstates the
+ * global state, such as the irqchip and the memory regions that are mapped
+ * into the guest.
+ */
+void kvm_vm_restart(struct kvm_vm *vmp, int perm)
+{
+	struct userspace_mem_region *region;
+
+	vm_open(vmp, perm);
+	if (vmp->has_irqchip)
+		vm_create_irqchip(vmp);
+
+	for (region = vmp->userspace_mem_region_head; region;
+		region = region->next) {
+		int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+		TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+			    "  rc: %i errno: %i\n"
+			    "  slot: %u flags: 0x%x\n"
+			    "  guest_phys_addr: 0x%lx size: 0x%lx",
+			    ret, errno, region->region.slot, region->region.flags,
+			    region->region.guest_phys_addr,
+			    region->region.memory_size);
+	}
+}
+
 /* Userspace Memory Region Find
  *
  * Input Args:
@@ -256,6 +292,23 @@ static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
 	free(vcpu);
 }
 
+void kvm_vm_release(struct kvm_vm *vmp)
+{
+	int ret;
+
+	/* Free VCPUs. */
+	while (vmp->vcpu_head)
+		vm_vcpu_rm(vmp, vmp->vcpu_head->id);
+
+	/* Close file descriptor for the VM. */
+	ret = close(vmp->fd);
+	TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
+		"  vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
+
+	close(vmp->kvm_fd);
+	TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n"
+		"  vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
+}
 
 /* Destroys and frees the VM pointed to by vmp.
  */
@@ -286,22 +339,11 @@ void kvm_vm_free(struct kvm_vm *vmp)
 		free(region);
 	}
 
-	/* Free VCPUs. */
-	while (vmp->vcpu_head)
-		vm_vcpu_rm(vmp, vmp->vcpu_head->id);
-
 	/* Free sparsebit arrays. */
 	sparsebit_free(&vmp->vpages_valid);
 	sparsebit_free(&vmp->vpages_mapped);
 
-	/* Close file descriptor for the VM. */
-	ret = close(vmp->fd);
-	TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
-		"  vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
-
-	close(vmp->kvm_fd);
-	TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n"
-		"  vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
+	kvm_vm_release(vmp);
 
 	/* Free the structure describing the VM. */
 	free(vmp);
@@ -965,6 +1007,8 @@ void vm_create_irqchip(struct kvm_vm *vm)
 	ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0);
 	TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, "
 		"rc: %i errno: %i", ret, errno);
+
+	vm->has_irqchip = true;
 }
 
 /* VM VCPU State
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
index cbb40288890a..542ed606b338 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -43,6 +43,7 @@ struct vcpu {
 
 struct kvm_vm {
 	int mode;
+	int kvm_fd;
 	int fd;
 	unsigned int page_size;
 	unsigned int page_shift;
@@ -52,6 +53,7 @@ struct kvm_vm {
 	struct sparsebit *vpages_valid;
 	struct sparsebit *vpages_mapped;
 
+	bool has_irqchip;
 	bool pgd_created;
 	vm_paddr_t pgd;
 	vm_vaddr_t gdt;
diff --git a/tools/testing/selftests/kvm/lib/x86.c b/tools/testing/selftests/kvm/lib/x86.c
index 024e95f1b470..2013efb60991 100644
--- a/tools/testing/selftests/kvm/lib/x86.c
+++ b/tools/testing/selftests/kvm/lib/x86.c
@@ -727,3 +727,119 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code)
 
 	return vm;
 }
+
+struct kvm_x86_state {
+	struct kvm_vcpu_events events;
+	struct kvm_mp_state mp_state;
+	struct kvm_regs regs;
+	struct kvm_xsave xsave;
+	struct kvm_xcrs xcrs;
+	struct kvm_sregs sregs;
+	struct kvm_debugregs debugregs;
+	struct kvm_msrs msrs;
+};
+
+static int kvm_get_num_msrs(struct kvm_vm *vm)
+{
+	struct kvm_msr_list nmsrs;
+	int r;
+
+	nmsrs.nmsrs = 0;
+	r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
+	TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i",
+		r);
+
+	return nmsrs.nmsrs;
+}
+
+struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
+{
+	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+	struct kvm_msr_list *list;
+	struct kvm_x86_state *state;
+	int nmsrs, r, i;
+
+	nmsrs = kvm_get_num_msrs(vm);
+	list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+	list->nmsrs = nmsrs;
+	r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+                r);
+
+	state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
+	r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
+                r);
+
+	state->msrs.nmsrs = nmsrs;
+	for (i = 0; i < nmsrs; i++)
+		state->msrs.entries[i].index = list->indices[i];
+	r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
+        TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed at %x)",
+                r, r == nmsrs ? -1 : list->indices[r]);
+
+	r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
+                r);
+
+	free(list);
+	return state;
+}
+
+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state)
+{
+	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+	int r;
+
+	r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
+        TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
+                r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
+
+	r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
+                r);
+
+	r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
+                r);
+}
diff --git a/tools/testing/selftests/kvm/state_test.c b/tools/testing/selftests/kvm/state_test.c
new file mode 100644
index 000000000000..e47b27b5406e
--- /dev/null
+++ b/tools/testing/selftests/kvm/state_test.c
@@ -0,0 +1,119 @@
+/*
+ * KVM_GET/SET_* tests
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Tests for vCPU state save/restore, including nested guest state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "x86.h"
+
+#define VCPU_ID		5
+#define PORT_SYNC	0x1000
+#define PORT_ABORT	0x1001
+#define PORT_DONE	0x1002
+
+static inline void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1)
+{
+	__asm__ __volatile__("in %[port], %%al"
+			     :
+			     : [port]"d"(port), "D"(arg0), "S"(arg1)
+			     : "rax");
+}
+
+#define exit_to_l0(_port, _arg0, _arg1) \
+	__exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1))
+
+#define GUEST_ASSERT(_condition) do { \
+	if (!(_condition)) \
+		exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition, __LINE__);\
+} while (0)
+
+#define GUEST_SYNC(stage) \
+	exit_to_l0(PORT_SYNC, "hello", stage);
+
+static bool have_nested_state;
+
+void guest_code(void)
+{
+	GUEST_SYNC(1);
+	GUEST_SYNC(2);
+
+	exit_to_l0(PORT_DONE, 0, 0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_regs regs1, regs2;
+	struct kvm_vm *vm;
+	struct kvm_run *run;
+	struct kvm_x86_state *state;
+	int stage;
+
+	struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
+
+	/* Create VM */
+	vm = vm_create_default(VCPU_ID, guest_code);
+	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	run = vcpu_state(vm, VCPU_ID);
+
+	vcpu_regs_get(vm, VCPU_ID, &regs1);
+	for (stage = 1;; stage++) {
+		_vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Unexpected exit reason: %u (%s),\n",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vm, VCPU_ID, &regs1);
+		switch (run->io.port) {
+		case PORT_ABORT:
+			TEST_ASSERT(false, "%s at %s:%d", (const char *) regs1.rdi,
+				    __FILE__, regs1.rsi);
+			/* NOT REACHED */
+		case PORT_SYNC:
+			break;
+		case PORT_DONE:
+			goto done;
+		default:
+			TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port);
+		}
+
+		/* PORT_SYNC is handled here.  */
+		TEST_ASSERT(!strcmp((const char *)regs1.rdi, "hello") &&
+			    regs1.rsi == stage, "Unexpected register values vmexit #%lx, got %lx",
+			    stage, (ulong) regs1.rsi);
+
+		state = vcpu_save_state(vm, VCPU_ID);
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		kvm_vm_restart(vm, O_RDWR);
+		vm_vcpu_add(vm, VCPU_ID, 0, 0);
+		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+		vcpu_load_state(vm, VCPU_ID, state);
+		run = vcpu_state(vm, VCPU_ID);
+		free(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vm, VCPU_ID, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+
+done:
+	kvm_vm_free(vm);
+}

From 7f7f1ba33cf2c21d001821313088c231db42ff40 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Jul 2018 18:49:01 +0200
Subject: [PATCH 22/85] KVM: x86: do not load vmcs12 pages while still in SMM

If the vCPU enters system management mode while running a nested guest,
RSM starts processing the vmentry while still in SMM.  In that case,
however, the pages pointed to by the vmcs12 might be incorrectly
loaded from SMRAM.  To avoid this, delay the handling of the pages
until just before the next vmentry.  This is done with a new request
and a new entry in kvm_x86_ops, which we will be able to reuse for
nested VMX state migration.

Extracted from a patch by Jim Mattson and KarimAllah Ahmed.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++
 arch/x86/kvm/vmx.c              | 54 ++++++++++++++++++++++-----------
 arch/x86/kvm/x86.c              |  2 ++
 3 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..da957725992d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -75,6 +75,7 @@
 #define KVM_REQ_HV_EXIT			KVM_ARCH_REQ(21)
 #define KVM_REQ_HV_STIMER		KVM_ARCH_REQ(22)
 #define KVM_REQ_LOAD_EOI_EXITMAP	KVM_ARCH_REQ(23)
+#define KVM_REQ_GET_VMCS12_PAGES	KVM_ARCH_REQ(24)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1085,6 +1086,8 @@ struct kvm_x86_ops {
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+	void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
 	int (*smi_allowed)(struct kvm_vcpu *vcpu);
 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6318167b1464..fee44e4c5c79 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10660,9 +10660,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 						 struct vmcs12 *vmcs12);
 
-static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
-					struct vmcs12 *vmcs12)
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct page *page;
 	u64 hpa;
@@ -11774,12 +11774,17 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	return 0;
 }
 
-static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
+/*
+ * If exit_qual is NULL, this is being called from RSM.
+ * Otherwise it's called from vmlaunch/vmresume.
+ */
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u32 exit_qual;
-	int r;
+	bool from_vmentry = !!exit_qual;
+	u32 dummy_exit_qual;
+	int r = 0;
 
 	enter_guest_mode(vcpu);
 
@@ -11793,17 +11798,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
 		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
 
 	r = EXIT_REASON_INVALID_STATE;
-	if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
+	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
 		goto fail;
 
-	nested_get_vmcs12_pages(vcpu, vmcs12);
+	if (from_vmentry) {
+		nested_get_vmcs12_pages(vcpu);
 
-	r = EXIT_REASON_MSR_LOAD_FAIL;
-	exit_qual = nested_vmx_load_msr(vcpu,
-					vmcs12->vm_entry_msr_load_addr,
-					vmcs12->vm_entry_msr_load_count);
-	if (exit_qual)
-		goto fail;
+		r = EXIT_REASON_MSR_LOAD_FAIL;
+		*exit_qual = nested_vmx_load_msr(vcpu,
+	     					 vmcs12->vm_entry_msr_load_addr,
+					      	 vmcs12->vm_entry_msr_load_count);
+		if (*exit_qual)
+			goto fail;
+	} else {
+		/*
+		 * The MMU is not initialized to point at the right entities yet and
+		 * "get pages" would need to read data from the guest (i.e. we will
+		 * need to perform gpa to hpa translation). Request a call
+		 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
+		 * have already been set at vmentry time and should not be reset.
+		 */
+		kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+	}
 
 	/*
 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -11818,8 +11834,7 @@ fail:
 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
 	leave_guest_mode(vcpu);
 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-	nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
-	return 1;
+	return r;
 }
 
 /*
@@ -11896,10 +11911,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 */
 
 	vmx->nested.nested_run_pending = 1;
-	ret = enter_vmx_non_root_mode(vcpu);
+	ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
 	if (ret) {
+		nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
 		vmx->nested.nested_run_pending = 0;
-		return ret;
+		return 1;
 	}
 
 	/*
@@ -12985,7 +13001,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
 
 	if (vmx->nested.smm.guest_mode) {
 		vcpu->arch.hflags &= ~HF_SMM_MASK;
-		ret = enter_vmx_non_root_mode(vcpu);
+		ret = enter_vmx_non_root_mode(vcpu, NULL);
 		vcpu->arch.hflags |= HF_SMM_MASK;
 		if (ret)
 			return ret;
@@ -13134,6 +13150,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.setup_mce = vmx_setup_mce,
 
+	.get_vmcs12_pages = nested_get_vmcs12_pages,
+
 	.smi_allowed = vmx_smi_allowed,
 	.pre_enter_smm = vmx_pre_enter_smm,
 	.pre_leave_smm = vmx_pre_leave_smm,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f0fabdb2109..fbd59ad047b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7260,6 +7260,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_immediate_exit = false;
 
 	if (kvm_request_pending(vcpu)) {
+		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
+			kvm_x86_ops->get_vmcs12_pages(vcpu);
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))

From 8fcc4b5923af5de58b80b53a069453b135693304 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Tue, 10 Jul 2018 11:27:20 +0200
Subject: [PATCH 23/85] kvm: nVMX: Introduce KVM_CAP_NESTED_STATE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For nested virtualization L0 KVM is managing a bit of state for L2 guests,
this state can not be captured through the currently available IOCTLs. In
fact the state captured through all of these IOCTLs is usually a mix of L1
and L2 state. It is also dependent on whether the L2 guest was running at
the moment when the process was interrupted to save its state.

With this capability, there are two new vcpu ioctls: KVM_GET_NESTED_STATE
and KVM_SET_NESTED_STATE. These can be used for saving and restoring a VM
that is in VMX operation.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jim Mattson <jmattson@google.com>
[karahmed@ - rename structs and functions and make them ready for AMD and
             address previous comments.
           - handle nested.smm state.
           - rebase & a bit of refactoring.
           - Merge 7/8 and 8/8 into one patch. ]
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  56 ++++++++++
 arch/x86/include/asm/kvm_host.h   |   6 +
 arch/x86/include/uapi/asm/kvm.h   |  37 +++++++
 arch/x86/kvm/vmx.c                | 175 +++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c                |  54 +++++++++
 include/uapi/linux/kvm.h          |   4 +
 6 files changed, 330 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index cb8db4f9d097..7b83b176c662 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3561,6 +3561,62 @@ Returns: 0 on success,
 	-ENOENT on deassign if the conn_id isn't registered
 	-EEXIST on assign if the conn_id is already registered
 
+4.114 KVM_GET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG:     the total state size (including the fixed-size part of struct
+             kvm_nested_state) exceeds the value of 'size' specified by
+             the user; the size required will be written into size.
+
+struct kvm_nested_state {
+	__u16 flags;
+	__u16 format;
+	__u32 size;
+	union {
+		struct kvm_vmx_nested_state vmx;
+		struct kvm_svm_nested_state svm;
+		__u8 pad[120];
+	};
+	__u8 data[0];
+};
+
+#define KVM_STATE_NESTED_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING	0x00000002
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON	0x00000002
+
+struct kvm_vmx_nested_state {
+	__u64 vmxon_pa;
+	__u64 vmcs_pa;
+
+	struct {
+		__u16 flags;
+	} smm;
+};
+
+This ioctl copies the vcpu's nested virtualization state from the kernel to
+userspace.
+
+The maximum size of the state, including the fixed-size part of struct
+kvm_nested_state, can be retrieved by passing KVM_CAP_NESTED_STATE to
+the KVM_CHECK_EXTENSION ioctl().
+
+4.115 KVM_SET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in)
+Returns: 0 on success, -1 on error
+
+This copies the vcpu's kvm_nested_state struct from userspace to the kernel.  For
+the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da957725992d..bd287b348751 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1086,6 +1086,12 @@ struct kvm_x86_ops {
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+	int (*get_nested_state)(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				unsigned user_data_size);
+	int (*set_nested_state)(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				struct kvm_nested_state *kvm_state);
 	void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
 
 	int (*smi_allowed)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index c535c2fdea13..86299efa804a 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -378,4 +378,41 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED	(1 << 1)
 
+#define KVM_STATE_NESTED_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING	0x00000002
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON	0x00000002
+
+struct kvm_vmx_nested_state {
+	__u64 vmxon_pa;
+	__u64 vmcs_pa;
+
+	struct {
+		__u16 flags;
+	} smm;
+};
+
+/* for KVM_CAP_NESTED_STATE */
+struct kvm_nested_state {
+	/* KVM_STATE_* flags */
+	__u16 flags;
+
+	/* 0 for VMX, 1 for SVM.  */
+	__u16 format;
+
+	/* 128 for SVM, 128 + VMCS size for VMX.  */
+	__u32 size;
+
+	union {
+		/* VMXON, VMCS */
+		struct kvm_vmx_nested_state vmx;
+
+		/* Pad the header to 128 bytes.  */
+		__u8 pad[120];
+	};
+
+	__u8 data[0];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fee44e4c5c79..4be6486173b7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7589,6 +7589,11 @@ static __init int hardware_setup(void)
 	else
 		kvm_disable_tdp();
 
+	if (!nested) {
+		kvm_x86_ops->get_nested_state = NULL;
+		kvm_x86_ops->set_nested_state = NULL;
+	}
+
 	/*
 	 * Only enable PML when hardware supports PML feature, and both EPT
 	 * and EPT A/D bit features are enabled -- PML depends on them to work.
@@ -11775,8 +11780,8 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 }
 
 /*
- * If exit_qual is NULL, this is being called from RSM.
- * Otherwise it's called from vmlaunch/vmresume.
+ * If exit_qual is NULL, this is being called from state restore (either RSM
+ * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
  */
 static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
 {
@@ -13016,6 +13021,170 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				u32 user_data_size)
+{
+	struct vcpu_vmx *vmx;
+	struct vmcs12 *vmcs12;
+	struct kvm_nested_state kvm_state = {
+		.flags = 0,
+		.format = 0,
+		.size = sizeof(kvm_state),
+		.vmx.vmxon_pa = -1ull,
+		.vmx.vmcs_pa = -1ull,
+	};
+
+	if (!vcpu)
+		return kvm_state.size + 2 * VMCS12_SIZE;
+
+	vmx = to_vmx(vcpu);
+	vmcs12 = get_vmcs12(vcpu);
+	if (nested_vmx_allowed(vcpu) &&
+	    (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
+		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
+
+		if (vmx->nested.current_vmptr != -1ull)
+			kvm_state.size += VMCS12_SIZE;
+
+		if (vmx->nested.smm.vmxon)
+			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
+
+		if (vmx->nested.smm.guest_mode)
+			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
+
+		if (is_guest_mode(vcpu)) {
+			kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+			if (vmx->nested.nested_run_pending)
+				kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+		}
+	}
+
+	if (user_data_size < kvm_state.size)
+		goto out;
+
+	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+		return -EFAULT;
+
+	if (vmx->nested.current_vmptr == -1ull)
+		goto out;
+
+	/*
+	 * When running L2, the authoritative vmcs12 state is in the
+	 * vmcs02. When running L1, the authoritative vmcs12 state is
+	 * in the shadow vmcs linked to vmcs01, unless
+	 * sync_shadow_vmcs is set, in which case, the authoritative
+	 * vmcs12 state is in the vmcs12 already.
+	 */
+	if (is_guest_mode(vcpu))
+		sync_vmcs12(vcpu, vmcs12);
+	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+		copy_shadow_to_vmcs12(vmx);
+
+	if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
+		return -EFAULT;
+
+out:
+	return kvm_state.size;
+}
+
+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				struct kvm_nested_state *kvm_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12;
+	u32 exit_qual;
+	int ret;
+
+	if (kvm_state->format != 0)
+		return -EINVAL;
+
+	if (!nested_vmx_allowed(vcpu))
+		return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
+
+	if (kvm_state->vmx.vmxon_pa == -1ull) {
+		if (kvm_state->vmx.smm.flags)
+			return -EINVAL;
+
+		if (kvm_state->vmx.vmcs_pa != -1ull)
+			return -EINVAL;
+
+		vmx_leave_nested(vcpu);
+		return 0;
+	}
+
+	if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
+		return -EINVAL;
+
+	if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+		return -EINVAL;
+
+	if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+	    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+		return -EINVAL;
+
+	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+	    (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+		return -EINVAL;
+
+	if (kvm_state->vmx.smm.flags &
+	    ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
+		return -EINVAL;
+
+	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+	    !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
+		return -EINVAL;
+
+	vmx_leave_nested(vcpu);
+	if (kvm_state->vmx.vmxon_pa == -1ull)
+		return 0;
+
+	vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
+	ret = enter_vmx_operation(vcpu);
+	if (ret)
+		return ret;
+
+	set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+
+	if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
+		vmx->nested.smm.vmxon = true;
+		vmx->nested.vmxon = false;
+
+		if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
+			vmx->nested.smm.guest_mode = true;
+	}
+
+	vmcs12 = get_vmcs12(vcpu);
+	if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
+		return -EFAULT;
+
+	if (vmcs12->revision_id != VMCS12_REVISION)
+		return -EINVAL;
+
+	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+		return 0;
+
+	vmx->nested.nested_run_pending =
+		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+	if (check_vmentry_prereqs(vcpu, vmcs12) ||
+	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+		return -EINVAL;
+
+	if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+		vmx->nested.nested_run_pending = 1;
+
+	vmx->nested.dirty_vmcs12 = true;
+	ret = enter_vmx_non_root_mode(vcpu, NULL);
+	if (ret)
+		return -EINVAL;
+
+	return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -13150,6 +13319,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.setup_mce = vmx_setup_mce,
 
+	.get_nested_state = vmx_get_nested_state,
+	.set_nested_state = vmx_set_nested_state,
 	.get_vmcs12_pages = nested_get_vmcs12_pages,
 
 	.smi_allowed = vmx_smi_allowed,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fbd59ad047b0..1b14c4a654c3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2947,6 +2947,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X2APIC_API:
 		r = KVM_X2APIC_API_VALID_FLAGS;
 		break;
+	case KVM_CAP_NESTED_STATE:
+		r = kvm_x86_ops->get_nested_state ?
+			kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
+		break;
 	default:
 		break;
 	}
@@ -3963,6 +3967,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
+	case KVM_GET_NESTED_STATE: {
+		struct kvm_nested_state __user *user_kvm_nested_state = argp;
+		u32 user_data_size;
+
+		r = -EINVAL;
+		if (!kvm_x86_ops->get_nested_state)
+			break;
+
+		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+		if (get_user(user_data_size, &user_kvm_nested_state->size))
+			return -EFAULT;
+
+		r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
+						  user_data_size);
+		if (r < 0)
+			return r;
+
+		if (r > user_data_size) {
+			if (put_user(r, &user_kvm_nested_state->size))
+				return -EFAULT;
+			return -E2BIG;
+		}
+		r = 0;
+		break;
+	}
+	case KVM_SET_NESTED_STATE: {
+		struct kvm_nested_state __user *user_kvm_nested_state = argp;
+		struct kvm_nested_state kvm_state;
+
+		r = -EINVAL;
+		if (!kvm_x86_ops->set_nested_state)
+			break;
+
+		if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
+			return -EFAULT;
+
+		if (kvm_state.size < sizeof(kvm_state))
+			return -EINVAL;
+
+		if (kvm_state.flags &
+		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+			return -EINVAL;
+
+		/* nested_run_pending implies guest_mode.  */
+		if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+			return -EINVAL;
+
+		r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b955b986b341..3cf632839337 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -950,6 +950,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_EVENTFD 154
 #define KVM_CAP_HYPERV_TLBFLUSH 155
 #define KVM_CAP_S390_HPAGE_1M 156
+#define KVM_CAP_NESTED_STATE 157
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1392,6 +1393,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_HYPERV_EVENTFD */
 #define KVM_HYPERV_EVENTFD        _IOW(KVMIO,  0xbd, struct kvm_hyperv_eventfd)
 
+/* Available with KVM_CAP_NESTED_STATE */
+#define KVM_GET_NESTED_STATE         _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
+#define KVM_SET_NESTED_STATE         _IOW(KVMIO,  0xbf, struct kvm_nested_state)
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {

From cb5476379f0718046f3c6b3195d18838c5b25ea2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 28 Jul 2018 21:56:09 +0200
Subject: [PATCH 24/85] kvm: selftests: add test for nested state save/restore

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/vmx.h | 33 +++++++++++
 tools/testing/selftests/kvm/lib/x86.c     | 29 +++++++++
 tools/testing/selftests/kvm/state_test.c  | 71 ++++++++++++++++++++++-
 3 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/include/vmx.h b/tools/testing/selftests/kvm/include/vmx.h
index 9caaf56696a4..54e6020d0638 100644
--- a/tools/testing/selftests/kvm/include/vmx.h
+++ b/tools/testing/selftests/kvm/include/vmx.h
@@ -380,6 +380,30 @@ static inline int vmptrld(uint64_t vmcs_pa)
 	return ret;
 }
 
+static inline int vmptrst(uint64_t *value)
+{
+	uint64_t tmp;
+	uint8_t ret;
+
+	__asm__ __volatile__("vmptrst %[value]; setna %[ret]"
+		: [value]"=m"(tmp), [ret]"=rm"(ret)
+		: : "cc", "memory");
+
+	*value = tmp;
+	return ret;
+}
+
+/*
+ * A wrapper around vmptrst that ignores errors and returns zero if the
+ * vmptrst instruction fails.
+ */
+static inline uint64_t vmptrstz(void)
+{
+	uint64_t value = 0;
+	vmptrst(&value);
+	return value;
+}
+
 /*
  * No guest state (e.g. GPRs) is established by this vmlaunch.
  */
@@ -444,6 +468,15 @@ static inline int vmresume(void)
 	return ret;
 }
 
+static inline void vmcall(void)
+{
+	/* Currently, L1 destroys our GPRs during vmexits.  */
+	__asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : :
+			     "rax", "rbx", "rcx", "rdx",
+			     "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
+			     "r13", "r14", "r15");
+}
+
 static inline int vmread(uint64_t encoding, uint64_t *value)
 {
 	uint64_t tmp;
diff --git a/tools/testing/selftests/kvm/lib/x86.c b/tools/testing/selftests/kvm/lib/x86.c
index 2013efb60991..e38345252df5 100644
--- a/tools/testing/selftests/kvm/lib/x86.c
+++ b/tools/testing/selftests/kvm/lib/x86.c
@@ -736,6 +736,10 @@ struct kvm_x86_state {
 	struct kvm_xcrs xcrs;
 	struct kvm_sregs sregs;
 	struct kvm_debugregs debugregs;
+	union {
+		struct kvm_nested_state nested;
+		char nested_[16384];
+	};
 	struct kvm_msrs msrs;
 };
 
@@ -758,6 +762,14 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 	struct kvm_msr_list *list;
 	struct kvm_x86_state *state;
 	int nmsrs, r, i;
+	static int nested_size = -1;
+
+	if (nested_size == -1) {
+		nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
+		TEST_ASSERT(nested_size <= sizeof(state->nested_),
+			    "Nested state size too big, %i > %zi",
+			    nested_size, sizeof(state->nested_));
+	}
 
 	nmsrs = kvm_get_num_msrs(vm);
 	list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
@@ -791,6 +803,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
                 r);
 
+	if (nested_size) {
+		state->nested.size = sizeof(state->nested_);
+		r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
+		TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
+			r);
+		TEST_ASSERT(state->nested.size <= nested_size,
+			"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
+			state->nested.size, nested_size);
+	} else
+		state->nested.size = 0;
+
 	state->msrs.nmsrs = nmsrs;
 	for (i = 0; i < nmsrs; i++)
 		state->msrs.entries[i].index = list->indices[i];
@@ -811,6 +834,12 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	int r;
 
+	if (state->nested.size) {
+		r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
+		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
+			r);
+	}
+
 	r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
                 r);
diff --git a/tools/testing/selftests/kvm/state_test.c b/tools/testing/selftests/kvm/state_test.c
index e47b27b5406e..8554a34f826e 100644
--- a/tools/testing/selftests/kvm/state_test.c
+++ b/tools/testing/selftests/kvm/state_test.c
@@ -18,6 +18,7 @@
 
 #include "kvm_util.h"
 #include "x86.h"
+#include "vmx.h"
 
 #define VCPU_ID		5
 #define PORT_SYNC	0x1000
@@ -45,16 +46,75 @@ static inline void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1)
 
 static bool have_nested_state;
 
-void guest_code(void)
+void l2_guest_code(void)
+{
+	GUEST_SYNC(5);
+
+        /* Exit to L1 */
+	vmcall();
+
+	GUEST_SYNC(7);
+
+	/* Done, exit to L1 and never come back.  */
+	vmcall();
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+        unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT(vmx_pages->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+	GUEST_SYNC(3);
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_SYNC(4);
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	/* Check that the launched state is preserved.  */
+	GUEST_ASSERT(vmlaunch());
+
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	GUEST_SYNC(6);
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
+
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	GUEST_SYNC(8);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
 {
 	GUEST_SYNC(1);
 	GUEST_SYNC(2);
 
+	if (vmx_pages)
+		l1_guest_code(vmx_pages);
+
 	exit_to_l0(PORT_DONE, 0, 0);
 }
 
 int main(int argc, char *argv[])
 {
+	struct vmx_pages *vmx_pages = NULL;
+	vm_vaddr_t vmx_pages_gva = 0;
+
 	struct kvm_regs regs1, regs2;
 	struct kvm_vm *vm;
 	struct kvm_run *run;
@@ -69,6 +129,15 @@ int main(int argc, char *argv[])
 	run = vcpu_state(vm, VCPU_ID);
 
 	vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+	if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+		vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+		vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+	} else {
+		printf("will skip nested state checks\n");
+		vcpu_args_set(vm, VCPU_ID, 1, 0);
+	}
+
 	for (stage = 1;; stage++) {
 		_vcpu_run(vm, VCPU_ID);
 		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,

From 392b2f25aa415c0222b348f95875409be49a1201 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:01 +0300
Subject: [PATCH 25/85] KVM: VMX: Create struct for VMCS header

No functionality change.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4be6486173b7..37944b79da45 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -198,8 +198,13 @@ struct kvm_vmx {
 
 #define NR_AUTOLOAD_MSRS 8
 
+struct vmcs_hdr {
+	u32 revision_id:31;
+	u32 shadow_vmcs:1;
+};
+
 struct vmcs {
-	u32 revision_id;
+	struct vmcs_hdr hdr;
 	u32 abort;
 	char data[0];
 };
@@ -253,7 +258,7 @@ struct __packed vmcs12 {
 	/* According to the Intel spec, a VMCS region must start with the
 	 * following two fields. Then follow implementation-specific data.
 	 */
-	u32 revision_id;
+	struct vmcs_hdr hdr;
 	u32 abort;
 
 	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
@@ -421,7 +426,7 @@ struct __packed vmcs12 {
 		"Offset of " #field " in struct vmcs12 has changed.")
 
 static inline void vmx_check_vmcs12_offsets(void) {
-	CHECK_OFFSET(revision_id, 0);
+	CHECK_OFFSET(hdr, 0);
 	CHECK_OFFSET(abort, 4);
 	CHECK_OFFSET(launch_state, 8);
 	CHECK_OFFSET(io_bitmap_a, 40);
@@ -4399,9 +4404,9 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 
 	/* KVM supports Enlightened VMCS v1 only */
 	if (static_branch_unlikely(&enable_evmcs))
-		vmcs->revision_id = KVM_EVMCS_VERSION;
+		vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
 	else
-		vmcs->revision_id = vmcs_config.revision_id;
+		vmcs->hdr.revision_id = vmcs_config.revision_id;
 
 	return vmcs;
 }
@@ -4581,7 +4586,7 @@ static __init int alloc_kvm_area(void)
 		 * physical CPU.
 		 */
 		if (static_branch_unlikely(&enable_evmcs))
-			vmcs->revision_id = vmcs_config.revision_id;
+			vmcs->hdr.revision_id = vmcs_config.revision_id;
 
 		per_cpu(vmxarea, cpu) = vmcs;
 	}
@@ -7889,7 +7894,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 		if (!shadow_vmcs)
 			goto out_shadow_vmcs;
 		/* mark vmcs as shadow */
-		shadow_vmcs->revision_id |= (1u << 31);
+		shadow_vmcs->hdr.shadow_vmcs = 1;
 		/* init shadow vmcs */
 		vmcs_clear(shadow_vmcs);
 		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
@@ -8459,7 +8464,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 			return kvm_skip_emulated_instruction(vcpu);
 		}
 		new_vmcs12 = kmap(page);
-		if (new_vmcs12->revision_id != VMCS12_REVISION) {
+		if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+		    new_vmcs12->hdr.shadow_vmcs) {
 			kunmap(page);
 			kvm_release_page_clean(page);
 			nested_vmx_failValid(vcpu,
@@ -13161,7 +13167,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 	if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
 		return -EFAULT;
 
-	if (vmcs12->revision_id != VMCS12_REVISION)
+	if (vmcs12->hdr.revision_id != VMCS12_REVISION)
 		return -EINVAL;
 
 	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))

From e253674227328575084cf7454d22749b9be11d52 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:02 +0300
Subject: [PATCH 26/85] KVM: VMX: Change vmcs12_{read,write}_any() to receive
 vmcs12 as parameter

No functionality change.
This is done as a preparation for VMCS shadowing emulation.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 37944b79da45..82b01b34a3a2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8159,7 +8159,7 @@ static int handle_vmresume(struct kvm_vcpu *vcpu)
  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
  * 64-bit fields are to be returned).
  */
-static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
+static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
 				  unsigned long field, u64 *ret)
 {
 	short offset = vmcs_field_to_offset(field);
@@ -8168,7 +8168,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
 	if (offset < 0)
 		return offset;
 
-	p = ((char *)(get_vmcs12(vcpu))) + offset;
+	p = (char *)vmcs12 + offset;
 
 	switch (vmcs_field_width(field)) {
 	case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
@@ -8190,10 +8190,10 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
 }
 
 
-static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
+static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
 				   unsigned long field, u64 field_value){
 	short offset = vmcs_field_to_offset(field);
-	char *p = ((char *) get_vmcs12(vcpu)) + offset;
+	char *p = (char *)vmcs12 + offset;
 	if (offset < 0)
 		return offset;
 
@@ -8246,7 +8246,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 		for (i = 0; i < max_fields[q]; i++) {
 			field = fields[q][i];
 			field_value = __vmcs_readl(field);
-			vmcs12_write_any(&vmx->vcpu, field, field_value);
+			vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
 		}
 		/*
 		 * Skip the VM-exit information fields if they are read-only.
@@ -8281,7 +8281,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 	for (q = 0; q < ARRAY_SIZE(fields); q++) {
 		for (i = 0; i < max_fields[q]; i++) {
 			field = fields[q][i];
-			vmcs12_read_any(&vmx->vcpu, field, &field_value);
+			vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
 			__vmcs_writel(field, field_value);
 		}
 	}
@@ -8321,7 +8321,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 	/* Decode instruction info and find the field to read */
 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
 	/* Read the field, zero-extended to a u64 field_value */
-	if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
+	if (vmcs12_read_any(get_vmcs12(vcpu), field, &field_value) < 0) {
 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 		return kvm_skip_emulated_instruction(vcpu);
 	}
@@ -8397,7 +8397,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 		return kvm_skip_emulated_instruction(vcpu);
 	}
 
-	if (vmcs12_write_any(vcpu, field, field_value) < 0) {
+	if (vmcs12_write_any(get_vmcs12(vcpu), field, field_value) < 0) {
 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 		return kvm_skip_emulated_instruction(vcpu);
 	}
@@ -10971,11 +10971,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 				       unsigned long count_field,
 				       unsigned long addr_field)
 {
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	int maxphyaddr;
 	u64 count, addr;
 
-	if (vmcs12_read_any(vcpu, count_field, &count) ||
-	    vmcs12_read_any(vcpu, addr_field, &addr)) {
+	if (vmcs12_read_any(vmcs12, count_field, &count) ||
+	    vmcs12_read_any(vmcs12, addr_field, &addr)) {
 		WARN_ON(1);
 		return -EINVAL;
 	}

From fa97d7dba7538adf174c3984e726c0537ab553a4 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Wed, 18 Jul 2018 14:07:59 +0200
Subject: [PATCH 27/85] KVM: nVMX: Allow VMPTRLD for shadow VMCS if vCPU
 supports VMCS shadowing

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 82b01b34a3a2..44e2b82f6519 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1720,6 +1720,12 @@ static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
 			CPU_BASED_MONITOR_TRAP_FLAG;
 }
 
+static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+		SECONDARY_EXEC_SHADOW_VMCS;
+}
+
 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
 {
 	return vmcs12->cpu_based_vm_exec_control & bit;
@@ -8465,7 +8471,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		}
 		new_vmcs12 = kmap(page);
 		if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
-		    new_vmcs12->hdr.shadow_vmcs) {
+		    (new_vmcs12->hdr.shadow_vmcs &&
+		     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
 			kunmap(page);
 			kvm_release_page_clean(page);
 			nested_vmx_failValid(vcpu,

From a6192d40d52f6b86997a71449e2ebc3d7f5ca103 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:04 +0300
Subject: [PATCH 28/85] KVM: nVMX: Fail VMLAUNCH and VMRESUME on shadow VMCS

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 44e2b82f6519..f6ec898ac539 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11876,6 +11876,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	vmcs12 = get_vmcs12(vcpu);
 
+	/*
+	 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
+	 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
+	 * rather than RFLAGS.ZF, and no error number is stored to the
+	 * VM-instruction error field.
+	 */
+	if (vmcs12->hdr.shadow_vmcs) {
+		nested_vmx_failInvalid(vcpu);
+		goto out;
+	}
+
 	if (enable_shadow_vmcs)
 		copy_shadow_to_vmcs12(vmx);
 

From f792d2743ed4db9e96eff43bdc1e15dd441f19bf Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:05 +0300
Subject: [PATCH 29/85] KVM: nVMX: Introduce nested_cpu_has_shadow_vmcs()

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f6ec898ac539..64b11c57b5f4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1806,6 +1806,11 @@ static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
 		 VMX_VMFUNC_EPTP_SWITCHING);
 }
 
+static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
+}
+
 static inline bool is_nmi(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -11745,7 +11750,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
 		return 1;
 
-	if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
+	if (!nested_cpu_has_shadow_vmcs(vmcs12) &&
 	    vmcs12->vmcs_link_pointer != -1ull) {
 		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
 		return 1;

From a8a7c02bf7b70cda6face6321a45de56519c24bf Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:06 +0300
Subject: [PATCH 30/85] KVM: nVMX: Verify VMCS shadowing controls

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 64b11c57b5f4..e762222476a9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11038,6 +11038,19 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
+						 struct vmcs12 *vmcs12)
+{
+	if (!nested_cpu_has_shadow_vmcs(vmcs12))
+		return 0;
+
+	if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
+	    !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
 				       struct vmx_msr_entry *e)
 {
@@ -11639,6 +11652,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	if (nested_vmx_check_pml_controls(vcpu, vmcs12))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+	if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
 				vmx->nested.msrs.procbased_ctls_low,
 				vmx->nested.msrs.procbased_ctls_high) ||

From f145d90d97bab0e11b78da1739e5db742575037c Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:07 +0300
Subject: [PATCH 31/85] KVM: nVMX: Verify VMCS shadowing VMCS link pointer

Intel SDM considers these checks to be part of
"Checks on Guest Non-Register State".

Note that it is legal for vmcs->vmcs_link_pointer to be -1ull
when VMCS shadowing is enabled. In this case, any VMREAD/VMWRITE to
shadowed-field sets the ALU flags for VMfailInvalid (i.e. CF=1).

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e762222476a9..f8ad42bcfc2d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11755,6 +11755,33 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	return 0;
 }
 
+static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
+					  struct vmcs12 *vmcs12)
+{
+	int r;
+	struct page *page;
+	struct vmcs12 *shadow;
+
+	if (vmcs12->vmcs_link_pointer == -1ull)
+		return 0;
+
+	if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
+		return -EINVAL;
+
+	page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
+	if (is_error_page(page))
+		return -EINVAL;
+
+	r = 0;
+	shadow = kmap(page);
+	if (shadow->hdr.revision_id != VMCS12_REVISION ||
+	    shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
+		r = -EINVAL;
+	kunmap(page);
+	kvm_release_page_clean(page);
+	return r;
+}
+
 static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 				  u32 *exit_qual)
 {
@@ -11766,8 +11793,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
 		return 1;
 
-	if (!nested_cpu_has_shadow_vmcs(vmcs12) &&
-	    vmcs12->vmcs_link_pointer != -1ull) {
+	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
 		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
 		return 1;
 	}

From 61ada7488ffdef0c0f83da14a12629870abb9e97 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:08 +0300
Subject: [PATCH 32/85] KVM: nVMX: Cache shadow vmcs12 on VMEntry and flush to
 memory on VMExit

This is done is done as a preparation to VMCS shadowing emulation.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f8ad42bcfc2d..2a005f519032 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -644,6 +644,12 @@ struct nested_vmx {
 	 * memory during VMCLEAR and VMPTRLD.
 	 */
 	struct vmcs12 *cached_vmcs12;
+	/*
+	 * Cache of the guest's shadow VMCS, existing outside of guest
+	 * memory. Loaded from guest memory during VM entry. Flushed
+	 * to guest memory during VM exit.
+	 */
+	struct vmcs12 *cached_shadow_vmcs12;
 	/*
 	 * Indicates if the shadow vmcs must be updated with the
 	 * data hold by vmcs12
@@ -1076,6 +1082,11 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 	return to_vmx(vcpu)->nested.cached_vmcs12;
 }
 
+static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
+}
+
 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
@@ -7900,6 +7911,10 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	if (!vmx->nested.cached_vmcs12)
 		goto out_cached_vmcs12;
 
+	vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+	if (!vmx->nested.cached_shadow_vmcs12)
+		goto out_cached_shadow_vmcs12;
+
 	if (enable_shadow_vmcs) {
 		shadow_vmcs = alloc_vmcs();
 		if (!shadow_vmcs)
@@ -7919,6 +7934,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	return 0;
 
 out_shadow_vmcs:
+	kfree(vmx->nested.cached_shadow_vmcs12);
+
+out_cached_shadow_vmcs12:
 	kfree(vmx->nested.cached_vmcs12);
 
 out_cached_vmcs12:
@@ -8085,6 +8103,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->vmcs01.shadow_vmcs = NULL;
 	}
 	kfree(vmx->nested.cached_vmcs12);
+	kfree(vmx->nested.cached_shadow_vmcs12);
 	/* Unpin physical memory we referred to in the vmcs02 */
 	if (vmx->nested.apic_access_page) {
 		kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -10926,6 +10945,38 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	return true;
 }
 
+static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
+				       struct vmcs12 *vmcs12)
+{
+	struct vmcs12 *shadow;
+	struct page *page;
+
+	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+	    vmcs12->vmcs_link_pointer == -1ull)
+		return;
+
+	shadow = get_shadow_vmcs12(vcpu);
+	page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
+
+	memcpy(shadow, kmap(page), VMCS12_SIZE);
+
+	kunmap(page);
+	kvm_release_page_clean(page);
+}
+
+static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
+					      struct vmcs12 *vmcs12)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+	    vmcs12->vmcs_link_pointer == -1ull)
+		return;
+
+	kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
+			get_shadow_vmcs12(vcpu), VMCS12_SIZE);
+}
+
 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
 					  struct vmcs12 *vmcs12)
 {
@@ -11995,6 +12046,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
+	/*
+	 * Must happen outside of enter_vmx_non_root_mode() as it will
+	 * also be used as part of restoring nVMX state for
+	 * snapshot restore (migration).
+	 *
+	 * In this flow, it is assumed that vmcs12 cache was
+	 * trasferred as part of captured nVMX state and should
+	 * therefore not be read from guest memory (which may not
+	 * exist on destination host yet).
+	 */
+	nested_cache_shadow_vmcs12(vcpu, vmcs12);
+
 	/*
 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
 	 * by event injection, halt vcpu.
@@ -12504,6 +12567,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 			prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
 				       exit_qualification);
 
+		/*
+		 * Must happen outside of sync_vmcs12() as it will
+		 * also be used to capture vmcs12 cache as part of
+		 * capturing nVMX state for snapshot (migration).
+		 *
+		 * Otherwise, this flush will dirty guest memory at a
+		 * point it is already assumed by user-space to be
+		 * immutable.
+		 */
+		nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
+
 		if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
 					 vmcs12->vm_exit_msr_store_count))
 			nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);

From fa58a9fa74979f845fc6c94a58501e67f3abb6de Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Jul 2018 19:45:51 +0200
Subject: [PATCH 33/85] KVM: nVMX: include shadow vmcs12 in nested state

The shadow vmcs12 cannot be flushed on KVM_GET_NESTED_STATE,
because at that point guest memory is assumed by userspace to
be immutable.  Capture the cache in vmx_get_nested_state, adding
another page at the end if there is an active shadow vmcs12.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2a005f519032..c7d08a54ab8d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -13191,9 +13191,15 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
 		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
 
-		if (vmx->nested.current_vmptr != -1ull)
+		if (vmx->nested.current_vmptr != -1ull) {
 			kvm_state.size += VMCS12_SIZE;
 
+			if (is_guest_mode(vcpu) &&
+			    nested_cpu_has_shadow_vmcs(vmcs12) &&
+			    vmcs12->vmcs_link_pointer != -1ull)
+				kvm_state.size += VMCS12_SIZE;
+		}
+
 		if (vmx->nested.smm.vmxon)
 			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
 
@@ -13232,6 +13238,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 	if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
 		return -EFAULT;
 
+	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+	    vmcs12->vmcs_link_pointer != -1ull) {
+		if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
+				 get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
+			return -EFAULT;
+	}
+
 out:
 	return kvm_state.size;
 }
@@ -13316,6 +13329,22 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 	vmx->nested.nested_run_pending =
 		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
 
+	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+	    vmcs12->vmcs_link_pointer != -1ull) {
+		struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
+		if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
+			return -EINVAL;
+
+		if (copy_from_user(shadow_vmcs12,
+				   user_kvm_nested_state->data + VMCS12_SIZE,
+				   sizeof(*vmcs12)))
+			return -EFAULT;
+
+		if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+		    !shadow_vmcs12->hdr.shadow_vmcs)
+			return -EINVAL;
+	}
+
 	if (check_vmentry_prereqs(vcpu, vmcs12) ||
 	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
 		return -EINVAL;

From 9a78bdf31da79a2a58585474598afcf44b14b245 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 29 Jul 2018 00:14:11 +0200
Subject: [PATCH 34/85] KVM: selftests: add tests for shadow VMCS save/restore

This includes setting up the shadow VMCS and the secondary execution
controls in lib/vmx.c.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/vmx.h | 12 ++++++++
 tools/testing/selftests/kvm/lib/vmx.c     | 33 ++++++++++++++++++++--
 tools/testing/selftests/kvm/state_test.c  | 34 +++++++++++++++++++++--
 3 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/vmx.h b/tools/testing/selftests/kvm/include/vmx.h
index 54e6020d0638..b9ffe1024d3a 100644
--- a/tools/testing/selftests/kvm/include/vmx.h
+++ b/tools/testing/selftests/kvm/include/vmx.h
@@ -531,6 +531,18 @@ struct vmx_pages {
 	void *msr_hva;
 	uint64_t msr_gpa;
 	void *msr;
+
+	void *shadow_vmcs_hva;
+	uint64_t shadow_vmcs_gpa;
+	void *shadow_vmcs;
+
+	void *vmread_hva;
+	uint64_t vmread_gpa;
+	void *vmread;
+
+	void *vmwrite_hva;
+	uint64_t vmwrite_gpa;
+	void *vmwrite;
 };
 
 struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
diff --git a/tools/testing/selftests/kvm/lib/vmx.c b/tools/testing/selftests/kvm/lib/vmx.c
index 5701e52a33ed..b987c3c970eb 100644
--- a/tools/testing/selftests/kvm/lib/vmx.c
+++ b/tools/testing/selftests/kvm/lib/vmx.c
@@ -44,6 +44,23 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
 	vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
 	vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr);
 	vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr);
+	memset(vmx->msr_hva, 0, getpagesize());
+
+	/* Setup of a region of guest memory for the shadow VMCS. */
+	vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+	vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs);
+	vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs);
+
+	/* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */
+	vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+	vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread);
+	vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread);
+	memset(vmx->vmread_hva, 0, getpagesize());
+
+	vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+	vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite);
+	vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
+	memset(vmx->vmwrite_hva, 0, getpagesize());
 
 	*p_vmx_gva = vmx_gva;
 	return vmx;
@@ -98,6 +115,11 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx)
 	if (vmptrld(vmx->vmcs_gpa))
 		return false;
 
+	/* Setup shadow VMCS, do not load it yet. */
+	*(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul;
+	if (vmclear(vmx->shadow_vmcs_gpa))
+		return false;
+
 	return true;
 }
 
@@ -109,8 +131,12 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
 	vmwrite(VIRTUAL_PROCESSOR_ID, 0);
 	vmwrite(POSTED_INTR_NV, 0);
 
-	vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PINBASED_CTLS));
-	vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PROCBASED_CTLS));
+	vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS));
+	if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, 0))
+		vmwrite(CPU_BASED_VM_EXEC_CONTROL,
+			rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+	else
+		vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS));
 	vmwrite(EXCEPTION_BITMAP, 0);
 	vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
 	vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
@@ -124,7 +150,6 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
 	vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
 	vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
 	vmwrite(TPR_THRESHOLD, 0);
-	vmwrite(SECONDARY_VM_EXEC_CONTROL, 0);
 
 	vmwrite(CR0_GUEST_HOST_MASK, 0);
 	vmwrite(CR4_GUEST_HOST_MASK, 0);
@@ -132,6 +157,8 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
 	vmwrite(CR4_READ_SHADOW, get_cr4());
 
 	vmwrite(MSR_BITMAP, vmx->msr_gpa);
+	vmwrite(VMREAD_BITMAP, vmx->vmread_gpa);
+	vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa);
 }
 
 /*
diff --git a/tools/testing/selftests/kvm/state_test.c b/tools/testing/selftests/kvm/state_test.c
index 8554a34f826e..ecabf25b7077 100644
--- a/tools/testing/selftests/kvm/state_test.c
+++ b/tools/testing/selftests/kvm/state_test.c
@@ -53,7 +53,15 @@ void l2_guest_code(void)
         /* Exit to L1 */
 	vmcall();
 
-	GUEST_SYNC(7);
+	/* L1 has now set up a shadow VMCS for us.  */
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+	GUEST_SYNC(9);
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+	GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee));
+	GUEST_SYNC(10);
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee);
+	GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee));
+	GUEST_SYNC(11);
 
 	/* Done, exit to L1 and never come back.  */
 	vmcall();
@@ -94,9 +102,31 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
 
 	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
 
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+	vmwrite(VMCS_LINK_POINTER, vmx_pages->shadow_vmcs_gpa);
+
+	GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
+	GUEST_ASSERT(vmlaunch());
+	GUEST_SYNC(7);
+	GUEST_ASSERT(vmlaunch());
+	GUEST_ASSERT(vmresume());
+
+	vmwrite(GUEST_RIP, 0xc0ffee);
+	GUEST_SYNC(8);
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+
+	GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa));
 	GUEST_ASSERT(!vmresume());
 	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	GUEST_SYNC(8);
+
+	GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
+	GUEST_ASSERT(vmlaunch());
+	GUEST_ASSERT(vmresume());
+	GUEST_SYNC(12);
+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
+	GUEST_ASSERT(vmlaunch());
+	GUEST_ASSERT(vmresume());
 }
 
 void guest_code(struct vmx_pages *vmx_pages)

From 6d894f498f5d121b57a231315b0b459e185abed1 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:09 +0300
Subject: [PATCH 35/85] KVM: nVMX: vmread/vmwrite: Use shadow vmcs12 if running
 L2

This is done as a preparation to VMCS shadowing emulation.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 61 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c7d08a54ab8d..f91a1599738e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8341,6 +8341,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	gva_t gva = 0;
+	struct vmcs12 *vmcs12;
 
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
@@ -8348,10 +8349,24 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_vmcs12(vcpu))
 		return kvm_skip_emulated_instruction(vcpu);
 
+	if (!is_guest_mode(vcpu))
+		vmcs12 = get_vmcs12(vcpu);
+	else {
+		/*
+		 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
+		 * to shadowed-field sets the ALU flags for VMfailInvalid.
+		 */
+		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
+			nested_vmx_failInvalid(vcpu);
+			return kvm_skip_emulated_instruction(vcpu);
+		}
+		vmcs12 = get_shadow_vmcs12(vcpu);
+	}
+
 	/* Decode instruction info and find the field to read */
 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
 	/* Read the field, zero-extended to a u64 field_value */
-	if (vmcs12_read_any(get_vmcs12(vcpu), field, &field_value) < 0) {
+	if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 		return kvm_skip_emulated_instruction(vcpu);
 	}
@@ -8393,6 +8408,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 	 */
 	u64 field_value = 0;
 	struct x86_exception e;
+	struct vmcs12 *vmcs12;
 
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
@@ -8427,23 +8443,44 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 		return kvm_skip_emulated_instruction(vcpu);
 	}
 
-	if (vmcs12_write_any(get_vmcs12(vcpu), field, field_value) < 0) {
+	if (!is_guest_mode(vcpu))
+		vmcs12 = get_vmcs12(vcpu);
+	else {
+		/*
+		 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
+		 * to shadowed-field sets the ALU flags for VMfailInvalid.
+		 */
+		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
+			nested_vmx_failInvalid(vcpu);
+			return kvm_skip_emulated_instruction(vcpu);
+		}
+		vmcs12 = get_shadow_vmcs12(vcpu);
+
+	}
+
+	if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 		return kvm_skip_emulated_instruction(vcpu);
 	}
 
-	switch (field) {
+	/*
+	 * Do not track vmcs12 dirty-state if in guest-mode
+	 * as we actually dirty shadow vmcs12 instead of vmcs12.
+	 */
+	if (!is_guest_mode(vcpu)) {
+		switch (field) {
 #define SHADOW_FIELD_RW(x) case x:
 #include "vmx_shadow_fields.h"
-		/*
-		 * The fields that can be updated by L1 without a vmexit are
-		 * always updated in the vmcs02, the others go down the slow
-		 * path of prepare_vmcs02.
-		 */
-		break;
-	default:
-		vmx->nested.dirty_vmcs12 = true;
-		break;
+			/*
+			 * The fields that can be updated by L1 without a vmexit are
+			 * always updated in the vmcs02, the others go down the slow
+			 * path of prepare_vmcs02.
+			 */
+			break;
+		default:
+			vmx->nested.dirty_vmcs12 = true;
+			break;
+		}
 	}
 
 	nested_vmx_succeed(vcpu);

From a7cde481b6e8506ee147cf5031ad1917fcc8bf9b Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:10 +0300
Subject: [PATCH 36/85] KVM: nVMX: Do not forward VMREAD/VMWRITE VMExits to L1
 if required so by vmcs12 vmread/vmwrite bitmaps

This is done as a preparation for VMCS shadowing emulation.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f91a1599738e..27fd3e1653af 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9107,6 +9107,30 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 	return false;
 }
 
+static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
+	struct vmcs12 *vmcs12, gpa_t bitmap)
+{
+	u32 vmx_instruction_info;
+	unsigned long field;
+	u8 b;
+
+	if (!nested_cpu_has_shadow_vmcs(vmcs12))
+		return true;
+
+	/* Decode instruction info and find the field to access */
+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+
+	/* Out-of-range fields always cause a VM exit from L2 to L1 */
+	if (field >> 15)
+		return true;
+
+	if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
+		return true;
+
+	return 1 & (b >> (field & 7));
+}
+
 /*
  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
  * should handle it ourselves in L0 (and then continue L2). Only call this
@@ -9191,10 +9215,15 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
 	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
 		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+	case EXIT_REASON_VMREAD:
+		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+			vmcs12->vmread_bitmap);
+	case EXIT_REASON_VMWRITE:
+		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+			vmcs12->vmwrite_bitmap);
 	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
 	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
-	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
-	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
+	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
 	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
 	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
 		/*

From 32c7acf044873c8782be45a03e46cf0ac579a459 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:11 +0300
Subject: [PATCH 37/85] KVM: nVMX: Expose VMCS shadowing to L1 guest

Expose VMCS shadowing to L1 as a VMX capability of the virtual CPU,
whether or not VMCS shadowing is supported by the physical CPU.
(VMCS shadowing emulation)

Shadowed VMREADs and VMWRITEs from L2 are handled by L0, without a
VM-exit to L1.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 27fd3e1653af..7cf64cbc3afd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3308,6 +3308,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING;
+	/*
+	 * We can emulate "VMCS shadowing," even if the hardware
+	 * doesn't support it.
+	 */
+	msrs->secondary_ctls_high |=
+		SECONDARY_EXEC_SHADOW_VMCS;
 
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
@@ -11550,6 +11556,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			exec_control |= vmcs12_exec_ctrl;
 		}
 
+		/* VMCS shadowing for L2 is emulated for now */
+		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
 			vmcs_write16(GUEST_INTR_STATUS,
 				vmcs12->guest_intr_status);

From 491a6038458fc07ec307447dd5e6722e34d67c04 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:12 +0300
Subject: [PATCH 38/85] KVM: VMX: Mark vmcs header as shadow in case
 alloc_vmcs_cpu() allocate shadow vmcs

No functionality change.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7cf64cbc3afd..098926a07cf7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4418,7 +4418,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	return 0;
 }
 
-static struct vmcs *alloc_vmcs_cpu(int cpu)
+static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
 {
 	int node = cpu_to_node(cpu);
 	struct page *pages;
@@ -4436,6 +4436,8 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 	else
 		vmcs->hdr.revision_id = vmcs_config.revision_id;
 
+	if (shadow)
+		vmcs->hdr.shadow_vmcs = 1;
 	return vmcs;
 }
 
@@ -4459,14 +4461,14 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
 }
 
-static struct vmcs *alloc_vmcs(void)
+static struct vmcs *alloc_vmcs(bool shadow)
 {
-	return alloc_vmcs_cpu(raw_smp_processor_id());
+	return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
 }
 
 static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 {
-	loaded_vmcs->vmcs = alloc_vmcs();
+	loaded_vmcs->vmcs = alloc_vmcs(false);
 	if (!loaded_vmcs->vmcs)
 		return -ENOMEM;
 
@@ -4597,7 +4599,7 @@ static __init int alloc_kvm_area(void)
 	for_each_possible_cpu(cpu) {
 		struct vmcs *vmcs;
 
-		vmcs = alloc_vmcs_cpu(cpu);
+		vmcs = alloc_vmcs_cpu(false, cpu);
 		if (!vmcs) {
 			free_kvm_area();
 			return -ENOMEM;
@@ -7922,11 +7924,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 		goto out_cached_shadow_vmcs12;
 
 	if (enable_shadow_vmcs) {
-		shadow_vmcs = alloc_vmcs();
+		shadow_vmcs = alloc_vmcs(true);
 		if (!shadow_vmcs)
 			goto out_shadow_vmcs;
-		/* mark vmcs as shadow */
-		shadow_vmcs->hdr.shadow_vmcs = 1;
 		/* init shadow vmcs */
 		vmcs_clear(shadow_vmcs);
 		vmx->vmcs01.shadow_vmcs = shadow_vmcs;

From abfc52c612ddb101549846688bc87dfedbbfd4f4 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sat, 23 Jun 2018 02:35:13 +0300
Subject: [PATCH 39/85] KVM: nVMX: Separate logic allocating shadow vmcs to a
 function

No functionality change.
This is done as a preparation for VMCS shadowing virtualization.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 098926a07cf7..cbe31b9d2b81 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7905,10 +7905,35 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
 	return 0;
 }
 
+/*
+ * Allocate a shadow VMCS and associate it with the currently loaded
+ * VMCS, unless such a shadow VMCS already exists. The newly allocated
+ * VMCS is also VMCLEARed, so that it is ready for use.
+ */
+static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
+
+	/*
+	 * We should allocate a shadow vmcs for vmcs01 only when L1
+	 * executes VMXON and free it when L1 executes VMXOFF.
+	 * As it is invalid to execute VMXON twice, we shouldn't reach
+	 * here when vmcs01 already have an allocated shadow vmcs.
+	 */
+	WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+
+	if (!loaded_vmcs->shadow_vmcs) {
+		loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+		if (loaded_vmcs->shadow_vmcs)
+			vmcs_clear(loaded_vmcs->shadow_vmcs);
+	}
+	return loaded_vmcs->shadow_vmcs;
+}
+
 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct vmcs *shadow_vmcs;
 	int r;
 
 	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
@@ -7923,14 +7948,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	if (!vmx->nested.cached_shadow_vmcs12)
 		goto out_cached_shadow_vmcs12;
 
-	if (enable_shadow_vmcs) {
-		shadow_vmcs = alloc_vmcs(true);
-		if (!shadow_vmcs)
-			goto out_shadow_vmcs;
-		/* init shadow vmcs */
-		vmcs_clear(shadow_vmcs);
-		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
-	}
+	if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
+		goto out_shadow_vmcs;
 
 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL_PINNED);

From 42522d08cdba6d8be4247e4f0770f39f4708b71f Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 18 Jul 2018 15:57:50 +0800
Subject: [PATCH 40/85] KVM: MMU: drop vcpu param in gpte_access

It's never used.  Drop it.

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6288e9d7068e..74996fc2fc97 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -181,7 +181,7 @@ no_present:
  * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
  * to signify readability since it isn't used in the EPT case
  */
-static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+static inline unsigned FNAME(gpte_access)(u64 gpte)
 {
 	unsigned access;
 #if PTTYPE == PTTYPE_EPT
@@ -394,8 +394,8 @@ retry_walk:
 	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
 
 	/* Convert to ACC_*_MASK flags for struct guest_walker.  */
-	walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
-	walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
+	walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+	walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
 	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
 	if (unlikely(errcode))
 		goto error;
@@ -508,7 +508,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 
 	gfn = gpte_to_gfn(gpte);
-	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+	pte_access = sp->role.access & FNAME(gpte_access)(gpte);
 	FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
 	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 			no_dirty_log && (pte_access & ACC_WRITE_MASK));
@@ -1002,7 +1002,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		gfn = gpte_to_gfn(gpte);
 		pte_access = sp->role.access;
-		pte_access &= FNAME(gpte_access)(vcpu, gpte);
+		pte_access &= FNAME(gpte_access)(gpte);
 		FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
 
 		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,

From 5ce4786f75d16504223c7a65a42b200c2550fa29 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:04 -0700
Subject: [PATCH 41/85] kvm: x86: Make sync_page() flush remote TLBs once only

sync_page() calls set_spte() from a loop across a page table. It would
work better if set_spte() left the TLB flushing to its callers, so that
sync_page() can aggregate into a single call.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 16 ++++++++++++----
 arch/x86/kvm/paging_tmpl.h | 12 ++++++++----
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..75bc73e23df0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2724,6 +2724,10 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
 	return true;
 }
 
+/* Bits which may be returned by set_spte() */
+#define SET_SPTE_WRITE_PROTECTED_PT	BIT(0)
+#define SET_SPTE_NEED_REMOTE_TLB_FLUSH	BIT(1)
+
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    unsigned pte_access, int level,
 		    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
@@ -2800,7 +2804,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
 			pgprintk("%s: found shadow page for %llx, marking ro\n",
 				 __func__, gfn);
-			ret = 1;
+			ret |= SET_SPTE_WRITE_PROTECTED_PT;
 			pte_access &= ~ACC_WRITE_MASK;
 			spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
 		}
@@ -2816,7 +2820,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 set_pte:
 	if (mmu_spte_update(sptep, spte))
-		kvm_flush_remote_tlbs(vcpu->kvm);
+		ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
 done:
 	return ret;
 }
@@ -2827,6 +2831,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 {
 	int was_rmapped = 0;
 	int rmap_count;
+	int set_spte_ret;
 	int ret = RET_PF_RETRY;
 
 	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
@@ -2854,12 +2859,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 			was_rmapped = 1;
 	}
 
-	if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
-	      true, host_writable)) {
+	set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
+				speculative, true, host_writable);
+	if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 		if (write_fault)
 			ret = RET_PF_EMULATE;
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	}
+	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
+		kvm_flush_remote_tlbs(vcpu->kvm);
 
 	if (unlikely(is_mmio_spte(*sptep)))
 		ret = RET_PF_EMULATE;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 74996fc2fc97..208f7646ce0d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -968,6 +968,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	int i, nr_present = 0;
 	bool host_writable;
 	gpa_t first_pte_gpa;
+	int set_spte_ret = 0;
 
 	/* direct kvm_mmu_page can not be unsync. */
 	BUG_ON(sp->role.direct);
@@ -1024,12 +1025,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
 
-		set_spte(vcpu, &sp->spt[i], pte_access,
-			 PT_PAGE_TABLE_LEVEL, gfn,
-			 spte_to_pfn(sp->spt[i]), true, false,
-			 host_writable);
+		set_spte_ret |= set_spte(vcpu, &sp->spt[i],
+					 pte_access, PT_PAGE_TABLE_LEVEL,
+					 gfn, spte_to_pfn(sp->spt[i]),
+					 true, false, host_writable);
 	}
 
+	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
+		kvm_flush_remote_tlbs(vcpu->kvm);
+
 	return nr_present;
 }
 

From 578e1c4db22135d91bad6c79585c4d19252b8d81 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:05 -0700
Subject: [PATCH 42/85] kvm: x86: Avoid taking MMU lock in kvm_mmu_sync_roots
 if no sync is needed

kvm_mmu_sync_roots() can locklessly check whether a sync is needed and just
bail out if it isn't.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 75 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 67 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 75bc73e23df0..4b4452d0022e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2702,6 +2702,45 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 		kvm_unsync_page(vcpu, sp);
 	}
 
+	/*
+	 * We need to ensure that the marking of unsync pages is visible
+	 * before the SPTE is updated to allow writes because
+	 * kvm_mmu_sync_roots() checks the unsync flags without holding
+	 * the MMU lock and so can race with this. If the SPTE was updated
+	 * before the page had been marked as unsync-ed, something like the
+	 * following could happen:
+	 *
+	 * CPU 1                    CPU 2
+	 * ---------------------------------------------------------------------
+	 * 1.2 Host updates SPTE
+	 *     to be writable
+	 *                      2.1 Guest writes a GPTE for GVA X.
+	 *                          (GPTE being in the guest page table shadowed
+	 *                           by the SP from CPU 1.)
+	 *                          This reads SPTE during the page table walk.
+	 *                          Since SPTE.W is read as 1, there is no
+	 *                          fault.
+	 *
+	 *                      2.2 Guest issues TLB flush.
+	 *                          That causes a VM Exit.
+	 *
+	 *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
+	 *                          Since it is false, so it just returns.
+	 *
+	 *                      2.4 Guest accesses GVA X.
+	 *                          Since the mapping in the SP was not updated,
+	 *                          so the old mapping for GVA X incorrectly
+	 *                          gets used.
+	 * 1.1 Host marks SP
+	 *     as unsync
+	 *     (sp->unsync = true)
+	 *
+	 * The write barrier below ensures that 1.1 happens before 1.2 and thus
+	 * the situation in 2.4 does not arise. The implicit barrier in 2.2
+	 * pairs with this write barrier.
+	 */
+	smp_wmb();
+
 	return false;
 }
 
@@ -3554,7 +3593,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		return mmu_alloc_shadow_roots(vcpu);
 }
 
-static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	struct kvm_mmu_page *sp;
@@ -3566,14 +3605,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 		return;
 
 	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
-	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
 	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
+
 		sp = page_header(root);
+
+		/*
+		 * Even if another CPU was marking the SP as unsync-ed
+		 * simultaneously, any guest page table changes are not
+		 * guaranteed to be visible anyway until this VCPU issues a TLB
+		 * flush strictly after those changes are made. We only need to
+		 * ensure that the other CPU sets these flags before any actual
+		 * changes to the page tables are made. The comments in
+		 * mmu_need_write_protect() describe what could go wrong if this
+		 * requirement isn't satisfied.
+		 */
+		if (!smp_load_acquire(&sp->unsync) &&
+		    !smp_load_acquire(&sp->unsync_children))
+			return;
+
+		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
 		mmu_sync_children(vcpu, sp);
+
 		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
 	}
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -3583,13 +3647,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 			mmu_sync_children(vcpu, sp);
 		}
 	}
-	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
-}
 
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
-	spin_lock(&vcpu->kvm->mmu_lock);
-	mmu_sync_roots(vcpu);
+	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);

From 7c390d350f8b677df3236afef4ced80dba6c3201 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:06 -0700
Subject: [PATCH 43/85] kvm: x86: Add fast CR3 switch code path

When using shadow paging, a CR3 switch in the guest results in a VM Exit.
In the common case, that VM exit doesn't require much processing by KVM.
However, it does acquire the MMU lock, which can start showing signs of
contention under some workloads even on a 2 VCPU VM when the guest is
using KPTI. Therefore, we add a fast path that avoids acquiring the MMU
lock in the most common cases e.g. when switching back and forth between
the kernel and user mode CR3s used by KPTI with no guest page table
changes in between.

For now, this fast path is implemented only for 64-bit guests and hosts
to avoid the handling of PDPTEs, but it can be extended later to 32-bit
guests and/or hosts as well.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 13 +++++--
 arch/x86/kvm/mmu.c              | 63 ++++++++++++++++++++++++++++++---
 arch/x86/kvm/x86.c              |  3 +-
 3 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bd287b348751..290b7d05790a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -326,6 +326,14 @@ struct rsvd_bits_validate {
 	u64 bad_mt_xwr;
 };
 
+struct kvm_mmu_root_info {
+	gpa_t cr3;
+	hpa_t hpa;
+};
+
+#define KVM_MMU_ROOT_INFO_INVALID \
+	((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE })
+
 /*
  * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
  * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
@@ -354,6 +362,7 @@ struct kvm_mmu {
 	u8 shadow_root_level;
 	u8 ept_ad;
 	bool direct_map;
+	struct kvm_mmu_root_info prev_root;
 
 	/*
 	 * Bitmap; bit set = permission fault
@@ -1288,7 +1297,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root);
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
 			   struct x86_exception *exception);
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
@@ -1307,7 +1316,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
 		       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3);
 
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4b4452d0022e..7f490298c635 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3405,17 +3405,22 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 	*root_hpa = INVALID_PAGE;
 }
 
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu)
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root)
 {
 	int i;
 	LIST_HEAD(invalid_list);
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 
-	if (!VALID_PAGE(mmu->root_hpa))
+	if (!VALID_PAGE(mmu->root_hpa) &&
+	    (!VALID_PAGE(mmu->prev_root.hpa) || !free_prev_root))
 		return;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 
+	if (free_prev_root)
+		mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa,
+				   &invalid_list);
+
 	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
 	    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
 		mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
@@ -4015,13 +4020,56 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = true;
 	context->nx = false;
 }
 
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
+static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3)
 {
-	kvm_mmu_free_roots(vcpu);
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+	/*
+	 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
+	 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
+	 * later if necessary.
+	 */
+	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+	    mmu->root_level >= PT64_ROOT_4LEVEL) {
+		gpa_t prev_cr3 = mmu->prev_root.cr3;
+
+		if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
+			return false;
+
+		swap(mmu->root_hpa, mmu->prev_root.hpa);
+		mmu->prev_root.cr3 = kvm_read_cr3(vcpu);
+
+		if (new_cr3 == prev_cr3 && VALID_PAGE(mmu->root_hpa)) {
+			/*
+			 * It is possible that the cached previous root page is
+			 * obsolete because of a change in the MMU
+			 * generation number. However, that is accompanied by
+			 * KVM_REQ_MMU_RELOAD, which will free the root that we
+			 * have set here and allocate a new one.
+			 */
+
+			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+			__clear_sp_write_flooding_count(
+				page_header(mmu->root_hpa));
+
+			mmu->set_cr3(vcpu, mmu->root_hpa);
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3)
+{
+	if (!fast_cr3_switch(vcpu, new_cr3))
+		kvm_mmu_free_roots(vcpu, false);
 }
 
 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
@@ -4499,6 +4547,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->update_pte = paging64_update_pte;
 	context->shadow_root_level = level;
 	context->root_hpa = INVALID_PAGE;
+	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = false;
 }
 
@@ -4529,6 +4578,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
 	context->update_pte = paging32_update_pte;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = false;
 }
 
@@ -4552,6 +4602,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->update_pte = nonpaging_update_pte;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
 	context->root_hpa = INVALID_PAGE;
+	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
@@ -4634,6 +4685,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 	context->update_pte = ept_update_pte;
 	context->root_level = PT64_ROOT_4LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = false;
 	context->base_role.ad_disabled = !accessed_dirty;
 	context->base_role.guest_mode = 1;
@@ -4736,7 +4788,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-	kvm_mmu_free_roots(vcpu);
+	kvm_mmu_free_roots(vcpu, true);
 	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
@@ -5116,6 +5168,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
 	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b14c4a654c3..5a1e4f79398f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -867,9 +867,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 		return 1;
 
+	kvm_mmu_new_cr3(vcpu, cr3);
 	vcpu->arch.cr3 = cr3;
 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-	kvm_mmu_new_cr3(vcpu);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);

From 9fa72119b24db78d665a4034b5a0d349b0289b08 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:07 -0700
Subject: [PATCH 44/85] kvm: x86: Introduce kvm_mmu_calc_root_page_role()

These functions factor out the base role calculation from the
corresponding kvm_init_*_mmu() functions. The new functions return
what would be the role assigned to a root page in the current VCPU
state. This can be masked with mmu_base_role_mask to derive the base
role.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 112 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 85 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7f490298c635..afb865d054c2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,6 +178,17 @@ struct kvm_shadow_walk_iterator {
 	unsigned index;
 };
 
+static const union kvm_mmu_page_role mmu_base_role_mask = {
+	.cr0_wp = 1,
+	.cr4_pae = 1,
+	.nxe = 1,
+	.smep_andnot_wp = 1,
+	.smap_andnot_wp = 1,
+	.smm = 1,
+	.guest_mode = 1,
+	.ad_disabled = 1,
+};
+
 #define for_each_shadow_entry(_vcpu, _addr, _walker)    \
 	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
 	     shadow_walk_okay(&(_walker));			\
@@ -222,6 +233,8 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
 {
@@ -4588,14 +4601,27 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
 	paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
 
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
+{
+	union kvm_mmu_page_role role = {0};
+
+	role.guest_mode = is_guest_mode(vcpu);
+	role.smm = is_smm(vcpu);
+	role.ad_disabled = (shadow_accessed_mask == 0);
+	role.level = kvm_x86_ops->get_tdp_level(vcpu);
+	role.direct = true;
+	role.access = ACC_ALL;
+
+	return role;
+}
+
 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
-	context->base_role.word = 0;
-	context->base_role.guest_mode = is_guest_mode(vcpu);
-	context->base_role.smm = is_smm(vcpu);
-	context->base_role.ad_disabled = (shadow_accessed_mask == 0);
+	context->base_role.word = mmu_base_role_mask.word &
+				  kvm_calc_tdp_mmu_root_page_role(vcpu).word;
 	context->page_fault = tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;
@@ -4637,10 +4663,35 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	reset_tdp_shadow_zero_bits_mask(vcpu, context);
 }
 
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+static union kvm_mmu_page_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
 {
+	union kvm_mmu_page_role role = {0};
 	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
 	bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+
+	role.nxe = is_nx(vcpu);
+	role.cr4_pae = !!is_pae(vcpu);
+	role.cr0_wp  = is_write_protection(vcpu);
+	role.smep_andnot_wp = smep && !is_write_protection(vcpu);
+	role.smap_andnot_wp = smap && !is_write_protection(vcpu);
+	role.guest_mode = is_guest_mode(vcpu);
+	role.smm = is_smm(vcpu);
+	role.direct = !is_paging(vcpu);
+	role.access = ACC_ALL;
+
+	if (!is_long_mode(vcpu))
+		role.level = PT32E_ROOT_LEVEL;
+	else if (is_la57_mode(vcpu))
+		role.level = PT64_ROOT_5LEVEL;
+	else
+		role.level = PT64_ROOT_4LEVEL;
+
+	return role;
+}
+
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+{
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
 	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
@@ -4654,23 +4705,32 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 	else
 		paging32_init_context(vcpu, context);
 
-	context->base_role.nxe = is_nx(vcpu);
-	context->base_role.cr4_pae = !!is_pae(vcpu);
-	context->base_role.cr0_wp  = is_write_protection(vcpu);
-	context->base_role.smep_andnot_wp
-		= smep && !is_write_protection(vcpu);
-	context->base_role.smap_andnot_wp
-		= smap && !is_write_protection(vcpu);
-	context->base_role.guest_mode = is_guest_mode(vcpu);
-	context->base_role.smm = is_smm(vcpu);
+	context->base_role.word = mmu_base_role_mask.word &
+				  kvm_calc_shadow_mmu_root_page_role(vcpu).word;
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+static union kvm_mmu_page_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
+{
+	union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
+
+	role.level = PT64_ROOT_4LEVEL;
+	role.direct = false;
+	role.ad_disabled = !accessed_dirty;
+	role.guest_mode = true;
+	role.access = ACC_ALL;
+
+	return role;
+}
+
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 			     bool accessed_dirty)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	union kvm_mmu_page_role root_page_role =
+		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
 
 	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
 
@@ -4687,8 +4747,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 	context->root_hpa = INVALID_PAGE;
 	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = false;
-	context->base_role.ad_disabled = !accessed_dirty;
-	context->base_role.guest_mode = 1;
+	context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
 	update_permission_bitmask(vcpu, context, true);
 	update_pkru_bitmask(vcpu, context, true);
 	update_last_nonleaf_level(vcpu, context);
@@ -4761,6 +4820,15 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 		init_kvm_softmmu(vcpu);
 }
 
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
+{
+	if (tdp_enabled)
+		return kvm_calc_tdp_mmu_root_page_role(vcpu);
+	else
+		return kvm_calc_shadow_mmu_root_page_role(vcpu);
+}
+
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_unload(vcpu);
@@ -4941,16 +5009,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	u64 entry, gentry, *spte;
 	int npte;
 	bool remote_flush, local_flush;
-	union kvm_mmu_page_role mask = { };
-
-	mask.cr0_wp = 1;
-	mask.cr4_pae = 1;
-	mask.nxe = 1;
-	mask.smep_andnot_wp = 1;
-	mask.smap_andnot_wp = 1;
-	mask.smm = 1;
-	mask.guest_mode = 1;
-	mask.ad_disabled = 1;
 
 	/*
 	 * If we don't have indirect shadow pages, it means no page is
@@ -4994,7 +5052,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			mmu_page_zap_pte(vcpu->kvm, sp, spte);
 			if (gentry &&
 			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
-			      & mask.word) && rmap_can_add(vcpu))
+			      & mmu_base_role_mask.word) && rmap_can_add(vcpu))
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			if (need_remote_flush(entry, *spte))
 				remote_flush = true;

From 6e42782f516f05c8030f63308f2457681b1c9919 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:08 -0700
Subject: [PATCH 45/85] kvm: x86: Introduce KVM_REQ_LOAD_CR3

The KVM_REQ_LOAD_CR3 request loads the hardware CR3 using the
current root_hpa.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c              | 3 +--
 arch/x86/kvm/mmu.h              | 7 +++++++
 arch/x86/kvm/x86.c              | 2 ++
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 290b7d05790a..c2b4df8a03cd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -54,6 +54,7 @@
 #define KVM_REQ_TRIPLE_FAULT		KVM_ARCH_REQ(2)
 #define KVM_REQ_MMU_SYNC		KVM_ARCH_REQ(3)
 #define KVM_REQ_CLOCK_UPDATE		KVM_ARCH_REQ(4)
+#define KVM_REQ_LOAD_CR3		KVM_ARCH_REQ(5)
 #define KVM_REQ_EVENT			KVM_ARCH_REQ(6)
 #define KVM_REQ_APF_HALT		KVM_ARCH_REQ(7)
 #define KVM_REQ_STEAL_UPDATE		KVM_ARCH_REQ(8)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index afb865d054c2..704f7df11f0b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4847,8 +4847,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	kvm_mmu_sync_roots(vcpu);
 	if (r)
 		goto out;
-	/* set_cr3() should ensure TLB has been flushed */
-	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+	kvm_mmu_load_cr3(vcpu);
 out:
 	return r;
 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b408c0ad612..16b7178853ac 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -85,6 +85,13 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 	return kvm_mmu_load(vcpu);
 }
 
+static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
+{
+	/* set_cr3() should ensure TLB has been flushed */
+	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+}
+
 /*
  * Currently, we have two sorts of write-protection, a) the first one
  * write-protects guest page to sync the guest modification, b) another one is
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a1e4f79398f..7748037b17fd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7332,6 +7332,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
+		if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
+			kvm_mmu_load_cr3(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
 			kvm_vcpu_flush_tlb(vcpu, true);
 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {

From 0aab33e4f9459fc80378bc2a089d5784fe8ccd3b Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:09 -0700
Subject: [PATCH 46/85] kvm: x86: Add support for fast CR3 switch across
 different MMU modes

This generalizes the lockless CR3 switch path to be able to work
across different MMU modes (e.g. nested vs non-nested) by checking
that the expected page role of the new root page matches the page role
of the previously stored root page in addition to checking that the new
CR3 matches the previous CR3. Furthermore, instead of loading the
hardware CR3 in fast_cr3_switch(), it is now done in vcpu_enter_guest(),
as by that time the MMU context would be up-to-date with the VCPU mode.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 704f7df11f0b..536473411930 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4038,7 +4038,8 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->nx = false;
 }
 
-static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3)
+static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+			    union kvm_mmu_page_role new_role)
 {
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 
@@ -4057,7 +4058,10 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3)
 		swap(mmu->root_hpa, mmu->prev_root.hpa);
 		mmu->prev_root.cr3 = kvm_read_cr3(vcpu);
 
-		if (new_cr3 == prev_cr3 && VALID_PAGE(mmu->root_hpa)) {
+		if (new_cr3 == prev_cr3 &&
+		    VALID_PAGE(mmu->root_hpa) &&
+		    page_header(mmu->root_hpa) != NULL &&
+		    new_role.word == page_header(mmu->root_hpa)->role.word) {
 			/*
 			 * It is possible that the cached previous root page is
 			 * obsolete because of a change in the MMU
@@ -4066,12 +4070,11 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3)
 			 * have set here and allocate a new one.
 			 */
 
+			kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
 			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
 			__clear_sp_write_flooding_count(
 				page_header(mmu->root_hpa));
 
-			mmu->set_cr3(vcpu, mmu->root_hpa);
-
 			return true;
 		}
 	}
@@ -4079,10 +4082,16 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3)
 	return false;
 }
 
+static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+			      union kvm_mmu_page_role new_role)
+{
+	if (!fast_cr3_switch(vcpu, new_cr3, new_role))
+		kvm_mmu_free_roots(vcpu, false);
+}
+
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3)
 {
-	if (!fast_cr3_switch(vcpu, new_cr3))
-		kvm_mmu_free_roots(vcpu, false);
+	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu));
 }
 
 static unsigned long get_cr3(struct kvm_vcpu *vcpu)

From 1c53da3fa3a333eb15ee5a154700e75d135c21c8 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:10 -0700
Subject: [PATCH 47/85] kvm: x86: Support resetting the MMU context without
 resetting roots

This adds support for re-initializing the MMU context in a different
mode while preserving the active root_hpa and the prev_root.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 26 +++++++++-----------------
 arch/x86/kvm/mmu.h |  1 +
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 536473411930..3d822a97dfa1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4032,8 +4032,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->update_pte = nonpaging_update_pte;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
-	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = true;
 	context->nx = false;
 }
@@ -4568,8 +4566,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->invlpg = paging64_invlpg;
 	context->update_pte = paging64_update_pte;
 	context->shadow_root_level = level;
-	context->root_hpa = INVALID_PAGE;
-	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = false;
 }
 
@@ -4599,8 +4595,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
 	context->invlpg = paging32_invlpg;
 	context->update_pte = paging32_update_pte;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
-	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = false;
 }
 
@@ -4636,8 +4630,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->invlpg = nonpaging_invlpg;
 	context->update_pte = nonpaging_update_pte;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
-	context->root_hpa = INVALID_PAGE;
-	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
@@ -4703,8 +4695,6 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
-	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
-
 	if (!is_paging(vcpu))
 		nonpaging_init_context(vcpu, context);
 	else if (is_long_mode(vcpu))
@@ -4741,8 +4731,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 	union kvm_mmu_page_role root_page_role =
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
 
-	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
-
 	context->shadow_root_level = PT64_ROOT_4LEVEL;
 
 	context->nx = true;
@@ -4753,8 +4741,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 	context->invlpg = ept_invlpg;
 	context->update_pte = ept_update_pte;
 	context->root_level = PT64_ROOT_4LEVEL;
-	context->root_hpa = INVALID_PAGE;
-	context->prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	context->direct_map = false;
 	context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
 	update_permission_bitmask(vcpu, context, true);
@@ -4819,8 +4805,13 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	update_last_nonleaf_level(vcpu, g_context);
 }
 
-static void init_kvm_mmu(struct kvm_vcpu *vcpu)
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
 {
+	if (reset_roots) {
+		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+		vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID;
+	}
+
 	if (mmu_is_nested(vcpu))
 		init_kvm_nested_mmu(vcpu);
 	else if (tdp_enabled)
@@ -4828,6 +4819,7 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 	else
 		init_kvm_softmmu(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_init_mmu);
 
 static union kvm_mmu_page_role
 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
@@ -4841,7 +4833,7 @@ kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_unload(vcpu);
-	init_kvm_mmu(vcpu);
+	kvm_init_mmu(vcpu, true);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 
@@ -5245,7 +5237,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
 	MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-	init_kvm_mmu(vcpu);
+	kvm_init_mmu(vcpu, true);
 }
 
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 16b7178853ac..11ab3d62ad65 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -61,6 +61,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 			     bool accessed_dirty);

From 50c28f21d045dde8c52548f8482d456b3f0956f5 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:11 -0700
Subject: [PATCH 48/85] kvm: x86: Use fast CR3 switch for nested VMX

Use the fast CR3 switch mechanism to locklessly change the MMU root
page when switching between L1 and L2. The switch from L2 to L1 should
always go through the fast path, while the switch from L1 to L2 should
go through the fast path if L1's CR3/EPTP for L2 hasn't changed
since the last time.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c  |  6 ++++--
 arch/x86/kvm/mmu.h  |  2 +-
 arch/x86/kvm/vmx.c  | 16 ++++++++++------
 virt/kvm/kvm_main.c | 14 ++++++++++----
 4 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3d822a97dfa1..9b73cfcef917 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4054,7 +4054,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			return false;
 
 		swap(mmu->root_hpa, mmu->prev_root.hpa);
-		mmu->prev_root.cr3 = kvm_read_cr3(vcpu);
+		mmu->prev_root.cr3 = mmu->get_cr3(vcpu);
 
 		if (new_cr3 == prev_cr3 &&
 		    VALID_PAGE(mmu->root_hpa) &&
@@ -4091,6 +4091,7 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3)
 {
 	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu));
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
 
 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 {
@@ -4725,12 +4726,13 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
 }
 
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
-			     bool accessed_dirty)
+			     bool accessed_dirty, gpa_t new_eptp)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 	union kvm_mmu_page_role root_page_role =
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
 
+	__kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role);
 	context->shadow_root_level = PT64_ROOT_4LEVEL;
 
 	context->nx = true;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 11ab3d62ad65..3177127cee96 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -64,7 +64,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
-			     bool accessed_dirty);
+			     bool accessed_dirty, gpa_t new_eptp);
 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 				u64 fault_address, char *insn, int insn_len);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cbe31b9d2b81..c6ea892a610e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10741,11 +10741,11 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 	if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
 		return 1;
 
-	kvm_mmu_unload(vcpu);
 	kvm_init_shadow_ept_mmu(vcpu,
 			to_vmx(vcpu)->nested.msrs.ept_caps &
 			VMX_EPT_EXECUTE_ONLY_BIT,
-			nested_ept_ad_enabled(vcpu));
+			nested_ept_ad_enabled(vcpu),
+			nested_ept_get_cr3(vcpu));
 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -11342,12 +11342,16 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 				return 1;
 			}
 		}
-
-		vcpu->arch.cr3 = cr3;
-		__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 	}
 
-	kvm_mmu_reset_context(vcpu);
+	if (!nested_ept)
+		kvm_mmu_new_cr3(vcpu, cr3);
+
+	vcpu->arch.cr3 = cr3;
+	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+	kvm_init_mmu(vcpu, false);
+
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f519eb8d06b1..8f461e0ed382 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2127,16 +2127,22 @@ static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
 
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
+	int ret = -EINTR;
+	int idx = srcu_read_lock(&vcpu->kvm->srcu);
+
 	if (kvm_arch_vcpu_runnable(vcpu)) {
 		kvm_make_request(KVM_REQ_UNHALT, vcpu);
-		return -EINTR;
+		goto out;
 	}
 	if (kvm_cpu_has_pending_timer(vcpu))
-		return -EINTR;
+		goto out;
 	if (signal_pending(current))
-		return -EINTR;
+		goto out;
 
-	return 0;
+	ret = 0;
+out:
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	return ret;
 }
 
 /*

From afe828d1de4047d26eb0cd0c0154f5ac3722bf63 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:12 -0700
Subject: [PATCH 49/85] kvm: x86: Add ability to skip TLB flush when switching
 CR3

Remove the implicit flush from the set_cr3 handlers, so that the
callers are able to decide whether to flush the TLB or not.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 ++
 arch/x86/kvm/mmu.h | 1 -
 arch/x86/kvm/svm.c | 4 ----
 arch/x86/kvm/vmx.c | 1 -
 4 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b73cfcef917..d3a04cf6514b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4070,6 +4070,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 
 			kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
 			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+			kvm_x86_ops->tlb_flush(vcpu, true);
 			__clear_sp_write_flooding_count(
 				page_header(mmu->root_hpa));
 
@@ -4851,6 +4852,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	kvm_mmu_load_cr3(vcpu);
+	kvm_x86_ops->tlb_flush(vcpu, true);
 out:
 	return r;
 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3177127cee96..6a2a97d8015b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -88,7 +88,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 
 static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
 {
-	/* set_cr3() should ensure TLB has been flushed */
 	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f059a73f0fd0..be9931cad409 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2884,7 +2884,6 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 
 	svm->vmcb->control.nested_cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_NPT);
-	svm_flush_tlb(vcpu, true);
 }
 
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
@@ -5766,7 +5765,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 
 	svm->vmcb->save.cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_CR);
-	svm_flush_tlb(vcpu, true);
 }
 
 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -5779,8 +5777,6 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 	/* Also sync guest cr3 here in case we live migrate */
 	svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
 	mark_dirty(svm->vmcb, VMCB_CR);
-
-	svm_flush_tlb(vcpu, true);
 }
 
 static int is_disabled(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6ea892a610e..903c130bb811 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5029,7 +5029,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		ept_load_pdptrs(vcpu);
 	}
 
-	vmx_flush_tlb(vcpu, true);
 	vmcs_writel(GUEST_CR3, guest_cr3);
 }
 

From c9470a2e28479e97eb44d926ea7bbb5709ad9d6b Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:13 -0700
Subject: [PATCH 50/85] kvm: x86: Propagate guest PCIDs to host PCIDs

When using shadow paging mode, propagate the guest's PCID value to
the shadow CR3 in the host instead of always using PCID 0.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6a2a97d8015b..1fab69c0b2f3 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -86,10 +86,25 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 	return kvm_mmu_load(vcpu);
 }
 
+static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+	BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
+
+	return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)
+	       ? cr3 & X86_CR3_PCID_MASK
+	       : 0;
+}
+
+static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
+{
+	return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
+}
+
 static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
 {
 	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+		vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa |
+					     kvm_get_active_pcid(vcpu));
 }
 
 /*

From eb4b248e152d3ecf189b9d32c04961360dbd938a Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:14 -0700
Subject: [PATCH 51/85] kvm: vmx: Support INVPCID in shadow paging mode

Implement support for INVPCID in shadow paging mode as well.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 18 +++++++
 arch/x86/kvm/vmx.c              | 96 +++++++++++++++++++++++++++++++--
 3 files changed, 112 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c2b4df8a03cd..1c253f15f9cd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1317,6 +1317,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
 		       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3);
 
 void kvm_enable_tdp(void);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d3a04cf6514b..2af15db12ccd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5187,6 +5187,24 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
 
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
+{
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+	if (pcid == kvm_get_active_pcid(vcpu)) {
+		mmu->invlpg(vcpu, gva);
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	}
+
+	++vcpu->stat.invlpg;
+
+	/*
+	 * Mappings not reachable via the current cr3 will be synced when
+	 * switching to that cr3, so nothing needs to be done here for them.
+	 */
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
+
 void kvm_enable_tdp(void)
 {
 	tdp_enabled = true;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 903c130bb811..80be024cb979 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3065,7 +3065,7 @@ static bool vmx_rdtscp_supported(void)
 
 static bool vmx_invpcid_supported(void)
 {
-	return cpu_has_vmx_invpcid() && enable_ept;
+	return cpu_has_vmx_invpcid();
 }
 
 /*
@@ -6101,8 +6101,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 	if (!enable_ept) {
 		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
 		enable_unrestricted_guest = 0;
-		/* Enable INVPCID for non-ept guests may cause performance regression. */
-		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
 	}
 	if (!enable_unrestricted_guest)
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -8756,6 +8754,97 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	return kvm_skip_emulated_instruction(vcpu);
 }
 
+static int handle_invpcid(struct kvm_vcpu *vcpu)
+{
+	u32 vmx_instruction_info;
+	unsigned long type;
+	bool pcid_enabled;
+	gva_t gva;
+	struct x86_exception e;
+	struct {
+		u64 pcid;
+		u64 gla;
+	} operand;
+
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+	if (type > 3) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	/* According to the Intel instruction reference, the memory operand
+	 * is read even if it isn't needed (e.g., for type==all)
+	 */
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+				vmx_instruction_info, false, &gva))
+		return 1;
+
+	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	if (operand.pcid >> 12 != 0) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+	switch (type) {
+	case INVPCID_TYPE_INDIV_ADDR:
+		if ((!pcid_enabled && (operand.pcid != 0)) ||
+		    is_noncanonical_address(operand.gla, vcpu)) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+		return kvm_skip_emulated_instruction(vcpu);
+
+	case INVPCID_TYPE_SINGLE_CTXT:
+		if (!pcid_enabled && (operand.pcid != 0)) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+
+		if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+			kvm_mmu_sync_roots(vcpu);
+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		}
+
+		/*
+		 * If the current cr3 does not use the given PCID, then nothing
+		 * needs to be done here because a resync will happen anyway
+		 * before switching to any other CR3.
+		 */
+
+		return kvm_skip_emulated_instruction(vcpu);
+
+	case INVPCID_TYPE_ALL_NON_GLOBAL:
+		/*
+		 * Currently, KVM doesn't mark global entries in the shadow
+		 * page tables, so a non-global flush just degenerates to a
+		 * global flush. If needed, we could optimize this later by
+		 * keeping track of global entries in shadow page tables.
+		 */
+
+		/* fall-through */
+	case INVPCID_TYPE_ALL_INCL_GLOBAL:
+		kvm_mmu_unload(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
+
+	default:
+		BUG(); /* We have already checked above that type <= 3 */
+	}
+}
+
 static int handle_pml_full(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
@@ -8959,6 +9048,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
+	[EXIT_REASON_INVPCID]                 = handle_invpcid,
 	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 };

From ade61e2824443a208bb3aaafd8b345ce878298cd Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:15 -0700
Subject: [PATCH 52/85] kvm: x86: Skip TLB flush on fast CR3 switch when
 indicated by guest

When PCIDs are enabled, the MSb of the source operand for a MOV-to-CR3
instruction indicates that the TLB doesn't need to be flushed.

This change enables this optimization for MOV-to-CR3s in the guest
that have been intercepted by KVM for shadow paging and are handled
within the fast CR3 switch path.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 28 +++++++++++++++++++---------
 arch/x86/kvm/vmx.c              | 12 ++++++++----
 arch/x86/kvm/x86.c              | 11 ++++++++---
 4 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1c253f15f9cd..8b4aa5e7ff92 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1318,7 +1318,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
 		       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3);
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
 
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2af15db12ccd..2b7cfc8e41bb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4037,7 +4037,8 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 }
 
 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
-			    union kvm_mmu_page_role new_role)
+			    union kvm_mmu_page_role new_role,
+			    bool skip_tlb_flush)
 {
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 
@@ -4070,7 +4071,9 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 
 			kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
 			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-			kvm_x86_ops->tlb_flush(vcpu, true);
+			if (!skip_tlb_flush)
+				kvm_x86_ops->tlb_flush(vcpu, true);
+
 			__clear_sp_write_flooding_count(
 				page_header(mmu->root_hpa));
 
@@ -4082,15 +4085,17 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 }
 
 static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
-			      union kvm_mmu_page_role new_role)
+			      union kvm_mmu_page_role new_role,
+			      bool skip_tlb_flush)
 {
-	if (!fast_cr3_switch(vcpu, new_cr3, new_role))
+	if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
 		kvm_mmu_free_roots(vcpu, false);
 }
 
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3)
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
 {
-	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu));
+	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
+			  skip_tlb_flush);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
 
@@ -4733,7 +4738,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 	union kvm_mmu_page_role root_page_role =
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
 
-	__kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role);
+	__kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
 	context->shadow_root_level = PT64_ROOT_4LEVEL;
 
 	context->nx = true;
@@ -5196,11 +5201,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	}
 
+	if (VALID_PAGE(mmu->prev_root.hpa) &&
+	    pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3))
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
 	++vcpu->stat.invlpg;
 
 	/*
-	 * Mappings not reachable via the current cr3 will be synced when
-	 * switching to that cr3, so nothing needs to be done here for them.
+	 * Mappings not reachable via the current cr3 or the prev_root.cr3 will
+	 * be synced when switching to that cr3, so nothing needs to be done
+	 * here for them.
 	 */
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 80be024cb979..6151418cec32 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8819,10 +8819,14 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		}
 
+		if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3)
+		    == operand.pcid)
+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
 		/*
-		 * If the current cr3 does not use the given PCID, then nothing
-		 * needs to be done here because a resync will happen anyway
-		 * before switching to any other CR3.
+		 * If neither the current cr3 nor the prev_root.cr3 use the
+		 * given PCID, then nothing needs to be done here because a
+		 * resync will happen anyway before switching to any other CR3.
 		 */
 
 		return kvm_skip_emulated_instruction(vcpu);
@@ -11434,7 +11438,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 	}
 
 	if (!nested_ept)
-		kvm_mmu_new_cr3(vcpu, cr3);
+		kvm_mmu_new_cr3(vcpu, cr3, false);
 
 	vcpu->arch.cr3 = cr3;
 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7748037b17fd..493afbf12e78 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -847,16 +847,21 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	bool skip_tlb_flush = false;
 #ifdef CONFIG_X86_64
 	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
 
-	if (pcid_enabled)
+	if (pcid_enabled) {
+		skip_tlb_flush = cr3 & CR3_PCID_INVD;
 		cr3 &= ~CR3_PCID_INVD;
+	}
 #endif
 
 	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
 		kvm_mmu_sync_roots(vcpu);
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+		if (!skip_tlb_flush)
+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		return 0;
 	}
 
@@ -867,7 +872,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 		return 1;
 
-	kvm_mmu_new_cr3(vcpu, cr3);
+	kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
 	vcpu->arch.cr3 = cr3;
 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 

From 7eb77e9f5fcf652a21b2d12bff1cd509b6b14f21 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:16 -0700
Subject: [PATCH 53/85] kvm: x86: Add a root_hpa parameter to kvm_mmu->invlpg()

This allows invlpg() to be called using either the active root_hpa
or the prev_root_hpa.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 36 ++++++++++++++++++++++++++-------
 arch/x86/kvm/paging_tmpl.h      |  6 +++---
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8b4aa5e7ff92..0b77c233e441 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,7 +354,7 @@ struct kvm_mmu {
 			       struct x86_exception *exception);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
 			 struct kvm_mmu_page *sp);
-	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
+	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
 	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   u64 *spte, const void *pte);
 	hpa_t root_hpa;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2b7cfc8e41bb..6eeca915511e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -189,7 +189,13 @@ static const union kvm_mmu_page_role mmu_base_role_mask = {
 	.ad_disabled = 1,
 };
 
-#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
+	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
+					 (_root), (_addr));                \
+	     shadow_walk_okay(&(_walker));			           \
+	     shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
 	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
@@ -1999,7 +2005,7 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
 {
 }
 
@@ -2405,11 +2411,12 @@ out:
 	return sp;
 }
 
-static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
-			     struct kvm_vcpu *vcpu, u64 addr)
+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
+					struct kvm_vcpu *vcpu, hpa_t root,
+					u64 addr)
 {
 	iterator->addr = addr;
-	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
+	iterator->shadow_addr = root;
 	iterator->level = vcpu->arch.mmu.shadow_root_level;
 
 	if (iterator->level == PT64_ROOT_4LEVEL &&
@@ -2418,6 +2425,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 		--iterator->level;
 
 	if (iterator->level == PT32E_ROOT_LEVEL) {
+		/*
+		 * prev_root is currently only used for 64-bit hosts. So only
+		 * the active root_hpa is valid here.
+		 */
+		BUG_ON(root != vcpu->arch.mmu.root_hpa);
+
 		iterator->shadow_addr
 			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
 		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
@@ -2427,6 +2440,13 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 	}
 }
 
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+			     struct kvm_vcpu *vcpu, u64 addr)
+{
+	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
+				    addr);
+}
+
 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 {
 	if (iterator->level < PT_PAGE_TABLE_LEVEL)
@@ -5186,7 +5206,9 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
-	vcpu->arch.mmu.invlpg(vcpu, gva);
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+	mmu->invlpg(vcpu, gva, mmu->root_hpa);
 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	++vcpu->stat.invlpg;
 }
@@ -5197,7 +5219,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 
 	if (pcid == kvm_get_active_pcid(vcpu)) {
-		mmu->invlpg(vcpu, gva);
+		mmu->invlpg(vcpu, gva, mmu->root_hpa);
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	}
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 208f7646ce0d..14ffd973df54 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -856,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
 	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
 }
 
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
@@ -871,13 +871,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	 */
 	mmu_topup_memory_caches(vcpu);
 
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+	if (!VALID_PAGE(root_hpa)) {
 		WARN_ON(1);
 		return;
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
-	for_each_shadow_entry(vcpu, gva, iterator) {
+	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
 		level = iterator.level;
 		sptep = iterator.sptep;
 

From 08fb59d8a47d5e1f9de08659603a47f117fe60d5 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:17 -0700
Subject: [PATCH 54/85] kvm: x86: Support selectively freeing either current or
 previous MMU root

kvm_mmu_free_roots() now takes a mask specifying which roots to free, so
that either one of the roots (active/previous) can be individually freed
when needed.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 +++++-
 arch/x86/kvm/mmu.c              | 36 ++++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0b77c233e441..262b0bc64dfc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1287,6 +1287,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
 	return !!(*irq_state);
 }
 
+#define KVM_MMU_ROOT_CURRENT	BIT(0)
+#define KVM_MMU_ROOT_PREVIOUS	BIT(1)
+#define KVM_MMU_ROOTS_ALL	(~0UL)
+
 int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
 void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
@@ -1298,7 +1302,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root);
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free);
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
 			   struct x86_exception *exception);
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6eeca915511e..0f6965ce016a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3438,14 +3438,18 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 	*root_hpa = INVALID_PAGE;
 }
 
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root)
+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
 {
 	int i;
 	LIST_HEAD(invalid_list);
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
+	bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS;
 
-	if (!VALID_PAGE(mmu->root_hpa) &&
-	    (!VALID_PAGE(mmu->prev_root.hpa) || !free_prev_root))
+	/* Before acquiring the MMU lock, see if we need to do any real work. */
+	if (!(free_active_root && VALID_PAGE(mmu->root_hpa)) &&
+	    !(free_prev_root && VALID_PAGE(mmu->prev_root.hpa)))
 		return;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -3454,15 +3458,19 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root)
 		mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa,
 				   &invalid_list);
 
-	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
-	    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
-		mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
-	} else {
-		for (i = 0; i < 4; ++i)
-			if (mmu->pae_root[i] != 0)
-				mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
-						   &invalid_list);
-		mmu->root_hpa = INVALID_PAGE;
+	if (free_active_root) {
+		if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+		    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
+			mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
+					   &invalid_list);
+		} else {
+			for (i = 0; i < 4; ++i)
+				if (mmu->pae_root[i] != 0)
+					mmu_free_root_page(vcpu->kvm,
+							   &mmu->pae_root[i],
+							   &invalid_list);
+			mmu->root_hpa = INVALID_PAGE;
+		}
 	}
 
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -4109,7 +4117,7 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			      bool skip_tlb_flush)
 {
 	if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
-		kvm_mmu_free_roots(vcpu, false);
+		kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
 }
 
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
@@ -4885,7 +4893,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-	kvm_mmu_free_roots(vcpu, true);
+	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
 	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);

From 956bf3531fba53c0501eda4fbc67950b0f7b913f Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:18 -0700
Subject: [PATCH 55/85] kvm: x86: Skip shadow page resync on CR3 switch when
 indicated by guest

When the guest indicates that the TLB doesn't need to be flushed in a
CR3 switch, we can also skip resyncing the shadow page tables since an
out-of-sync shadow page table is equivalent to an out-of-sync TLB.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++---
 arch/x86/kvm/vmx.c |  2 +-
 arch/x86/kvm/x86.c |  6 +++---
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0f6965ce016a..9446a36a4ab7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4098,9 +4098,19 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			 */
 
 			kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
-			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-			if (!skip_tlb_flush)
+			if (!skip_tlb_flush) {
+				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
 				kvm_x86_ops->tlb_flush(vcpu, true);
+			}
+
+			/*
+			 * The last MMIO access's GVA and GPA are cached in the
+			 * VCPU. When switching to a new CR3, that GVA->GPA
+			 * mapping may no longer be valid. So clear any cached
+			 * MMIO info even when we don't need to sync the shadow
+			 * page tables.
+			 */
+			vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
 
 			__clear_sp_write_flooding_count(
 				page_header(mmu->root_hpa));
@@ -5217,6 +5227,21 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 
 	mmu->invlpg(vcpu, gva, mmu->root_hpa);
+
+	/*
+	 * INVLPG is required to invalidate any global mappings for the VA,
+	 * irrespective of PCID. Since it would take us roughly similar amount
+	 * of work to determine whether the prev_root mapping of the VA is
+	 * marked global, or to just sync it blindly, so we might as well just
+	 * always sync it.
+	 *
+	 * Mappings not reachable via the current cr3 or the prev_root.cr3 will
+	 * be synced when switching to that cr3, so nothing needs to be done
+	 * here for them.
+	 */
+	if (VALID_PAGE(mmu->prev_root.hpa))
+		mmu->invlpg(vcpu, gva, mmu->prev_root.hpa);
+
 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	++vcpu->stat.invlpg;
 }
@@ -5232,8 +5257,10 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	}
 
 	if (VALID_PAGE(mmu->prev_root.hpa) &&
-	    pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3))
+	    pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) {
+		mmu->invlpg(vcpu, gva, mmu->prev_root.hpa);
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	}
 
 	++vcpu->stat.invlpg;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6151418cec32..b81210051133 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8821,7 +8821,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 
 		if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3)
 		    == operand.pcid)
-			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+			kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_PREVIOUS);
 
 		/*
 		 * If neither the current cr3 nor the prev_root.cr3 use the
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 493afbf12e78..aa5d96b4b386 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -858,10 +858,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 #endif
 
 	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
-		kvm_mmu_sync_roots(vcpu);
-
-		if (!skip_tlb_flush)
+		if (!skip_tlb_flush) {
+			kvm_mmu_sync_roots(vcpu);
 			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		}
 		return 0;
 	}
 

From faff87588d8bfd9e56e9203412f0bb80455da7b9 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Fri, 29 Jun 2018 13:10:05 -0700
Subject: [PATCH 56/85] kvm: x86: Flush only affected TLB entries in
 kvm_mmu_invlpg*

This needs a minor bug fix. The updated patch is as follows.

Thanks,
Junaid

------------------------------------------------------------------------------

kvm_mmu_invlpg() and kvm_mmu_invpcid_gva() only need to flush the TLB
entries for the specific guest virtual address, instead of flushing all
TLB entries associated with the VM.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  8 ++++++++
 arch/x86/kvm/mmu.c              | 14 +++++++++++---
 arch/x86/kvm/svm.c              |  8 ++++++++
 arch/x86/kvm/vmx.c              | 28 ++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 262b0bc64dfc..dcdc9150bb76 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -985,6 +985,14 @@ struct kvm_x86_ops {
 
 	void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
 
+	/*
+	 * Flush any TLB entries associated with the given GVA.
+	 * Does not need to flush GPA->HPA mappings.
+	 * Can potentially get non-canonical addresses through INVLPGs, which
+	 * the implementation may choose to ignore if appropriate.
+	 */
+	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
+
 	void (*run)(struct kvm_vcpu *vcpu);
 	int (*handle_exit)(struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9446a36a4ab7..f84c194e6db1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5226,6 +5226,10 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 
+	/* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
+	if (is_noncanonical_address(gva, vcpu))
+		return;
+
 	mmu->invlpg(vcpu, gva, mmu->root_hpa);
 
 	/*
@@ -5242,7 +5246,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 	if (VALID_PAGE(mmu->prev_root.hpa))
 		mmu->invlpg(vcpu, gva, mmu->prev_root.hpa);
 
-	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	kvm_x86_ops->tlb_flush_gva(vcpu, gva);
 	++vcpu->stat.invlpg;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5250,18 +5254,22 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 {
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+	bool tlb_flush = false;
 
 	if (pcid == kvm_get_active_pcid(vcpu)) {
 		mmu->invlpg(vcpu, gva, mmu->root_hpa);
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		tlb_flush = true;
 	}
 
 	if (VALID_PAGE(mmu->prev_root.hpa) &&
 	    pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) {
 		mmu->invlpg(vcpu, gva, mmu->prev_root.hpa);
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		tlb_flush = true;
 	}
 
+	if (tlb_flush)
+		kvm_x86_ops->tlb_flush_gva(vcpu, gva);
+
 	++vcpu->stat.invlpg;
 
 	/*
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index be9931cad409..73e27a98456f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5434,6 +5434,13 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 		svm->asid_generation--;
 }
 
+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	invlpga(gva, svm->vmcb->control.asid);
+}
+
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
 }
@@ -7086,6 +7093,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.set_rflags = svm_set_rflags,
 
 	.tlb_flush = svm_flush_tlb,
+	.tlb_flush_gva = svm_flush_tlb_gva,
 
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b81210051133..5aea5af02386 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1992,6 +1992,19 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 			 __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
+static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+	if (vpid == 0)
+		return true;
+
+	if (cpu_has_vmx_invvpid_individual_addr()) {
+		__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+		return true;
+	}
+
+	return false;
+}
+
 static inline void vpid_sync_vcpu_single(int vpid)
 {
 	if (vpid == 0)
@@ -4833,6 +4846,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
 }
 
+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	int vpid = to_vmx(vcpu)->vpid;
+
+	if (!vpid_sync_vcpu_addr(vpid, addr))
+		vpid_sync_context(vpid);
+
+	/*
+	 * If VPIDs are not supported or enabled, then the above is a no-op.
+	 * But we don't really need a TLB flush in that case anyway, because
+	 * each VM entry/exit includes an implicit flush when VPID is 0.
+	 */
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -13603,6 +13630,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.set_rflags = vmx_set_rflags,
 
 	.tlb_flush = vmx_flush_tlb,
+	.tlb_flush_gva = vmx_flush_tlb_gva,
 
 	.run = vmx_vcpu_run,
 	.handle_exit = vmx_handle_exit,

From b94742c958f0b97d304d4aecb4603a20ee9a2df3 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:20 -0700
Subject: [PATCH 57/85] kvm: x86: Add multi-entry LRU cache for previous CR3s

Adds support for storing multiple previous CR3/root_hpa pairs maintained
as an LRU cache, so that the lockless CR3 switch path can be used when
switching back to any of them.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  10 +--
 arch/x86/kvm/mmu.c              | 111 ++++++++++++++++++++++----------
 arch/x86/kvm/vmx.c              |  12 ++--
 3 files changed, 92 insertions(+), 41 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dcdc9150bb76..4f1983640bda 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -335,6 +335,8 @@ struct kvm_mmu_root_info {
 #define KVM_MMU_ROOT_INFO_INVALID \
 	((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE })
 
+#define KVM_MMU_NUM_PREV_ROOTS 3
+
 /*
  * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
  * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
@@ -363,7 +365,7 @@ struct kvm_mmu {
 	u8 shadow_root_level;
 	u8 ept_ad;
 	bool direct_map;
-	struct kvm_mmu_root_info prev_root;
+	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 
 	/*
 	 * Bitmap; bit set = permission fault
@@ -1295,9 +1297,9 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
 	return !!(*irq_state);
 }
 
-#define KVM_MMU_ROOT_CURRENT	BIT(0)
-#define KVM_MMU_ROOT_PREVIOUS	BIT(1)
-#define KVM_MMU_ROOTS_ALL	(~0UL)
+#define KVM_MMU_ROOT_CURRENT		BIT(0)
+#define KVM_MMU_ROOT_PREVIOUS(i)	BIT(1+i)
+#define KVM_MMU_ROOTS_ALL		(~0UL)
 
 int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
 void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f84c194e6db1..687bfb03b9ca 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3445,18 +3445,26 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
 	LIST_HEAD(invalid_list);
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
-	bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS;
+
+	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
 
 	/* Before acquiring the MMU lock, see if we need to do any real work. */
-	if (!(free_active_root && VALID_PAGE(mmu->root_hpa)) &&
-	    !(free_prev_root && VALID_PAGE(mmu->prev_root.hpa)))
-		return;
+	if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+			if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+			    VALID_PAGE(mmu->prev_roots[i].hpa))
+				break;
+
+		if (i == KVM_MMU_NUM_PREV_ROOTS)
+			return;
+	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 
-	if (free_prev_root)
-		mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa,
-				   &invalid_list);
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+			mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+					   &invalid_list);
 
 	if (free_active_root) {
 		if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
@@ -4064,6 +4072,38 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->nx = false;
 }
 
+/*
+ * Find out if a previously cached root matching the new CR3/role is available.
+ * The current root is also inserted into the cache.
+ * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
+ * returned.
+ * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
+ * false is returned. This root should now be freed by the caller.
+ */
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+				  union kvm_mmu_page_role new_role)
+{
+	uint i;
+	struct kvm_mmu_root_info root;
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+	root.cr3 = mmu->get_cr3(vcpu);
+	root.hpa = mmu->root_hpa;
+
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+		swap(root, mmu->prev_roots[i]);
+
+		if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
+		    page_header(root.hpa) != NULL &&
+		    new_role.word == page_header(root.hpa)->role.word)
+			break;
+	}
+
+	mmu->root_hpa = root.hpa;
+
+	return i < KVM_MMU_NUM_PREV_ROOTS;
+}
+
 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			    union kvm_mmu_page_role new_role,
 			    bool skip_tlb_flush)
@@ -4077,18 +4117,10 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 	 */
 	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
 	    mmu->root_level >= PT64_ROOT_4LEVEL) {
-		gpa_t prev_cr3 = mmu->prev_root.cr3;
-
 		if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
 			return false;
 
-		swap(mmu->root_hpa, mmu->prev_root.hpa);
-		mmu->prev_root.cr3 = mmu->get_cr3(vcpu);
-
-		if (new_cr3 == prev_cr3 &&
-		    VALID_PAGE(mmu->root_hpa) &&
-		    page_header(mmu->root_hpa) != NULL &&
-		    new_role.word == page_header(mmu->root_hpa)->role.word) {
+		if (cached_root_available(vcpu, new_cr3, new_role)) {
 			/*
 			 * It is possible that the cached previous root page is
 			 * obsolete because of a change in the MMU
@@ -4854,8 +4886,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
 {
 	if (reset_roots) {
+		uint i;
+
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-		vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID;
+
+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+			vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
 	}
 
 	if (mmu_is_nested(vcpu))
@@ -5225,6 +5261,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+	int i;
 
 	/* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
 	if (is_noncanonical_address(gva, vcpu))
@@ -5235,16 +5272,17 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 	/*
 	 * INVLPG is required to invalidate any global mappings for the VA,
 	 * irrespective of PCID. Since it would take us roughly similar amount
-	 * of work to determine whether the prev_root mapping of the VA is
-	 * marked global, or to just sync it blindly, so we might as well just
-	 * always sync it.
+	 * of work to determine whether any of the prev_root mappings of the VA
+	 * is marked global, or to just sync it blindly, so we might as well
+	 * just always sync it.
 	 *
-	 * Mappings not reachable via the current cr3 or the prev_root.cr3 will
-	 * be synced when switching to that cr3, so nothing needs to be done
-	 * here for them.
+	 * Mappings not reachable via the current cr3 or the prev_roots will be
+	 * synced when switching to that cr3, so nothing needs to be done here
+	 * for them.
 	 */
-	if (VALID_PAGE(mmu->prev_root.hpa))
-		mmu->invlpg(vcpu, gva, mmu->prev_root.hpa);
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+		if (VALID_PAGE(mmu->prev_roots[i].hpa))
+			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
 
 	kvm_x86_ops->tlb_flush_gva(vcpu, gva);
 	++vcpu->stat.invlpg;
@@ -5255,16 +5293,19 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 {
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 	bool tlb_flush = false;
+	uint i;
 
 	if (pcid == kvm_get_active_pcid(vcpu)) {
 		mmu->invlpg(vcpu, gva, mmu->root_hpa);
 		tlb_flush = true;
 	}
 
-	if (VALID_PAGE(mmu->prev_root.hpa) &&
-	    pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) {
-		mmu->invlpg(vcpu, gva, mmu->prev_root.hpa);
-		tlb_flush = true;
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
+		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+			tlb_flush = true;
+		}
 	}
 
 	if (tlb_flush)
@@ -5273,9 +5314,9 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	++vcpu->stat.invlpg;
 
 	/*
-	 * Mappings not reachable via the current cr3 or the prev_root.cr3 will
-	 * be synced when switching to that cr3, so nothing needs to be done
-	 * here for them.
+	 * Mappings not reachable via the current cr3 or the prev_roots will be
+	 * synced when switching to that cr3, so nothing needs to be done here
+	 * for them.
 	 */
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
@@ -5321,12 +5362,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
+	uint i;
+
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
 	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+		vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
 	return alloc_mmu_pages(vcpu);
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5aea5af02386..97e4d71431a1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8788,6 +8788,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 	bool pcid_enabled;
 	gva_t gva;
 	struct x86_exception e;
+	unsigned i;
+	unsigned long roots_to_free = 0;
 	struct {
 		u64 pcid;
 		u64 gla;
@@ -8846,12 +8848,14 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		}
 
-		if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3)
-		    == operand.pcid)
-			kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_PREVIOUS);
+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+			if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+			    == operand.pcid)
+				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
 
+		kvm_mmu_free_roots(vcpu, roots_to_free);
 		/*
-		 * If neither the current cr3 nor the prev_root.cr3 use the
+		 * If neither the current cr3 nor any of the prev_roots use the
 		 * given PCID, then nothing needs to be done here because a
 		 * resync will happen anyway before switching to any other CR3.
 		 */

From 208320ba103e01fd2f3a7b81e97c9c5bc85f0612 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 27 Jun 2018 14:59:21 -0700
Subject: [PATCH 58/85] kvm: x86: Remove CR3_PCID_INVD flag

It is a duplicate of X86_CR3_PCID_NOFLUSH. So just use that instead.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/emulate.c          | 2 +-
 arch/x86/kvm/x86.c              | 4 ++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f1983640bda..75ab578c3f6a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,7 +83,6 @@
 			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
 			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
-#define CR3_PCID_INVD		 BIT_64(63)
 #define CR4_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4c4f4263420c..106482da6388 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4191,7 +4191,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 				maxphyaddr = 36;
 			rsvd = rsvd_bits(maxphyaddr, 63);
 			if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
-				rsvd &= ~CR3_PCID_INVD;
+				rsvd &= ~X86_CR3_PCID_NOFLUSH;
 		}
 
 		if (new_val & rsvd)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aa5d96b4b386..6cc29dd21519 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -852,8 +852,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
 
 	if (pcid_enabled) {
-		skip_tlb_flush = cr3 & CR3_PCID_INVD;
-		cr3 &= ~CR3_PCID_INVD;
+		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
+		cr3 &= ~X86_CR3_PCID_NOFLUSH;
 	}
 #endif
 

From 450917b654c12075bd2b07d1e375dfed1bc9ca1e Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Wed, 18 Jul 2018 06:12:04 +0000
Subject: [PATCH 59/85] KVM/MMU: Simplify __kvm_sync_page() function

Merge check of "sp->role.cr4_pae != !!is_pae(vcpu))" and "vcpu->
arch.mmu.sync_page(vcpu, sp) == 0". kvm_mmu_prepare_zap_page()
is called under both these conditions.

Signed-off-by: Lan Tianyu <Tianyu.Lan@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 687bfb03b9ca..22a7984c68a5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2136,12 +2136,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			    struct list_head *invalid_list)
 {
-	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
-		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-		return false;
-	}
-
-	if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
+	if (sp->role.cr4_pae != !!is_pae(vcpu)
+	    || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return false;
 	}

From 3553ae5690a84a5baae5baa329467b3df2d99f72 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 17 Jul 2018 17:59:27 -0400
Subject: [PATCH 60/85] x86/kvm: Don't use pvqspinlock code if only 1 vCPU

On a VM with only 1 vCPU, the locking fast path will always be
successful. In this case, there is no need to use the the PV qspinlock
code which has higher overhead on the unlock side than the native
qspinlock code.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/kvm.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5b2300b818af..575c9a51e6ba 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -748,6 +748,10 @@ void __init kvm_spinlock_init(void)
 	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
 		return;
 
+	/* Don't use the pvqspinlock code if there is only 1 vCPU. */
+	if (num_possible_cpus() == 1)
+		return;
+
 	__pv_init_lock_hash();
 	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);

From eb914cfe72f4c948b2318b1381f6d2e08d43b63c Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Thu, 19 Jul 2018 08:40:06 +0000
Subject: [PATCH 61/85] X86/Hyper-V: Add flush HvFlushGuestPhysicalAddressSpace
 hypercall support

Hyper-V supports a pv hypercall HvFlushGuestPhysicalAddressSpace to
flush nested VM address space mapping in l1 hypervisor and it's to
reduce overhead of flushing ept tlb among vcpus. This patch is to
implement it.

Signed-off-by: Lan Tianyu <Tianyu.Lan@microsoft.com>
Acked-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/hyperv/Makefile           |  2 +-
 arch/x86/hyperv/nested.c           | 53 ++++++++++++++++++++++++++++++
 arch/x86/include/asm/hyperv-tlfs.h |  8 +++++
 arch/x86/include/asm/mshyperv.h    |  2 ++
 4 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/hyperv/nested.c

diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index b173d404e3df..b21ee65c4101 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,2 +1,2 @@
-obj-y			:= hv_init.o mmu.o
+obj-y			:= hv_init.o mmu.o nested.o
 obj-$(CONFIG_X86_64)	+= hv_apic.o
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
new file mode 100644
index 000000000000..08f914217518
--- /dev/null
+++ b/arch/x86/hyperv/nested.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V nested virtualization code.
+ *
+ * Copyright (C) 2018, Microsoft, Inc.
+ *
+ * Author : Lan Tianyu <Tianyu.Lan@microsoft.com>
+ */
+
+
+#include <linux/types.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
+#include <asm/tlbflush.h>
+
+int hyperv_flush_guest_mapping(u64 as)
+{
+	struct hv_guest_mapping_flush **flush_pcpu;
+	struct hv_guest_mapping_flush *flush;
+	u64 status;
+	unsigned long flags;
+	int ret = -ENOTSUPP;
+
+	if (!hv_hypercall_pg)
+		goto fault;
+
+	local_irq_save(flags);
+
+	flush_pcpu = (struct hv_guest_mapping_flush **)
+		this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	flush = *flush_pcpu;
+
+	if (unlikely(!flush)) {
+		local_irq_restore(flags);
+		goto fault;
+	}
+
+	flush->address_space = as;
+	flush->flags = 0;
+
+	status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE,
+				 flush, NULL);
+	local_irq_restore(flags);
+
+	if (!(status & HV_HYPERCALL_RESULT_MASK))
+		ret = 0;
+
+fault:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping);
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index b8c89265baf0..08e24f552030 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -309,6 +309,7 @@ struct ms_hyperv_tsc_page {
 #define HV_X64_MSR_REENLIGHTENMENT_CONTROL	0x40000106
 
 /* Nested features (CPUID 0x4000000A) EAX */
+#define HV_X64_NESTED_GUEST_MAPPING_FLUSH	BIT(18)
 #define HV_X64_NESTED_MSR_BITMAP		BIT(19)
 
 struct hv_reenlightenment_control {
@@ -350,6 +351,7 @@ struct hv_tsc_emulation_status {
 #define HVCALL_SEND_IPI_EX			0x0015
 #define HVCALL_POST_MESSAGE			0x005c
 #define HVCALL_SIGNAL_EVENT			0x005d
+#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
 
 #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE	0x00000001
 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT	12
@@ -741,6 +743,12 @@ struct ipi_arg_ex {
 	struct hv_vpset vp_set;
 };
 
+/* HvFlushGuestPhysicalAddressSpace hypercalls */
+struct hv_guest_mapping_flush {
+	u64 address_space;
+	u64 flags;
+};
+
 /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
 struct hv_tlb_flush {
 	u64 address_space;
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 5a7375ed5f7c..2519f5de1e57 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -305,6 +305,7 @@ void hyperv_reenlightenment_intr(struct pt_regs *regs);
 void set_hv_tscchange_cb(void (*cb)(void));
 void clear_hv_tscchange_cb(void);
 void hyperv_stop_tsc_emulation(void);
+int hyperv_flush_guest_mapping(u64 as);
 
 #ifdef CONFIG_X86_64
 void hv_apic_init(void);
@@ -324,6 +325,7 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
 {
 	return NULL;
 }
+static inline int hyperv_flush_guest_mapping(u64 as) { return -1; }
 #endif /* CONFIG_HYPERV */
 
 #ifdef CONFIG_HYPERV_TSCPAGE

From 60cfce4c4f6f6741882032ee6f895e835533a16b Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Thu, 19 Jul 2018 08:40:12 +0000
Subject: [PATCH 62/85] X86/Hyper-V: Add hyperv_nested_flush_guest_mapping
 ftrace support

This patch is to add hyperv_nested_flush_guest_mapping support to trace
hvFlushGuestPhysicalAddressSpace hypercall.

Signed-off-by: Lan Tianyu <Tianyu.Lan@microsoft.com>
Acked-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/hyperv/nested.c            |  3 +++
 arch/x86/include/asm/trace/hyperv.h | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
index 08f914217518..b8e60cc50461 100644
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -14,6 +14,8 @@
 #include <asm/mshyperv.h>
 #include <asm/tlbflush.h>
 
+#include <asm/trace/hyperv.h>
+
 int hyperv_flush_guest_mapping(u64 as)
 {
 	struct hv_guest_mapping_flush **flush_pcpu;
@@ -48,6 +50,7 @@ int hyperv_flush_guest_mapping(u64 as)
 		ret = 0;
 
 fault:
+	trace_hyperv_nested_flush_guest_mapping(as, ret);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping);
diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h
index 4253bca99989..e1ffe61de8d6 100644
--- a/arch/x86/include/asm/trace/hyperv.h
+++ b/arch/x86/include/asm/trace/hyperv.h
@@ -28,6 +28,20 @@ TRACE_EVENT(hyperv_mmu_flush_tlb_others,
 		      __entry->addr, __entry->end)
 	);
 
+TRACE_EVENT(hyperv_nested_flush_guest_mapping,
+	    TP_PROTO(u64 as, int ret),
+	    TP_ARGS(as, ret),
+
+	    TP_STRUCT__entry(
+		    __field(u64, as)
+		    __field(int, ret)
+		    ),
+	    TP_fast_assign(__entry->as = as;
+			   __entry->ret = ret;
+		    ),
+	    TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
+	);
+
 #endif /* CONFIG_HYPERV */
 
 #undef TRACE_INCLUDE_PATH

From b08660e59dbdb600c55953787ed2265a0b510f77 Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Thu, 19 Jul 2018 08:40:17 +0000
Subject: [PATCH 63/85] KVM: x86: Add tlb remote flush callback in kvm_x86_ops.

This patch is to provide a way for platforms to register hv tlb remote
flush callback and this helps to optimize operation of tlb flush
among vcpus for nested virtualization case.

Signed-off-by: Lan Tianyu <Tianyu.Lan@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 11 +++++++++++
 include/linux/kvm_host.h        |  7 +++++++
 virt/kvm/kvm_main.c             |  3 ++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 75ab578c3f6a..150937e64f63 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -985,6 +985,7 @@ struct kvm_x86_ops {
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 	void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+	int  (*tlb_remote_flush)(struct kvm *kvm);
 
 	/*
 	 * Flush any TLB entries associated with the given GVA.
@@ -1145,6 +1146,16 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 	return kvm_x86_ops->vm_free(kvm);
 }
 
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+	if (kvm_x86_ops->tlb_remote_flush &&
+	    !kvm_x86_ops->tlb_remote_flush(kvm))
+		return 0;
+	else
+		return -ENOTSUPP;
+}
+
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2840ac74a3d7..7c7362dd2faa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -834,6 +834,13 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
+#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+	return -ENOTSUPP;
+}
+#endif
+
 #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
 void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
 void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8f461e0ed382..74544d20bbf8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -273,7 +273,8 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 	 * barrier here.
 	 */
-	if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+	if (!kvm_arch_flush_remote_tlb(kvm)
+	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }

From 877ad952be3d51445f6a74dd63708a9327c8f19d Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Thu, 19 Jul 2018 08:40:23 +0000
Subject: [PATCH 64/85] KVM: vmx: Add tlb_remote_flush callback support

Register tlb_remote_flush callback for vmx when hyperv capability of
nested guest mapping flush is detected. The interface can help to
reduce overhead when flush ept table among vcpus for nested VM. The
tradition way is to send IPIs to all affected vcpus and executes
INVEPT on each vcpus. It will trigger several vmexits for IPI
and INVEPT emulation. Hyper-V provides such hypercall to do
flush for all vcpus and call the hypercall when all ept table
pointers of single VM are same.

Signed-off-by: Lan Tianyu <Tianyu.Lan@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 72 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 97e4d71431a1..6a95e6b1a43d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -188,12 +188,21 @@ module_param(ple_window_max, uint, 0444);
 
 extern const ulong vmx_return;
 
+enum ept_pointers_status {
+	EPT_POINTERS_CHECK = 0,
+	EPT_POINTERS_MATCH = 1,
+	EPT_POINTERS_MISMATCH = 2
+};
+
 struct kvm_vmx {
 	struct kvm kvm;
 
 	unsigned int tss_addr;
 	bool ept_identity_pagetable_done;
 	gpa_t ept_identity_map_addr;
+
+	enum ept_pointers_status ept_pointers_match;
+	spinlock_t ept_pointer_lock;
 };
 
 #define NR_AUTOLOAD_MSRS 8
@@ -863,6 +872,7 @@ struct vcpu_vmx {
 	 */
 	u64 msr_ia32_feature_control;
 	u64 msr_ia32_feature_control_valid_bits;
+	u64 ept_pointer;
 };
 
 enum segment_cache_field {
@@ -1357,6 +1367,48 @@ static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 	 *	GUEST_IA32_RTIT_CTL		= 0x00002814,
 	 */
 }
+
+/* check_ept_pointer() should be under protection of ept_pointer_lock. */
+static void check_ept_pointer_match(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	u64 tmp_eptp = INVALID_PAGE;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!VALID_PAGE(tmp_eptp)) {
+			tmp_eptp = to_vmx(vcpu)->ept_pointer;
+		} else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
+			to_kvm_vmx(kvm)->ept_pointers_match
+				= EPT_POINTERS_MISMATCH;
+			return;
+		}
+	}
+
+	to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
+}
+
+static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
+{
+	int ret;
+
+	spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+
+	if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
+		check_ept_pointer_match(kvm);
+
+	if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+		ret = -ENOTSUPP;
+		goto out;
+	}
+
+	ret = hyperv_flush_guest_mapping(
+			to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
+
+out:
+	spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+	return ret;
+}
 #else /* !IS_ENABLED(CONFIG_HYPERV) */
 static inline void evmcs_write64(unsigned long field, u64 value) {}
 static inline void evmcs_write32(unsigned long field, u32 value) {}
@@ -5041,6 +5093,7 @@ static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	struct kvm *kvm = vcpu->kvm;
 	unsigned long guest_cr3;
 	u64 eptp;
 
@@ -5048,11 +5101,20 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	if (enable_ept) {
 		eptp = construct_eptp(vcpu, cr3);
 		vmcs_write64(EPT_POINTER, eptp);
+
+		if (kvm_x86_ops->tlb_remote_flush) {
+			spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+			to_vmx(vcpu)->ept_pointer = eptp;
+			to_kvm_vmx(kvm)->ept_pointers_match
+				= EPT_POINTERS_CHECK;
+			spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+		}
+
 		if (enable_unrestricted_guest || is_paging(vcpu) ||
 		    is_guest_mode(vcpu))
 			guest_cr3 = kvm_read_cr3(vcpu);
 		else
-			guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr;
+			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
 		ept_load_pdptrs(vcpu);
 	}
 
@@ -7622,6 +7684,12 @@ static __init int hardware_setup(void)
 	if (enable_ept && !cpu_has_vmx_ept_2m_page())
 		kvm_disable_largepages();
 
+#if IS_ENABLED(CONFIG_HYPERV)
+	if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
+	    && enable_ept)
+		kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
+#endif
+
 	if (!cpu_has_vmx_ple()) {
 		ple_gap = 0;
 		ple_window = 0;
@@ -10665,6 +10733,8 @@ free_vcpu:
 
 static int vmx_vm_init(struct kvm *kvm)
 {
+	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+
 	if (!ple_gap)
 		kvm->arch.pause_in_guest = true;
 	return 0;

From b9b33da2aa7429b0f61bcd218d34e1a277459fb4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 27 Jul 2018 17:44:41 +0200
Subject: [PATCH 65/85] KVM: try __get_user_pages_fast even if not in atomic
 context

We are currently cutting hva_to_pfn_fast short if we do not want an
immediate exit, which is represented by !async && !atomic.  However,
this is unnecessary, and __get_user_pages_fast is *much* faster
because the regular get_user_pages takes pmd_lock/pte_lock.
In fact, when many CPUs take a nested vmexit at the same time
the contention on those locks is visible, and this patch removes
about 25% (compared to 4.18) from vmexit.flat on a 16 vCPU
nested guest.

Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 74544d20bbf8..f83239ac8be1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1343,18 +1343,16 @@ static inline int check_user_page_hwpoison(unsigned long addr)
 }
 
 /*
- * The atomic path to get the writable pfn which will be stored in @pfn,
- * true indicates success, otherwise false is returned.
+ * The fast path to get the writable pfn which will be stored in @pfn,
+ * true indicates success, otherwise false is returned.  It's also the
+ * only part that runs if we can are in atomic context.
  */
-static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
-			    bool write_fault, bool *writable, kvm_pfn_t *pfn)
+static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
+			    bool *writable, kvm_pfn_t *pfn)
 {
 	struct page *page[1];
 	int npages;
 
-	if (!(async || atomic))
-		return false;
-
 	/*
 	 * Fast pin a writable pfn only if it is a write fault request
 	 * or the caller allows to map a writable pfn for a read fault
@@ -1498,7 +1496,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);
 
-	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
 		return pfn;
 
 	if (atomic)

From 36090bf43a6b835a42f515cb515ff6fa293a25fe Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 27 Jul 2018 09:18:50 -0700
Subject: [PATCH 66/85] kvm: nVMX: Fix fault vector for VMX operation at CPL >
 0

The fault that should be raised for a privilege level violation is #GP
rather than #UD.

Fixes: 727ba748e110b4 ("kvm: nVMX: Enforce cpl=0 for VMX instructions")
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a95e6b1a43d..d19bb602ff07 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8096,7 +8096,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
 	/* CPL=0 must be checked manually. */
 	if (vmx_get_cpl(vcpu)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
+		kvm_inject_gp(vcpu, 0);
 		return 1;
 	}
 
@@ -8160,7 +8160,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 {
 	if (vmx_get_cpl(vcpu)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
+		kvm_inject_gp(vcpu, 0);
 		return 0;
 	}
 

From e49fcb8b9ef26dfb2d02b173d790e1ef41177121 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 27 Jul 2018 13:44:45 -0700
Subject: [PATCH 67/85] kvm: nVMX: Fix fault priority for VMX operations

When checking emulated VMX instructions for faults, the #UD for "IF
(not in VMX operation)" should take precedence over the #GP for "ELSIF
CPL > 0."

Suggested-by: Eric Northup <digitaleric@google.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d19bb602ff07..ccc9d75124a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8159,15 +8159,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
  */
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 {
+	if (!to_vmx(vcpu)->nested.vmxon) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
 	if (vmx_get_cpl(vcpu)) {
 		kvm_inject_gp(vcpu, 0);
 		return 0;
 	}
 
-	if (!to_vmx(vcpu)->nested.vmxon) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 0;
-	}
 	return 1;
 }
 

From e368b875a8a91125f1bd0ffac1100c305c0bdff4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:41 -0700
Subject: [PATCH 68/85] KVM: vmx: refactor segmentation code in
 vmx_save_host_state()

Use local variables in vmx_save_host_state() to temporarily track
the selector and base values for FS and GS, and reorganize the
code so that the 64-bit vs 32-bit portions are contained within
a single #ifdef.  This refactoring paves the way for future patches
to modify the updating of VMCS state with minimal changes to the
code, and (hopefully) simplifies resolving a likely conflict with
another in-flight patch[1] by being the whipping boy for future
patches.

[1] https://www.spinics.net/lists/kvm/msg171647.html

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 53 ++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ccc9d75124a6..06fd598ee8a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2662,8 +2662,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 #ifdef CONFIG_X86_64
 	int cpu = raw_smp_processor_id();
-	unsigned long fs_base, kernel_gs_base;
 #endif
+	unsigned long fs_base, gs_base;
+	u16 fs_sel, gs_sel;
 	int i;
 
 	if (vmx->host_state.loaded)
@@ -2678,49 +2679,51 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
 
 #ifdef CONFIG_X86_64
+	savesegment(ds, vmx->host_state.ds_sel);
+	savesegment(es, vmx->host_state.es_sel);
+
+	gs_base = cpu_kernelmode_gs_base(cpu);
 	if (likely(is_64bit_mm(current->mm))) {
 		save_fsgs_for_kvm();
-		vmx->host_state.fs_sel = current->thread.fsindex;
-		vmx->host_state.gs_sel = current->thread.gsindex;
+		fs_sel = current->thread.fsindex;
+		gs_sel = current->thread.gsindex;
 		fs_base = current->thread.fsbase;
-		kernel_gs_base = current->thread.gsbase;
+		vmx->msr_host_kernel_gs_base = current->thread.gsbase;
 	} else {
-#endif
-		savesegment(fs, vmx->host_state.fs_sel);
-		savesegment(gs, vmx->host_state.gs_sel);
-#ifdef CONFIG_X86_64
+		savesegment(fs, fs_sel);
+		savesegment(gs, gs_sel);
 		fs_base = read_msr(MSR_FS_BASE);
-		kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+		vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
 	}
+
+	if (is_long_mode(&vmx->vcpu))
+		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#else
+	savesegment(fs, fs_sel);
+	savesegment(gs, gs_sel);
+	fs_base = segment_base(fs_sel);
+	gs_base = segment_base(gs_sel);
 #endif
-	if (!(vmx->host_state.fs_sel & 7)) {
-		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
+
+	vmx->host_state.fs_sel = fs_sel;
+	if (!(fs_sel & 7)) {
+		vmcs_write16(HOST_FS_SELECTOR, fs_sel);
 		vmx->host_state.fs_reload_needed = 0;
 	} else {
 		vmcs_write16(HOST_FS_SELECTOR, 0);
 		vmx->host_state.fs_reload_needed = 1;
 	}
-	if (!(vmx->host_state.gs_sel & 7))
-		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
+	vmx->host_state.gs_sel = gs_sel;
+	if (!(gs_sel & 7))
+		vmcs_write16(HOST_GS_SELECTOR, gs_sel);
 	else {
 		vmcs_write16(HOST_GS_SELECTOR, 0);
 		vmx->host_state.gs_ldt_reload_needed = 1;
 	}
 
-#ifdef CONFIG_X86_64
-	savesegment(ds, vmx->host_state.ds_sel);
-	savesegment(es, vmx->host_state.es_sel);
-
 	vmcs_writel(HOST_FS_BASE, fs_base);
-	vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
+	vmcs_writel(HOST_GS_BASE, gs_base);
 
-	vmx->msr_host_kernel_gs_base = kernel_gs_base;
-	if (is_long_mode(&vmx->vcpu))
-		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
-#else
-	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
-	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
-#endif
 	for (i = 0; i < vmx->save_nmsrs; ++i)
 		kvm_set_shared_msr(vmx->guest_msrs[i].index,
 				   vmx->guest_msrs[i].data,

From bd9966de4e14fb559e89a06f7f5c9aab2cc028b9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:42 -0700
Subject: [PATCH 69/85] KVM: vmx: track host_state.loaded using a loaded_vmcs
 pointer

Using 'struct loaded_vmcs*' to track whether the CPU registers
contain host or guest state kills two birds with one stone.

  1. The (effective) boolean host_state.loaded is poorly named.
     It does not track whether or not host state is loaded into
     the CPU registers (which most readers would expect), but
     rather tracks if host state has been saved AND guest state
     is loaded.

  2. Using a loaded_vmcs pointer provides a more robust framework
     for the optimized guest/host state switching, especially when
     consideration per-VMCS enhancements.  To that end, WARN_ONCE
     if we try to switch to host state with a different VMCS than
     was last used to save host state.

Resolve an occurrence of the new WARN by setting loaded_vmcs after
the call to vmx_vcpu_put() in vmx_switch_vmcs().

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 06fd598ee8a6..935b84f40f65 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -804,18 +804,22 @@ struct vcpu_vmx {
 	/*
 	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
 	 * non-nested (L1) guest, it always points to vmcs01. For a nested
-	 * guest (L2), it points to a different VMCS.
+	 * guest (L2), it points to a different VMCS.  loaded_cpu_state points
+	 * to the VMCS whose state is loaded into the CPU registers that only
+	 * need to be switched when transitioning to/from the kernel; a NULL
+	 * value indicates that host state is loaded.
 	 */
 	struct loaded_vmcs    vmcs01;
 	struct loaded_vmcs   *loaded_vmcs;
+	struct loaded_vmcs   *loaded_cpu_state;
 	bool                  __launched; /* temporary, used in vmx_vcpu_run */
 	struct msr_autoload {
 		unsigned nr;
 		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
 		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
 	} msr_autoload;
+
 	struct {
-		int           loaded;
 		u16           fs_sel, gs_sel, ldt_sel;
 #ifdef CONFIG_X86_64
 		u16           ds_sel, es_sel;
@@ -2667,10 +2671,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	u16 fs_sel, gs_sel;
 	int i;
 
-	if (vmx->host_state.loaded)
+	if (vmx->loaded_cpu_state)
 		return;
 
-	vmx->host_state.loaded = 1;
+	vmx->loaded_cpu_state = vmx->loaded_vmcs;
+
 	/*
 	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
 	 * allow segment selectors with cpl > 0 or ti == 1.
@@ -2732,11 +2737,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
-	if (!vmx->host_state.loaded)
+	if (!vmx->loaded_cpu_state)
 		return;
 
+	WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
+
 	++vmx->vcpu.stat.host_state_reload;
-	vmx->host_state.loaded = 0;
+	vmx->loaded_cpu_state = NULL;
+
 #ifdef CONFIG_X86_64
 	if (is_long_mode(&vmx->vcpu))
 		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
@@ -10596,8 +10604,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 		return;
 
 	cpu = get_cpu();
-	vmx->loaded_vmcs = vmcs;
 	vmx_vcpu_put(vcpu);
+	vmx->loaded_vmcs = vmcs;
 	vmx_vcpu_load(vcpu, cpu);
 	put_cpu();
 }

From 678e315e78a780dbef384b92339c8414309dbc11 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:43 -0700
Subject: [PATCH 70/85] KVM: vmx: add dedicated utility to access guest's
 kernel_gs_base

When lazy save/restore of MSR_KERNEL_GS_BASE was introduced[1], the
MSR was intercepted in all modes and was only restored for the host
when the guest is in 64-bit mode.  So at the time, going through the
full host restore prior to accessing MSR_KERNEL_GS_BASE was necessary
to load host state and was not a significant waste of cycles.

Later, MSR_KERNEL_GS_BASE interception was disabled for a 64-bit
guest[2], and then unconditionally saved/restored for the host[3].
As a result, loading full host state is overkill for accesses to
MSR_KERNEL_GS_BASE, and completely unnecessary when the guest is
not in 64-bit mode.

Add a dedicated utility to read/write the guest's MSR_KERNEL_GS_BASE
(outside of the save/restore flow) to minimize the overhead incurred
when accessing the MSR.  When setting EFER, only decache the MSR if
the new EFER will disable long mode.

Removing out-of-band usage of vmx_load_host_state() also eliminates,
or at least reduces, potential corner cases in its usage, which in
turn will (hopefuly) make it easier to reason about future changes
to the save/restore flow, e.g. optimization of saving host state.

[1] commit 44ea2b1758d8 ("KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx
                                    autoload msr area")
[2] commit 5897297bc228 ("KVM: VMX: Don't intercept MSR_KERNEL_GS_BASE")
[3] commit c8770e7ba63b ("KVM: VMX: Fix host userspace gsbase corruption")

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 935b84f40f65..72c392525f61 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2772,13 +2772,31 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 	load_fixmap_gdt(raw_smp_processor_id());
 }
 
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
+#ifdef CONFIG_X86_64
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
 {
-	preempt_disable();
-	__vmx_load_host_state(vmx);
-	preempt_enable();
+	if (is_long_mode(&vmx->vcpu)) {
+		preempt_disable();
+		if (vmx->loaded_cpu_state)
+			rdmsrl(MSR_KERNEL_GS_BASE,
+			       vmx->msr_guest_kernel_gs_base);
+		preempt_enable();
+	}
+	return vmx->msr_guest_kernel_gs_base;
 }
 
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+	if (is_long_mode(&vmx->vcpu)) {
+		preempt_disable();
+		if (vmx->loaded_cpu_state)
+			wrmsrl(MSR_KERNEL_GS_BASE, data);
+		preempt_enable();
+	}
+	vmx->msr_guest_kernel_gs_base = data;
+}
+#endif
+
 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -3857,8 +3875,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vmcs_readl(GUEST_GS_BASE);
 		break;
 	case MSR_KERNEL_GS_BASE:
-		vmx_load_host_state(vmx);
-		msr_info->data = vmx->msr_guest_kernel_gs_base;
+		msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
 		break;
 #endif
 	case MSR_EFER:
@@ -3958,8 +3975,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmcs_writel(GUEST_GS_BASE, data);
 		break;
 	case MSR_KERNEL_GS_BASE:
-		vmx_load_host_state(vmx);
-		vmx->msr_guest_kernel_gs_base = data;
+		vmx_write_guest_kernel_gs_base(vmx, data);
 		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
@@ -4849,10 +4865,18 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 		return;
 
 	/*
-	 * Force kernel_gs_base reloading before EFER changes, as control
-	 * of this msr depends on is_long_mode().
+	 * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
+	 * 64-bit mode as a 64-bit kernel may frequently access the
+	 * MSR.  This means we need to manually save/restore the MSR
+	 * when switching between guest and host state, but only if
+	 * the guest is in 64-bit mode.  Sync our cached value if the
+	 * guest is transitioning to 32-bit mode and the CPU contains
+	 * guest state, i.e. the cache is stale.
 	 */
-	vmx_load_host_state(to_vmx(vcpu));
+#ifdef CONFIG_X86_64
+	if (!(efer & EFER_LMA))
+		(void)vmx_read_guest_kernel_gs_base(vmx);
+#endif
 	vcpu->arch.efer = efer;
 	if (efer & EFER_LMA) {
 		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);

From 6d6095bd2c915f4c2ba6f25e7d16e38b06c904de Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:44 -0700
Subject: [PATCH 71/85] KVM: vmx: rename __vmx_load_host_state() and
 vmx_save_host_state()

Now that the vmx_load_host_state() wrapper is gone, i.e. the only
time we call the core functions is when we're actually about to
switch between guest/host, rename the functions that handle lazy
state switching to vmx_prepare_switch_to_{guest,host}_state() to
better document the full extent of their functionality.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 72c392525f61..ab27f9734ac0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2661,7 +2661,7 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 #ifdef CONFIG_X86_64
@@ -2735,7 +2735,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 				   vmx->guest_msrs[i].mask);
 }
 
-static void __vmx_load_host_state(struct vcpu_vmx *vmx)
+static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 {
 	if (!vmx->loaded_cpu_state)
 		return;
@@ -2938,7 +2938,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	vmx_vcpu_pi_put(vcpu);
 
-	__vmx_load_host_state(to_vmx(vcpu));
+	vmx_prepare_switch_to_host(to_vmx(vcpu));
 }
 
 static bool emulation_required(struct kvm_vcpu *vcpu)
@@ -6099,8 +6099,8 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
 	/*
 	 * Load null selectors, so we can avoid reloading them in
-	 * __vmx_load_host_state(), in case userspace uses the null selectors
-	 * too (the expected case).
+	 * vmx_prepare_switch_to_host(), in case userspace uses
+	 * the null selectors too (the expected case).
 	 */
 	vmcs_write16(HOST_DS_SELECTOR, 0);
 	vmcs_write16(HOST_ES_SELECTOR, 0);
@@ -10565,9 +10565,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * The sysexit path does not restore ds/es, so we must set them to
 	 * a reasonable value ourselves.
 	 *
-	 * We can't defer this to vmx_load_host_state() since that function
-	 * may be executed in interrupt context, which saves and restore segments
-	 * around it, nullifying its effect.
+	 * We can't defer this to vmx_prepare_switch_to_host() since that
+	 * function may be executed in interrupt context, which saves and
+	 * restore segments around it, nullifying its effect.
 	 */
 	loadsegment(ds, __USER_DS);
 	loadsegment(es, __USER_DS);
@@ -11668,7 +11668,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
 	 * Some constant fields are set here by vmx_set_constant_host_state().
 	 * Other fields are different per CPU, and will be set later when
-	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+	 * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
+	 * is called.
 	 */
 	vmx_set_constant_host_state(vmx);
 
@@ -13707,7 +13708,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.vcpu_free = vmx_free_vcpu,
 	.vcpu_reset = vmx_vcpu_reset,
 
-	.prepare_guest_switch = vmx_save_host_state,
+	.prepare_guest_switch = vmx_prepare_switch_to_guest,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 

From fd1ec7723fbd560f924769b43cbdfe82dfd6a98e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:45 -0700
Subject: [PATCH 72/85] KVM: nVMX: remove a misleading comment regarding vmcs02
 fields

prepare_vmcs02() has an odd comment that says certain fields are
"not in vmcs02".  AFAICT the intent of the comment is to document
that various VMCS fields are not handled by prepare_vmcs02(),
e.g. HOST_{FS,GS}_{BASE,SELECTOR}.  While technically true, the
comment is misleading, e.g. it can lead the reader to think that
KVM never writes those fields to vmcs02.

Remove the comment altogether as the handling of FS and GS is
not specific to nested VMX, and GUEST_PML_INDEX has been written
by prepare_vmcs02() since commit "4e59516a12a6 (kvm: vmx: ensure
VMCS is current while enabling PML)"

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ab27f9734ac0..a135c91d44f8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11741,11 +11741,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
 	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
 
-	/*
-	 * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
-	 * HOST_FS_BASE, HOST_GS_BASE.
-	 */
-
 	if (vmx->nested.nested_run_pending &&
 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
 		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);

From e920de8507c6c8760c775cd718627e7cbf57c3b7 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:46 -0700
Subject: [PATCH 73/85] KVM: vmx: compute need to reload FS/GS/LDT on demand

Remove fs_reload_needed and gs_ldt_reload_needed from host_state
and instead compute whether we need to reload various state at
the time we actually do the reload.  The state that is tracked
by the *_reload_needed variables is not any more volatile than
the trackers themselves.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a135c91d44f8..c8a583ff7bf2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -824,8 +824,6 @@ struct vcpu_vmx {
 #ifdef CONFIG_X86_64
 		u16           ds_sel, es_sel;
 #endif
-		int           gs_ldt_reload_needed;
-		int           fs_reload_needed;
 	} host_state;
 	struct {
 		int vm86_active;
@@ -2681,7 +2679,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	 * allow segment selectors with cpl > 0 or ti == 1.
 	 */
 	vmx->host_state.ldt_sel = kvm_read_ldt();
-	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
 
 #ifdef CONFIG_X86_64
 	savesegment(ds, vmx->host_state.ds_sel);
@@ -2711,20 +2708,15 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 #endif
 
 	vmx->host_state.fs_sel = fs_sel;
-	if (!(fs_sel & 7)) {
+	if (!(fs_sel & 7))
 		vmcs_write16(HOST_FS_SELECTOR, fs_sel);
-		vmx->host_state.fs_reload_needed = 0;
-	} else {
+	else
 		vmcs_write16(HOST_FS_SELECTOR, 0);
-		vmx->host_state.fs_reload_needed = 1;
-	}
 	vmx->host_state.gs_sel = gs_sel;
 	if (!(gs_sel & 7))
 		vmcs_write16(HOST_GS_SELECTOR, gs_sel);
-	else {
+	else
 		vmcs_write16(HOST_GS_SELECTOR, 0);
-		vmx->host_state.gs_ldt_reload_needed = 1;
-	}
 
 	vmcs_writel(HOST_FS_BASE, fs_base);
 	vmcs_writel(HOST_GS_BASE, gs_base);
@@ -2749,7 +2741,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 	if (is_long_mode(&vmx->vcpu))
 		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 #endif
-	if (vmx->host_state.gs_ldt_reload_needed) {
+	if (vmx->host_state.ldt_sel || (vmx->host_state.gs_sel & 7)) {
 		kvm_load_ldt(vmx->host_state.ldt_sel);
 #ifdef CONFIG_X86_64
 		load_gs_index(vmx->host_state.gs_sel);
@@ -2757,7 +2749,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 		loadsegment(gs, vmx->host_state.gs_sel);
 #endif
 	}
-	if (vmx->host_state.fs_reload_needed)
+	if (vmx->host_state.fs_sel & 7)
 		loadsegment(fs, vmx->host_state.fs_sel);
 #ifdef CONFIG_X86_64
 	if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {

From d7ee039e2bab6341cdabf42d3036063cf91b40ea Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:47 -0700
Subject: [PATCH 74/85] KVM: vmx: move struct host_state usage to struct
 loaded_vmcs

Make host_state a property of a loaded_vmcs so that it can be
used as a cache of the VMCS fields, e.g. to lazily VMWRITE the
corresponding VMCS field.  Treating host_state as a cache does
not work if it's not VMCS specific as the cache would become
incoherent when switching between vmcs01 and vmcs02.

Move vmcs_host_cr3 and vmcs_host_cr4 into host_state.

Explicitly zero out host_state when allocating a new VMCS for a
loaded_vmcs.  Unlike the pre-existing vmcs_host_cr{3,4} usage,
the segment information is not guaranteed to be (re)initialized
when running a new nested VMCS, e.g. HOST_FS_BASE is not written
in vmx_set_constant_host_state().

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 72 ++++++++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c8a583ff7bf2..d0d2374a3170 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -218,6 +218,21 @@ struct vmcs {
 	char data[0];
 };
 
+/*
+ * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
+ * and whose values change infrequently, but are not constant.  I.e. this is
+ * used as a write-through cache of the corresponding VMCS fields.
+ */
+struct vmcs_host_state {
+	unsigned long cr3;	/* May not match real cr3 */
+	unsigned long cr4;	/* May not match real cr4 */
+
+	u16           fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+	u16           ds_sel, es_sel;
+#endif
+};
+
 /*
  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
@@ -229,14 +244,13 @@ struct loaded_vmcs {
 	int cpu;
 	bool launched;
 	bool nmi_known_unmasked;
-	unsigned long vmcs_host_cr3;	/* May not match real cr3 */
-	unsigned long vmcs_host_cr4;	/* May not match real cr4 */
 	/* Support for vnmi-less CPUs */
 	int soft_vnmi_blocked;
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
 	unsigned long *msr_bitmap;
 	struct list_head loaded_vmcss_on_cpu_link;
+	struct vmcs_host_state host_state;
 };
 
 struct shared_msr_entry {
@@ -819,12 +833,6 @@ struct vcpu_vmx {
 		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
 	} msr_autoload;
 
-	struct {
-		u16           fs_sel, gs_sel, ldt_sel;
-#ifdef CONFIG_X86_64
-		u16           ds_sel, es_sel;
-#endif
-	} host_state;
 	struct {
 		int vm86_active;
 		ulong save_rflags;
@@ -2662,6 +2670,7 @@ static unsigned long segment_base(u16 selector)
 static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs_host_state *host_state;
 #ifdef CONFIG_X86_64
 	int cpu = raw_smp_processor_id();
 #endif
@@ -2673,16 +2682,17 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 		return;
 
 	vmx->loaded_cpu_state = vmx->loaded_vmcs;
+	host_state = &vmx->loaded_cpu_state->host_state;
 
 	/*
 	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
 	 * allow segment selectors with cpl > 0 or ti == 1.
 	 */
-	vmx->host_state.ldt_sel = kvm_read_ldt();
+	host_state->ldt_sel = kvm_read_ldt();
 
 #ifdef CONFIG_X86_64
-	savesegment(ds, vmx->host_state.ds_sel);
-	savesegment(es, vmx->host_state.es_sel);
+	savesegment(ds, host_state->ds_sel);
+	savesegment(es, host_state->es_sel);
 
 	gs_base = cpu_kernelmode_gs_base(cpu);
 	if (likely(is_64bit_mm(current->mm))) {
@@ -2707,12 +2717,12 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	gs_base = segment_base(gs_sel);
 #endif
 
-	vmx->host_state.fs_sel = fs_sel;
+	host_state->fs_sel = fs_sel;
 	if (!(fs_sel & 7))
 		vmcs_write16(HOST_FS_SELECTOR, fs_sel);
 	else
 		vmcs_write16(HOST_FS_SELECTOR, 0);
-	vmx->host_state.gs_sel = gs_sel;
+	host_state->gs_sel = gs_sel;
 	if (!(gs_sel & 7))
 		vmcs_write16(HOST_GS_SELECTOR, gs_sel);
 	else
@@ -2729,10 +2739,13 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 
 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 {
+	struct vmcs_host_state *host_state;
+
 	if (!vmx->loaded_cpu_state)
 		return;
 
 	WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
+	host_state = &vmx->loaded_cpu_state->host_state;
 
 	++vmx->vcpu.stat.host_state_reload;
 	vmx->loaded_cpu_state = NULL;
@@ -2741,20 +2754,20 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 	if (is_long_mode(&vmx->vcpu))
 		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 #endif
-	if (vmx->host_state.ldt_sel || (vmx->host_state.gs_sel & 7)) {
-		kvm_load_ldt(vmx->host_state.ldt_sel);
+	if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
+		kvm_load_ldt(host_state->ldt_sel);
 #ifdef CONFIG_X86_64
-		load_gs_index(vmx->host_state.gs_sel);
+		load_gs_index(host_state->gs_sel);
 #else
-		loadsegment(gs, vmx->host_state.gs_sel);
+		loadsegment(gs, host_state->gs_sel);
 #endif
 	}
-	if (vmx->host_state.fs_sel & 7)
-		loadsegment(fs, vmx->host_state.fs_sel);
+	if (host_state->fs_sel & 7)
+		loadsegment(fs, host_state->fs_sel);
 #ifdef CONFIG_X86_64
-	if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
-		loadsegment(ds, vmx->host_state.ds_sel);
-		loadsegment(es, vmx->host_state.es_sel);
+	if (unlikely(host_state->ds_sel | host_state->es_sel)) {
+		loadsegment(ds, host_state->ds_sel);
+		loadsegment(es, host_state->es_sel);
 	}
 #endif
 	invalidate_tss_limit();
@@ -4574,6 +4587,9 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 			evmcs->hv_enlightenments_control.msr_bitmap = 1;
 		}
 	}
+
+	memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
+
 	return 0;
 
 out_vmcs:
@@ -6080,12 +6096,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	 */
 	cr3 = __read_cr3();
 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
-	vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
+	vmx->loaded_vmcs->host_state.cr3 = cr3;
 
 	/* Save the most likely value for this task's CR4 in the VMCS. */
 	cr4 = cr4_read_shadow();
 	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
-	vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
+	vmx->loaded_vmcs->host_state.cr4 = cr4;
 
 	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
 #ifdef CONFIG_X86_64
@@ -10356,15 +10372,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
 	cr3 = __get_current_cr3_fast();
-	if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
+	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
 		vmcs_writel(HOST_CR3, cr3);
-		vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
+		vmx->loaded_vmcs->host_state.cr3 = cr3;
 	}
 
 	cr4 = cr4_read_shadow();
-	if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
+	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
 		vmcs_writel(HOST_CR4, cr4);
-		vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
+		vmx->loaded_vmcs->host_state.cr4 = cr4;
 	}
 
 	/* When single-stepping over STI and MOV SS, we must clear the

From f3bbc0dcedf50d29d2d6fdfb16c5ec3e2368837f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:48 -0700
Subject: [PATCH 75/85] KVM: vmx: always initialize HOST_{FS,GS}_BASE to zero
 during setup

The HOST_{FS,GS}_BASE fields are guaranteed to be written prior to
VMENTER, by way of vmx_prepare_switch_to_guest().  Initialize the
fields to zero for 64-bit kernels instead of pulling the base values
from their respective MSRs.  In addition to eliminating two RDMSRs,
vmx_prepare_switch_to_guest() can safely assume the initial value of
the fields is zero in all cases.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d0d2374a3170..60bf9a95219f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6361,9 +6361,6 @@ static void ept_set_mmio_spte_mask(void)
  */
 static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-#ifdef CONFIG_X86_64
-	unsigned long a;
-#endif
 	int i;
 
 	if (enable_shadow_vmcs) {
@@ -6418,15 +6415,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
 	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
 	vmx_set_constant_host_state(vmx);
-#ifdef CONFIG_X86_64
-	rdmsrl(MSR_FS_BASE, a);
-	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
-	rdmsrl(MSR_GS_BASE, a);
-	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
 	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
 	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
 
 	if (cpu_has_vmx_vmfunc())
 		vmcs_write64(VM_FUNCTION_CONTROL, 0);

From 8f21a0bbf36f58334695ae6e46c0cf906a217154 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:49 -0700
Subject: [PATCH 76/85] KVM: vmx: skip VMWRITE of HOST_{FS,GS}_SEL when
 possible

On a 64-bit host, FS.sel and GS.sel are all but guaranteed to be 0,
which in turn means they'll rarely change.  Skip the VMWRITE for the
associated VMCS fields when loading host state if the selector hasn't
changed since the last VMWRITE.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 60bf9a95219f..4937e11f971a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2717,16 +2717,20 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	gs_base = segment_base(gs_sel);
 #endif
 
-	host_state->fs_sel = fs_sel;
-	if (!(fs_sel & 7))
-		vmcs_write16(HOST_FS_SELECTOR, fs_sel);
-	else
-		vmcs_write16(HOST_FS_SELECTOR, 0);
-	host_state->gs_sel = gs_sel;
-	if (!(gs_sel & 7))
-		vmcs_write16(HOST_GS_SELECTOR, gs_sel);
-	else
-		vmcs_write16(HOST_GS_SELECTOR, 0);
+	if (unlikely(fs_sel != host_state->fs_sel)) {
+		if (!(fs_sel & 7))
+			vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+		else
+			vmcs_write16(HOST_FS_SELECTOR, 0);
+		host_state->fs_sel = fs_sel;
+	}
+	if (unlikely(gs_sel != host_state->gs_sel)) {
+		if (!(gs_sel & 7))
+			vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+		else
+			vmcs_write16(HOST_GS_SELECTOR, 0);
+		host_state->gs_sel = gs_sel;
+	}
 
 	vmcs_writel(HOST_FS_BASE, fs_base);
 	vmcs_writel(HOST_GS_BASE, gs_base);

From 5e079c7ece1060491ef4a45414823162cce91b3d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 23 Jul 2018 12:32:50 -0700
Subject: [PATCH 77/85] KVM: vmx: skip VMWRITE of HOST_{FS,GS}_BASE when
 possible

The host's FS.base and GS.base rarely change, e.g. ~0.1% of host/guest
swaps on my system.  Cache the last value written to the VMCS and skip
the VMWRITE to the associated VMCS fields when loading host state if
the value hasn't changed since the last VMWRITE.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4937e11f971a..b23ecc04ca51 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -226,6 +226,8 @@ struct vmcs {
 struct vmcs_host_state {
 	unsigned long cr3;	/* May not match real cr3 */
 	unsigned long cr4;	/* May not match real cr4 */
+	unsigned long gs_base;
+	unsigned long fs_base;
 
 	u16           fs_sel, gs_sel, ldt_sel;
 #ifdef CONFIG_X86_64
@@ -2731,9 +2733,14 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 			vmcs_write16(HOST_GS_SELECTOR, 0);
 		host_state->gs_sel = gs_sel;
 	}
-
-	vmcs_writel(HOST_FS_BASE, fs_base);
-	vmcs_writel(HOST_GS_BASE, gs_base);
+	if (unlikely(fs_base != host_state->fs_base)) {
+		vmcs_writel(HOST_FS_BASE, fs_base);
+		host_state->fs_base = fs_base;
+	}
+	if (unlikely(gs_base != host_state->gs_base)) {
+		vmcs_writel(HOST_GS_BASE, gs_base);
+		host_state->gs_base = gs_base;
+	}
 
 	for (i = 0; i < vmx->save_nmsrs; ++i)
 		kvm_set_shared_msr(vmx->guest_msrs[i].index,

From c2a4eadf7747a1359a80ede64e4ae0e0ba64ca08 Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Tue, 24 Jul 2018 08:17:07 +0000
Subject: [PATCH 78/85] KVM/MMU: Combine flushing remote tlb in mmu_set_spte()

mmu_set_spte() flushes remote tlbs for drop_parent_pte/drop_spte()
and set_spte() separately. This may introduce redundant flush. This
patch is to combine these flushes and check flush request after
calling set_spte().

Signed-off-by: Lan Tianyu <Tianyu.Lan@microsoft.com>
Reviewed-by: Junaid Shahid <junaids@google.com>
Reviewed-by: Xiao Guangrong <xiaoguangrong@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 22a7984c68a5..8f216321d33b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2901,6 +2901,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 	int rmap_count;
 	int set_spte_ret;
 	int ret = RET_PF_RETRY;
+	bool flush = false;
 
 	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
 		 *sptep, write_fault, gfn);
@@ -2917,12 +2918,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, sptep);
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			flush = true;
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %llx new %llx\n",
 				 spte_to_pfn(*sptep), pfn);
 			drop_spte(vcpu->kvm, sptep);
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			flush = true;
 		} else
 			was_rmapped = 1;
 	}
@@ -2934,7 +2935,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 			ret = RET_PF_EMULATE;
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	}
-	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
+	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
 		kvm_flush_remote_tlbs(vcpu->kvm);
 
 	if (unlikely(is_mmio_spte(*sptep)))

From ee6268ba3a6861b2806e569bff7fe91fbdf846dd Mon Sep 17 00:00:00 2001
From: Liang Chen <liangchen.linux@gmail.com>
Date: Wed, 25 Jul 2018 16:32:14 +0800
Subject: [PATCH 79/85] KVM: x86: Skip pae_root shadow allocation if tdp
 enabled

Considering the fact that the pae_root shadow is not needed when
tdp is in use, skip the pae_root shadow page allocation to allow
mmu creation even not being able to obtain memory from DMA32
zone when particular cgroup cpuset.mems or mempolicy control is
applied.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8f216321d33b..f5aef52b148b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5341,6 +5341,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 	struct page *page;
 	int i;
 
+	if (tdp_enabled)
+		return 0;
+
 	/*
 	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
 	 * Therefore we need to allocate shadow page tables in the first

From 74fec5b9dbaa5e6fe776f6c73e6c00fb23dca844 Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Mon, 23 Jul 2018 12:31:21 +0000
Subject: [PATCH 80/85] KVM/x86: Move X86_CR4_OSXSAVE check into
 kvm_valid_sregs()

X86_CR4_OSXSAVE check belongs to sregs check and so move into
kvm_valid_sregs().

Signed-off-by: Lan Tianyu <Tianyu.Lan@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6cc29dd21519..6b974802cadb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8049,6 +8049,10 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);
 
 static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+			(sregs->cr4 & X86_CR4_OSXSAVE))
+		return  -EINVAL;
+
 	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
 		/*
 		 * When EFER.LME and CR0.PG are set, the processor is in
@@ -8079,10 +8083,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	struct desc_ptr dt;
 	int ret = -EINVAL;
 
-	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-			(sregs->cr4 & X86_CR4_OSXSAVE))
-		goto out;
-
 	if (kvm_valid_sregs(vcpu, sregs))
 		goto out;
 

From 4180bf1b655a791a0a6ef93a2ffffc762722c782 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Mon, 23 Jul 2018 14:39:54 +0800
Subject: [PATCH 81/85] KVM: X86: Implement "send IPI" hypercall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using hypercall to send IPIs by one vmexit instead of one by one for
xAPIC/x2APIC physical mode and one vmexit per-cluster for x2APIC cluster
mode. Intel guest can enter x2apic cluster mode when interrupt remmaping
is enabled in qemu, however, latest AMD EPYC still just supports xapic
mode which can get great improvement by Exit-less IPIs. This patchset
lets a guest send multicast IPIs, with at most 128 destinations per
hypercall in 64-bit mode and 64 vCPUs per hypercall in 32-bit mode.

Hardware: Xeon Skylake 2.5GHz, 2 sockets, 40 cores, 80 threads, the VM
is 80 vCPUs, IPI microbenchmark(https://lkml.org/lkml/2017/12/19/141):

x2apic cluster mode, vanilla

 Dry-run:                         0,            2392199 ns
 Self-IPI:                  6907514,           15027589 ns
 Normal IPI:              223910476,          251301666 ns
 Broadcast IPI:                   0,         9282161150 ns
 Broadcast lock:                  0,         8812934104 ns

x2apic cluster mode, pv-ipi

 Dry-run:                         0,            2449341 ns
 Self-IPI:                  6720360,           15028732 ns
 Normal IPI:              228643307,          255708477 ns
 Broadcast IPI:                   0,         7572293590 ns  => 22% performance boost
 Broadcast lock:                  0,         8316124651 ns

x2apic physical mode, vanilla

 Dry-run:                         0,            3135933 ns
 Self-IPI:                  8572670,           17901757 ns
 Normal IPI:              226444334,          255421709 ns
 Broadcast IPI:                   0,        19845070887 ns
 Broadcast lock:                  0,        19827383656 ns

x2apic physical mode, pv-ipi

 Dry-run:                         0,            2446381 ns
 Self-IPI:                  6788217,           15021056 ns
 Normal IPI:              219454441,          249583458 ns
 Broadcast IPI:                   0,         7806540019 ns  => 154% performance boost
 Broadcast lock:                  0,         9143618799 ns

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/cpuid.txt      |  4 +++
 Documentation/virtual/kvm/hypercalls.txt | 20 ++++++++++++
 arch/x86/include/asm/kvm_host.h          |  4 +++
 arch/x86/kvm/cpuid.c                     |  3 +-
 arch/x86/kvm/lapic.c                     | 40 ++++++++++++++++++++++++
 arch/x86/kvm/x86.c                       |  3 ++
 include/uapi/linux/kvm_para.h            |  1 +
 7 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index ab022dcd0911..97ca1940a0dc 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -62,6 +62,10 @@ KVM_FEATURE_ASYNC_PF_VMEXIT        ||    10 || paravirtualized async PF VM exit
                                    ||       || can be enabled by setting bit 2
                                    ||       || when writing to msr 0x4b564d02
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_SEND_IPI            ||    11 || guest checks this feature bit
+                                   ||       || before using paravirtualized
+                                   ||       || send IPIs.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index a890529c63ed..da24c138c8d1 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -121,3 +121,23 @@ compute the CLOCK_REALTIME for its clock, at the same instant.
 
 Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
 or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
+
+6. KVM_HC_SEND_IPI
+------------------------
+Architecture: x86
+Status: active
+Purpose: Send IPIs to multiple vCPUs.
+
+a0: lower part of the bitmap of destination APIC IDs
+a1: higher part of the bitmap of destination APIC IDs
+a2: the lowest APIC ID in bitmap
+a3: APIC ICR
+
+The hypercall lets a guest send multicast IPIs, with at most 128
+128 destinations per hypercall in 64-bit mode and 64 vCPUs per
+hypercall in 32-bit mode.  The destinations are represented by a
+bitmap contained in the first two arguments (a0 and a1). Bit 0 of
+a0 corresponds to the APIC ID in the third argument (a2), bit 1
+corresponds to the APIC ID a2+1, and so on.
+
+Returns the number of CPUs to which the IPIs were delivered successfully.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 150937e64f63..c18958ef17d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1457,6 +1457,10 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+    		    unsigned long ipi_bitmap_high, int min,
+		    unsigned long icr, int op_64_bit);
+
 void kvm_define_shared_msr(unsigned index, u32 msr);
 int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7e042e3d47fd..7bcfa61375c0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -621,7 +621,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
 			     (1 << KVM_FEATURE_PV_UNHALT) |
 			     (1 << KVM_FEATURE_PV_TLB_FLUSH) |
-			     (1 << KVM_FEATURE_ASYNC_PF_VMEXIT);
+			     (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+			     (1 << KVM_FEATURE_PV_SEND_IPI);
 
 		if (sched_info_on())
 			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b5cd8465d44f..f0d693122c24 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -547,6 +547,46 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 			irq->level, irq->trig_mode, dest_map);
 }
 
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+    		    unsigned long ipi_bitmap_high, int min,
+		    unsigned long icr, int op_64_bit)
+{
+	int i;
+	struct kvm_apic_map *map;
+	struct kvm_vcpu *vcpu;
+	struct kvm_lapic_irq irq = {0};
+	int cluster_size = op_64_bit ? 64 : 32;
+	int count = 0;
+
+	irq.vector = icr & APIC_VECTOR_MASK;
+	irq.delivery_mode = icr & APIC_MODE_MASK;
+	irq.level = (icr & APIC_INT_ASSERT) != 0;
+	irq.trig_mode = icr & APIC_INT_LEVELTRIG;
+
+	if (icr & APIC_DEST_MASK)
+		return -KVM_EINVAL;
+	if (icr & APIC_SHORT_MASK)
+		return -KVM_EINVAL;
+
+	rcu_read_lock();
+	map = rcu_dereference(kvm->arch.apic_map);
+
+	/* Bits above cluster_size are masked in the caller.  */
+	for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) {
+		vcpu = map->phys_map[min + i]->vcpu;
+		count += kvm_apic_set_irq(vcpu, &irq, NULL);
+	}
+
+	min += cluster_size;
+	for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) {
+		vcpu = map->phys_map[min + i]->vcpu;
+		count += kvm_apic_set_irq(vcpu, &irq, NULL);
+	}
+
+	rcu_read_unlock();
+	return count;
+}
+
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 {
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b974802cadb..3c83711c0ebe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6802,6 +6802,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	case KVM_HC_CLOCK_PAIRING:
 		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
 		break;
+	case KVM_HC_SEND_IPI:
+		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+		break;
 #endif
 	default:
 		ret = -KVM_ENOSYS;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index dcf629dd2889..f3893ef82b65 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -13,6 +13,7 @@
 /* Return values for hypercalls */
 #define KVM_ENOSYS		1000
 #define KVM_EFAULT		EFAULT
+#define KVM_EINVAL		EINVAL
 #define KVM_E2BIG		E2BIG
 #define KVM_EPERM		EPERM
 #define KVM_EOPNOTSUPP		95

From d63bae079b6426afc998c5ea76d9cde8d8c98303 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Mon, 23 Jul 2018 14:39:51 +0800
Subject: [PATCH 82/85] KVM: X86: Add kvm hypervisor init time platform setup
 callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add kvm hypervisor init time platform setup callback which
will be used to replace native apic hooks by pararvirtual
hooks.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/kvm.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 575c9a51e6ba..39d79720380f 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -624,12 +624,22 @@ static uint32_t __init kvm_detect(void)
 	return kvm_cpuid_base();
 }
 
+static void __init kvm_apic_init(void)
+{
+}
+
+static void __init kvm_init_platform(void)
+{
+	x86_platform.apic_post_init = kvm_apic_init;
+}
+
 const __initconst struct hypervisor_x86 x86_hyper_kvm = {
 	.name			= "KVM",
 	.detect			= kvm_detect,
 	.type			= X86_HYPER_KVM,
 	.init.guest_late_init	= kvm_guest_init,
 	.init.x2apic_available	= kvm_para_available,
+	.init.init_platform	= kvm_init_platform,
 };
 
 static __init int activate_jump_labels(void)

From aaffcfd1e82d3378538408d0310b7424b98d8f81 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Mon, 23 Jul 2018 14:39:52 +0800
Subject: [PATCH 83/85] KVM: X86: Implement PV IPIs in linux guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement paravirtual apic hooks to enable PV IPIs for KVM if the "send IPI"
hypercall is available.  The hypercall lets a guest send IPIs, with
at most 128 destinations per hypercall in 64-bit mode and 64 vCPUs per
hypercall in 32-bit mode.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/kvm_para.h |  1 +
 arch/x86/kernel/kvm.c                | 96 ++++++++++++++++++++++++++++
 include/uapi/linux/kvm_para.h        |  1 +
 3 files changed, 98 insertions(+)

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 0ede697c3961..19980ec1a316 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -28,6 +28,7 @@
 #define KVM_FEATURE_PV_UNHALT		7
 #define KVM_FEATURE_PV_TLB_FLUSH	9
 #define KVM_FEATURE_ASYNC_PF_VMEXIT	10
+#define KVM_FEATURE_PV_SEND_IPI	11
 
 #define KVM_HINTS_REALTIME      0
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 39d79720380f..62cbd089f709 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -454,6 +454,98 @@ static void __init sev_map_percpu_data(void)
 }
 
 #ifdef CONFIG_SMP
+#define KVM_IPI_CLUSTER_SIZE	(2 * BITS_PER_LONG)
+
+static void __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+	unsigned long flags;
+	int cpu, apic_id, icr;
+	int min = 0, max = 0;
+#ifdef CONFIG_X86_64
+	__uint128_t ipi_bitmap = 0;
+#else
+	u64 ipi_bitmap = 0;
+#endif
+
+	if (cpumask_empty(mask))
+		return;
+
+	local_irq_save(flags);
+
+	switch (vector) {
+	default:
+		icr = APIC_DM_FIXED | vector;
+		break;
+	case NMI_VECTOR:
+		icr = APIC_DM_NMI;
+		break;
+	}
+
+	for_each_cpu(cpu, mask) {
+		apic_id = per_cpu(x86_cpu_to_apicid, cpu);
+		if (!ipi_bitmap) {
+			min = max = apic_id;
+		} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
+			ipi_bitmap <<= min - apic_id;
+			min = apic_id;
+		} else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+			max = apic_id < max ? max : apic_id;
+		} else {
+			kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+				(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+			min = max = apic_id;
+			ipi_bitmap = 0;
+		}
+		__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
+	}
+
+	if (ipi_bitmap) {
+		kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+			(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+	}
+
+	local_irq_restore(flags);
+}
+
+static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+	__send_ipi_mask(mask, vector);
+}
+
+static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+	unsigned int this_cpu = smp_processor_id();
+	struct cpumask new_mask;
+	const struct cpumask *local_mask;
+
+	cpumask_copy(&new_mask, mask);
+	cpumask_clear_cpu(this_cpu, &new_mask);
+	local_mask = &new_mask;
+	__send_ipi_mask(local_mask, vector);
+}
+
+static void kvm_send_ipi_allbutself(int vector)
+{
+	kvm_send_ipi_mask_allbutself(cpu_online_mask, vector);
+}
+
+static void kvm_send_ipi_all(int vector)
+{
+	__send_ipi_mask(cpu_online_mask, vector);
+}
+
+/*
+ * Set the IPI entry points
+ */
+static void kvm_setup_pv_ipi(void)
+{
+	apic->send_IPI_mask = kvm_send_ipi_mask;
+	apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
+	apic->send_IPI_allbutself = kvm_send_ipi_allbutself;
+	apic->send_IPI_all = kvm_send_ipi_all;
+	pr_info("KVM setup pv IPIs\n");
+}
+
 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
 {
 	native_smp_prepare_cpus(max_cpus);
@@ -626,6 +718,10 @@ static uint32_t __init kvm_detect(void)
 
 static void __init kvm_apic_init(void)
 {
+#if defined(CONFIG_SMP)
+	if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
+		kvm_setup_pv_ipi();
+#endif
 }
 
 static void __init kvm_init_platform(void)
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index f3893ef82b65..6c0ce49931e5 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -27,6 +27,7 @@
 #define KVM_HC_MIPS_EXIT_VM		7
 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
 #define KVM_HC_CLOCK_PAIRING		9
+#define KVM_HC_SEND_IPI		10
 
 /*
  * hypercalls use architecture specific

From fd8ca6dac9b45db8503cf508880edd63e039e2f2 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 6 Aug 2018 16:42:49 +0200
Subject: [PATCH 84/85] KVM/x86: Use CC_SET()/CC_OUT in arch/x86/kvm/vmx.c

Remove open-coded uses of set instructions to use CC_SET()/CC_OUT() in
arch/x86/kvm/vmx.c.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
[Mark error paths as unlikely while touching this. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b23ecc04ca51..16f9373c01de 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -38,6 +38,7 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
+#include <asm/asm.h>
 #include <asm/cpu.h>
 #include <asm/io.h>
 #include <asm/desc.h>
@@ -1916,11 +1917,12 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 	u64 rsvd : 48;
 	u64 gva;
     } operand = { vpid, 0, gva };
+    bool error;
 
-    asm volatile (__ex(ASM_VMX_INVVPID)
-		  /* CF==1 or ZF==1 --> rc = -1 */
-		  "; ja 1f ; ud2 ; 1:"
-		  : : "a"(&operand), "c"(ext) : "cc", "memory");
+    asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
+		  : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
+		  : "memory");
+    BUG_ON(error);
 }
 
 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
@@ -1928,11 +1930,12 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
 	struct {
 		u64 eptp, gpa;
 	} operand = {eptp, gpa};
+	bool error;
 
-	asm volatile (__ex(ASM_VMX_INVEPT)
-			/* CF==1 or ZF==1 --> rc = -1 */
-			"; ja 1f ; ud2 ; 1:\n"
-			: : "a" (&operand), "c" (ext) : "cc", "memory");
+	asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
+		      : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
+		      : "memory");
+	BUG_ON(error);
 }
 
 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
@@ -1948,12 +1951,12 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 static void vmcs_clear(struct vmcs *vmcs)
 {
 	u64 phys_addr = __pa(vmcs);
-	u8 error;
+	bool error;
 
-	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
-		      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
-		      : "cc", "memory");
-	if (error)
+	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
+		      : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
+		      : "memory");
+	if (unlikely(error))
 		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
 		       vmcs, phys_addr);
 }
@@ -1970,15 +1973,15 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
 static void vmcs_load(struct vmcs *vmcs)
 {
 	u64 phys_addr = __pa(vmcs);
-	u8 error;
+	bool error;
 
 	if (static_branch_unlikely(&enable_evmcs))
 		return evmcs_load(phys_addr);
 
-	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
-			: "cc", "memory");
-	if (error)
+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
+		      : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
+		      : "memory");
+	if (unlikely(error))
 		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
 		       vmcs, phys_addr);
 }
@@ -2203,10 +2206,10 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value)
 
 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
 {
-	u8 error;
+	bool error;
 
-	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
-		       : "=q"(error) : "a"(value), "d"(field) : "cc");
+	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
+		      : CC_OUT(na) (error) : "a"(value), "d"(field));
 	if (unlikely(error))
 		vmwrite_error(field, value);
 }

From 28a1f3ac1d0c8558ee4453d9634dad891a6e922e Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 14 Aug 2018 10:15:34 -0700
Subject: [PATCH 85/85] kvm: x86: Set highest physical address bits in
 non-present/reserved SPTEs

Always set the 5 upper-most supported physical address bits to 1 for SPTEs
that are marked as non-present or reserved, to make them unusable for
L1TF attacks from the guest. Currently, this just applies to MMIO SPTEs.
(We do not need to mark PTEs that are completely 0 as physical page 0
is already reserved.)

This allows mitigation of L1TF without disabling hyper-threading by using
shadow paging mode instead of EPT.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 arch/x86/kvm/x86.c |  8 ++++++--
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f5aef52b148b..27c2ab079a76 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -238,6 +238,17 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
 						    PT64_EPT_EXECUTABLE_MASK;
 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
 
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static union kvm_mmu_page_role
 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
@@ -327,9 +338,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 {
 	unsigned int gen = kvm_current_mmio_generation(vcpu);
 	u64 mask = generation_mmio_spte_mask(gen);
+	u64 gpa = gfn << PAGE_SHIFT;
 
 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
-	mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
+	mask |= shadow_mmio_value | access;
+	mask |= gpa | shadow_nonpresent_or_rsvd_mask;
+	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+		<< shadow_nonpresent_or_rsvd_mask_len;
 
 	trace_mark_mmio_spte(sptep, gfn, access, gen);
 	mmu_spte_set(sptep, mask);
@@ -342,8 +357,14 @@ static bool is_mmio_spte(u64 spte)
 
 static gfn_t get_mmio_spte_gfn(u64 spte)
 {
-	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
-	return (spte & ~mask) >> PAGE_SHIFT;
+	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
+		   shadow_nonpresent_or_rsvd_mask;
+	u64 gpa = spte & ~mask;
+
+	gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+	       & shadow_nonpresent_or_rsvd_mask;
+
+	return gpa >> PAGE_SHIFT;
 }
 
 static unsigned get_mmio_spte_access(u64 spte)
@@ -400,7 +421,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static void kvm_mmu_clear_all_pte_masks(void)
+static void kvm_mmu_reset_all_pte_masks(void)
 {
 	shadow_user_mask = 0;
 	shadow_accessed_mask = 0;
@@ -410,6 +431,18 @@ static void kvm_mmu_clear_all_pte_masks(void)
 	shadow_mmio_mask = 0;
 	shadow_present_mask = 0;
 	shadow_acc_track_mask = 0;
+
+	/*
+	 * If the CPU has 46 or less physical address bits, then set an
+	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
+	 * assumed that the CPU is not vulnerable to L1TF.
+	 */
+	if (boot_cpu_data.x86_phys_bits <
+	    52 - shadow_nonpresent_or_rsvd_mask_len)
+		shadow_nonpresent_or_rsvd_mask =
+			rsvd_bits(boot_cpu_data.x86_phys_bits -
+				  shadow_nonpresent_or_rsvd_mask_len,
+				  boot_cpu_data.x86_phys_bits - 1);
 }
 
 static int is_cpuid_PSE36(void)
@@ -5819,7 +5852,7 @@ int kvm_mmu_module_init(void)
 {
 	int ret = -ENOMEM;
 
-	kvm_mmu_clear_all_pte_masks();
+	kvm_mmu_reset_all_pte_masks();
 
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c83711c0ebe..d294983ee1c0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6536,8 +6536,12 @@ static void kvm_set_mmio_spte_mask(void)
 	 * Set the reserved bits and the present bit of an paging-structure
 	 * entry to generate page fault with PFER.RSV = 1.
 	 */
-	 /* Mask the reserved physical address bits. */
-	mask = rsvd_bits(maxphyaddr, 51);
+
+	/*
+	 * Mask the uppermost physical address bit, which would be reserved as
+	 * long as the supported physical address width is less than 52.
+	 */
+	mask = 1ull << 51;
 
 	/* Set the present bit. */
 	mask |= 1ull;