diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5ec15e4160b1..98db78a26957 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -57,6 +57,9 @@ struct kvm_mmu_page { atomic_t write_flooding_count; bool tdp_mmu_page; + + /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + struct rcu_head rcu_head; }; extern struct kmem_cache *mmu_page_header_cache; diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 1a09d212186b..e5f148106e20 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); } static gfn_t round_gfn_for_level(gfn_t gfn, int level) @@ -35,7 +35,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, iter->root_level = root_level; iter->min_level = min_level; iter->level = root_level; - iter->pt_path[iter->level - 1] = root_pt; + iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); @@ -48,7 +48,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, * address of the child page table referenced by the SPTE. Returns null if * there is no such entry. */ -u64 *spte_to_child_pt(u64 spte, int level) +tdp_ptep_t spte_to_child_pt(u64 spte, int level) { /* * There's no child entry if this entry isn't present or is a @@ -57,7 +57,7 @@ u64 *spte_to_child_pt(u64 spte, int level) if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) return NULL; - return __va(spte_to_pfn(spte) << PAGE_SHIFT); + return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT); } /* @@ -66,7 +66,7 @@ u64 *spte_to_child_pt(u64 spte, int level) */ static bool try_step_down(struct tdp_iter *iter) { - u64 *child_pt; + tdp_ptep_t child_pt; if (iter->level == iter->min_level) return false; @@ -75,7 +75,7 @@ static bool try_step_down(struct tdp_iter *iter) * Reread the SPTE before stepping down to avoid traversing into page * tables that are no longer linked from this entry. */ - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); child_pt = spte_to_child_pt(iter->old_spte, iter->level); if (!child_pt) @@ -109,7 +109,7 @@ static bool try_step_side(struct tdp_iter *iter) iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); iter->next_last_level_gfn = iter->gfn; iter->sptep++; - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); return true; } @@ -159,7 +159,7 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -u64 *tdp_iter_root_pt(struct tdp_iter *iter) +tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) { return iter->pt_path[iter->root_level - 1]; } diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index d480c540ee27..4cc177d75c4a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -7,6 +7,8 @@ #include "mmu.h" +typedef u64 __rcu *tdp_ptep_t; + /* * A TDP iterator performs a pre-order walk over a TDP paging structure. */ @@ -23,9 +25,9 @@ struct tdp_iter { */ gfn_t yielded_gfn; /* Pointers to the page tables traversed to reach the current SPTE */ - u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; /* A pointer to the current SPTE */ - u64 *sptep; + tdp_ptep_t sptep; /* The lowest GFN mapped by the current SPTE */ gfn_t gfn; /* The level of the root page given to the iterator */ @@ -55,11 +57,11 @@ struct tdp_iter { #define for_each_tdp_pte(iter, root, root_level, start, end) \ for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) -u64 *spte_to_child_pt(u64 pte, int level); +tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -u64 *tdp_iter_root_pt(struct tdp_iter *iter); +tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a75e92164a8b..9e4009068920 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -42,6 +42,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) return; WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); + + /* + * Ensure that all the outstanding RCU callbacks to free shadow pages + * can run before the VM is torn down. + */ + rcu_barrier(); } static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) @@ -196,6 +202,28 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) return __pa(root->spt); } +static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) +{ + free_page((unsigned long)sp->spt); + kmem_cache_free(mmu_page_header_cache, sp); +} + +/* + * This is called through call_rcu in order to free TDP page table memory + * safely with respect to other kernel threads that may be operating on + * the memory. + * By only accessing TDP MMU page table memory in an RCU read critical + * section, and freeing it after a grace period, lockless access to that + * memory won't use it after it is freed. + */ +static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) +{ + struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, + rcu_head); + + tdp_mmu_free_sp(sp); +} + static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level); @@ -269,8 +297,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt) kvm_flush_remote_tlbs_with_address(kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - free_page((unsigned long)pt); - kmem_cache_free(mmu_page_header_cache, sp); + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } /** @@ -390,13 +417,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - u64 *root_pt = tdp_iter_root_pt(iter); + tdp_ptep_t root_pt = tdp_iter_root_pt(iter); struct kvm_mmu_page *root = sptep_to_sp(root_pt); int as_id = kvm_mmu_page_as_id(root); lockdep_assert_held(&kvm->mmu_lock); - WRITE_ONCE(*iter->sptep, new_spte); + WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, iter->level); @@ -466,10 +493,13 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, return false; if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + rcu_read_unlock(); + if (flush) kvm_flush_remote_tlbs(kvm); cond_resched_lock(&kvm->mmu_lock); + rcu_read_lock(); WARN_ON(iter->gfn > iter->next_last_level_gfn); @@ -500,6 +530,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, struct tdp_iter iter; bool flush_needed = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { @@ -523,6 +555,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte(kvm, &iter, 0); flush_needed = true; } + + rcu_read_unlock(); return flush_needed; } @@ -568,13 +602,15 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (unlikely(is_noslot_pfn(pfn))) { new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); - trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); + trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, + new_spte); } else { make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, pfn, iter->old_spte, prefault, true, map_writable, !shadow_accessed_mask, &new_spte); - trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); } if (new_spte == iter->old_spte) @@ -597,7 +633,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (unlikely(is_mmio_spte(new_spte))) ret = RET_PF_EMULATE; - trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); if (!prefault) vcpu->stat.pf_fixed++; @@ -635,6 +672,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); + + rcu_read_lock(); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { if (nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(iter.old_spte, gfn, @@ -660,7 +700,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, * because the new value informs the !present * path below. */ - iter.old_spte = READ_ONCE(*iter.sptep); + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); } if (!is_shadow_present_pte(iter.old_spte)) { @@ -678,11 +718,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } } - if (WARN_ON(iter.level != level)) + if (WARN_ON(iter.level != level)) { + rcu_read_unlock(); return RET_PF_RETRY; + } ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, pfn, prefault); + rcu_read_unlock(); return ret; } @@ -753,6 +796,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, int young = 0; u64 new_spte = 0; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { /* * If we have a non-accessed entry we don't need to change the @@ -784,6 +829,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, trace_kvm_age_page(iter.gfn, iter.level, slot, young); } + rcu_read_unlock(); + return young; } @@ -829,6 +876,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, u64 new_spte; int need_flush = 0; + rcu_read_lock(); + WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); @@ -857,6 +906,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, if (need_flush) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + rcu_read_unlock(); + return 0; } @@ -880,6 +931,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); for_each_tdp_pte_min_level(iter, root->spt, root->role.level, @@ -897,6 +950,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); spte_set = true; } + + rcu_read_unlock(); return spte_set; } @@ -938,6 +993,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) continue; @@ -957,6 +1014,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); spte_set = true; } + + rcu_read_unlock(); return spte_set; } @@ -998,6 +1057,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, struct tdp_iter iter; u64 new_spte; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) @@ -1023,6 +1084,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); } + + rcu_read_unlock(); } /* @@ -1062,6 +1125,8 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) continue; @@ -1076,6 +1141,7 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, spte_set = true; } + rcu_read_unlock(); return spte_set; } @@ -1113,6 +1179,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm, kvm_pfn_t pfn; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { spte_set = false; @@ -1133,6 +1201,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm, spte_set = true; } + rcu_read_unlock(); if (spte_set) kvm_flush_remote_tlbs(kvm); } @@ -1169,6 +1238,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { if (!is_writable_pte(iter.old_spte)) break; @@ -1180,6 +1251,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, spte_set = true; } + rcu_read_unlock(); + return spte_set; } @@ -1220,10 +1293,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; + rcu_read_lock(); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } + rcu_read_unlock(); + return leaf; }