diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b160652f7eee..8637bffbdb4a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -446,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) } #endif +static bool spte_is_locklessly_modifiable(u64 spte) +{ + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +} + static bool spte_has_volatile_bits(u64 spte) { + /* + * Always atomicly update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (spte_is_locklessly_modifiable(spte)) + return true; + if (!shadow_accessed_mask) return false; @@ -489,7 +503,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) */ static bool mmu_spte_update(u64 *sptep, u64 new_spte) { - u64 mask, old_spte = *sptep; + u64 old_spte = *sptep; bool ret = false; WARN_ON(!is_rmap_spte(new_spte)); @@ -499,17 +513,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) return ret; } - new_spte |= old_spte & shadow_dirty_mask; - - mask = shadow_accessed_mask; - if (is_writable_pte(old_spte)) - mask |= shadow_dirty_mask; - - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) + if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); + /* + * For the spte updated out of mmu-lock is safe, since + * we always atomicly update it, see the comments in + * spte_has_volatile_bits(). + */ if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) ret = true; @@ -1085,11 +1098,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) kvm_flush_remote_tlbs(vcpu->kvm); } -static bool spte_is_locklessly_modifiable(u64 spte) -{ - return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); -} - /* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte writ-protection is caused by protecting shadow page table. @@ -2677,18 +2685,114 @@ exit: return ret; } +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +{ + /* + * #PF can be fast only if the shadow page table is present and it + * is caused by write-protect, that means we just need change the + * W bit of the spte which can be done out of mmu-lock. + */ + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + return true; +} + +static bool +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + gfn_t gfn; + + WARN_ON(!sp->role.direct); + + /* + * The gfn of direct spte is stable since it is calculated + * by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + + if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + mark_page_dirty(vcpu->kvm, gfn); + + return true; +} + +/* + * Return value: + * - true: let the vcpu to access on the same address again. + * - false: let the real page fault path to fix it. + */ +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, + u32 error_code) +{ + struct kvm_shadow_walk_iterator iterator; + bool ret = false; + u64 spte = 0ull; + + if (!page_fault_can_be_fast(vcpu, error_code)) + return false; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || iterator.level < level) + break; + + /* + * If the mapping has been changed, let the vcpu fault on the + * same address again. + */ + if (!is_rmap_spte(spte)) { + ret = true; + goto exit; + } + + if (!is_last_spte(spte, level)) + goto exit; + + /* + * Check if it is a spurious fault caused by TLB lazily flushed. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_writable_pte(spte)) { + ret = true; + goto exit; + } + + /* + * Currently, to simplify the code, only the spte write-protected + * by dirty-log can be fast fixed. + */ + if (!spte_is_locklessly_modifiable(spte)) + goto exit; + + /* + * Currently, fast page fault only works for direct mapping since + * the gfn is not stable for indirect shadow page. + * See Documentation/virtual/kvm/locking.txt to get more detail. + */ + ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); +exit: + walk_shadow_page_lockless_end(vcpu); + + return ret; +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, - bool prefault) +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + gfn_t gfn, bool prefault) { int r; int level; int force_pt_level; pfn_t pfn; unsigned long mmu_seq; - bool map_writable; + bool map_writable, write = error_code & PFERR_WRITE_MASK; force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); if (likely(!force_pt_level)) { @@ -2705,6 +2809,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, v, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -3093,7 +3200,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn, prefault); + error_code, gfn, prefault); } static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) @@ -3173,6 +3280,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, gpa, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb();