| From 80f117e4a498a3505038c40ed3c9d6ace5a2f8a9 Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Sat, 1 Apr 2023 16:36:37 -0600 |
| Subject: [PATCH] BACKPORT: FROMLIST: kvm/x86: add kvm_arch_test_clear_young() |
| |
| This patch adds kvm_arch_test_clear_young() for the vast majority of |
| VMs that are not nested and run on hardware that sets the accessed bit |
| in TDP MMU PTEs. |
| |
| It relies on two techniques, RCU and cmpxchg, to safely test and clear |
| the accessed bit without taking the MMU lock. The former protects KVM |
| page tables from being freed while the latter clears the accessed bit |
| atomically against both the hardware and other software page table |
| walkers. |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| (am from https://patchwork.kernel.org/patch/13144311/) |
| |
| BUG=b:266976439 |
| UPSTREAM-TASK=b:266738578 |
| TEST=built |
| |
| Change-Id: I46afae96afd8b2b6865a24d42eb7aba6935f1573 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392742 |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: T.J. Alumbaugh <talumbau@google.com> |
| --- |
| arch/x86/include/asm/kvm_host.h | 15 ++++++++++++ |
| arch/x86/kvm/mmu/spte.h | 1 - |
| arch/x86/kvm/mmu/tdp_mmu.c | 41 +++++++++++++++++++++++++++++++++ |
| 3 files changed, 56 insertions(+), 1 deletion(-) |
| |
| diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h |
| index ffe2914075fd6d88d13ef0bb4e255d6538e23474..35907e9a13448c803a9120c85dd2153f428998b0 100644 |
| --- a/arch/x86/include/asm/kvm_host.h |
| +++ b/arch/x86/include/asm/kvm_host.h |
| @@ -1383,6 +1383,12 @@ struct kvm_arch { |
| * the MMU lock in read mode + the tdp_mmu_pages_lock or |
| * the MMU lock in write mode |
| * |
| + * kvm_arch_test_clear_young() is a special case. It relies on two |
| + * techniques, RCU and cmpxchg, to safely test and clear the accessed |
| + * bit without taking the MMU lock. The former protects KVM page tables |
| + * from being freed while the latter clears the accessed bit atomically |
| + * against both the hardware and other software page table walkers. |
| + * |
| * Roots will remain in the list until their tdp_mmu_root_count |
| * drops to zero, at which point the thread that decremented the |
| * count to zero should removed the root from the list and clean |
| @@ -1774,6 +1780,7 @@ struct kvm_arch_async_pf { |
| |
| extern u32 __read_mostly kvm_nr_uret_msrs; |
| extern u64 __read_mostly host_efer; |
| +extern u64 __read_mostly shadow_accessed_mask; |
| extern bool __read_mostly allow_smaller_maxphyaddr; |
| extern bool __read_mostly enable_apicv; |
| extern struct kvm_x86_ops kvm_x86_ops; |
| @@ -2212,6 +2219,14 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); |
| KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ |
| KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) |
| |
| +/* see the comments on the generic kvm_arch_has_test_clear_young() */ |
| +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young |
| +static inline bool kvm_arch_has_test_clear_young(void) |
| +{ |
| + return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) && |
| + (!IS_REACHABLE(CONFIG_KVM) || (tdp_enabled && shadow_accessed_mask)); |
| +} |
| + |
| /* |
| * KVM previously used a u32 field in kvm_run to indicate the hypercall was |
| * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the |
| diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h |
| index 1279db2eab44c7887f09d40ab2c895bf409122eb..a82c4fa1c47b9e155e3a805f0705fd7264226e13 100644 |
| --- a/arch/x86/kvm/mmu/spte.h |
| +++ b/arch/x86/kvm/mmu/spte.h |
| @@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask; |
| extern u64 __read_mostly shadow_nx_mask; |
| extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
| extern u64 __read_mostly shadow_user_mask; |
| -extern u64 __read_mostly shadow_accessed_mask; |
| extern u64 __read_mostly shadow_dirty_mask; |
| extern u64 __read_mostly shadow_mmio_value; |
| extern u64 __read_mostly shadow_mmio_mask; |
| diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c |
| index 08340219c35a40291abc29856a0f396834e0e742..19b2d21d5125253c1b7176f14bf380db04277e3c 100644 |
| --- a/arch/x86/kvm/mmu/tdp_mmu.c |
| +++ b/arch/x86/kvm/mmu/tdp_mmu.c |
| @@ -1232,6 +1232,47 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) |
| return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); |
| } |
| |
| +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, |
| + gfn_t lsb_gfn, unsigned long *bitmap) |
| +{ |
| + struct kvm_mmu_page *root; |
| + |
| + if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) |
| + return false; |
| + |
| + if (kvm_memslots_have_rmaps(kvm)) |
| + return false; |
| + |
| + /* see the comments on kvm_arch->tdp_mmu_roots */ |
| + rcu_read_lock(); |
| + |
| + list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { |
| + struct tdp_iter iter; |
| + |
| + if (kvm_mmu_page_as_id(root) != range->slot->as_id) |
| + continue; |
| + |
| + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) { |
| + u64 *sptep = rcu_dereference(iter.sptep); |
| + u64 new_spte = iter.old_spte & ~shadow_accessed_mask; |
| + |
| + VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep))); |
| + VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end); |
| + |
| + if (new_spte == iter.old_spte) |
| + continue; |
| + |
| + /* see the comments on the generic kvm_arch_has_test_clear_young() */ |
| + if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap)) |
| + cmpxchg64(sptep, iter.old_spte, new_spte); |
| + } |
| + } |
| + |
| + rcu_read_unlock(); |
| + |
| + return true; |
| +} |
| + |
| static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, |
| struct kvm_gfn_range *range) |
| { |
| -- |
| 2.34.1 |
| |