blob: f9f64cfa6b84ef5290ff68e17074ff93754037f4 [file] [log] [blame]
From 80f117e4a498a3505038c40ed3c9d6ace5a2f8a9 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 1 Apr 2023 16:36:37 -0600
Subject: [PATCH] BACKPORT: FROMLIST: kvm/x86: add kvm_arch_test_clear_young()
This patch adds kvm_arch_test_clear_young() for the vast majority of
VMs that are not nested and run on hardware that sets the accessed bit
in TDP MMU PTEs.
It relies on two techniques, RCU and cmpxchg, to safely test and clear
the accessed bit without taking the MMU lock. The former protects KVM
page tables from being freed while the latter clears the accessed bit
atomically against both the hardware and other software page table
walkers.
Signed-off-by: Yu Zhao <yuzhao@google.com>
(am from https://patchwork.kernel.org/patch/13144311/)
BUG=b:266976439
UPSTREAM-TASK=b:266738578
TEST=built
Change-Id: I46afae96afd8b2b6865a24d42eb7aba6935f1573
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392742
Commit-Queue: Yu Zhao <yuzhao@chromium.org>
Tested-by: Yu Zhao <yuzhao@chromium.org>
Reviewed-by: T.J. Alumbaugh <talumbau@google.com>
---
arch/x86/include/asm/kvm_host.h | 15 ++++++++++++
arch/x86/kvm/mmu/spte.h | 1 -
arch/x86/kvm/mmu/tdp_mmu.c | 41 +++++++++++++++++++++++++++++++++
3 files changed, 56 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ffe2914075fd6d88d13ef0bb4e255d6538e23474..35907e9a13448c803a9120c85dd2153f428998b0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1383,6 +1383,12 @@ struct kvm_arch {
* the MMU lock in read mode + the tdp_mmu_pages_lock or
* the MMU lock in write mode
*
+ * kvm_arch_test_clear_young() is a special case. It relies on two
+ * techniques, RCU and cmpxchg, to safely test and clear the accessed
+ * bit without taking the MMU lock. The former protects KVM page tables
+ * from being freed while the latter clears the accessed bit atomically
+ * against both the hardware and other software page table walkers.
+ *
* Roots will remain in the list until their tdp_mmu_root_count
* drops to zero, at which point the thread that decremented the
* count to zero should removed the root from the list and clean
@@ -1774,6 +1780,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern u64 __read_mostly host_efer;
+extern u64 __read_mostly shadow_accessed_mask;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
extern struct kvm_x86_ops kvm_x86_ops;
@@ -2212,6 +2219,14 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+/* see the comments on the generic kvm_arch_has_test_clear_young() */
+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+ return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) &&
+ (!IS_REACHABLE(CONFIG_KVM) || (tdp_enabled && shadow_accessed_mask));
+}
+
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
* initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1279db2eab44c7887f09d40ab2c895bf409122eb..a82c4fa1c47b9e155e3a805f0705fd7264226e13 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
-extern u64 __read_mostly shadow_accessed_mask;
extern u64 __read_mostly shadow_dirty_mask;
extern u64 __read_mostly shadow_mmio_value;
extern u64 __read_mostly shadow_mmio_mask;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 08340219c35a40291abc29856a0f396834e0e742..19b2d21d5125253c1b7176f14bf380db04277e3c 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1232,6 +1232,47 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
+ gfn_t lsb_gfn, unsigned long *bitmap)
+{
+ struct kvm_mmu_page *root;
+
+ if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
+ return false;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ return false;
+
+ /* see the comments on kvm_arch->tdp_mmu_roots */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
+ struct tdp_iter iter;
+
+ if (kvm_mmu_page_as_id(root) != range->slot->as_id)
+ continue;
+
+ tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
+ u64 *sptep = rcu_dereference(iter.sptep);
+ u64 new_spte = iter.old_spte & ~shadow_accessed_mask;
+
+ VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));
+ VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end);
+
+ if (new_spte == iter.old_spte)
+ continue;
+
+ /* see the comments on the generic kvm_arch_has_test_clear_young() */
+ if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap))
+ cmpxchg64(sptep, iter.old_spte, new_spte);
+ }
+ }
+
+ rcu_read_unlock();
+
+ return true;
+}
+
static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range)
{
--
2.34.1