blob: c8aa21ca12b80399d245e2924494c7f54f38c871 [file] [log] [blame]
From 858300686151930c31c3217aad26890342fafc34 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 1 Apr 2023 14:10:45 -0600
Subject: [PATCH] BACKPORT: FROMLIST: mm/kvm: add
mmu_notifier_test_clear_young()
mmu_notifier_test_clear_young() allows the caller to safely test and
clear the accessed bit in KVM PTEs without taking the MMU lock.
This patch adds the generic infrastructure to invoke the subsequent
arch-specific patches. The arch-specific implementations generally
rely on two techniques: RCU and cmpxchg. The former protects KVM page
tables from being freed while the latter clears the accessed bit
atomically against both the hardware and other software page table
walkers.
mmu_notifier_test_clear_young() follows two design patterns: fallback
and batching. For any unsupported cases, it can optionally fall back
to mmu_notifier_ops->clear_young(). For a range of KVM PTEs, it can
test or test and clear their accessed bits according to a bitmap
provided by the caller.
mmu_notifier_test_clear_young() always returns 0 if fallback is not
allowed. If fallback happens, its return value is similar to that of
mmu_notifier_clear_young().
The bitmap parameter has the following specifications:
1. The number of bits should be at least (end-start)/PAGE_SIZE.
2. The offset of each bit is relative to the end. E.g., the offset
corresponding to addr is (end-addr)/PAGE_SIZE-1. This is convenient
for batching while forward looping.
3. For each KVM PTE with the accessed bit set (young), arch-specific
implementations flip the corresponding bit in the bitmap. It only
clears the accessed bit if the old value is 1. A caller can test or
test and clear the accessed bit by setting the corresponding bit in
the bitmap to 0 or 1, and the new value will be 1 or 0 for a young
KVM PTE.
Signed-off-by: Yu Zhao <yuzhao@google.com>
(am from https://patchwork.kernel.org/patch/13144312/)
BUG=b:266976439
UPSTREAM-TASK=b:266738578
TEST=built
Change-Id: I542f491965336527c0da22917775421da5a53dfd
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392681
Commit-Queue: Yu Zhao <yuzhao@chromium.org>
Tested-by: Yu Zhao <yuzhao@chromium.org>
Reviewed-by: T.J. Alumbaugh <talumbau@google.com>
Kcr-patch: 3995af79d3483c6c6afa84c99ce0a8eabfb03102e0c6aac12fd840ff.patch
---
include/linux/kvm_host.h | 28 +++++++++++++++++
include/linux/mmu_notifier.h | 40 +++++++++++++++++++++++++
mm/mmu_notifier.c | 26 ++++++++++++++++
virt/kvm/kvm_main.c | 58 ++++++++++++++++++++++++++++++++++++
4 files changed, 152 insertions(+)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 94c9dafd08132baca34a2e6afa22864586d46ed3..b359084959dd167669482a1b92c74478ea6ece6b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -271,6 +271,8 @@ struct kvm_gfn_range {
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
+ gfn_t lsb_gfn, unsigned long *bitmap);
#endif
enum {
@@ -2399,6 +2401,32 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
/* Max number of entries allowed for each kvm dirty ring */
#define KVM_DIRTY_RING_MAX_ENTRIES 65536
+/*
+ * Architectures that implement kvm_arch_test_clear_young() should override
+ * kvm_arch_has_test_clear_young().
+ *
+ * kvm_arch_has_test_clear_young() is allowed to return false positive. It can
+ * return true if kvm_arch_test_clear_young() is supported but disabled due to
+ * some runtime constraint. In this case, kvm_arch_test_clear_young() should
+ * return false.
+ *
+ * The last parameter to kvm_arch_test_clear_young() is a bitmap with the
+ * following specifications:
+ * 1. The offset of each bit is relative to the second to the last parameter
+ * lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is
+ * convenient for batching while forward looping.
+ * 2. For each KVM PTE with the accessed bit set, the implementation should flip
+ * the corresponding bit in the bitmap. It should only clear the accessed bit
+ * if the old value is 1. This allows the caller to test or test and clear
+ * the accessed bit.
+ */
+#ifndef kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+ return false;
+}
+#endif
+
static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
gpa_t gpa, gpa_t size,
bool is_write, bool is_exec,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d39ebb10caeb68453af7595a741bd5fe2c390b40..fce9dd80933b664eae2bcb4b89894d7f519cff1e 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -122,6 +122,11 @@ struct mmu_notifier_ops {
struct mm_struct *mm,
unsigned long address);
+ /* see the comments on mmu_notifier_test_clear_young() */
+ bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ unsigned long *bitmap);
+
/*
* invalidate_range_start() and invalidate_range_end() must be
* paired and are called only when the mmap_lock and/or the
@@ -383,6 +388,9 @@ extern int __mmu_notifier_clear_young(struct mm_struct *mm,
unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address);
+extern int __mmu_notifier_test_clear_young(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool fallback, unsigned long *bitmap);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
@@ -428,6 +436,31 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
return 0;
}
+/*
+ * This function always returns 0 if fallback is not allowed. If fallback
+ * happens, its return value is similar to that of mmu_notifier_clear_young().
+ *
+ * The bitmap has the following specifications:
+ * 1. The number of bits should be at least (end-start)/PAGE_SIZE.
+ * 2. The offset of each bit is relative to the end. E.g., the offset
+ * corresponding to addr is (end-addr)/PAGE_SIZE-1. This is convenient for
+ * batching while forward looping.
+ * 3. For each KVM PTE with the accessed bit set (young), this function flips
+ * the corresponding bit in the bitmap. It only clears the accessed bit if
+ * the old value is 1. A caller can test or test and clear the accessed bit
+ * by setting the corresponding bit in the bitmap to 0 or 1, and the new
+ * value will be 1 or 0 for a young KVM PTE.
+ */
+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool fallback, unsigned long *bitmap)
+{
+ if (mm_has_notifiers(mm))
+ return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap);
+
+ return 0;
+}
+
static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
@@ -612,6 +645,13 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
return 0;
}
+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool fallback, unsigned long *bitmap)
+{
+ return 0;
+}
+
static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8982e6139d074bb38777f1e84f24c84f079c4a2f..25e2cd1b3688c09c4b83ef060f6be2d6fba1baa5 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -424,6 +424,32 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
return young;
}
+/* see the comments on mmu_notifier_test_clear_young() */
+int __mmu_notifier_test_clear_young(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool fallback, unsigned long *bitmap)
+{
+ int key;
+ struct mmu_notifier *mn;
+ int young = 0;
+
+ key = srcu_read_lock(&srcu);
+
+ hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list,
+ hlist, srcu_read_lock_held(&srcu)) {
+ if (mn->ops->test_clear_young &&
+ mn->ops->test_clear_young(mn, mm, start, end, bitmap))
+ continue;
+
+ if (fallback && mn->ops->clear_young)
+ young |= mn->ops->clear_young(mn, mm, start, end);
+ }
+
+ srcu_read_unlock(&srcu, key);
+
+ return young;
+}
+
static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
const struct mmu_notifier_range *range)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a57eb18cd8a665f5c60db60d715cef421c278538..9fc306a69a53a20158ca89872d308d162b0b66ba 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -907,6 +907,63 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
kvm_test_age_gfn);
}
+static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start,
+ unsigned long end, unsigned long *bitmap)
+{
+ int i;
+ int key;
+ bool success = true;
+
+ trace_kvm_age_hva(start, end);
+
+ key = srcu_read_lock(&kvm->srcu);
+
+ for (i = 0; i < KVM_MAX_NR_ADDRESS_SPACES; i++) {
+ struct interval_tree_node *node;
+ struct kvm_memslots *slots = __kvm_memslots(kvm, i);
+
+ kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) {
+ gfn_t lsb_gfn;
+ unsigned long hva_start, hva_end;
+ struct kvm_gfn_range range = {
+ .slot = container_of(node, struct kvm_memory_slot,
+ hva_node[slots->node_idx]),
+ };
+
+ hva_start = max(start, range.slot->userspace_addr);
+ hva_end = min(end, range.slot->userspace_addr +
+ range.slot->npages * PAGE_SIZE);
+
+ range.start = hva_to_gfn_memslot(hva_start, range.slot);
+ range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, range.slot);
+
+ if (WARN_ON_ONCE(range.end <= range.start))
+ continue;
+
+ /* see the comments on the generic kvm_arch_has_test_clear_young() */
+ lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot);
+
+ success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap);
+ if (!success)
+ break;
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, key);
+
+ return success;
+}
+
+static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ unsigned long *bitmap)
+{
+ if (kvm_arch_has_test_clear_young())
+ return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap);
+
+ return false;
+}
+
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
@@ -924,6 +981,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
.clear_young = kvm_mmu_notifier_clear_young,
.test_young = kvm_mmu_notifier_test_young,
+ .test_clear_young = kvm_mmu_notifier_test_clear_young,
.release = kvm_mmu_notifier_release,
};
--
2.45.1.288.g0e0cd299f1-goog