| From 8bd561bd451a2035e26637f0fe5b7523857afe86 Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Sat, 1 Apr 2023 14:10:45 -0600 |
| Subject: [PATCH] BACKPORT: FROMLIST: mm/kvm: add |
| mmu_notifier_test_clear_young() |
| |
| mmu_notifier_test_clear_young() allows the caller to safely test and |
| clear the accessed bit in KVM PTEs without taking the MMU lock. |
| |
| This patch adds the generic infrastructure to invoke the subsequent |
| arch-specific patches. The arch-specific implementations generally |
| rely on two techniques: RCU and cmpxchg. The former protects KVM page |
| tables from being freed while the latter clears the accessed bit |
| atomically against both the hardware and other software page table |
| walkers. |
| |
| mmu_notifier_test_clear_young() follows two design patterns: fallback |
| and batching. For any unsupported cases, it can optionally fall back |
| to mmu_notifier_ops->clear_young(). For a range of KVM PTEs, it can |
| test or test and clear their accessed bits according to a bitmap |
| provided by the caller. |
| |
| mmu_notifier_test_clear_young() always returns 0 if fallback is not |
| allowed. If fallback happens, its return value is similar to that of |
| mmu_notifier_clear_young(). |
| |
| The bitmap parameter has the following specifications: |
| 1. The number of bits should be at least (end-start)/PAGE_SIZE. |
| 2. The offset of each bit is relative to the end. E.g., the offset |
| corresponding to addr is (end-addr)/PAGE_SIZE-1. This is convenient |
| for batching while forward looping. |
| 3. For each KVM PTE with the accessed bit set (young), arch-specific |
| implementations flip the corresponding bit in the bitmap. It only |
| clears the accessed bit if the old value is 1. A caller can test or |
| test and clear the accessed bit by setting the corresponding bit in |
| the bitmap to 0 or 1, and the new value will be 1 or 0 for a young |
| KVM PTE. |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| (am from https://patchwork.kernel.org/patch/13144312/) |
| |
| BUG=b:266976439 |
| UPSTREAM-TASK=b:266738578 |
| TEST=built |
| |
| Change-Id: I542f491965336527c0da22917775421da5a53dfd |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392681 |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: T.J. Alumbaugh <talumbau@google.com> |
| --- |
| include/linux/kvm_host.h | 64 ++++++++++++++++++++++++++++++++++++ |
| include/linux/mmu_notifier.h | 40 ++++++++++++++++++++++ |
| mm/mmu_notifier.c | 26 +++++++++++++++ |
| virt/kvm/kvm_main.c | 58 ++++++++++++++++++++++++++++++++ |
| 4 files changed, 188 insertions(+) |
| |
| diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h |
| index 0e571e973bc283c50d910d4fbda03cb49b741f16..58338dcb2a01bc91356d8d15602c6914196dfe8a 100644 |
| --- a/include/linux/kvm_host.h |
| +++ b/include/linux/kvm_host.h |
| @@ -267,6 +267,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); |
| bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); |
| bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); |
| bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); |
| +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, |
| + gfn_t lsb_gfn, unsigned long *bitmap); |
| #endif |
| |
| enum { |
| @@ -2288,4 +2290,30 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) |
| /* Max number of entries allowed for each kvm dirty ring */ |
| #define KVM_DIRTY_RING_MAX_ENTRIES 65536 |
| |
| +/* |
| + * Architectures that implement kvm_arch_test_clear_young() should override |
| + * kvm_arch_has_test_clear_young(). |
| + * |
| + * kvm_arch_has_test_clear_young() is allowed to return false positive. It can |
| + * return true if kvm_arch_test_clear_young() is supported but disabled due to |
| + * some runtime constraint. In this case, kvm_arch_test_clear_young() should |
| + * return false. |
| + * |
| + * The last parameter to kvm_arch_test_clear_young() is a bitmap with the |
| + * following specifications: |
| + * 1. The offset of each bit is relative to the second to the last parameter |
| + * lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is |
| + * convenient for batching while forward looping. |
| + * 2. For each KVM PTE with the accessed bit set, the implementation should flip |
| + * the corresponding bit in the bitmap. It should only clear the accessed bit |
| + * if the old value is 1. This allows the caller to test or test and clear |
| + * the accessed bit. |
| + */ |
| +#ifndef kvm_arch_has_test_clear_young |
| +static inline bool kvm_arch_has_test_clear_young(void) |
| +{ |
| + return false; |
| +} |
| +#endif |
| + |
| #endif |
| diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h |
| index 64a3e051c3c43897dca74ea44e45cc32c183e06f..ea68460a8b7ea4d7016c112a6f77121fde71f0dd 100644 |
| --- a/include/linux/mmu_notifier.h |
| +++ b/include/linux/mmu_notifier.h |
| @@ -122,6 +122,11 @@ struct mmu_notifier_ops { |
| struct mm_struct *mm, |
| unsigned long address); |
| |
| + /* see the comments on mmu_notifier_test_clear_young() */ |
| + bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + unsigned long *bitmap); |
| + |
| /* |
| * change_pte is called in cases that pte mapping to page is changed: |
| * for example, when ksm remaps pte to point to a new shared page. |
| @@ -392,6 +397,9 @@ extern int __mmu_notifier_clear_young(struct mm_struct *mm, |
| unsigned long end); |
| extern int __mmu_notifier_test_young(struct mm_struct *mm, |
| unsigned long address); |
| +extern int __mmu_notifier_test_clear_young(struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + bool fallback, unsigned long *bitmap); |
| extern void __mmu_notifier_change_pte(struct mm_struct *mm, |
| unsigned long address, pte_t pte); |
| extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); |
| @@ -440,6 +448,31 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm, |
| return 0; |
| } |
| |
| +/* |
| + * This function always returns 0 if fallback is not allowed. If fallback |
| + * happens, its return value is similar to that of mmu_notifier_clear_young(). |
| + * |
| + * The bitmap has the following specifications: |
| + * 1. The number of bits should be at least (end-start)/PAGE_SIZE. |
| + * 2. The offset of each bit is relative to the end. E.g., the offset |
| + * corresponding to addr is (end-addr)/PAGE_SIZE-1. This is convenient for |
| + * batching while forward looping. |
| + * 3. For each KVM PTE with the accessed bit set (young), this function flips |
| + * the corresponding bit in the bitmap. It only clears the accessed bit if |
| + * the old value is 1. A caller can test or test and clear the accessed bit |
| + * by setting the corresponding bit in the bitmap to 0 or 1, and the new |
| + * value will be 1 or 0 for a young KVM PTE. |
| + */ |
| +static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + bool fallback, unsigned long *bitmap) |
| +{ |
| + if (mm_has_notifiers(mm)) |
| + return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap); |
| + |
| + return 0; |
| +} |
| + |
| static inline void mmu_notifier_change_pte(struct mm_struct *mm, |
| unsigned long address, pte_t pte) |
| { |
| @@ -690,6 +723,13 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm, |
| return 0; |
| } |
| |
| +static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + bool fallback, unsigned long *bitmap) |
| +{ |
| + return 0; |
| +} |
| + |
| static inline void mmu_notifier_change_pte(struct mm_struct *mm, |
| unsigned long address, pte_t pte) |
| { |
| diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c |
| index 50c0dde1354f456366786cfa35d555df5398ad96..a4a2906074b6ca16b1deef4adaf3db1b700aa105 100644 |
| --- a/mm/mmu_notifier.c |
| +++ b/mm/mmu_notifier.c |
| @@ -441,6 +441,32 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, |
| srcu_read_unlock(&srcu, id); |
| } |
| |
| +/* see the comments on mmu_notifier_test_clear_young() */ |
| +int __mmu_notifier_test_clear_young(struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + bool fallback, unsigned long *bitmap) |
| +{ |
| + int key; |
| + struct mmu_notifier *mn; |
| + int young = 0; |
| + |
| + key = srcu_read_lock(&srcu); |
| + |
| + hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list, |
| + hlist, srcu_read_lock_held(&srcu)) { |
| + if (mn->ops->test_clear_young && |
| + mn->ops->test_clear_young(mn, mm, start, end, bitmap)) |
| + continue; |
| + |
| + if (fallback && mn->ops->clear_young) |
| + young |= mn->ops->clear_young(mn, mm, start, end); |
| + } |
| + |
| + srcu_read_unlock(&srcu, key); |
| + |
| + return young; |
| +} |
| + |
| static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, |
| const struct mmu_notifier_range *range) |
| { |
| diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c |
| index 479802a892d4f7aace4d72a38e8fab18e5e7dc03..642534d67a7a6f37fffd370f7fa32569299bb511 100644 |
| --- a/virt/kvm/kvm_main.c |
| +++ b/virt/kvm/kvm_main.c |
| @@ -880,6 +880,63 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, |
| kvm_test_age_gfn); |
| } |
| |
| +static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start, |
| + unsigned long end, unsigned long *bitmap) |
| +{ |
| + int i; |
| + int key; |
| + bool success = true; |
| + |
| + trace_kvm_age_hva(start, end); |
| + |
| + key = srcu_read_lock(&kvm->srcu); |
| + |
| + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { |
| + struct interval_tree_node *node; |
| + struct kvm_memslots *slots = __kvm_memslots(kvm, i); |
| + |
| + kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) { |
| + gfn_t lsb_gfn; |
| + unsigned long hva_start, hva_end; |
| + struct kvm_gfn_range range = { |
| + .slot = container_of(node, struct kvm_memory_slot, |
| + hva_node[slots->node_idx]), |
| + }; |
| + |
| + hva_start = max(start, range.slot->userspace_addr); |
| + hva_end = min(end, range.slot->userspace_addr + |
| + range.slot->npages * PAGE_SIZE); |
| + |
| + range.start = hva_to_gfn_memslot(hva_start, range.slot); |
| + range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, range.slot); |
| + |
| + if (WARN_ON_ONCE(range.end <= range.start)) |
| + continue; |
| + |
| + /* see the comments on the generic kvm_arch_has_test_clear_young() */ |
| + lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot); |
| + |
| + success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap); |
| + if (!success) |
| + break; |
| + } |
| + } |
| + |
| + srcu_read_unlock(&kvm->srcu, key); |
| + |
| + return success; |
| +} |
| + |
| +static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + unsigned long *bitmap) |
| +{ |
| + if (kvm_arch_has_test_clear_young()) |
| + return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap); |
| + |
| + return false; |
| +} |
| + |
| static void kvm_mmu_notifier_release(struct mmu_notifier *mn, |
| struct mm_struct *mm) |
| { |
| @@ -898,6 +955,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { |
| .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |
| .clear_young = kvm_mmu_notifier_clear_young, |
| .test_young = kvm_mmu_notifier_test_young, |
| + .test_clear_young = kvm_mmu_notifier_test_clear_young, |
| .change_pte = kvm_mmu_notifier_change_pte, |
| .release = kvm_mmu_notifier_release, |
| }; |
| -- |
| 2.34.1 |
| |