| From 858300686151930c31c3217aad26890342fafc34 Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Sat, 1 Apr 2023 14:10:45 -0600 |
| Subject: [PATCH] BACKPORT: FROMLIST: mm/kvm: add |
| mmu_notifier_test_clear_young() |
| |
| mmu_notifier_test_clear_young() allows the caller to safely test and |
| clear the accessed bit in KVM PTEs without taking the MMU lock. |
| |
| This patch adds the generic infrastructure to invoke the subsequent |
| arch-specific patches. The arch-specific implementations generally |
| rely on two techniques: RCU and cmpxchg. The former protects KVM page |
| tables from being freed while the latter clears the accessed bit |
| atomically against both the hardware and other software page table |
| walkers. |
| |
| mmu_notifier_test_clear_young() follows two design patterns: fallback |
| and batching. For any unsupported cases, it can optionally fall back |
| to mmu_notifier_ops->clear_young(). For a range of KVM PTEs, it can |
| test or test and clear their accessed bits according to a bitmap |
| provided by the caller. |
| |
| mmu_notifier_test_clear_young() always returns 0 if fallback is not |
| allowed. If fallback happens, its return value is similar to that of |
| mmu_notifier_clear_young(). |
| |
| The bitmap parameter has the following specifications: |
| 1. The number of bits should be at least (end-start)/PAGE_SIZE. |
| 2. The offset of each bit is relative to the end. E.g., the offset |
| corresponding to addr is (end-addr)/PAGE_SIZE-1. This is convenient |
| for batching while forward looping. |
| 3. For each KVM PTE with the accessed bit set (young), arch-specific |
| implementations flip the corresponding bit in the bitmap. It only |
| clears the accessed bit if the old value is 1. A caller can test or |
| test and clear the accessed bit by setting the corresponding bit in |
| the bitmap to 0 or 1, and the new value will be 1 or 0 for a young |
| KVM PTE. |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| (am from https://patchwork.kernel.org/patch/13144312/) |
| |
| BUG=b:266976439 |
| UPSTREAM-TASK=b:266738578 |
| TEST=built |
| |
| Change-Id: I542f491965336527c0da22917775421da5a53dfd |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392681 |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: T.J. Alumbaugh <talumbau@google.com> |
| Kcr-patch: 3995af79d3483c6c6afa84c99ce0a8eabfb03102e0c6aac12fd840ff.patch |
| --- |
| include/linux/kvm_host.h | 28 +++++++++++++++++ |
| include/linux/mmu_notifier.h | 40 +++++++++++++++++++++++++ |
| mm/mmu_notifier.c | 26 ++++++++++++++++ |
| virt/kvm/kvm_main.c | 58 ++++++++++++++++++++++++++++++++++++ |
| 4 files changed, 152 insertions(+) |
| |
| diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h |
| index 94c9dafd08132baca34a2e6afa22864586d46ed3..b359084959dd167669482a1b92c74478ea6ece6b 100644 |
| --- a/include/linux/kvm_host.h |
| +++ b/include/linux/kvm_host.h |
| @@ -271,6 +271,8 @@ struct kvm_gfn_range { |
| bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); |
| bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); |
| bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); |
| +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, |
| + gfn_t lsb_gfn, unsigned long *bitmap); |
| #endif |
| |
| enum { |
| @@ -2399,6 +2401,32 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) |
| /* Max number of entries allowed for each kvm dirty ring */ |
| #define KVM_DIRTY_RING_MAX_ENTRIES 65536 |
| |
| +/* |
| + * Architectures that implement kvm_arch_test_clear_young() should override |
| + * kvm_arch_has_test_clear_young(). |
| + * |
| + * kvm_arch_has_test_clear_young() is allowed to return false positive. It can |
| + * return true if kvm_arch_test_clear_young() is supported but disabled due to |
| + * some runtime constraint. In this case, kvm_arch_test_clear_young() should |
| + * return false. |
| + * |
| + * The last parameter to kvm_arch_test_clear_young() is a bitmap with the |
| + * following specifications: |
| + * 1. The offset of each bit is relative to the second to the last parameter |
| + * lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is |
| + * convenient for batching while forward looping. |
| + * 2. For each KVM PTE with the accessed bit set, the implementation should flip |
| + * the corresponding bit in the bitmap. It should only clear the accessed bit |
| + * if the old value is 1. This allows the caller to test or test and clear |
| + * the accessed bit. |
| + */ |
| +#ifndef kvm_arch_has_test_clear_young |
| +static inline bool kvm_arch_has_test_clear_young(void) |
| +{ |
| + return false; |
| +} |
| +#endif |
| + |
| static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, |
| gpa_t gpa, gpa_t size, |
| bool is_write, bool is_exec, |
| diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h |
| index d39ebb10caeb68453af7595a741bd5fe2c390b40..fce9dd80933b664eae2bcb4b89894d7f519cff1e 100644 |
| --- a/include/linux/mmu_notifier.h |
| +++ b/include/linux/mmu_notifier.h |
| @@ -122,6 +122,11 @@ struct mmu_notifier_ops { |
| struct mm_struct *mm, |
| unsigned long address); |
| |
| + /* see the comments on mmu_notifier_test_clear_young() */ |
| + bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + unsigned long *bitmap); |
| + |
| /* |
| * invalidate_range_start() and invalidate_range_end() must be |
| * paired and are called only when the mmap_lock and/or the |
| @@ -383,6 +388,9 @@ extern int __mmu_notifier_clear_young(struct mm_struct *mm, |
| unsigned long end); |
| extern int __mmu_notifier_test_young(struct mm_struct *mm, |
| unsigned long address); |
| +extern int __mmu_notifier_test_clear_young(struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + bool fallback, unsigned long *bitmap); |
| extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); |
| extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); |
| extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, |
| @@ -428,6 +436,31 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm, |
| return 0; |
| } |
| |
| +/* |
| + * This function always returns 0 if fallback is not allowed. If fallback |
| + * happens, its return value is similar to that of mmu_notifier_clear_young(). |
| + * |
| + * The bitmap has the following specifications: |
| + * 1. The number of bits should be at least (end-start)/PAGE_SIZE. |
| + * 2. The offset of each bit is relative to the end. E.g., the offset |
| + * corresponding to addr is (end-addr)/PAGE_SIZE-1. This is convenient for |
| + * batching while forward looping. |
| + * 3. For each KVM PTE with the accessed bit set (young), this function flips |
| + * the corresponding bit in the bitmap. It only clears the accessed bit if |
| + * the old value is 1. A caller can test or test and clear the accessed bit |
| + * by setting the corresponding bit in the bitmap to 0 or 1, and the new |
| + * value will be 1 or 0 for a young KVM PTE. |
| + */ |
| +static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + bool fallback, unsigned long *bitmap) |
| +{ |
| + if (mm_has_notifiers(mm)) |
| + return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap); |
| + |
| + return 0; |
| +} |
| + |
| static inline void |
| mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) |
| { |
| @@ -612,6 +645,13 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm, |
| return 0; |
| } |
| |
| +static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + bool fallback, unsigned long *bitmap) |
| +{ |
| + return 0; |
| +} |
| + |
| static inline void |
| mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) |
| { |
| diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c |
| index 8982e6139d074bb38777f1e84f24c84f079c4a2f..25e2cd1b3688c09c4b83ef060f6be2d6fba1baa5 100644 |
| --- a/mm/mmu_notifier.c |
| +++ b/mm/mmu_notifier.c |
| @@ -424,6 +424,32 @@ int __mmu_notifier_test_young(struct mm_struct *mm, |
| return young; |
| } |
| |
| +/* see the comments on mmu_notifier_test_clear_young() */ |
| +int __mmu_notifier_test_clear_young(struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + bool fallback, unsigned long *bitmap) |
| +{ |
| + int key; |
| + struct mmu_notifier *mn; |
| + int young = 0; |
| + |
| + key = srcu_read_lock(&srcu); |
| + |
| + hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list, |
| + hlist, srcu_read_lock_held(&srcu)) { |
| + if (mn->ops->test_clear_young && |
| + mn->ops->test_clear_young(mn, mm, start, end, bitmap)) |
| + continue; |
| + |
| + if (fallback && mn->ops->clear_young) |
| + young |= mn->ops->clear_young(mn, mm, start, end); |
| + } |
| + |
| + srcu_read_unlock(&srcu, key); |
| + |
| + return young; |
| +} |
| + |
| static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, |
| const struct mmu_notifier_range *range) |
| { |
| diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c |
| index a57eb18cd8a665f5c60db60d715cef421c278538..9fc306a69a53a20158ca89872d308d162b0b66ba 100644 |
| --- a/virt/kvm/kvm_main.c |
| +++ b/virt/kvm/kvm_main.c |
| @@ -907,6 +907,63 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, |
| kvm_test_age_gfn); |
| } |
| |
| +static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start, |
| + unsigned long end, unsigned long *bitmap) |
| +{ |
| + int i; |
| + int key; |
| + bool success = true; |
| + |
| + trace_kvm_age_hva(start, end); |
| + |
| + key = srcu_read_lock(&kvm->srcu); |
| + |
| + for (i = 0; i < KVM_MAX_NR_ADDRESS_SPACES; i++) { |
| + struct interval_tree_node *node; |
| + struct kvm_memslots *slots = __kvm_memslots(kvm, i); |
| + |
| + kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) { |
| + gfn_t lsb_gfn; |
| + unsigned long hva_start, hva_end; |
| + struct kvm_gfn_range range = { |
| + .slot = container_of(node, struct kvm_memory_slot, |
| + hva_node[slots->node_idx]), |
| + }; |
| + |
| + hva_start = max(start, range.slot->userspace_addr); |
| + hva_end = min(end, range.slot->userspace_addr + |
| + range.slot->npages * PAGE_SIZE); |
| + |
| + range.start = hva_to_gfn_memslot(hva_start, range.slot); |
| + range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, range.slot); |
| + |
| + if (WARN_ON_ONCE(range.end <= range.start)) |
| + continue; |
| + |
| + /* see the comments on the generic kvm_arch_has_test_clear_young() */ |
| + lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot); |
| + |
| + success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap); |
| + if (!success) |
| + break; |
| + } |
| + } |
| + |
| + srcu_read_unlock(&kvm->srcu, key); |
| + |
| + return success; |
| +} |
| + |
| +static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm, |
| + unsigned long start, unsigned long end, |
| + unsigned long *bitmap) |
| +{ |
| + if (kvm_arch_has_test_clear_young()) |
| + return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap); |
| + |
| + return false; |
| +} |
| + |
| static void kvm_mmu_notifier_release(struct mmu_notifier *mn, |
| struct mm_struct *mm) |
| { |
| @@ -924,6 +981,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { |
| .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |
| .clear_young = kvm_mmu_notifier_clear_young, |
| .test_young = kvm_mmu_notifier_test_young, |
| + .test_clear_young = kvm_mmu_notifier_test_clear_young, |
| .release = kvm_mmu_notifier_release, |
| }; |
| |
| -- |
| 2.45.1.288.g0e0cd299f1-goog |
| |