| From c04f6bcc544d4865846c46a469a446517920ddde Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Wed, 23 Jun 2021 19:34:03 -0600 |
| Subject: [PATCH] CHROMIUM: mm: multigenerational lru: scan kvm mmu pages |
| |
| For each kvm instance, we add a list of its mmu pages. And each kvm |
| mmu page is reference counted and will not be freed before an rcu |
| grace period has expired. This allows us to walk each kvm mmu page |
| list under the rcu lock, and we can reschedule as long as we hold a |
| reference to any mmu page on the list (and unlock the rcu). |
| |
| For data pages, we call get_page_unless_zero() and verify that it is |
| the right page, using cmpxchg64 to make sure that the kvm pte hasn't |
| changed. |
| |
| BUG=b:123039911 |
| TEST=Ran crostini.DiskIOPerf and crostini.VimCompile |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| Change-Id: Ie047b349964063356fbbc604d0a935508ced5096 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987928 |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: Sonny Rao <sonnyrao@chromium.org> |
| Reviewed-by: Yu Zhao <yuzhao@chromium.org> |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| --- |
| arch/x86/include/asm/kvm_host.h | 2 + |
| arch/x86/kvm/mmu/mmu.c | 111 ++++++++++++++++++++++++++++++-- |
| arch/x86/kvm/mmu/mmu_internal.h | 2 + |
| include/linux/mmu_notifier.h | 25 +++++++ |
| mm/mmu_notifier.c | 16 +++++ |
| mm/vmscan.c | 109 +++++++++++++++++++++++++++++++ |
| virt/kvm/kvm_main.c | 14 ++++ |
| 7 files changed, 274 insertions(+), 5 deletions(-) |
| |
| diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h |
| index 974cbfb1eefe..0ac920f6adcb 100644 |
| --- a/arch/x86/include/asm/kvm_host.h |
| +++ b/arch/x86/include/asm/kvm_host.h |
| @@ -1195,6 +1195,8 @@ struct kvm_arch { |
| hpa_t hv_root_tdp; |
| spinlock_t hv_root_tdp_lock; |
| #endif |
| + spinlock_t mmu_page_list_lock; |
| + struct list_head mmu_page_list; |
| }; |
| |
| struct kvm_vm_stat { |
| diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c |
| index 66f7f5bc3482..a68790370712 100644 |
| --- a/arch/x86/kvm/mmu/mmu.c |
| +++ b/arch/x86/kvm/mmu/mmu.c |
| @@ -1650,17 +1650,37 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr) |
| percpu_counter_add(&kvm_total_used_mmu_pages, nr); |
| } |
| |
| -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) |
| +static void kvm_mmu_free_page_rcu(struct rcu_head *rcu_head) |
| { |
| - MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); |
| - hlist_del(&sp->hash_link); |
| - list_del(&sp->link); |
| + struct kvm_mmu_page *sp = container_of(rcu_head, struct kvm_mmu_page, |
| + rcu_head); |
| + |
| free_page((unsigned long)sp->spt); |
| if (!sp->role.direct) |
| free_page((unsigned long)sp->gfns); |
| kmem_cache_free(mmu_page_header_cache, sp); |
| } |
| |
| +static void kvm_mmu_put_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
| +{ |
| + if (!atomic_dec_and_test(&sp->ref_count)) |
| + return; |
| + |
| + spin_lock(&kvm->arch.mmu_page_list_lock); |
| + list_del_rcu(&sp->mmu_page_list); |
| + spin_unlock(&kvm->arch.mmu_page_list_lock); |
| + |
| + call_rcu(&sp->rcu_head, kvm_mmu_free_page_rcu); |
| +} |
| + |
| +static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
| +{ |
| + MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); |
| + hlist_del(&sp->hash_link); |
| + list_del(&sp->link); |
| + kvm_mmu_put_page(kvm, sp); |
| +} |
| + |
| static unsigned kvm_page_table_hashfn(gfn_t gfn) |
| { |
| return hash_64(gfn, KVM_MMU_HASH_SHIFT); |
| @@ -2110,6 +2130,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
| if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) |
| kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); |
| } |
| + atomic_set(&sp->ref_count, 1); |
| + spin_lock(&vcpu->kvm->arch.mmu_page_list_lock); |
| + list_add_tail_rcu(&sp->mmu_page_list, &vcpu->kvm->arch.mmu_page_list); |
| + spin_unlock(&vcpu->kvm->arch.mmu_page_list_lock); |
| trace_kvm_mmu_get_page(sp, true); |
| out: |
| kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
| @@ -2388,7 +2412,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, |
| |
| list_for_each_entry_safe(sp, nsp, invalid_list, link) { |
| WARN_ON(!sp->role.invalid || sp->root_count); |
| - kvm_mmu_free_page(sp); |
| + kvm_mmu_free_page(kvm, sp); |
| } |
| } |
| |
| @@ -5548,6 +5572,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) |
| node->track_write = kvm_mmu_pte_write; |
| node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; |
| kvm_page_track_register_notifier(kvm, node); |
| + INIT_LIST_HEAD(&kvm->arch.mmu_page_list); |
| + spin_lock_init(&kvm->arch.mmu_page_list_lock); |
| } |
| |
| void kvm_mmu_uninit_vm(struct kvm *kvm) |
| @@ -6133,3 +6159,78 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) |
| if (kvm->arch.nx_lpage_recovery_thread) |
| kthread_stop(kvm->arch.nx_lpage_recovery_thread); |
| } |
| + |
| +static void kvm_clear_young_walk(struct kvm *kvm, struct mmu_notifier_walk *walk, |
| + struct kvm_mmu_page *sp) |
| +{ |
| + int i; |
| + |
| + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { |
| + u64 old, new; |
| + struct page *page; |
| + kvm_pfn_t pfn = -1; |
| + |
| + new = old = mmu_spte_get_lockless(sp->spt + i); |
| + |
| + if (is_shadow_present_pte(old)) { |
| + if (!is_last_spte(old, sp->role.level)) |
| + continue; |
| + |
| + pfn = spte_to_pfn(old); |
| + |
| + if (spte_ad_enabled(old)) |
| + new = old & ~shadow_accessed_mask; |
| + else if (!is_access_track_spte(old)) |
| + new = mark_spte_for_access_track(old); |
| + } |
| + |
| + page = walk->get_page(walk->private, pfn, new != old); |
| + if (!page) |
| + continue; |
| + |
| + if (new != old && cmpxchg64(sp->spt + i, old, new) == old) |
| + walk->update_page(walk->private, page); |
| + |
| + put_page(page); |
| + } |
| +} |
| + |
| +void kvm_arch_mmu_clear_young_walk(struct kvm *kvm, struct mmu_notifier_walk *walk) |
| +{ |
| + struct kvm_mmu_page *sp; |
| + bool started = false; |
| + |
| + rcu_read_lock(); |
| + |
| + list_for_each_entry_rcu(sp, &kvm->arch.mmu_page_list, mmu_page_list) { |
| + if (is_obsolete_sp(kvm, sp) || sp->role.invalid || |
| + sp->role.level > PG_LEVEL_2M) |
| + continue; |
| + |
| + if (!started && !walk->start_batch(kvm->mm, walk->private)) |
| + break; |
| + |
| + started = true; |
| + |
| + kvm_clear_young_walk(kvm, walk, sp); |
| + |
| + if (!walk->end_batch(walk->private, false)) |
| + continue; |
| + |
| + started = false; |
| + |
| + if (!atomic_inc_not_zero(&sp->ref_count)) |
| + continue; |
| + |
| + rcu_read_unlock(); |
| + cond_resched(); |
| + rcu_read_lock(); |
| + |
| + kvm_mmu_put_page(kvm, sp); |
| + } |
| + |
| + if (started && !walk->end_batch(walk->private, true)) |
| + VM_BUG_ON(true); |
| + |
| + rcu_read_unlock(); |
| +} |
| diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h |
| index 35567293c1fd..a79b647fe9bb 100644 |
| --- a/arch/x86/kvm/mmu/mmu_internal.h |
| +++ b/arch/x86/kvm/mmu/mmu_internal.h |
| @@ -74,7 +74,9 @@ struct kvm_mmu_page { |
| bool tdp_mmu_page; |
| |
| /* Used for freeing the page asynchronously if it is a TDP MMU page. */ |
| + atomic_t ref_count; |
| struct rcu_head rcu_head; |
| + struct list_head mmu_page_list; |
| #endif |
| }; |
| |
| diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h |
| index 45fc2c81e370..16bb73aa31b9 100644 |
| --- a/include/linux/mmu_notifier.h |
| +++ b/include/linux/mmu_notifier.h |
| @@ -59,6 +59,14 @@ enum mmu_notifier_event { |
| MMU_NOTIFY_EXCLUSIVE, |
| }; |
| |
| +struct mmu_notifier_walk { |
| + bool (*start_batch)(struct mm_struct *mm, void *priv); |
| + bool (*end_batch)(void *priv, bool last); |
| + struct page *(*get_page)(void *priv, unsigned long pfn, bool young); |
| + void (*update_page)(void *priv, struct page *page); |
| + void *private; |
| +}; |
| + |
| #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) |
| |
| struct mmu_notifier_ops { |
| @@ -112,6 +120,9 @@ struct mmu_notifier_ops { |
| unsigned long start, |
| unsigned long end); |
| |
| + void (*clear_young_walk)(struct mmu_notifier *mn, |
| + struct mmu_notifier_walk *walk); |
| + |
| /* |
| * test_young is called to check the young/accessed bitflag in |
| * the secondary pte. This is used to know if the page is |
| @@ -391,6 +402,8 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, |
| extern int __mmu_notifier_clear_young(struct mm_struct *mm, |
| unsigned long start, |
| unsigned long end); |
| +extern void __mmu_notifier_clear_young_walk(struct mm_struct *mm, |
| + struct mmu_notifier_walk *walk); |
| extern int __mmu_notifier_test_young(struct mm_struct *mm, |
| unsigned long address); |
| extern void __mmu_notifier_change_pte(struct mm_struct *mm, |
| @@ -433,6 +446,13 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm, |
| return 0; |
| } |
| |
| +static inline void mmu_notifier_clear_young_walk(struct mm_struct *mm, |
| + struct mmu_notifier_walk *walk) |
| +{ |
| + if (mm_has_notifiers(mm)) |
| + __mmu_notifier_clear_young_walk(mm, walk); |
| +} |
| + |
| static inline int mmu_notifier_test_young(struct mm_struct *mm, |
| unsigned long address) |
| { |
| @@ -687,6 +707,11 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, |
| return 0; |
| } |
| |
| +static inline void mmu_notifier_clear_young_walk(struct mm_struct *mm, |
| + struct mmu_notifier_walk *walk) |
| +{ |
| +} |
| + |
| static inline int mmu_notifier_test_young(struct mm_struct *mm, |
| unsigned long address) |
| { |
| diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c |
| index 459d195d2ff6..8b93671f6cbc 100644 |
| --- a/mm/mmu_notifier.c |
| +++ b/mm/mmu_notifier.c |
| @@ -402,6 +402,22 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, |
| return young; |
| } |
| |
| +void __mmu_notifier_clear_young_walk(struct mm_struct *mm, |
| + struct mmu_notifier_walk *walk) |
| +{ |
| + int id; |
| + struct mmu_notifier *subscription; |
| + |
| + id = srcu_read_lock(&srcu); |
| + hlist_for_each_entry_rcu(subscription, |
| + &mm->notifier_subscriptions->list, hlist, |
| + srcu_read_lock_held(&srcu)) { |
| + if (subscription->ops->clear_young_walk) |
| + subscription->ops->clear_young_walk(subscription, walk); |
| + } |
| + srcu_read_unlock(&srcu, id); |
| +} |
| + |
| int __mmu_notifier_test_young(struct mm_struct *mm, |
| unsigned long address) |
| { |
| diff --git a/mm/vmscan.c b/mm/vmscan.c |
| index 8b4455a10abf..596034091065 100644 |
| --- a/mm/vmscan.c |
| +++ b/mm/vmscan.c |
| @@ -54,6 +54,7 @@ |
| #include <linux/shmem_fs.h> |
| #include <linux/ctype.h> |
| #include <linux/debugfs.h> |
| +#include <linux/mmu_notifier.h> |
| |
| #include <asm/tlbflush.h> |
| #include <asm/div64.h> |
| @@ -3721,6 +3722,113 @@ static void walk_mm(struct mm_walk_args *args, struct mm_struct *mm) |
| cond_resched(); |
| } while (err == -EAGAIN && args->next_addr && |
| !mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg)); |
| + |
| +} |
| + |
| +static bool mmu_notifier_start_batch(struct mm_struct *mm, void *priv) |
| +{ |
| + struct mm_walk_args *args = priv; |
| + struct mem_cgroup *memcg = args->memcg; |
| + |
| + VM_BUG_ON(!rcu_read_lock_held()); |
| + |
| +#ifdef CONFIG_MEMCG |
| + if (memcg && atomic_read(&memcg->moving_account)) { |
| + args->mm_stats[MM_LOCK_CONTENTION]++; |
| + return false; |
| + } |
| +#endif |
| + return !mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg); |
| +} |
| + |
| +static bool mmu_notifier_end_batch(void *priv, bool last) |
| +{ |
| + struct lruvec *lruvec; |
| + struct mm_walk_args *args = priv; |
| + |
| + VM_BUG_ON(!rcu_read_lock_held()); |
| + |
| + if (!last && args->batch_size < MAX_BATCH_SIZE) |
| + return false; |
| + |
| + lruvec = mem_cgroup_lruvec(args->memcg, NODE_DATA(args->node_id)); |
| + reset_batch_size(lruvec, args); |
| + |
| + return true; |
| +} |
| + |
| +static struct page *mmu_notifier_get_page(void *priv, unsigned long pfn, bool young) |
| +{ |
| + struct page *page; |
| + struct mm_walk_args *args = priv; |
| + |
| + if (pfn == -1 || is_zero_pfn(pfn)) { |
| + args->mm_stats[MM_LEAF_HOLE]++; |
| + return NULL; |
| + } |
| + |
| + if (!young) { |
| + args->mm_stats[MM_LEAF_OLD]++; |
| + return NULL; |
| + } |
| + |
| + VM_BUG_ON(!pfn_valid(pfn)); |
| + if (pfn < args->start_pfn || pfn >= args->end_pfn) { |
| + args->mm_stats[MM_LEAF_OTHER_NODE]++; |
| + return NULL; |
| + } |
| + |
| + page = compound_head(pfn_to_page(pfn)); |
| + if (page_to_nid(page) != args->node_id) { |
| + args->mm_stats[MM_LEAF_OTHER_NODE]++; |
| + return NULL; |
| + } |
| + |
| + if (page_memcg_rcu(page) != args->memcg) { |
| + args->mm_stats[MM_LEAF_OTHER_MEMCG]++; |
| + return NULL; |
| + } |
| + |
| + if (!PageLRU(page)) { |
| + args->mm_stats[MM_LEAF_HOLE]++; |
| + return NULL; |
| + } |
| + |
| + return get_page_unless_zero(page) ? page : NULL; |
| +} |
| + |
| +static void mmu_notifier_update_page(void *priv, struct page *page) |
| +{ |
| + struct mm_walk_args *args = priv; |
| + int old_gen, new_gen = lru_gen_from_seq(args->max_seq); |
| + |
| + if (page_memcg_rcu(page) != args->memcg) { |
| + args->mm_stats[MM_LEAF_OTHER_MEMCG]++; |
| + return; |
| + } |
| + |
| + if (!PageLRU(page)) { |
| + args->mm_stats[MM_LEAF_HOLE]++; |
| + return; |
| + } |
| + |
| + old_gen = page_update_gen(page, new_gen); |
| + if (old_gen >= 0 && old_gen != new_gen) |
| + update_batch_size(page, old_gen, new_gen, args); |
| + args->mm_stats[MM_LEAF_YOUNG]++; |
| +} |
| + |
| +static void call_mmu_notifier(struct mm_walk_args *args, struct mm_struct *mm) |
| +{ |
| + struct mmu_notifier_walk walk = { |
| + .start_batch = mmu_notifier_start_batch, |
| + .end_batch = mmu_notifier_end_batch, |
| + .get_page = mmu_notifier_get_page, |
| + .update_page = mmu_notifier_update_page, |
| + .private = args, |
| + }; |
| + |
| + mmu_notifier_clear_young_walk(mm, &walk); |
| } |
| |
| static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool front) |
| @@ -3912,6 +4020,7 @@ static bool walk_mm_list(struct lruvec *lruvec, unsigned long max_seq, |
| last = get_next_mm(args, &mm); |
| if (mm) { |
| walk_mm(args, mm); |
| + call_mmu_notifier(args, mm); |
| } |
| |
| cond_resched(); |
| diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c |
| index 986959833d70..64f8a088dcf9 100644 |
| --- a/virt/kvm/kvm_main.c |
| +++ b/virt/kvm/kvm_main.c |
| @@ -733,6 +733,19 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, |
| return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); |
| } |
| |
| +__weak void kvm_arch_mmu_clear_young_walk(struct kvm *kvm, |
| + struct mmu_notifier_walk *walk) |
| +{ |
| +} |
| + |
| +static void kvm_mmu_notifier_clear_young_walk(struct mmu_notifier *mn, |
| + struct mmu_notifier_walk *walk) |
| +{ |
| + struct kvm *kvm = mmu_notifier_to_kvm(mn); |
| + |
| + kvm_arch_mmu_clear_young_walk(kvm, walk); |
| +} |
| + |
| static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, |
| struct mm_struct *mm, |
| unsigned long address) |
| @@ -760,6 +773,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { |
| .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, |
| .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |
| .clear_young = kvm_mmu_notifier_clear_young, |
| + .clear_young_walk = kvm_mmu_notifier_clear_young_walk, |
| .test_young = kvm_mmu_notifier_test_young, |
| .change_pte = kvm_mmu_notifier_change_pte, |
| .release = kvm_mmu_notifier_release, |
| -- |
| 2.32.0.402.g57bb445576-goog |
| |