56b94a7cc952f0a6edaf4e5c610373330d8f30f40fd7ea63f9fa5521.patch - chromiumos/third_party/kernel-rebase-patches - Git at Google

 From c04f6bcc544d4865846c46a469a446517920ddde Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Wed, 23 Jun 2021 19:34:03 -0600
 Subject: [PATCH] CHROMIUM: mm: multigenerational lru: scan kvm mmu pages

 For each kvm instance, we add a list of its mmu pages. And each kvm
 mmu page is reference counted and will not be freed before an rcu
 grace period has expired. This allows us to walk each kvm mmu page
 list under the rcu lock, and we can reschedule as long as we hold a
 reference to any mmu page on the list (and unlock the rcu).

 For data pages, we call get_page_unless_zero() and verify that it is
 the right page, using cmpxchg64 to make sure that the kvm pte hasn't
 changed.

 BUG=b:123039911
 TEST=Ran crostini.DiskIOPerf and crostini.VimCompile

 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Change-Id: Ie047b349964063356fbbc604d0a935508ced5096
 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987928
 Tested-by: Yu Zhao <yuzhao@chromium.org>
 Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
 Reviewed-by: Yu Zhao <yuzhao@chromium.org>
 Commit-Queue: Yu Zhao <yuzhao@chromium.org>
 ---
  arch/x86/include/asm/kvm_host.h |   2 +
  arch/x86/kvm/mmu/mmu.c          | 111 ++++++++++++++++++++++++++++++--
  arch/x86/kvm/mmu/mmu_internal.h |   2 +
  include/linux/mmu_notifier.h    |  25 +++++++
  mm/mmu_notifier.c               |  16 +++++
  mm/vmscan.c                     | 109 +++++++++++++++++++++++++++++++
  virt/kvm/kvm_main.c             |  14 ++++
  7 files changed, 274 insertions(+), 5 deletions(-)

 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index 974cbfb1eefe..0ac920f6adcb 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -1195,6 +1195,8 @@ struct kvm_arch {
  	hpa_t	hv_root_tdp;
  	spinlock_t hv_root_tdp_lock;
  #endif
 +	spinlock_t mmu_page_list_lock;
 +	struct list_head mmu_page_list;
  };

  struct kvm_vm_stat {
 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
 index 66f7f5bc3482..a68790370712 100644
 --- a/arch/x86/kvm/mmu/mmu.c
 +++ b/arch/x86/kvm/mmu/mmu.c
 @@ -1650,17 +1650,37 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
  	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
  }

 -static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 +static void kvm_mmu_free_page_rcu(struct rcu_head *rcu_head)
  {
 -	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
 -	hlist_del(&sp->hash_link);
 -	list_del(&sp->link);
 +	struct kvm_mmu_page *sp = container_of(rcu_head, struct kvm_mmu_page,
 +					       rcu_head);
 +
  	free_page((unsigned long)sp->spt);
  	if (!sp->role.direct)
  		free_page((unsigned long)sp->gfns);
  	kmem_cache_free(mmu_page_header_cache, sp);
  }

 +static void kvm_mmu_put_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 +{
 +	if (!atomic_dec_and_test(&sp->ref_count))
 +		return;
 +
 +	spin_lock(&kvm->arch.mmu_page_list_lock);
 +	list_del_rcu(&sp->mmu_page_list);
 +	spin_unlock(&kvm->arch.mmu_page_list_lock);
 +
 +	call_rcu(&sp->rcu_head, kvm_mmu_free_page_rcu);
 +}
 +
 +static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 +{
 +	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
 +	hlist_del(&sp->hash_link);
 +	list_del(&sp->link);
 +	kvm_mmu_put_page(kvm, sp);
 +}
 +
  static unsigned kvm_page_table_hashfn(gfn_t gfn)
  {
  	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
 @@ -2110,6 +2130,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
  		if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
  			kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
  	}
 +	atomic_set(&sp->ref_count, 1);
 +	spin_lock(&vcpu->kvm->arch.mmu_page_list_lock);
 +	list_add_tail_rcu(&sp->mmu_page_list, &vcpu->kvm->arch.mmu_page_list);
 +	spin_unlock(&vcpu->kvm->arch.mmu_page_list_lock);
  	trace_kvm_mmu_get_page(sp, true);
  out:
  	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 @@ -2388,7 +2412,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,

  	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
  		WARN_ON(!sp->role.invalid || sp->root_count);
 -		kvm_mmu_free_page(sp);
 +		kvm_mmu_free_page(kvm, sp);
  	}
  }

 @@ -5548,6 +5572,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
  	node->track_write = kvm_mmu_pte_write;
  	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
  	kvm_page_track_register_notifier(kvm, node);
 +	INIT_LIST_HEAD(&kvm->arch.mmu_page_list);
 +	spin_lock_init(&kvm->arch.mmu_page_list_lock);
  }

  void kvm_mmu_uninit_vm(struct kvm *kvm)
 @@ -6133,3 +6159,78 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
  	if (kvm->arch.nx_lpage_recovery_thread)
  		kthread_stop(kvm->arch.nx_lpage_recovery_thread);
  }
 +
 +static void kvm_clear_young_walk(struct kvm *kvm, struct mmu_notifier_walk *walk,
 +				 struct kvm_mmu_page *sp)
 +{
 +	int i;
 +
 +	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 +		u64 old, new;
 +		struct page *page;
 +		kvm_pfn_t pfn = -1;
 +
 +		new = old = mmu_spte_get_lockless(sp->spt + i);
 +
 +		if (is_shadow_present_pte(old)) {
 +			if (!is_last_spte(old, sp->role.level))
 +				continue;
 +
 +			pfn = spte_to_pfn(old);
 +
 +			if (spte_ad_enabled(old))
 +				new = old & ~shadow_accessed_mask;
 +			else if (!is_access_track_spte(old))
 +				new = mark_spte_for_access_track(old);
 +		}
 +
 +		page = walk->get_page(walk->private, pfn, new != old);
 +		if (!page)
 +			continue;
 +
 +		if (new != old && cmpxchg64(sp->spt + i, old, new) == old)
 +			walk->update_page(walk->private, page);
 +
 +		put_page(page);
 +	}
 +}
 +
 +void kvm_arch_mmu_clear_young_walk(struct kvm *kvm, struct mmu_notifier_walk *walk)
 +{
 +	struct kvm_mmu_page *sp;
 +	bool started = false;
 +
 +	rcu_read_lock();
 +
 +	list_for_each_entry_rcu(sp, &kvm->arch.mmu_page_list, mmu_page_list) {
 +		if (is_obsolete_sp(kvm, sp) || sp->role.invalid ||
 +		    sp->role.level > PG_LEVEL_2M)
 +			continue;
 +
 +		if (!started && !walk->start_batch(kvm->mm, walk->private))
 +			break;
 +
 +		started = true;
 +
 +		kvm_clear_young_walk(kvm, walk, sp);
 +
 +		if (!walk->end_batch(walk->private, false))
 +			continue;
 +
 +		started = false;
 +
 +		if (!atomic_inc_not_zero(&sp->ref_count))
 +			continue;
 +
 +		rcu_read_unlock();
 +		cond_resched();
 +		rcu_read_lock();
 +
 +		kvm_mmu_put_page(kvm, sp);
 +	}
 +
 +	if (started && !walk->end_batch(walk->private, true))
 +		VM_BUG_ON(true);
 +
 +	rcu_read_unlock();
 +}
 diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
 index 35567293c1fd..a79b647fe9bb 100644
 --- a/arch/x86/kvm/mmu/mmu_internal.h
 +++ b/arch/x86/kvm/mmu/mmu_internal.h
 @@ -74,7 +74,9 @@ struct kvm_mmu_page {
  	bool tdp_mmu_page;

  	/* Used for freeing the page asynchronously if it is a TDP MMU page. */
 +	atomic_t ref_count;
  	struct rcu_head rcu_head;
 +	struct list_head mmu_page_list;
  #endif
  };

 diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
 index 45fc2c81e370..16bb73aa31b9 100644
 --- a/include/linux/mmu_notifier.h
 +++ b/include/linux/mmu_notifier.h
 @@ -59,6 +59,14 @@ enum mmu_notifier_event {
  	MMU_NOTIFY_EXCLUSIVE,
  };

 +struct mmu_notifier_walk {
 +	bool (*start_batch)(struct mm_struct *mm, void *priv);
 +	bool (*end_batch)(void *priv, bool last);
 +	struct page *(*get_page)(void *priv, unsigned long pfn, bool young);
 +	void (*update_page)(void *priv, struct page *page);
 +	void *private;
 +};
 +
  #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

  struct mmu_notifier_ops {
 @@ -112,6 +120,9 @@ struct mmu_notifier_ops {
  			   unsigned long start,
  			   unsigned long end);

 +	void (*clear_young_walk)(struct mmu_notifier *mn,
 +				 struct mmu_notifier_walk *walk);
 +
  	/*
  	 * test_young is called to check the young/accessed bitflag in
  	 * the secondary pte. This is used to know if the page is
 @@ -391,6 +402,8 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
  extern int __mmu_notifier_clear_young(struct mm_struct *mm,
  				      unsigned long start,
  				      unsigned long end);
 +extern void __mmu_notifier_clear_young_walk(struct mm_struct *mm,
 +					    struct mmu_notifier_walk *walk);
  extern int __mmu_notifier_test_young(struct mm_struct *mm,
  				     unsigned long address);
  extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 @@ -433,6 +446,13 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm,
  	return 0;
  }

 +static inline void mmu_notifier_clear_young_walk(struct mm_struct *mm,
 +						 struct mmu_notifier_walk *walk)
 +{
 +	if (mm_has_notifiers(mm))
 +		__mmu_notifier_clear_young_walk(mm, walk);
 +}
 +
  static inline int mmu_notifier_test_young(struct mm_struct *mm,
  					  unsigned long address)
  {
 @@ -687,6 +707,11 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
  	return 0;
  }

 +static inline void mmu_notifier_clear_young_walk(struct mm_struct *mm,
 +						 struct mmu_notifier_walk *walk)
 +{
 +}
 +
  static inline int mmu_notifier_test_young(struct mm_struct *mm,
  					  unsigned long address)
  {
 diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
 index 459d195d2ff6..8b93671f6cbc 100644
 --- a/mm/mmu_notifier.c
 +++ b/mm/mmu_notifier.c
 @@ -402,6 +402,22 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
  	return young;
  }

 +void __mmu_notifier_clear_young_walk(struct mm_struct *mm,
 +				     struct mmu_notifier_walk *walk)
 +{
 +	int id;
 +	struct mmu_notifier *subscription;
 +
 +	id = srcu_read_lock(&srcu);
 +	hlist_for_each_entry_rcu(subscription,
 +				 &mm->notifier_subscriptions->list, hlist,
 +				 srcu_read_lock_held(&srcu)) {
 +		if (subscription->ops->clear_young_walk)
 +			subscription->ops->clear_young_walk(subscription, walk);
 +	}
 +	srcu_read_unlock(&srcu, id);
 +}
 +
  int __mmu_notifier_test_young(struct mm_struct *mm,
  			      unsigned long address)
  {
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 8b4455a10abf..596034091065 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -54,6 +54,7 @@
  #include <linux/shmem_fs.h>
  #include <linux/ctype.h>
  #include <linux/debugfs.h>
 +#include <linux/mmu_notifier.h>

  #include <asm/tlbflush.h>
  #include <asm/div64.h>
 @@ -3721,6 +3722,113 @@ static void walk_mm(struct mm_walk_args *args, struct mm_struct *mm)
  		cond_resched();
  	} while (err == -EAGAIN && args->next_addr &&
  		 !mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg));
 +
 +}
 +
 +static bool mmu_notifier_start_batch(struct mm_struct *mm, void *priv)
 +{
 +	struct mm_walk_args *args = priv;
 +	struct mem_cgroup *memcg = args->memcg;
 +
 +	VM_BUG_ON(!rcu_read_lock_held());
 +
 +#ifdef CONFIG_MEMCG
 +	if (memcg && atomic_read(&memcg->moving_account)) {
 +		args->mm_stats[MM_LOCK_CONTENTION]++;
 +		return false;
 +	}
 +#endif
 +	return !mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg);
 +}
 +
 +static bool mmu_notifier_end_batch(void *priv, bool last)
 +{
 +	struct lruvec *lruvec;
 +	struct mm_walk_args *args = priv;
 +
 +	VM_BUG_ON(!rcu_read_lock_held());
 +
 +	if (!last && args->batch_size < MAX_BATCH_SIZE)
 +		return false;
 +
 +	lruvec = mem_cgroup_lruvec(args->memcg, NODE_DATA(args->node_id));
 +	reset_batch_size(lruvec, args);
 +
 +	return true;
 +}
 +
 +static struct page *mmu_notifier_get_page(void *priv, unsigned long pfn, bool young)
 +{
 +	struct page *page;
 +	struct mm_walk_args *args = priv;
 +
 +	if (pfn == -1 || is_zero_pfn(pfn)) {
 +		args->mm_stats[MM_LEAF_HOLE]++;
 +		return NULL;
 +	}
 +
 +	if (!young) {
 +		args->mm_stats[MM_LEAF_OLD]++;
 +		return NULL;
 +	}
 +
 +	VM_BUG_ON(!pfn_valid(pfn));
 +	if (pfn < args->start_pfn || pfn >= args->end_pfn) {
 +		args->mm_stats[MM_LEAF_OTHER_NODE]++;
 +		return NULL;
 +	}
 +
 +	page = compound_head(pfn_to_page(pfn));
 +	if (page_to_nid(page) != args->node_id) {
 +		args->mm_stats[MM_LEAF_OTHER_NODE]++;
 +		return NULL;
 +	}
 +
 +	if (page_memcg_rcu(page) != args->memcg) {
 +		args->mm_stats[MM_LEAF_OTHER_MEMCG]++;
 +		return NULL;
 +	}
 +
 +	if (!PageLRU(page)) {
 +		args->mm_stats[MM_LEAF_HOLE]++;
 +		return NULL;
 +	}
 +
 +	return get_page_unless_zero(page) ? page : NULL;
 +}
 +
 +static void mmu_notifier_update_page(void *priv, struct page *page)
 +{
 +	struct mm_walk_args *args = priv;
 +	int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
 +
 +	if (page_memcg_rcu(page) != args->memcg) {
 +		args->mm_stats[MM_LEAF_OTHER_MEMCG]++;
 +		return;
 +	}
 +
 +	if (!PageLRU(page)) {
 +		args->mm_stats[MM_LEAF_HOLE]++;
 +		return;
 +	}
 +
 +	old_gen = page_update_gen(page, new_gen);
 +	if (old_gen >= 0 && old_gen != new_gen)
 +		update_batch_size(page, old_gen, new_gen, args);
 +	args->mm_stats[MM_LEAF_YOUNG]++;
 +}
 +
 +static void call_mmu_notifier(struct mm_walk_args *args, struct mm_struct *mm)
 +{
 +	struct mmu_notifier_walk walk = {
 +		.start_batch = mmu_notifier_start_batch,
 +		.end_batch = mmu_notifier_end_batch,
 +		.get_page = mmu_notifier_get_page,
 +		.update_page = mmu_notifier_update_page,
 +		.private = args,
 +	};
 +
 +	mmu_notifier_clear_young_walk(mm, &walk);
  }

  static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool front)
 @@ -3912,6 +4020,7 @@ static bool walk_mm_list(struct lruvec *lruvec, unsigned long max_seq,
  		last = get_next_mm(args, &mm);
  		if (mm) {
  			walk_mm(args, mm);
 +			call_mmu_notifier(args, mm);
  		}

  		cond_resched();
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 986959833d70..64f8a088dcf9 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -733,6 +733,19 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
  	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
  }

 +__weak void kvm_arch_mmu_clear_young_walk(struct kvm *kvm,
 +					  struct mmu_notifier_walk *walk)
 +{
 +}
 +
 +static void kvm_mmu_notifier_clear_young_walk(struct mmu_notifier *mn,
 +					     struct mmu_notifier_walk *walk)
 +{
 +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 +
 +	kvm_arch_mmu_clear_young_walk(kvm, walk);
 +}
 +
  static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
  				       struct mm_struct *mm,
  				       unsigned long address)
 @@ -760,6 +773,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
  	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
  	.clear_young		= kvm_mmu_notifier_clear_young,
 +	.clear_young_walk	= kvm_mmu_notifier_clear_young_walk,
  	.test_young		= kvm_mmu_notifier_test_young,
  	.change_pte		= kvm_mmu_notifier_change_pte,
  	.release		= kvm_mmu_notifier_release,
 --
 2.32.0.402.g57bb445576-goog
	From c04f6bcc544d4865846c46a469a446517920ddde Mon Sep 17 00:00:00 2001
	From: Yu Zhao <yuzhao@google.com>
	Date: Wed, 23 Jun 2021 19:34:03 -0600
	Subject: [PATCH] CHROMIUM: mm: multigenerational lru: scan kvm mmu pages

	For each kvm instance, we add a list of its mmu pages. And each kvm
	mmu page is reference counted and will not be freed before an rcu
	grace period has expired. This allows us to walk each kvm mmu page
	list under the rcu lock, and we can reschedule as long as we hold a
	reference to any mmu page on the list (and unlock the rcu).

	For data pages, we call get_page_unless_zero() and verify that it is
	the right page, using cmpxchg64 to make sure that the kvm pte hasn't
	changed.

	BUG=b:123039911
	TEST=Ran crostini.DiskIOPerf and crostini.VimCompile

	Signed-off-by: Yu Zhao <yuzhao@google.com>
	Change-Id: Ie047b349964063356fbbc604d0a935508ced5096
	Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987928
	Tested-by: Yu Zhao <yuzhao@chromium.org>
	Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
	Reviewed-by: Yu Zhao <yuzhao@chromium.org>
	Commit-Queue: Yu Zhao <yuzhao@chromium.org>
	---
	arch/x86/include/asm/kvm_host.h \| 2 +
	arch/x86/kvm/mmu/mmu.c \| 111 ++++++++++++++++++++++++++++++--
	arch/x86/kvm/mmu/mmu_internal.h \| 2 +
	include/linux/mmu_notifier.h \| 25 +++++++
	mm/mmu_notifier.c \| 16 +++++
	mm/vmscan.c \| 109 +++++++++++++++++++++++++++++++
	virt/kvm/kvm_main.c \| 14 ++++
	7 files changed, 274 insertions(+), 5 deletions(-)

	diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
	index 974cbfb1eefe..0ac920f6adcb 100644
	--- a/arch/x86/include/asm/kvm_host.h
	+++ b/arch/x86/include/asm/kvm_host.h
	@@ -1195,6 +1195,8 @@ struct kvm_arch {
	hpa_t hv_root_tdp;
	spinlock_t hv_root_tdp_lock;
	#endif
	+ spinlock_t mmu_page_list_lock;
	+ struct list_head mmu_page_list;
	};

	struct kvm_vm_stat {
	diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
	index 66f7f5bc3482..a68790370712 100644
	--- a/arch/x86/kvm/mmu/mmu.c
	+++ b/arch/x86/kvm/mmu/mmu.c
	@@ -1650,17 +1650,37 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
	}

	-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
	+static void kvm_mmu_free_page_rcu(struct rcu_head *rcu_head)
	{
	- MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
	- hlist_del(&sp->hash_link);
	- list_del(&sp->link);
	+ struct kvm_mmu_page *sp = container_of(rcu_head, struct kvm_mmu_page,
	+ rcu_head);
	+
	free_page((unsigned long)sp->spt);
	if (!sp->role.direct)
	free_page((unsigned long)sp->gfns);
	kmem_cache_free(mmu_page_header_cache, sp);
	}

	+static void kvm_mmu_put_page(struct kvm kvm, struct kvm_mmu_page sp)
	+{
	+ if (!atomic_dec_and_test(&sp->ref_count))
	+ return;
	+
	+ spin_lock(&kvm->arch.mmu_page_list_lock);
	+ list_del_rcu(&sp->mmu_page_list);
	+ spin_unlock(&kvm->arch.mmu_page_list_lock);
	+
	+ call_rcu(&sp->rcu_head, kvm_mmu_free_page_rcu);
	+}
	+
	+static void kvm_mmu_free_page(struct kvm kvm, struct kvm_mmu_page sp)
	+{
	+ MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
	+ hlist_del(&sp->hash_link);
	+ list_del(&sp->link);
	+ kvm_mmu_put_page(kvm, sp);
	+}
	+
	static unsigned kvm_page_table_hashfn(gfn_t gfn)
	{
	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
	@@ -2110,6 +2130,10 @@ static struct kvm_mmu_page kvm_mmu_get_page(struct kvm_vcpu vcpu,
	if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
	kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
	}
	+ atomic_set(&sp->ref_count, 1);
	+ spin_lock(&vcpu->kvm->arch.mmu_page_list_lock);
	+ list_add_tail_rcu(&sp->mmu_page_list, &vcpu->kvm->arch.mmu_page_list);
	+ spin_unlock(&vcpu->kvm->arch.mmu_page_list_lock);
	trace_kvm_mmu_get_page(sp, true);
	out:
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
	@@ -2388,7 +2412,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,

	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
	WARN_ON(!sp->role.invalid \|\| sp->root_count);
	- kvm_mmu_free_page(sp);
	+ kvm_mmu_free_page(kvm, sp);
	}
	}

	@@ -5548,6 +5572,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
	node->track_write = kvm_mmu_pte_write;
	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
	kvm_page_track_register_notifier(kvm, node);
	+ INIT_LIST_HEAD(&kvm->arch.mmu_page_list);
	+ spin_lock_init(&kvm->arch.mmu_page_list_lock);
	}

	void kvm_mmu_uninit_vm(struct kvm *kvm)
	@@ -6133,3 +6159,78 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
	if (kvm->arch.nx_lpage_recovery_thread)
	kthread_stop(kvm->arch.nx_lpage_recovery_thread);
	}
	+
	+static void kvm_clear_young_walk(struct kvm kvm, struct mmu_notifier_walk walk,
	+ struct kvm_mmu_page *sp)
	+{
	+ int i;
	+
	+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
	+ u64 old, new;
	+ struct page *page;
	+ kvm_pfn_t pfn = -1;
	+
	+ new = old = mmu_spte_get_lockless(sp->spt + i);
	+
	+ if (is_shadow_present_pte(old)) {
	+ if (!is_last_spte(old, sp->role.level))
	+ continue;
	+
	+ pfn = spte_to_pfn(old);
	+
	+ if (spte_ad_enabled(old))
	+ new = old & ~shadow_accessed_mask;
	+ else if (!is_access_track_spte(old))
	+ new = mark_spte_for_access_track(old);
	+ }
	+
	+ page = walk->get_page(walk->private, pfn, new != old);
	+ if (!page)
	+ continue;
	+
	+ if (new != old && cmpxchg64(sp->spt + i, old, new) == old)
	+ walk->update_page(walk->private, page);
	+
	+ put_page(page);
	+ }
	+}
	+
	+void kvm_arch_mmu_clear_young_walk(struct kvm kvm, struct mmu_notifier_walk walk)
	+{
	+ struct kvm_mmu_page *sp;
	+ bool started = false;
	+
	+ rcu_read_lock();
	+
	+ list_for_each_entry_rcu(sp, &kvm->arch.mmu_page_list, mmu_page_list) {
	+ if (is_obsolete_sp(kvm, sp) \|\| sp->role.invalid \|\|
	+ sp->role.level > PG_LEVEL_2M)
	+ continue;
	+
	+ if (!started && !walk->start_batch(kvm->mm, walk->private))
	+ break;
	+
	+ started = true;
	+
	+ kvm_clear_young_walk(kvm, walk, sp);
	+
	+ if (!walk->end_batch(walk->private, false))
	+ continue;
	+
	+ started = false;
	+
	+ if (!atomic_inc_not_zero(&sp->ref_count))
	+ continue;
	+
	+ rcu_read_unlock();
	+ cond_resched();
	+ rcu_read_lock();
	+
	+ kvm_mmu_put_page(kvm, sp);
	+ }
	+
	+ if (started && !walk->end_batch(walk->private, true))
	+ VM_BUG_ON(true);
	+
	+ rcu_read_unlock();
	+}
	diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
	index 35567293c1fd..a79b647fe9bb 100644
	--- a/arch/x86/kvm/mmu/mmu_internal.h
	+++ b/arch/x86/kvm/mmu/mmu_internal.h
	@@ -74,7 +74,9 @@ struct kvm_mmu_page {
	bool tdp_mmu_page;

	/* Used for freeing the page asynchronously if it is a TDP MMU page. */
	+ atomic_t ref_count;
	struct rcu_head rcu_head;
	+ struct list_head mmu_page_list;
	#endif
	};

	diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
	index 45fc2c81e370..16bb73aa31b9 100644
	--- a/include/linux/mmu_notifier.h
	+++ b/include/linux/mmu_notifier.h
	@@ -59,6 +59,14 @@ enum mmu_notifier_event {
	MMU_NOTIFY_EXCLUSIVE,
	};

	+struct mmu_notifier_walk {
	+ bool (start_batch)(struct mm_struct mm, void *priv);
	+ bool (end_batch)(void priv, bool last);
	+ struct page (get_page)(void *priv, unsigned long pfn, bool young);
	+ void (update_page)(void priv, struct page *page);
	+ void *private;
	+};
	+
	#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

	struct mmu_notifier_ops {
	@@ -112,6 +120,9 @@ struct mmu_notifier_ops {
	unsigned long start,
	unsigned long end);

	+ void (clear_young_walk)(struct mmu_notifier mn,
	+ struct mmu_notifier_walk *walk);
	+
	/*
	* test_young is called to check the young/accessed bitflag in
	* the secondary pte. This is used to know if the page is
	@@ -391,6 +402,8 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
	extern int __mmu_notifier_clear_young(struct mm_struct *mm,
	unsigned long start,
	unsigned long end);
	+extern void __mmu_notifier_clear_young_walk(struct mm_struct *mm,
	+ struct mmu_notifier_walk *walk);
	extern int __mmu_notifier_test_young(struct mm_struct *mm,
	unsigned long address);
	extern void __mmu_notifier_change_pte(struct mm_struct *mm,
	@@ -433,6 +446,13 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm,
	return 0;
	}

	+static inline void mmu_notifier_clear_young_walk(struct mm_struct *mm,
	+ struct mmu_notifier_walk *walk)
	+{
	+ if (mm_has_notifiers(mm))
	+ __mmu_notifier_clear_young_walk(mm, walk);
	+}
	+
	static inline int mmu_notifier_test_young(struct mm_struct *mm,
	unsigned long address)
	{
	@@ -687,6 +707,11 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
	return 0;
	}

	+static inline void mmu_notifier_clear_young_walk(struct mm_struct *mm,
	+ struct mmu_notifier_walk *walk)
	+{
	+}
	+
	static inline int mmu_notifier_test_young(struct mm_struct *mm,
	unsigned long address)
	{
	diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
	index 459d195d2ff6..8b93671f6cbc 100644
	--- a/mm/mmu_notifier.c
	+++ b/mm/mmu_notifier.c
	@@ -402,6 +402,22 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
	return young;
	}

	+void __mmu_notifier_clear_young_walk(struct mm_struct *mm,
	+ struct mmu_notifier_walk *walk)
	+{
	+ int id;
	+ struct mmu_notifier *subscription;
	+
	+ id = srcu_read_lock(&srcu);
	+ hlist_for_each_entry_rcu(subscription,
	+ &mm->notifier_subscriptions->list, hlist,
	+ srcu_read_lock_held(&srcu)) {
	+ if (subscription->ops->clear_young_walk)
	+ subscription->ops->clear_young_walk(subscription, walk);
	+ }
	+ srcu_read_unlock(&srcu, id);
	+}
	+
	int __mmu_notifier_test_young(struct mm_struct *mm,
	unsigned long address)
	{
	diff --git a/mm/vmscan.c b/mm/vmscan.c
	index 8b4455a10abf..596034091065 100644
	--- a/mm/vmscan.c
	+++ b/mm/vmscan.c
	@@ -54,6 +54,7 @@
	#include <linux/shmem_fs.h>
	#include <linux/ctype.h>
	#include <linux/debugfs.h>
	+#include <linux/mmu_notifier.h>

	#include <asm/tlbflush.h>
	#include <asm/div64.h>
	@@ -3721,6 +3722,113 @@ static void walk_mm(struct mm_walk_args args, struct mm_struct mm)
	cond_resched();
	} while (err == -EAGAIN && args->next_addr &&
	!mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg));
	+
	+}
	+
	+static bool mmu_notifier_start_batch(struct mm_struct mm, void priv)
	+{
	+ struct mm_walk_args *args = priv;
	+ struct mem_cgroup *memcg = args->memcg;
	+
	+ VM_BUG_ON(!rcu_read_lock_held());
	+
	+#ifdef CONFIG_MEMCG
	+ if (memcg && atomic_read(&memcg->moving_account)) {
	+ args->mm_stats[MM_LOCK_CONTENTION]++;
	+ return false;
	+ }
	+#endif
	+ return !mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg);
	+}
	+
	+static bool mmu_notifier_end_batch(void *priv, bool last)
	+{
	+ struct lruvec *lruvec;
	+ struct mm_walk_args *args = priv;
	+
	+ VM_BUG_ON(!rcu_read_lock_held());
	+
	+ if (!last && args->batch_size < MAX_BATCH_SIZE)
	+ return false;
	+
	+ lruvec = mem_cgroup_lruvec(args->memcg, NODE_DATA(args->node_id));
	+ reset_batch_size(lruvec, args);
	+
	+ return true;
	+}
	+
	+static struct page mmu_notifier_get_page(void priv, unsigned long pfn, bool young)
	+{
	+ struct page *page;
	+ struct mm_walk_args *args = priv;
	+
	+ if (pfn == -1 \|\| is_zero_pfn(pfn)) {
	+ args->mm_stats[MM_LEAF_HOLE]++;
	+ return NULL;
	+ }
	+
	+ if (!young) {
	+ args->mm_stats[MM_LEAF_OLD]++;
	+ return NULL;
	+ }
	+
	+ VM_BUG_ON(!pfn_valid(pfn));
	+ if (pfn < args->start_pfn \|\| pfn >= args->end_pfn) {
	+ args->mm_stats[MM_LEAF_OTHER_NODE]++;
	+ return NULL;
	+ }
	+
	+ page = compound_head(pfn_to_page(pfn));
	+ if (page_to_nid(page) != args->node_id) {
	+ args->mm_stats[MM_LEAF_OTHER_NODE]++;
	+ return NULL;
	+ }
	+
	+ if (page_memcg_rcu(page) != args->memcg) {
	+ args->mm_stats[MM_LEAF_OTHER_MEMCG]++;
	+ return NULL;
	+ }
	+
	+ if (!PageLRU(page)) {
	+ args->mm_stats[MM_LEAF_HOLE]++;
	+ return NULL;
	+ }
	+
	+ return get_page_unless_zero(page) ? page : NULL;
	+}
	+
	+static void mmu_notifier_update_page(void priv, struct page page)
	+{
	+ struct mm_walk_args *args = priv;
	+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
	+
	+ if (page_memcg_rcu(page) != args->memcg) {
	+ args->mm_stats[MM_LEAF_OTHER_MEMCG]++;
	+ return;
	+ }
	+
	+ if (!PageLRU(page)) {
	+ args->mm_stats[MM_LEAF_HOLE]++;
	+ return;
	+ }
	+
	+ old_gen = page_update_gen(page, new_gen);
	+ if (old_gen >= 0 && old_gen != new_gen)
	+ update_batch_size(page, old_gen, new_gen, args);
	+ args->mm_stats[MM_LEAF_YOUNG]++;
	+}
	+
	+static void call_mmu_notifier(struct mm_walk_args args, struct mm_struct mm)
	+{
	+ struct mmu_notifier_walk walk = {
	+ .start_batch = mmu_notifier_start_batch,
	+ .end_batch = mmu_notifier_end_batch,
	+ .get_page = mmu_notifier_get_page,
	+ .update_page = mmu_notifier_update_page,
	+ .private = args,
	+ };
	+
	+ mmu_notifier_clear_young_walk(mm, &walk);
	}

	static void page_inc_gen(struct page page, struct lruvec lruvec, bool front)
	@@ -3912,6 +4020,7 @@ static bool walk_mm_list(struct lruvec *lruvec, unsigned long max_seq,
	last = get_next_mm(args, &mm);
	if (mm) {
	walk_mm(args, mm);
	+ call_mmu_notifier(args, mm);
	}

	cond_resched();
	diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
	index 986959833d70..64f8a088dcf9 100644
	--- a/virt/kvm/kvm_main.c
	+++ b/virt/kvm/kvm_main.c
	@@ -733,6 +733,19 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
	}

	+__weak void kvm_arch_mmu_clear_young_walk(struct kvm *kvm,
	+ struct mmu_notifier_walk *walk)
	+{
	+}
	+
	+static void kvm_mmu_notifier_clear_young_walk(struct mmu_notifier *mn,
	+ struct mmu_notifier_walk *walk)
	+{
	+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
	+
	+ kvm_arch_mmu_clear_young_walk(kvm, walk);
	+}
	+
	static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
	struct mm_struct *mm,
	unsigned long address)
	@@ -760,6 +773,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
	.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
	.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
	.clear_young = kvm_mmu_notifier_clear_young,
	+ .clear_young_walk = kvm_mmu_notifier_clear_young_walk,
	.test_young = kvm_mmu_notifier_test_young,
	.change_pte = kvm_mmu_notifier_change_pte,
	.release = kvm_mmu_notifier_release,
	--
	2.32.0.402.g57bb445576-goog