d5d72a3b3b75a25380aff47154f926ef75a8cd5e00a20a28f00c633e.patch - chromiumos/third_party/kernel-rebase-patches - Git at Google

 From a4c85efd7479140185f14af30e9f5c3860f4dc16 Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Mon, 5 Apr 2021 11:44:28 -0600
 Subject: [PATCH] BACKPORT: FROMLIST: mm: multigenerational lru: activation

 For pages mapped upon page faults, the accessed bit is set during the
 initial faults. We add them to the per-zone lists index by max_seq,
 i.e., the youngest generation, so that eviction will not consider them
 before the aging has scanned them. Readahead pages allocated in the
 page fault path will also be added to the youngest generation, since
 it is assumed that they may be needed soon.

 For pages accessed multiple times via file descriptors, instead of
 activating them upon the second access, we activate them based on the
 refault rates of their tiers. Each generation contains at most
 MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2 bits in
 page->flags. Pages accessed N times via file descriptors belong to
 tier order_base_2(N). Tier 0 is the base tier and it contains pages
 read ahead, accessed once via file descriptors and accessed only via
 page tables. Pages from the base tier are evicted regardless of the
 refault rate. Pages from upper tiers that have higher refault rates
 than the base tier will be moved to the next generation. A feedback
 loop modeled after the PID controller monitors refault rates across
 all tiers and decides when to activate pages from which upper tiers
 in the reclaim path. The advantages of this model are:
   1) It has a negligible cost in the buffered IO access path because
   activations are done optionally in the reclaim path.
   2) It takes mapped pages into account and avoids overprotecting
   pages accessed multiple times via file descriptors.
   3) More tiers offer better protection to pages accessed more than
   twice when workloads doing intensive buffered IO are under memory
   pressure.

 Finally, we need to make sure deactivation works when the
 multigenerational lru is enabled. We cannot use PageActive() because
 it is not set on pages from active generations, in order to spare the
 aging the trouble of clearing it when active generations become
 inactive. So we deactivate pages unconditionally since deactivation is
 not a hot code path worth additional optimizations.

 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 (am from https://lore.kernel.org/patchwork/patch/1432183/)

 BUG=b:123039911
 TEST=Built

 Change-Id: Ibc9c90757fd095cdcc0a49823ada6b55f17ffc06
 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987922
 Tested-by: Yu Zhao <yuzhao@chromium.org>
 Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
 Reviewed-by: Yu Zhao <yuzhao@chromium.org>
 Commit-Queue: Yu Zhao <yuzhao@chromium.org>
 ---
  include/linux/memcontrol.h |  20 -------
  include/linux/mm.h         |  30 ++++++++++
  include/linux/mm_inline.h  |  40 +++++++++++++
  include/linux/sched.h      |   2 +-
  mm/memcontrol.c            |   2 +-
  mm/memory.c                |   4 +-
  mm/swap.c                  |  11 +++-
  mm/vmscan.c                |  91 +++++++++++++++++++++++++++++-
  mm/workingset.c            | 112 +++++++++++++++++++++++++++++++++++++
  9 files changed, 285 insertions(+), 27 deletions(-)

 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index bfe5c486f4ad..1b9705b10457 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -913,18 +913,6 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,

  void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

 -static inline void mem_cgroup_enter_user_fault(void)
 -{
 -	WARN_ON(current->in_user_fault);
 -	current->in_user_fault = 1;
 -}
 -
 -static inline void mem_cgroup_exit_user_fault(void)
 -{
 -	WARN_ON(!current->in_user_fault);
 -	current->in_user_fault = 0;
 -}
 -
  static inline bool task_in_memcg_oom(struct task_struct *p)
  {
  	return p->memcg_in_oom;
 @@ -1350,14 +1338,6 @@ static inline void mem_cgroup_handle_over_high(void)
  {
  }

 -static inline void mem_cgroup_enter_user_fault(void)
 -{
 -}
 -
 -static inline void mem_cgroup_exit_user_fault(void)
 -{
 -}
 -
  static inline bool task_in_memcg_oom(struct task_struct *p)
  {
  	return false;
 diff --git a/include/linux/mm.h b/include/linux/mm.h
 index 8e9f77de1c61..1d3d83619836 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -1781,6 +1781,23 @@ void unmap_mapping_pages(struct address_space *mapping,
  		pgoff_t start, pgoff_t nr, bool even_cows);
  void unmap_mapping_range(struct address_space *mapping,
  		loff_t const holebegin, loff_t const holelen, int even_cows);
 +
 +static inline void task_enter_user_fault(void)
 +{
 +	WARN_ON(current->in_user_fault);
 +	current->in_user_fault = 1;
 +}
 +
 +static inline void task_exit_user_fault(void)
 +{
 +	WARN_ON(!current->in_user_fault);
 +	current->in_user_fault = 0;
 +}
 +
 +static inline bool task_in_user_fault(void)
 +{
 +	return current->in_user_fault;
 +}
  #else
  static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
  					 unsigned long address, unsigned int flags,
 @@ -1802,6 +1819,19 @@ static inline void unmap_mapping_pages(struct address_space *mapping,
  		pgoff_t start, pgoff_t nr, bool even_cows) { }
  static inline void unmap_mapping_range(struct address_space *mapping,
  		loff_t const holebegin, loff_t const holelen, int even_cows) { }
 +
 +static inline void task_enter_user_fault(void)
 +{
 +}
 +
 +static inline void task_exit_user_fault(void)
 +{
 +}
 +
 +static inline bool task_in_user_fault(void)
 +{
 +	return false;
 +}
  #endif

  static inline void unmap_shared_mapping_range(struct address_space *mapping,
 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
 index 4c5791f52653..2124ad9bccdb 100644
 --- a/include/linux/mm_inline.h
 +++ b/include/linux/mm_inline.h
 @@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
  	return seq % MAX_NR_GENS;
  }

 +/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
 +static inline int lru_tier_from_usage(int usage)
 +{
 +	return order_base_2(usage + 1);
 +}
 +
  /* Return a proper index regardless whether we keep a full history of stats. */
  static inline int hist_from_seq_or_gen(int seq_or_gen)
  {
 @@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
  	return true;
  }

 +/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
 +static inline int page_tier_usage(struct page *page)
 +{
 +	unsigned long flags = READ_ONCE(page->flags);
 +
 +	return flags & BIT(PG_workingset) ?
 +	       ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
 +}
 +
 +/* Increment the usage counter after a page is accessed via file descriptors. */
 +static inline void page_inc_usage(struct page *page)
 +{
 +	unsigned long usage;
 +	unsigned long old_flags, new_flags;
 +
 +	do {
 +		old_flags = READ_ONCE(page->flags);
 +
 +		if (!(old_flags & BIT(PG_workingset))) {
 +			new_flags = old_flags | BIT(PG_workingset);
 +			continue;
 +		}
 +
 +		usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
 +
 +		new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK);
 +	} while (new_flags != old_flags &&
 +		 cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
 +}
 +
  #else /* CONFIG_LRU_GEN */

  static inline bool lru_gen_enabled(void)
 @@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
  	return false;
  }

 +static inline void page_inc_usage(struct page *page)
 +{
 +}
 +
  #endif /* CONFIG_LRU_GEN */

  static __always_inline void add_page_to_lru_list(struct page *page,
 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index c135cc87bf57..370d53a46efe 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -840,7 +840,7 @@ struct task_struct {
  #ifndef TIF_RESTORE_SIGMASK
  	unsigned			restore_sigmask:1;
  #endif
 -#ifdef CONFIG_MEMCG
 +#ifdef CONFIG_MMU
  	unsigned			in_user_fault:1;
  #endif
  #ifdef CONFIG_COMPAT_BRK
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index ae1f5d0cb581..d36723fd9ed7 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -1824,7 +1824,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
  	 * victim and then we have to bail out from the charge path.
  	 */
  	if (memcg->oom_kill_disable) {
 -		if (!current->in_user_fault)
 +		if (!task_in_user_fault())
  			return OOM_SKIPPED;
  		css_get(&memcg->css);
  		current->memcg_in_oom = memcg;
 diff --git a/mm/memory.c b/mm/memory.c
 index 747a01d495f2..990869538ffd 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -4774,7 +4774,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
  	 * space.  Kernel faults are handled more gracefully.
  	 */
  	if (flags & FAULT_FLAG_USER)
 -		mem_cgroup_enter_user_fault();
 +		task_enter_user_fault();

  	if (unlikely(is_vm_hugetlb_page(vma)))
  		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
 @@ -4782,7 +4782,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
  		ret = __handle_mm_fault(vma, address, flags);

  	if (flags & FAULT_FLAG_USER) {
 -		mem_cgroup_exit_user_fault();
 +		task_exit_user_fault();
  		/*
  		 * The task may have entered a memcg OOM situation but
  		 * if the allocation error was handled gracefully (no
 diff --git a/mm/swap.c b/mm/swap.c
 index 19600430e536..09d78e0c81fa 100644
 --- a/mm/swap.c
 +++ b/mm/swap.c
 @@ -433,6 +433,8 @@ void mark_page_accessed(struct page *page)
  		 * this list is never rotated or maintained, so marking an
  		 * evictable page accessed has no effect.
  		 */
 +	} else if (lru_gen_enabled()) {
 +		page_inc_usage(page);
  	} else if (!PageActive(page)) {
  		/*
  		 * If the page is on the LRU, queue it for activation via
 @@ -468,6 +470,10 @@ void lru_cache_add(struct page *page)
  	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
  	VM_BUG_ON_PAGE(PageLRU(page), page);

 +	if (lru_gen_enabled() && !PageActive(page) && !PageUnevictable(page) &&
 +	    task_in_user_fault() && !(current->flags & PF_MEMALLOC))
 +		SetPageActive(page);
 +
  	get_page(page);
  	local_lock(&lru_pvecs.lock);
  	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
 @@ -569,7 +575,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)

  static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
  {
 -	if (PageActive(page) && !PageUnevictable(page)) {
 +	if ((PageActive(page) || lru_gen_enabled()) && !PageUnevictable(page)) {
  		int nr_pages = thp_nr_pages(page);

  		del_page_from_lru_list(page, lruvec);
 @@ -684,7 +690,8 @@ void deactivate_file_page(struct page *page)
   */
  void deactivate_page(struct page *page)
  {
 -	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
 +	if (PageLRU(page) && (PageActive(page) || lru_gen_enabled()) &&
 +	    !PageUnevictable(page)) {
  		struct pagevec *pvec;

  		local_lock(&lru_pvecs.lock);
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 1a406475054a..1778715462b5 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,

  	if (PageSwapCache(page)) {
  		swp_entry_t swap = { .val = page_private(page) };
 -		mem_cgroup_swapout(page, swap);
 +
 +		/* get a shadow entry before page_memcg() is cleared */
  		if (reclaimed && !mapping_exiting(mapping))
  			shadow = workingset_eviction(page, target_memcg);
 +		mem_cgroup_swapout(page, swap);
  		__delete_from_swap_cache(page, swap, shadow);
  		xa_unlock_irqrestore(&mapping->i_pages, flags);
  		put_swap_page(page, swap);
 @@ -2827,6 +2829,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
  	       get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
  }

 +/******************************************************************************
 + *                          refault feedback loop
 + ******************************************************************************/
 +
 +/*
 + * A feedback loop modeled after the PID controller. Currently supports the
 + * proportional (P) and the integral (I) terms; the derivative (D) term can be
 + * added if necessary. The setpoint (SP) is the desired position; the process
 + * variable (PV) is the measured position. The error is the difference between
 + * the SP and the PV. A positive error results in a positive control output
 + * correction, which, in our case, is to allow eviction.
 + *
 + * The P term is the current refault rate refaulted/(evicted+activated), which
 + * has a weight of 1. The I term is the arithmetic mean of the last N refault
 + * rates, weighted by geometric series 1/2, 1/4, ..., 1/(1<<N).
 + *
 + * Our goal is to make sure upper tiers have similar refault rates as the base
 + * tier. That is we try to be fair to all tiers by maintaining similar refault
 + * rates across them.
 + */
 +struct controller_pos {
 +	unsigned long refaulted;
 +	unsigned long total;
 +	int gain;
 +};
 +
 +static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec,
 +				int type, int tier, int gain)
 +{
 +	struct lrugen *lrugen = &lruvec->evictable;
 +	int hist = hist_from_seq_or_gen(lrugen->min_seq[type]);
 +
 +	pos->refaulted = lrugen->avg_refaulted[type][tier] +
 +			 atomic_long_read(&lrugen->refaulted[hist][type][tier]);
 +	pos->total = lrugen->avg_total[type][tier] +
 +		     atomic_long_read(&lrugen->evicted[hist][type][tier]);
 +	if (tier)
 +		pos->total += lrugen->activated[hist][type][tier - 1];
 +	pos->gain = gain;
 +}
 +
 +static void reset_controller_pos(struct lruvec *lruvec, int gen, int type)
 +{
 +	int tier;
 +	int hist = hist_from_seq_or_gen(gen);
 +	struct lrugen *lrugen = &lruvec->evictable;
 +	bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
 +
 +	if (!carryover && NR_STAT_GENS == 1)
 +		return;
 +
 +	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
 +		if (carryover) {
 +			unsigned long sum;
 +
 +			sum = lrugen->avg_refaulted[type][tier] +
 +			      atomic_long_read(&lrugen->refaulted[hist][type][tier]);
 +			WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
 +
 +			sum = lrugen->avg_total[type][tier] +
 +			      atomic_long_read(&lrugen->evicted[hist][type][tier]);
 +			if (tier)
 +				sum += lrugen->activated[hist][type][tier - 1];
 +			WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
 +
 +			if (NR_STAT_GENS > 1)
 +				continue;
 +		}
 +
 +		atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
 +		atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
 +		if (tier)
 +			WRITE_ONCE(lrugen->activated[hist][type][tier - 1], 0);
 +	}
 +}
 +
 +static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv)
 +{
 +	/*
 +	 * Allow eviction if the PV has a limited number of refaulted pages or a
 +	 * lower refault rate than the SP.
 +	 */
 +	return pv->refaulted < SWAP_CLUSTER_MAX ||
 +	       pv->refaulted * max(sp->total, 1UL) * sp->gain <=
 +	       sp->refaulted * max(pv->total, 1UL) * pv->gain;
 +}
 +
  /******************************************************************************
   *                          state change
   ******************************************************************************/
 diff --git a/mm/workingset.c b/mm/workingset.c
 index 2286ddc809b1..5e1afe9b6790 100644
 --- a/mm/workingset.c
 +++ b/mm/workingset.c
 @@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da
  	return val >> MEM_CGROUP_ID_SHIFT;
  }

 +#ifdef CONFIG_LRU_GEN
 +
 +#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
 +#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
 +#endif
 +
 +static void page_set_usage(struct page *page, int usage)
 +{
 +	unsigned long old_flags, new_flags;
 +
 +	VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
 +
 +	if (!usage)
 +		return;
 +
 +	do {
 +		old_flags = READ_ONCE(page->flags);
 +		new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS |
 +			    ((usage - 1UL) << LRU_USAGE_PGOFF);
 +	} while (new_flags != old_flags &&
 +		 cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
 +}
 +
 +/* Return a token to be stored in the shadow entry of a page being evicted. */
 +static void *lru_gen_eviction(struct page *page)
 +{
 +	int hist, tier;
 +	unsigned long token;
 +	unsigned long min_seq;
 +	struct lruvec *lruvec;
 +	struct lrugen *lrugen;
 +	int type = page_is_file_lru(page);
 +	int usage = page_tier_usage(page);
 +	struct mem_cgroup *memcg = page_memcg(page);
 +	struct pglist_data *pgdat = page_pgdat(page);
 +
 +	lruvec = mem_cgroup_lruvec(memcg, pgdat);
 +	lrugen = &lruvec->evictable;
 +	min_seq = READ_ONCE(lrugen->min_seq[type]);
 +	token = (min_seq << LRU_USAGE_SHIFT) | usage;
 +
 +	hist = hist_from_seq_or_gen(min_seq);
 +	tier = lru_tier_from_usage(usage);
 +	atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
 +
 +	return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
 +}
 +
 +/* Account a refaulted page based on the token stored in its shadow entry. */
 +static void lru_gen_refault(struct page *page, void *shadow)
 +{
 +	int hist, tier, usage;
 +	int memcg_id;
 +	unsigned long token;
 +	unsigned long min_seq;
 +	struct lruvec *lruvec;
 +	struct lrugen *lrugen;
 +	struct pglist_data *pgdat;
 +	struct mem_cgroup *memcg;
 +	int type = page_is_file_lru(page);
 +
 +	token = unpack_shadow(shadow, &memcg_id, &pgdat);
 +	if (page_pgdat(page) != pgdat)
 +		return;
 +
 +	rcu_read_lock();
 +	memcg = page_memcg_rcu(page);
 +	if (mem_cgroup_id(memcg) != memcg_id)
 +		goto unlock;
 +
 +	usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
 +	token >>= LRU_USAGE_SHIFT;
 +
 +	lruvec = mem_cgroup_lruvec(memcg, pgdat);
 +	lrugen = &lruvec->evictable;
 +	min_seq = READ_ONCE(lrugen->min_seq[type]);
 +	if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
 +		goto unlock;
 +
 +	page_set_usage(page, usage);
 +
 +	hist = hist_from_seq_or_gen(min_seq);
 +	tier = lru_tier_from_usage(usage);
 +	atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
 +	inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
 +	if (tier)
 +		inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
 +unlock:
 +	rcu_read_unlock();
 +}
 +
 +#else /* CONFIG_LRU_GEN */
 +
 +static void *lru_gen_eviction(struct page *page)
 +{
 +	return NULL;
 +}
 +
 +static void lru_gen_refault(struct page *page, void *shadow)
 +{
 +}
 +
 +#endif /* CONFIG_LRU_GEN */
 +
  /**
   * workingset_age_nonresident - age non-resident entries as LRU ages
   * @lruvec: the lruvec that was aged
 @@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
  	VM_BUG_ON_PAGE(page_count(page), page);
  	VM_BUG_ON_PAGE(!PageLocked(page), page);

 +	if (lru_gen_enabled())
 +		return lru_gen_eviction(page);
 +
  	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
  	/* XXX: target_memcg can be NULL, go through lruvec */
  	memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
 @@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow)
  	bool workingset;
  	int memcgid;

 +	if (lru_gen_enabled()) {
 +		lru_gen_refault(page, shadow);
 +		return;
 +	}
 +
  	eviction = unpack_shadow(shadow, &memcgid, &pgdat);

  	rcu_read_lock();
 --
 2.32.0.402.g57bb445576-goog
	From a4c85efd7479140185f14af30e9f5c3860f4dc16 Mon Sep 17 00:00:00 2001
	From: Yu Zhao <yuzhao@google.com>
	Date: Mon, 5 Apr 2021 11:44:28 -0600
	Subject: [PATCH] BACKPORT: FROMLIST: mm: multigenerational lru: activation

	For pages mapped upon page faults, the accessed bit is set during the
	initial faults. We add them to the per-zone lists index by max_seq,
	i.e., the youngest generation, so that eviction will not consider them
	before the aging has scanned them. Readahead pages allocated in the
	page fault path will also be added to the youngest generation, since
	it is assumed that they may be needed soon.

	For pages accessed multiple times via file descriptors, instead of
	activating them upon the second access, we activate them based on the
	refault rates of their tiers. Each generation contains at most
	MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2 bits in
	page->flags. Pages accessed N times via file descriptors belong to
	tier order_base_2(N). Tier 0 is the base tier and it contains pages
	read ahead, accessed once via file descriptors and accessed only via
	page tables. Pages from the base tier are evicted regardless of the
	refault rate. Pages from upper tiers that have higher refault rates
	than the base tier will be moved to the next generation. A feedback
	loop modeled after the PID controller monitors refault rates across
	all tiers and decides when to activate pages from which upper tiers
	in the reclaim path. The advantages of this model are:
	1) It has a negligible cost in the buffered IO access path because
	activations are done optionally in the reclaim path.
	2) It takes mapped pages into account and avoids overprotecting
	pages accessed multiple times via file descriptors.
	3) More tiers offer better protection to pages accessed more than
	twice when workloads doing intensive buffered IO are under memory
	pressure.

	Finally, we need to make sure deactivation works when the
	multigenerational lru is enabled. We cannot use PageActive() because
	it is not set on pages from active generations, in order to spare the
	aging the trouble of clearing it when active generations become
	inactive. So we deactivate pages unconditionally since deactivation is
	not a hot code path worth additional optimizations.

	Signed-off-by: Yu Zhao <yuzhao@google.com>
	Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
	(am from https://lore.kernel.org/patchwork/patch/1432183/)

	BUG=b:123039911
	TEST=Built

	Change-Id: Ibc9c90757fd095cdcc0a49823ada6b55f17ffc06
	Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987922
	Tested-by: Yu Zhao <yuzhao@chromium.org>
	Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
	Reviewed-by: Yu Zhao <yuzhao@chromium.org>
	Commit-Queue: Yu Zhao <yuzhao@chromium.org>
	---
	include/linux/memcontrol.h \| 20 -------
	include/linux/mm.h \| 30 ++++++++++
	include/linux/mm_inline.h \| 40 +++++++++++++
	include/linux/sched.h \| 2 +-
	mm/memcontrol.c \| 2 +-
	mm/memory.c \| 4 +-
	mm/swap.c \| 11 +++-
	mm/vmscan.c \| 91 +++++++++++++++++++++++++++++-
	mm/workingset.c \| 112 +++++++++++++++++++++++++++++++++++++
	9 files changed, 285 insertions(+), 27 deletions(-)

	diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
	index bfe5c486f4ad..1b9705b10457 100644
	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -913,18 +913,6 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,

	void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

	-static inline void mem_cgroup_enter_user_fault(void)
	-{
	- WARN_ON(current->in_user_fault);
	- current->in_user_fault = 1;
	-}
	-
	-static inline void mem_cgroup_exit_user_fault(void)
	-{
	- WARN_ON(!current->in_user_fault);
	- current->in_user_fault = 0;
	-}
	-
	static inline bool task_in_memcg_oom(struct task_struct *p)
	{
	return p->memcg_in_oom;
	@@ -1350,14 +1338,6 @@ static inline void mem_cgroup_handle_over_high(void)
	{
	}

	-static inline void mem_cgroup_enter_user_fault(void)
	-{
	-}
	-
	-static inline void mem_cgroup_exit_user_fault(void)
	-{
	-}
	-
	static inline bool task_in_memcg_oom(struct task_struct *p)
	{
	return false;
	diff --git a/include/linux/mm.h b/include/linux/mm.h
	index 8e9f77de1c61..1d3d83619836 100644
	--- a/include/linux/mm.h
	+++ b/include/linux/mm.h
	@@ -1781,6 +1781,23 @@ void unmap_mapping_pages(struct address_space *mapping,
	pgoff_t start, pgoff_t nr, bool even_cows);
	void unmap_mapping_range(struct address_space *mapping,
	loff_t const holebegin, loff_t const holelen, int even_cows);
	+
	+static inline void task_enter_user_fault(void)
	+{
	+ WARN_ON(current->in_user_fault);
	+ current->in_user_fault = 1;
	+}
	+
	+static inline void task_exit_user_fault(void)
	+{
	+ WARN_ON(!current->in_user_fault);
	+ current->in_user_fault = 0;
	+}
	+
	+static inline bool task_in_user_fault(void)
	+{
	+ return current->in_user_fault;
	+}
	#else
	static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
	unsigned long address, unsigned int flags,
	@@ -1802,6 +1819,19 @@ static inline void unmap_mapping_pages(struct address_space *mapping,
	pgoff_t start, pgoff_t nr, bool even_cows) { }
	static inline void unmap_mapping_range(struct address_space *mapping,
	loff_t const holebegin, loff_t const holelen, int even_cows) { }
	+
	+static inline void task_enter_user_fault(void)
	+{
	+}
	+
	+static inline void task_exit_user_fault(void)
	+{
	+}
	+
	+static inline bool task_in_user_fault(void)
	+{
	+ return false;
	+}
	#endif

	static inline void unmap_shared_mapping_range(struct address_space *mapping,
	diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
	index 4c5791f52653..2124ad9bccdb 100644
	--- a/include/linux/mm_inline.h
	+++ b/include/linux/mm_inline.h
	@@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
	return seq % MAX_NR_GENS;
	}

	+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
	+static inline int lru_tier_from_usage(int usage)
	+{
	+ return order_base_2(usage + 1);
	+}
	+
	/* Return a proper index regardless whether we keep a full history of stats. */
	static inline int hist_from_seq_or_gen(int seq_or_gen)
	{
	@@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page page, struct lruvec lruvec)
	return true;
	}

	+/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
	+static inline int page_tier_usage(struct page *page)
	+{
	+ unsigned long flags = READ_ONCE(page->flags);
	+
	+ return flags & BIT(PG_workingset) ?
	+ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
	+}
	+
	+/* Increment the usage counter after a page is accessed via file descriptors. */
	+static inline void page_inc_usage(struct page *page)
	+{
	+ unsigned long usage;
	+ unsigned long old_flags, new_flags;
	+
	+ do {
	+ old_flags = READ_ONCE(page->flags);
	+
	+ if (!(old_flags & BIT(PG_workingset))) {
	+ new_flags = old_flags \| BIT(PG_workingset);
	+ continue;
	+ }
	+
	+ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
	+
	+ new_flags = (old_flags & ~LRU_USAGE_MASK) \| min(usage, LRU_USAGE_MASK);
	+ } while (new_flags != old_flags &&
	+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
	+}
	+
	#else /* CONFIG_LRU_GEN */

	static inline bool lru_gen_enabled(void)
	@@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page page, struct lruvec lruvec)
	return false;
	}

	+static inline void page_inc_usage(struct page *page)
	+{
	+}
	+
	#endif /* CONFIG_LRU_GEN */

	static __always_inline void add_page_to_lru_list(struct page *page,
	diff --git a/include/linux/sched.h b/include/linux/sched.h
	index c135cc87bf57..370d53a46efe 100644
	--- a/include/linux/sched.h
	+++ b/include/linux/sched.h
	@@ -840,7 +840,7 @@ struct task_struct {
	#ifndef TIF_RESTORE_SIGMASK
	unsigned restore_sigmask:1;
	#endif
	-#ifdef CONFIG_MEMCG
	+#ifdef CONFIG_MMU
	unsigned in_user_fault:1;
	#endif
	#ifdef CONFIG_COMPAT_BRK
	diff --git a/mm/memcontrol.c b/mm/memcontrol.c
	index ae1f5d0cb581..d36723fd9ed7 100644
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -1824,7 +1824,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
	* victim and then we have to bail out from the charge path.
	*/
	if (memcg->oom_kill_disable) {
	- if (!current->in_user_fault)
	+ if (!task_in_user_fault())
	return OOM_SKIPPED;
	css_get(&memcg->css);
	current->memcg_in_oom = memcg;
	diff --git a/mm/memory.c b/mm/memory.c
	index 747a01d495f2..990869538ffd 100644
	--- a/mm/memory.c
	+++ b/mm/memory.c
	@@ -4774,7 +4774,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
	* space. Kernel faults are handled more gracefully.
	*/
	if (flags & FAULT_FLAG_USER)
	- mem_cgroup_enter_user_fault();
	+ task_enter_user_fault();

	if (unlikely(is_vm_hugetlb_page(vma)))
	ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
	@@ -4782,7 +4782,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
	ret = __handle_mm_fault(vma, address, flags);

	if (flags & FAULT_FLAG_USER) {
	- mem_cgroup_exit_user_fault();
	+ task_exit_user_fault();
	/*
	* The task may have entered a memcg OOM situation but
	* if the allocation error was handled gracefully (no
	diff --git a/mm/swap.c b/mm/swap.c
	index 19600430e536..09d78e0c81fa 100644
	--- a/mm/swap.c
	+++ b/mm/swap.c
	@@ -433,6 +433,8 @@ void mark_page_accessed(struct page *page)
	* this list is never rotated or maintained, so marking an
	* evictable page accessed has no effect.
	*/
	+ } else if (lru_gen_enabled()) {
	+ page_inc_usage(page);
	} else if (!PageActive(page)) {
	/*
	* If the page is on the LRU, queue it for activation via
	@@ -468,6 +470,10 @@ void lru_cache_add(struct page *page)
	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
	VM_BUG_ON_PAGE(PageLRU(page), page);

	+ if (lru_gen_enabled() && !PageActive(page) && !PageUnevictable(page) &&
	+ task_in_user_fault() && !(current->flags & PF_MEMALLOC))
	+ SetPageActive(page);
	+
	get_page(page);
	local_lock(&lru_pvecs.lock);
	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
	@@ -569,7 +575,7 @@ static void lru_deactivate_file_fn(struct page page, struct lruvec lruvec)

	static void lru_deactivate_fn(struct page page, struct lruvec lruvec)
	{
	- if (PageActive(page) && !PageUnevictable(page)) {
	+ if ((PageActive(page) \|\| lru_gen_enabled()) && !PageUnevictable(page)) {
	int nr_pages = thp_nr_pages(page);

	del_page_from_lru_list(page, lruvec);
	@@ -684,7 +690,8 @@ void deactivate_file_page(struct page *page)
	*/
	void deactivate_page(struct page *page)
	{
	- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
	+ if (PageLRU(page) && (PageActive(page) \|\| lru_gen_enabled()) &&
	+ !PageUnevictable(page)) {
	struct pagevec *pvec;

	local_lock(&lru_pvecs.lock);
	diff --git a/mm/vmscan.c b/mm/vmscan.c
	index 1a406475054a..1778715462b5 100644
	--- a/mm/vmscan.c
	+++ b/mm/vmscan.c
	@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space mapping, struct page page,

	if (PageSwapCache(page)) {
	swp_entry_t swap = { .val = page_private(page) };
	- mem_cgroup_swapout(page, swap);
	+
	+ /* get a shadow entry before page_memcg() is cleared */
	if (reclaimed && !mapping_exiting(mapping))
	shadow = workingset_eviction(page, target_memcg);
	+ mem_cgroup_swapout(page, swap);
	__delete_from_swap_cache(page, swap, shadow);
	xa_unlock_irqrestore(&mapping->i_pages, flags);
	put_swap_page(page, swap);
	@@ -2827,6 +2829,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
	get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
	}

	+/******************************************************************************
	+ * refault feedback loop
	+ ******************************************************************************/
	+
	+/*
	+ * A feedback loop modeled after the PID controller. Currently supports the
	+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
	+ * added if necessary. The setpoint (SP) is the desired position; the process
	+ * variable (PV) is the measured position. The error is the difference between
	+ * the SP and the PV. A positive error results in a positive control output
	+ * correction, which, in our case, is to allow eviction.
	+ *
	+ * The P term is the current refault rate refaulted/(evicted+activated), which
	+ * has a weight of 1. The I term is the arithmetic mean of the last N refault
	+ * rates, weighted by geometric series 1/2, 1/4, ..., 1/(1<<N).
	+ *
	+ * Our goal is to make sure upper tiers have similar refault rates as the base
	+ * tier. That is we try to be fair to all tiers by maintaining similar refault
	+ * rates across them.
	+ */
	+struct controller_pos {
	+ unsigned long refaulted;
	+ unsigned long total;
	+ int gain;
	+};
	+
	+static void read_controller_pos(struct controller_pos pos, struct lruvec lruvec,
	+ int type, int tier, int gain)
	+{
	+ struct lrugen *lrugen = &lruvec->evictable;
	+ int hist = hist_from_seq_or_gen(lrugen->min_seq[type]);
	+
	+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
	+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
	+ pos->total = lrugen->avg_total[type][tier] +
	+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
	+ if (tier)
	+ pos->total += lrugen->activated[hist][type][tier - 1];
	+ pos->gain = gain;
	+}
	+
	+static void reset_controller_pos(struct lruvec *lruvec, int gen, int type)
	+{
	+ int tier;
	+ int hist = hist_from_seq_or_gen(gen);
	+ struct lrugen *lrugen = &lruvec->evictable;
	+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
	+
	+ if (!carryover && NR_STAT_GENS == 1)
	+ return;
	+
	+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
	+ if (carryover) {
	+ unsigned long sum;
	+
	+ sum = lrugen->avg_refaulted[type][tier] +
	+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
	+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
	+
	+ sum = lrugen->avg_total[type][tier] +
	+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
	+ if (tier)
	+ sum += lrugen->activated[hist][type][tier - 1];
	+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
	+
	+ if (NR_STAT_GENS > 1)
	+ continue;
	+ }
	+
	+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
	+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
	+ if (tier)
	+ WRITE_ONCE(lrugen->activated[hist][type][tier - 1], 0);
	+ }
	+}
	+
	+static bool positive_ctrl_err(struct controller_pos sp, struct controller_pos pv)
	+{
	+ /*
	+ * Allow eviction if the PV has a limited number of refaulted pages or a
	+ * lower refault rate than the SP.
	+ */
	+ return pv->refaulted < SWAP_CLUSTER_MAX \|\|
	+ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
	+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
	+}
	+
	/******************************************************************************
	* state change
	******************************************************************************/
	diff --git a/mm/workingset.c b/mm/workingset.c
	index 2286ddc809b1..5e1afe9b6790 100644
	--- a/mm/workingset.c
	+++ b/mm/workingset.c
	@@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void shadow, int memcg_id, struct pglist_da
	return val >> MEM_CGROUP_ID_SHIFT;
	}

	+#ifdef CONFIG_LRU_GEN
	+
	+#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
	+#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
	+#endif
	+
	+static void page_set_usage(struct page *page, int usage)
	+{
	+ unsigned long old_flags, new_flags;
	+
	+ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
	+
	+ if (!usage)
	+ return;
	+
	+ do {
	+ old_flags = READ_ONCE(page->flags);
	+ new_flags = (old_flags & ~LRU_USAGE_MASK) \| LRU_TIER_FLAGS \|
	+ ((usage - 1UL) << LRU_USAGE_PGOFF);
	+ } while (new_flags != old_flags &&
	+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
	+}
	+
	+/* Return a token to be stored in the shadow entry of a page being evicted. */
	+static void lru_gen_eviction(struct page page)
	+{
	+ int hist, tier;
	+ unsigned long token;
	+ unsigned long min_seq;
	+ struct lruvec *lruvec;
	+ struct lrugen *lrugen;
	+ int type = page_is_file_lru(page);
	+ int usage = page_tier_usage(page);
	+ struct mem_cgroup *memcg = page_memcg(page);
	+ struct pglist_data *pgdat = page_pgdat(page);
	+
	+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
	+ lrugen = &lruvec->evictable;
	+ min_seq = READ_ONCE(lrugen->min_seq[type]);
	+ token = (min_seq << LRU_USAGE_SHIFT) \| usage;
	+
	+ hist = hist_from_seq_or_gen(min_seq);
	+ tier = lru_tier_from_usage(usage);
	+ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
	+
	+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
	+}
	+
	+/* Account a refaulted page based on the token stored in its shadow entry. */
	+static void lru_gen_refault(struct page page, void shadow)
	+{
	+ int hist, tier, usage;
	+ int memcg_id;
	+ unsigned long token;
	+ unsigned long min_seq;
	+ struct lruvec *lruvec;
	+ struct lrugen *lrugen;
	+ struct pglist_data *pgdat;
	+ struct mem_cgroup *memcg;
	+ int type = page_is_file_lru(page);
	+
	+ token = unpack_shadow(shadow, &memcg_id, &pgdat);
	+ if (page_pgdat(page) != pgdat)
	+ return;
	+
	+ rcu_read_lock();
	+ memcg = page_memcg_rcu(page);
	+ if (mem_cgroup_id(memcg) != memcg_id)
	+ goto unlock;
	+
	+ usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
	+ token >>= LRU_USAGE_SHIFT;
	+
	+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
	+ lrugen = &lruvec->evictable;
	+ min_seq = READ_ONCE(lrugen->min_seq[type]);
	+ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
	+ goto unlock;
	+
	+ page_set_usage(page, usage);
	+
	+ hist = hist_from_seq_or_gen(min_seq);
	+ tier = lru_tier_from_usage(usage);
	+ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
	+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
	+ if (tier)
	+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
	+unlock:
	+ rcu_read_unlock();
	+}
	+
	+#else /* CONFIG_LRU_GEN */
	+
	+static void lru_gen_eviction(struct page page)
	+{
	+ return NULL;
	+}
	+
	+static void lru_gen_refault(struct page page, void shadow)
	+{
	+}
	+
	+#endif /* CONFIG_LRU_GEN */
	+
	/**
	* workingset_age_nonresident - age non-resident entries as LRU ages
	* @lruvec: the lruvec that was aged
	@@ -249,6 +353,9 @@ void workingset_eviction(struct page page, struct mem_cgroup *target_memcg)
	VM_BUG_ON_PAGE(page_count(page), page);
	VM_BUG_ON_PAGE(!PageLocked(page), page);

	+ if (lru_gen_enabled())
	+ return lru_gen_eviction(page);
	+
	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
	/* XXX: target_memcg can be NULL, go through lruvec */
	memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
	@@ -283,6 +390,11 @@ void workingset_refault(struct page page, void shadow)
	bool workingset;
	int memcgid;

	+ if (lru_gen_enabled()) {
	+ lru_gen_refault(page, shadow);
	+ return;
	+ }
	+
	eviction = unpack_shadow(shadow, &memcgid, &pgdat);

	rcu_read_lock();
	--
	2.32.0.402.g57bb445576-goog