| From a4c85efd7479140185f14af30e9f5c3860f4dc16 Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Mon, 5 Apr 2021 11:44:28 -0600 |
| Subject: [PATCH] BACKPORT: FROMLIST: mm: multigenerational lru: activation |
| |
| For pages mapped upon page faults, the accessed bit is set during the |
| initial faults. We add them to the per-zone lists index by max_seq, |
| i.e., the youngest generation, so that eviction will not consider them |
| before the aging has scanned them. Readahead pages allocated in the |
| page fault path will also be added to the youngest generation, since |
| it is assumed that they may be needed soon. |
| |
| For pages accessed multiple times via file descriptors, instead of |
| activating them upon the second access, we activate them based on the |
| refault rates of their tiers. Each generation contains at most |
| MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2 bits in |
| page->flags. Pages accessed N times via file descriptors belong to |
| tier order_base_2(N). Tier 0 is the base tier and it contains pages |
| read ahead, accessed once via file descriptors and accessed only via |
| page tables. Pages from the base tier are evicted regardless of the |
| refault rate. Pages from upper tiers that have higher refault rates |
| than the base tier will be moved to the next generation. A feedback |
| loop modeled after the PID controller monitors refault rates across |
| all tiers and decides when to activate pages from which upper tiers |
| in the reclaim path. The advantages of this model are: |
| 1) It has a negligible cost in the buffered IO access path because |
| activations are done optionally in the reclaim path. |
| 2) It takes mapped pages into account and avoids overprotecting |
| pages accessed multiple times via file descriptors. |
| 3) More tiers offer better protection to pages accessed more than |
| twice when workloads doing intensive buffered IO are under memory |
| pressure. |
| |
| Finally, we need to make sure deactivation works when the |
| multigenerational lru is enabled. We cannot use PageActive() because |
| it is not set on pages from active generations, in order to spare the |
| aging the trouble of clearing it when active generations become |
| inactive. So we deactivate pages unconditionally since deactivation is |
| not a hot code path worth additional optimizations. |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> |
| (am from https://lore.kernel.org/patchwork/patch/1432183/) |
| |
| BUG=b:123039911 |
| TEST=Built |
| |
| Change-Id: Ibc9c90757fd095cdcc0a49823ada6b55f17ffc06 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987922 |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: Sonny Rao <sonnyrao@chromium.org> |
| Reviewed-by: Yu Zhao <yuzhao@chromium.org> |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| --- |
| include/linux/memcontrol.h | 20 ------- |
| include/linux/mm.h | 30 ++++++++++ |
| include/linux/mm_inline.h | 40 +++++++++++++ |
| include/linux/sched.h | 2 +- |
| mm/memcontrol.c | 2 +- |
| mm/memory.c | 4 +- |
| mm/swap.c | 11 +++- |
| mm/vmscan.c | 91 +++++++++++++++++++++++++++++- |
| mm/workingset.c | 112 +++++++++++++++++++++++++++++++++++++ |
| 9 files changed, 285 insertions(+), 27 deletions(-) |
| |
| diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h |
| index bfe5c486f4ad..1b9705b10457 100644 |
| --- a/include/linux/memcontrol.h |
| +++ b/include/linux/memcontrol.h |
| @@ -913,18 +913,6 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, |
| |
| void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); |
| |
| -static inline void mem_cgroup_enter_user_fault(void) |
| -{ |
| - WARN_ON(current->in_user_fault); |
| - current->in_user_fault = 1; |
| -} |
| - |
| -static inline void mem_cgroup_exit_user_fault(void) |
| -{ |
| - WARN_ON(!current->in_user_fault); |
| - current->in_user_fault = 0; |
| -} |
| - |
| static inline bool task_in_memcg_oom(struct task_struct *p) |
| { |
| return p->memcg_in_oom; |
| @@ -1350,14 +1338,6 @@ static inline void mem_cgroup_handle_over_high(void) |
| { |
| } |
| |
| -static inline void mem_cgroup_enter_user_fault(void) |
| -{ |
| -} |
| - |
| -static inline void mem_cgroup_exit_user_fault(void) |
| -{ |
| -} |
| - |
| static inline bool task_in_memcg_oom(struct task_struct *p) |
| { |
| return false; |
| diff --git a/include/linux/mm.h b/include/linux/mm.h |
| index 8e9f77de1c61..1d3d83619836 100644 |
| --- a/include/linux/mm.h |
| +++ b/include/linux/mm.h |
| @@ -1781,6 +1781,23 @@ void unmap_mapping_pages(struct address_space *mapping, |
| pgoff_t start, pgoff_t nr, bool even_cows); |
| void unmap_mapping_range(struct address_space *mapping, |
| loff_t const holebegin, loff_t const holelen, int even_cows); |
| + |
| +static inline void task_enter_user_fault(void) |
| +{ |
| + WARN_ON(current->in_user_fault); |
| + current->in_user_fault = 1; |
| +} |
| + |
| +static inline void task_exit_user_fault(void) |
| +{ |
| + WARN_ON(!current->in_user_fault); |
| + current->in_user_fault = 0; |
| +} |
| + |
| +static inline bool task_in_user_fault(void) |
| +{ |
| + return current->in_user_fault; |
| +} |
| #else |
| static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, |
| unsigned long address, unsigned int flags, |
| @@ -1802,6 +1819,19 @@ static inline void unmap_mapping_pages(struct address_space *mapping, |
| pgoff_t start, pgoff_t nr, bool even_cows) { } |
| static inline void unmap_mapping_range(struct address_space *mapping, |
| loff_t const holebegin, loff_t const holelen, int even_cows) { } |
| + |
| +static inline void task_enter_user_fault(void) |
| +{ |
| +} |
| + |
| +static inline void task_exit_user_fault(void) |
| +{ |
| +} |
| + |
| +static inline bool task_in_user_fault(void) |
| +{ |
| + return false; |
| +} |
| #endif |
| |
| static inline void unmap_shared_mapping_range(struct address_space *mapping, |
| diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h |
| index 4c5791f52653..2124ad9bccdb 100644 |
| --- a/include/linux/mm_inline.h |
| +++ b/include/linux/mm_inline.h |
| @@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq) |
| return seq % MAX_NR_GENS; |
| } |
| |
| +/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */ |
| +static inline int lru_tier_from_usage(int usage) |
| +{ |
| + return order_base_2(usage + 1); |
| +} |
| + |
| /* Return a proper index regardless whether we keep a full history of stats. */ |
| static inline int hist_from_seq_or_gen(int seq_or_gen) |
| { |
| @@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec) |
| return true; |
| } |
| |
| +/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */ |
| +static inline int page_tier_usage(struct page *page) |
| +{ |
| + unsigned long flags = READ_ONCE(page->flags); |
| + |
| + return flags & BIT(PG_workingset) ? |
| + ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0; |
| +} |
| + |
| +/* Increment the usage counter after a page is accessed via file descriptors. */ |
| +static inline void page_inc_usage(struct page *page) |
| +{ |
| + unsigned long usage; |
| + unsigned long old_flags, new_flags; |
| + |
| + do { |
| + old_flags = READ_ONCE(page->flags); |
| + |
| + if (!(old_flags & BIT(PG_workingset))) { |
| + new_flags = old_flags | BIT(PG_workingset); |
| + continue; |
| + } |
| + |
| + usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF); |
| + |
| + new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK); |
| + } while (new_flags != old_flags && |
| + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); |
| +} |
| + |
| #else /* CONFIG_LRU_GEN */ |
| |
| static inline bool lru_gen_enabled(void) |
| @@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec) |
| return false; |
| } |
| |
| +static inline void page_inc_usage(struct page *page) |
| +{ |
| +} |
| + |
| #endif /* CONFIG_LRU_GEN */ |
| |
| static __always_inline void add_page_to_lru_list(struct page *page, |
| diff --git a/include/linux/sched.h b/include/linux/sched.h |
| index c135cc87bf57..370d53a46efe 100644 |
| --- a/include/linux/sched.h |
| +++ b/include/linux/sched.h |
| @@ -840,7 +840,7 @@ struct task_struct { |
| #ifndef TIF_RESTORE_SIGMASK |
| unsigned restore_sigmask:1; |
| #endif |
| -#ifdef CONFIG_MEMCG |
| +#ifdef CONFIG_MMU |
| unsigned in_user_fault:1; |
| #endif |
| #ifdef CONFIG_COMPAT_BRK |
| diff --git a/mm/memcontrol.c b/mm/memcontrol.c |
| index ae1f5d0cb581..d36723fd9ed7 100644 |
| --- a/mm/memcontrol.c |
| +++ b/mm/memcontrol.c |
| @@ -1824,7 +1824,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int |
| * victim and then we have to bail out from the charge path. |
| */ |
| if (memcg->oom_kill_disable) { |
| - if (!current->in_user_fault) |
| + if (!task_in_user_fault()) |
| return OOM_SKIPPED; |
| css_get(&memcg->css); |
| current->memcg_in_oom = memcg; |
| diff --git a/mm/memory.c b/mm/memory.c |
| index 747a01d495f2..990869538ffd 100644 |
| --- a/mm/memory.c |
| +++ b/mm/memory.c |
| @@ -4774,7 +4774,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
| * space. Kernel faults are handled more gracefully. |
| */ |
| if (flags & FAULT_FLAG_USER) |
| - mem_cgroup_enter_user_fault(); |
| + task_enter_user_fault(); |
| |
| if (unlikely(is_vm_hugetlb_page(vma))) |
| ret = hugetlb_fault(vma->vm_mm, vma, address, flags); |
| @@ -4782,7 +4782,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
| ret = __handle_mm_fault(vma, address, flags); |
| |
| if (flags & FAULT_FLAG_USER) { |
| - mem_cgroup_exit_user_fault(); |
| + task_exit_user_fault(); |
| /* |
| * The task may have entered a memcg OOM situation but |
| * if the allocation error was handled gracefully (no |
| diff --git a/mm/swap.c b/mm/swap.c |
| index 19600430e536..09d78e0c81fa 100644 |
| --- a/mm/swap.c |
| +++ b/mm/swap.c |
| @@ -433,6 +433,8 @@ void mark_page_accessed(struct page *page) |
| * this list is never rotated or maintained, so marking an |
| * evictable page accessed has no effect. |
| */ |
| + } else if (lru_gen_enabled()) { |
| + page_inc_usage(page); |
| } else if (!PageActive(page)) { |
| /* |
| * If the page is on the LRU, queue it for activation via |
| @@ -468,6 +470,10 @@ void lru_cache_add(struct page *page) |
| VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); |
| VM_BUG_ON_PAGE(PageLRU(page), page); |
| |
| + if (lru_gen_enabled() && !PageActive(page) && !PageUnevictable(page) && |
| + task_in_user_fault() && !(current->flags & PF_MEMALLOC)) |
| + SetPageActive(page); |
| + |
| get_page(page); |
| local_lock(&lru_pvecs.lock); |
| pvec = this_cpu_ptr(&lru_pvecs.lru_add); |
| @@ -569,7 +575,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) |
| |
| static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) |
| { |
| - if (PageActive(page) && !PageUnevictable(page)) { |
| + if ((PageActive(page) || lru_gen_enabled()) && !PageUnevictable(page)) { |
| int nr_pages = thp_nr_pages(page); |
| |
| del_page_from_lru_list(page, lruvec); |
| @@ -684,7 +690,8 @@ void deactivate_file_page(struct page *page) |
| */ |
| void deactivate_page(struct page *page) |
| { |
| - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { |
| + if (PageLRU(page) && (PageActive(page) || lru_gen_enabled()) && |
| + !PageUnevictable(page)) { |
| struct pagevec *pvec; |
| |
| local_lock(&lru_pvecs.lock); |
| diff --git a/mm/vmscan.c b/mm/vmscan.c |
| index 1a406475054a..1778715462b5 100644 |
| --- a/mm/vmscan.c |
| +++ b/mm/vmscan.c |
| @@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, |
| |
| if (PageSwapCache(page)) { |
| swp_entry_t swap = { .val = page_private(page) }; |
| - mem_cgroup_swapout(page, swap); |
| + |
| + /* get a shadow entry before page_memcg() is cleared */ |
| if (reclaimed && !mapping_exiting(mapping)) |
| shadow = workingset_eviction(page, target_memcg); |
| + mem_cgroup_swapout(page, swap); |
| __delete_from_swap_cache(page, swap, shadow); |
| xa_unlock_irqrestore(&mapping->i_pages, flags); |
| put_swap_page(page, swap); |
| @@ -2827,6 +2829,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) |
| get_nr_gens(lruvec, 1) <= MAX_NR_GENS; |
| } |
| |
| +/****************************************************************************** |
| + * refault feedback loop |
| + ******************************************************************************/ |
| + |
| +/* |
| + * A feedback loop modeled after the PID controller. Currently supports the |
| + * proportional (P) and the integral (I) terms; the derivative (D) term can be |
| + * added if necessary. The setpoint (SP) is the desired position; the process |
| + * variable (PV) is the measured position. The error is the difference between |
| + * the SP and the PV. A positive error results in a positive control output |
| + * correction, which, in our case, is to allow eviction. |
| + * |
| + * The P term is the current refault rate refaulted/(evicted+activated), which |
| + * has a weight of 1. The I term is the arithmetic mean of the last N refault |
| + * rates, weighted by geometric series 1/2, 1/4, ..., 1/(1<<N). |
| + * |
| + * Our goal is to make sure upper tiers have similar refault rates as the base |
| + * tier. That is we try to be fair to all tiers by maintaining similar refault |
| + * rates across them. |
| + */ |
| +struct controller_pos { |
| + unsigned long refaulted; |
| + unsigned long total; |
| + int gain; |
| +}; |
| + |
| +static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec, |
| + int type, int tier, int gain) |
| +{ |
| + struct lrugen *lrugen = &lruvec->evictable; |
| + int hist = hist_from_seq_or_gen(lrugen->min_seq[type]); |
| + |
| + pos->refaulted = lrugen->avg_refaulted[type][tier] + |
| + atomic_long_read(&lrugen->refaulted[hist][type][tier]); |
| + pos->total = lrugen->avg_total[type][tier] + |
| + atomic_long_read(&lrugen->evicted[hist][type][tier]); |
| + if (tier) |
| + pos->total += lrugen->activated[hist][type][tier - 1]; |
| + pos->gain = gain; |
| +} |
| + |
| +static void reset_controller_pos(struct lruvec *lruvec, int gen, int type) |
| +{ |
| + int tier; |
| + int hist = hist_from_seq_or_gen(gen); |
| + struct lrugen *lrugen = &lruvec->evictable; |
| + bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); |
| + |
| + if (!carryover && NR_STAT_GENS == 1) |
| + return; |
| + |
| + for (tier = 0; tier < MAX_NR_TIERS; tier++) { |
| + if (carryover) { |
| + unsigned long sum; |
| + |
| + sum = lrugen->avg_refaulted[type][tier] + |
| + atomic_long_read(&lrugen->refaulted[hist][type][tier]); |
| + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); |
| + |
| + sum = lrugen->avg_total[type][tier] + |
| + atomic_long_read(&lrugen->evicted[hist][type][tier]); |
| + if (tier) |
| + sum += lrugen->activated[hist][type][tier - 1]; |
| + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); |
| + |
| + if (NR_STAT_GENS > 1) |
| + continue; |
| + } |
| + |
| + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); |
| + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); |
| + if (tier) |
| + WRITE_ONCE(lrugen->activated[hist][type][tier - 1], 0); |
| + } |
| +} |
| + |
| +static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv) |
| +{ |
| + /* |
| + * Allow eviction if the PV has a limited number of refaulted pages or a |
| + * lower refault rate than the SP. |
| + */ |
| + return pv->refaulted < SWAP_CLUSTER_MAX || |
| + pv->refaulted * max(sp->total, 1UL) * sp->gain <= |
| + sp->refaulted * max(pv->total, 1UL) * pv->gain; |
| +} |
| + |
| /****************************************************************************** |
| * state change |
| ******************************************************************************/ |
| diff --git a/mm/workingset.c b/mm/workingset.c |
| index 2286ddc809b1..5e1afe9b6790 100644 |
| --- a/mm/workingset.c |
| +++ b/mm/workingset.c |
| @@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da |
| return val >> MEM_CGROUP_ID_SHIFT; |
| } |
| |
| +#ifdef CONFIG_LRU_GEN |
| + |
| +#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT |
| +#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations" |
| +#endif |
| + |
| +static void page_set_usage(struct page *page, int usage) |
| +{ |
| + unsigned long old_flags, new_flags; |
| + |
| + VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH)); |
| + |
| + if (!usage) |
| + return; |
| + |
| + do { |
| + old_flags = READ_ONCE(page->flags); |
| + new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS | |
| + ((usage - 1UL) << LRU_USAGE_PGOFF); |
| + } while (new_flags != old_flags && |
| + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); |
| +} |
| + |
| +/* Return a token to be stored in the shadow entry of a page being evicted. */ |
| +static void *lru_gen_eviction(struct page *page) |
| +{ |
| + int hist, tier; |
| + unsigned long token; |
| + unsigned long min_seq; |
| + struct lruvec *lruvec; |
| + struct lrugen *lrugen; |
| + int type = page_is_file_lru(page); |
| + int usage = page_tier_usage(page); |
| + struct mem_cgroup *memcg = page_memcg(page); |
| + struct pglist_data *pgdat = page_pgdat(page); |
| + |
| + lruvec = mem_cgroup_lruvec(memcg, pgdat); |
| + lrugen = &lruvec->evictable; |
| + min_seq = READ_ONCE(lrugen->min_seq[type]); |
| + token = (min_seq << LRU_USAGE_SHIFT) | usage; |
| + |
| + hist = hist_from_seq_or_gen(min_seq); |
| + tier = lru_tier_from_usage(usage); |
| + atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]); |
| + |
| + return pack_shadow(mem_cgroup_id(memcg), pgdat, token); |
| +} |
| + |
| +/* Account a refaulted page based on the token stored in its shadow entry. */ |
| +static void lru_gen_refault(struct page *page, void *shadow) |
| +{ |
| + int hist, tier, usage; |
| + int memcg_id; |
| + unsigned long token; |
| + unsigned long min_seq; |
| + struct lruvec *lruvec; |
| + struct lrugen *lrugen; |
| + struct pglist_data *pgdat; |
| + struct mem_cgroup *memcg; |
| + int type = page_is_file_lru(page); |
| + |
| + token = unpack_shadow(shadow, &memcg_id, &pgdat); |
| + if (page_pgdat(page) != pgdat) |
| + return; |
| + |
| + rcu_read_lock(); |
| + memcg = page_memcg_rcu(page); |
| + if (mem_cgroup_id(memcg) != memcg_id) |
| + goto unlock; |
| + |
| + usage = token & (BIT(LRU_USAGE_SHIFT) - 1); |
| + token >>= LRU_USAGE_SHIFT; |
| + |
| + lruvec = mem_cgroup_lruvec(memcg, pgdat); |
| + lrugen = &lruvec->evictable; |
| + min_seq = READ_ONCE(lrugen->min_seq[type]); |
| + if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT))) |
| + goto unlock; |
| + |
| + page_set_usage(page, usage); |
| + |
| + hist = hist_from_seq_or_gen(min_seq); |
| + tier = lru_tier_from_usage(usage); |
| + atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]); |
| + inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type); |
| + if (tier) |
| + inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type); |
| +unlock: |
| + rcu_read_unlock(); |
| +} |
| + |
| +#else /* CONFIG_LRU_GEN */ |
| + |
| +static void *lru_gen_eviction(struct page *page) |
| +{ |
| + return NULL; |
| +} |
| + |
| +static void lru_gen_refault(struct page *page, void *shadow) |
| +{ |
| +} |
| + |
| +#endif /* CONFIG_LRU_GEN */ |
| + |
| /** |
| * workingset_age_nonresident - age non-resident entries as LRU ages |
| * @lruvec: the lruvec that was aged |
| @@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) |
| VM_BUG_ON_PAGE(page_count(page), page); |
| VM_BUG_ON_PAGE(!PageLocked(page), page); |
| |
| + if (lru_gen_enabled()) |
| + return lru_gen_eviction(page); |
| + |
| lruvec = mem_cgroup_lruvec(target_memcg, pgdat); |
| /* XXX: target_memcg can be NULL, go through lruvec */ |
| memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); |
| @@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow) |
| bool workingset; |
| int memcgid; |
| |
| + if (lru_gen_enabled()) { |
| + lru_gen_refault(page, shadow); |
| + return; |
| + } |
| + |
| eviction = unpack_shadow(shadow, &memcgid, &pgdat); |
| |
| rcu_read_lock(); |
| -- |
| 2.32.0.402.g57bb445576-goog |
| |