blob: 7d1fda9d4c66aa7e05d3f4067dfafcc51c5721b3 [file] [log] [blame]
From a4c85efd7479140185f14af30e9f5c3860f4dc16 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 5 Apr 2021 11:44:28 -0600
Subject: [PATCH] BACKPORT: FROMLIST: mm: multigenerational lru: activation
For pages mapped upon page faults, the accessed bit is set during the
initial faults. We add them to the per-zone lists index by max_seq,
i.e., the youngest generation, so that eviction will not consider them
before the aging has scanned them. Readahead pages allocated in the
page fault path will also be added to the youngest generation, since
it is assumed that they may be needed soon.
For pages accessed multiple times via file descriptors, instead of
activating them upon the second access, we activate them based on the
refault rates of their tiers. Each generation contains at most
MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2 bits in
page->flags. Pages accessed N times via file descriptors belong to
tier order_base_2(N). Tier 0 is the base tier and it contains pages
read ahead, accessed once via file descriptors and accessed only via
page tables. Pages from the base tier are evicted regardless of the
refault rate. Pages from upper tiers that have higher refault rates
than the base tier will be moved to the next generation. A feedback
loop modeled after the PID controller monitors refault rates across
all tiers and decides when to activate pages from which upper tiers
in the reclaim path. The advantages of this model are:
1) It has a negligible cost in the buffered IO access path because
activations are done optionally in the reclaim path.
2) It takes mapped pages into account and avoids overprotecting
pages accessed multiple times via file descriptors.
3) More tiers offer better protection to pages accessed more than
twice when workloads doing intensive buffered IO are under memory
pressure.
Finally, we need to make sure deactivation works when the
multigenerational lru is enabled. We cannot use PageActive() because
it is not set on pages from active generations, in order to spare the
aging the trouble of clearing it when active generations become
inactive. So we deactivate pages unconditionally since deactivation is
not a hot code path worth additional optimizations.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
(am from https://lore.kernel.org/patchwork/patch/1432183/)
BUG=b:123039911
TEST=Built
Change-Id: Ibc9c90757fd095cdcc0a49823ada6b55f17ffc06
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987922
Tested-by: Yu Zhao <yuzhao@chromium.org>
Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
Reviewed-by: Yu Zhao <yuzhao@chromium.org>
Commit-Queue: Yu Zhao <yuzhao@chromium.org>
---
include/linux/memcontrol.h | 20 -------
include/linux/mm.h | 30 ++++++++++
include/linux/mm_inline.h | 40 +++++++++++++
include/linux/sched.h | 2 +-
mm/memcontrol.c | 2 +-
mm/memory.c | 4 +-
mm/swap.c | 11 +++-
mm/vmscan.c | 91 +++++++++++++++++++++++++++++-
mm/workingset.c | 112 +++++++++++++++++++++++++++++++++++++
9 files changed, 285 insertions(+), 27 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bfe5c486f4ad..1b9705b10457 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -913,18 +913,6 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);
-static inline void mem_cgroup_enter_user_fault(void)
-{
- WARN_ON(current->in_user_fault);
- current->in_user_fault = 1;
-}
-
-static inline void mem_cgroup_exit_user_fault(void)
-{
- WARN_ON(!current->in_user_fault);
- current->in_user_fault = 0;
-}
-
static inline bool task_in_memcg_oom(struct task_struct *p)
{
return p->memcg_in_oom;
@@ -1350,14 +1338,6 @@ static inline void mem_cgroup_handle_over_high(void)
{
}
-static inline void mem_cgroup_enter_user_fault(void)
-{
-}
-
-static inline void mem_cgroup_exit_user_fault(void)
-{
-}
-
static inline bool task_in_memcg_oom(struct task_struct *p)
{
return false;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8e9f77de1c61..1d3d83619836 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1781,6 +1781,23 @@ void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
+
+static inline void task_enter_user_fault(void)
+{
+ WARN_ON(current->in_user_fault);
+ current->in_user_fault = 1;
+}
+
+static inline void task_exit_user_fault(void)
+{
+ WARN_ON(!current->in_user_fault);
+ current->in_user_fault = 0;
+}
+
+static inline bool task_in_user_fault(void)
+{
+ return current->in_user_fault;
+}
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
@@ -1802,6 +1819,19 @@ static inline void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows) { }
+
+static inline void task_enter_user_fault(void)
+{
+}
+
+static inline void task_exit_user_fault(void)
+{
+}
+
+static inline bool task_in_user_fault(void)
+{
+ return false;
+}
#endif
static inline void unmap_shared_mapping_range(struct address_space *mapping,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 4c5791f52653..2124ad9bccdb 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
return seq % MAX_NR_GENS;
}
+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
+static inline int lru_tier_from_usage(int usage)
+{
+ return order_base_2(usage + 1);
+}
+
/* Return a proper index regardless whether we keep a full history of stats. */
static inline int hist_from_seq_or_gen(int seq_or_gen)
{
@@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
return true;
}
+/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
+static inline int page_tier_usage(struct page *page)
+{
+ unsigned long flags = READ_ONCE(page->flags);
+
+ return flags & BIT(PG_workingset) ?
+ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
+}
+
+/* Increment the usage counter after a page is accessed via file descriptors. */
+static inline void page_inc_usage(struct page *page)
+{
+ unsigned long usage;
+ unsigned long old_flags, new_flags;
+
+ do {
+ old_flags = READ_ONCE(page->flags);
+
+ if (!(old_flags & BIT(PG_workingset))) {
+ new_flags = old_flags | BIT(PG_workingset);
+ continue;
+ }
+
+ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
+
+ new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK);
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+
#else /* CONFIG_LRU_GEN */
static inline bool lru_gen_enabled(void)
@@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
return false;
}
+static inline void page_inc_usage(struct page *page)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
static __always_inline void add_page_to_lru_list(struct page *page,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c135cc87bf57..370d53a46efe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -840,7 +840,7 @@ struct task_struct {
#ifndef TIF_RESTORE_SIGMASK
unsigned restore_sigmask:1;
#endif
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_MMU
unsigned in_user_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ae1f5d0cb581..d36723fd9ed7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1824,7 +1824,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
* victim and then we have to bail out from the charge path.
*/
if (memcg->oom_kill_disable) {
- if (!current->in_user_fault)
+ if (!task_in_user_fault())
return OOM_SKIPPED;
css_get(&memcg->css);
current->memcg_in_oom = memcg;
diff --git a/mm/memory.c b/mm/memory.c
index 747a01d495f2..990869538ffd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4774,7 +4774,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
- mem_cgroup_enter_user_fault();
+ task_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
@@ -4782,7 +4782,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
- mem_cgroup_exit_user_fault();
+ task_exit_user_fault();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
diff --git a/mm/swap.c b/mm/swap.c
index 19600430e536..09d78e0c81fa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -433,6 +433,8 @@ void mark_page_accessed(struct page *page)
* this list is never rotated or maintained, so marking an
* evictable page accessed has no effect.
*/
+ } else if (lru_gen_enabled()) {
+ page_inc_usage(page);
} else if (!PageActive(page)) {
/*
* If the page is on the LRU, queue it for activation via
@@ -468,6 +470,10 @@ void lru_cache_add(struct page *page)
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (lru_gen_enabled() && !PageActive(page) && !PageUnevictable(page) &&
+ task_in_user_fault() && !(current->flags & PF_MEMALLOC))
+ SetPageActive(page);
+
get_page(page);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
@@ -569,7 +575,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
- if (PageActive(page) && !PageUnevictable(page)) {
+ if ((PageActive(page) || lru_gen_enabled()) && !PageUnevictable(page)) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
@@ -684,7 +690,8 @@ void deactivate_file_page(struct page *page)
*/
void deactivate_page(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ if (PageLRU(page) && (PageActive(page) || lru_gen_enabled()) &&
+ !PageUnevictable(page)) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1a406475054a..1778715462b5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
- mem_cgroup_swapout(page, swap);
+
+ /* get a shadow entry before page_memcg() is cleared */
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
+ mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page, swap, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
@@ -2827,6 +2829,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
}
+/******************************************************************************
+ * refault feedback loop
+ ******************************************************************************/
+
+/*
+ * A feedback loop modeled after the PID controller. Currently supports the
+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
+ * added if necessary. The setpoint (SP) is the desired position; the process
+ * variable (PV) is the measured position. The error is the difference between
+ * the SP and the PV. A positive error results in a positive control output
+ * correction, which, in our case, is to allow eviction.
+ *
+ * The P term is the current refault rate refaulted/(evicted+activated), which
+ * has a weight of 1. The I term is the arithmetic mean of the last N refault
+ * rates, weighted by geometric series 1/2, 1/4, ..., 1/(1<<N).
+ *
+ * Our goal is to make sure upper tiers have similar refault rates as the base
+ * tier. That is we try to be fair to all tiers by maintaining similar refault
+ * rates across them.
+ */
+struct controller_pos {
+ unsigned long refaulted;
+ unsigned long total;
+ int gain;
+};
+
+static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec,
+ int type, int tier, int gain)
+{
+ struct lrugen *lrugen = &lruvec->evictable;
+ int hist = hist_from_seq_or_gen(lrugen->min_seq[type]);
+
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ pos->total = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ pos->total += lrugen->activated[hist][type][tier - 1];
+ pos->gain = gain;
+}
+
+static void reset_controller_pos(struct lruvec *lruvec, int gen, int type)
+{
+ int tier;
+ int hist = hist_from_seq_or_gen(gen);
+ struct lrugen *lrugen = &lruvec->evictable;
+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (!carryover && NR_STAT_GENS == 1)
+ return;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ if (carryover) {
+ unsigned long sum;
+
+ sum = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+ sum = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ sum += lrugen->activated[hist][type][tier - 1];
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+
+ if (NR_STAT_GENS > 1)
+ continue;
+ }
+
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+ if (tier)
+ WRITE_ONCE(lrugen->activated[hist][type][tier - 1], 0);
+ }
+}
+
+static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv)
+{
+ /*
+ * Allow eviction if the PV has a limited number of refaulted pages or a
+ * lower refault rate than the SP.
+ */
+ return pv->refaulted < SWAP_CLUSTER_MAX ||
+ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
+}
+
/******************************************************************************
* state change
******************************************************************************/
diff --git a/mm/workingset.c b/mm/workingset.c
index 2286ddc809b1..5e1afe9b6790 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da
return val >> MEM_CGROUP_ID_SHIFT;
}
+#ifdef CONFIG_LRU_GEN
+
+#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
+#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
+#endif
+
+static void page_set_usage(struct page *page, int usage)
+{
+ unsigned long old_flags, new_flags;
+
+ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
+
+ if (!usage)
+ return;
+
+ do {
+ old_flags = READ_ONCE(page->flags);
+ new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS |
+ ((usage - 1UL) << LRU_USAGE_PGOFF);
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+
+/* Return a token to be stored in the shadow entry of a page being evicted. */
+static void *lru_gen_eviction(struct page *page)
+{
+ int hist, tier;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lrugen *lrugen;
+ int type = page_is_file_lru(page);
+ int usage = page_tier_usage(page);
+ struct mem_cgroup *memcg = page_memcg(page);
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->evictable;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ token = (min_seq << LRU_USAGE_SHIFT) | usage;
+
+ hist = hist_from_seq_or_gen(min_seq);
+ tier = lru_tier_from_usage(usage);
+ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
+
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
+}
+
+/* Account a refaulted page based on the token stored in its shadow entry. */
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+ int hist, tier, usage;
+ int memcg_id;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lrugen *lrugen;
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ int type = page_is_file_lru(page);
+
+ token = unpack_shadow(shadow, &memcg_id, &pgdat);
+ if (page_pgdat(page) != pgdat)
+ return;
+
+ rcu_read_lock();
+ memcg = page_memcg_rcu(page);
+ if (mem_cgroup_id(memcg) != memcg_id)
+ goto unlock;
+
+ usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
+ token >>= LRU_USAGE_SHIFT;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->evictable;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
+ goto unlock;
+
+ page_set_usage(page, usage);
+
+ hist = hist_from_seq_or_gen(min_seq);
+ tier = lru_tier_from_usage(usage);
+ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
+ if (tier)
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
+unlock:
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct page *page)
+{
+ return NULL;
+}
+
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
@@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (lru_gen_enabled())
+ return lru_gen_eviction(page);
+
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
@@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow)
bool workingset;
int memcgid;
+ if (lru_gen_enabled()) {
+ lru_gen_refault(page, shadow);
+ return;
+ }
+
eviction = unpack_shadow(shadow, &memcgid, &pgdat);
rcu_read_lock();
--
2.32.0.402.g57bb445576-goog