blob: ed16f6a1c14ae944d07c82bf255491791eb387bd [file] [log] [blame]
From 462cd4b89d57ee78d15caf25c1f3bb3f1f6e902a Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 25 Jan 2021 21:12:33 -0700
Subject: [PATCH] BACKPORT: FROMLIST: mm: multigenerational lru: groundwork
For each lruvec, evictable pages are divided into multiple
generations. The youngest generation number is stored in
lrugen->max_seq for both anon and file types as they are aged on an
equal footing. The oldest generation numbers are stored in
lrugen->min_seq[2] separately for anon and file types as clean file
pages can be evicted regardless of may_swap or may_writepage. These
three variables are monotonically increasing. Generation numbers are
truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
page->flags. The sliding window technique is used to prevent truncated
generation numbers from overlapping. Each truncated generation number
is an index to
lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. Evictable
pages are added to the per-zone lists indexed by lrugen->max_seq or
lrugen->min_seq[2] (modulo MAX_NR_GENS), depending on their types.
Each generation is then divided into multiple tiers. Tiers represent
levels of usage from file descriptors only. Pages accessed N times via
file descriptors belong to tier order_base_2(N). Each generation
contains at most MAX_NR_TIERS tiers, and they require additional
MAX_NR_TIERS-2 bits in page->flags. In contrast to moving across
generations which requires the lru lock for the list operations,
moving across tiers only involves an atomic operation on page->flags
and therefore has a negligible cost. A feedback loop modeled after the
PID controller monitors the refault rates across all tiers and decides
when to activate pages from which tiers in the reclaim path.
The framework comprises two conceptually independent components: the
aging and the eviction, which can be invoked separately from user
space for the purpose of working set estimation and proactive reclaim.
The aging produces young generations. Given an lruvec, the aging scans
page tables for referenced pages of this lruvec. Upon finding one, the
aging updates its generation number to max_seq. After each round of
scan, the aging increments max_seq. The aging is due when both of
min_seq[2] reaches max_seq-1, assuming both anon and file types are
reclaimable.
The eviction consumes old generations. Given an lruvec, the eviction
scans the pages on the per-zone lists indexed by either of min_seq[2].
It tries to select a type based on the values of min_seq[2] and
swappiness. During a scan, the eviction sorts pages according to their
new generation numbers, if the aging has found them referenced. When
it finds all the per-zone lists of a selected type are empty, the
eviction increments min_seq[2] indexed by this selected type.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
(am from https://lore.kernel.org/patchwork/patch/1432182/)
BUG=b:123039911
TEST=Built
Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987921
Tested-by: Yu Zhao <yuzhao@chromium.org>
Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
Reviewed-by: Yu Zhao <yuzhao@chromium.org>
Commit-Queue: Yu Zhao <yuzhao@chromium.org>
---
fs/fuse/dev.c | 3 +-
include/linux/mm.h | 2 +
include/linux/mm_inline.h | 194 +++++++++++++++++++
include/linux/mmzone.h | 106 +++++++++++
include/linux/page-flags-layout.h | 15 +-
include/linux/page-flags.h | 4 +-
kernel/bounds.c | 6 +
mm/huge_memory.c | 3 +-
mm/mm_init.c | 6 +-
mm/mmzone.c | 2 +
mm/swapfile.c | 4 +
mm/vmscan.c | 307 ++++++++++++++++++++++++++++++
12 files changed, 643 insertions(+), 9 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page)
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
- 1 << PG_waiters))) {
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_USAGE_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1096,6 +1096,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_USAGE_PGOFF (LRU_GEN_PGOFF - LRU_USAGE_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -79,11 +79,199 @@ static __always_inline enum lru_list page_lru(struct page *page)
return lru;
}
+#ifdef CONFIG_LRU_GEN
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
+
+static inline bool lru_gen_enabled(void)
+{
+ return static_branch_likely(&lru_gen_static_key);
+}
+#else
+DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
+
+static inline bool lru_gen_enabled(void)
+{
+ return static_branch_unlikely(&lru_gen_static_key);
+}
+#endif
+
+/* We track at most MAX_NR_GENS generations using the sliding window technique. */
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+ return seq % MAX_NR_GENS;
+}
+
+/* Return a proper index regardless whether we keep a full history of stats. */
+static inline int hist_from_seq_or_gen(int seq_or_gen)
+{
+ return seq_or_gen % NR_STAT_GENS;
+}
+
+/* The youngest and the second youngest generations are counted as active. */
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq);
+
+ VM_BUG_ON(!max_seq);
+ VM_BUG_ON(gen >= MAX_NR_GENS);
+
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+/* Update the sizes of the multigenerational lru lists. */
+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
+ int old_gen, int new_gen)
+{
+ int type = page_is_file_lru(page);
+ int zone = page_zonenum(page);
+ int delta = thp_nr_pages(page);
+ enum lru_list lru = type * LRU_FILE;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ lockdep_assert_held(&lruvec->lru_lock);
+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
+ VM_BUG_ON(old_gen == -1 && new_gen == -1);
+
+ if (old_gen >= 0)
+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
+ lrugen->sizes[old_gen][type][zone] - delta);
+ if (new_gen >= 0)
+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
+ lrugen->sizes[new_gen][type][zone] + delta);
+
+ if (old_gen < 0) {
+ if (lru_gen_is_active(lruvec, new_gen))
+ lru += LRU_ACTIVE;
+ update_lru_size(lruvec, lru, zone, delta);
+ return;
+ }
+
+ if (new_gen < 0) {
+ if (lru_gen_is_active(lruvec, old_gen))
+ lru += LRU_ACTIVE;
+ update_lru_size(lruvec, lru, zone, -delta);
+ return;
+ }
+
+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
+ update_lru_size(lruvec, lru, zone, -delta);
+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
+ }
+
+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
+}
+
+/* Add a page to one of the multigenerational lru lists. Return true on success. */
+static inline bool lru_gen_addition(struct page *page, struct lruvec *lruvec, bool front)
+{
+ int gen;
+ unsigned long old_flags, new_flags;
+ int type = page_is_file_lru(page);
+ int zone = page_zonenum(page);
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ if (PageUnevictable(page) || !lrugen->enabled[type])
+ return false;
+ /*
+ * If a page is being faulted in, add it to the youngest generation.
+ * try_walk_mm_list() may look at the size of the youngest generation to
+ * determine if the aging is due.
+ *
+ * If a page can't be evicted immediately, i.e., an anon page not in
+ * swap cache, a dirty file page under reclaim, or a page rejected by
+ * evict_pages() due to races, dirty buffer heads, etc., add it to the
+ * second oldest generation.
+ *
+ * If a page could be evicted immediately, i.e., a clean file page, add
+ * it to the oldest generation.
+ */
+ if (PageActive(page))
+ gen = lru_gen_from_seq(lrugen->max_seq);
+ else if ((!type && !PageSwapCache(page)) ||
+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))) ||
+ (!PageReferenced(page) && PageWorkingset(page)))
+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
+ else
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ do {
+ old_flags = READ_ONCE(page->flags);
+ VM_BUG_ON_PAGE(old_flags & LRU_GEN_MASK, page);
+
+ new_flags = (old_flags & ~(LRU_GEN_MASK | BIT(PG_active))) |
+ ((gen + 1UL) << LRU_GEN_PGOFF);
+ /* see the comment in evict_pages() */
+ if (!(old_flags & BIT(PG_referenced)))
+ new_flags &= ~(LRU_USAGE_MASK | LRU_TIER_FLAGS);
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+ lru_gen_update_size(page, lruvec, -1, gen);
+ if (front)
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
+ else
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
+
+ return true;
+}
+
+/* Delete a page from one of the multigenerational lru lists. Return true on success. */
+static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
+{
+ int gen;
+ unsigned long old_flags, new_flags;
+
+ do {
+ old_flags = READ_ONCE(page->flags);
+ if (!(old_flags & LRU_GEN_MASK))
+ return false;
+
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+
+ gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ new_flags = old_flags & ~LRU_GEN_MASK;
+ /* mark page active accordingly */
+ if (lru_gen_is_active(lruvec, gen))
+ new_flags |= BIT(PG_active);
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+ lru_gen_update_size(page, lruvec, gen, -1);
+ list_del(&page->lru);
+
+ return true;
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_addition(struct page *page, struct lruvec *lruvec, bool front)
+{
+ return false;
+}
+
+static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);
+ if (lru_gen_addition(page, lruvec, true))
+ return;
+
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
@@ -93,6 +281,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
{
enum lru_list lru = page_lru(page);
+ if (lru_gen_addition(page, lruvec, false))
+ return;
+
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add_tail(&page->lru, &lruvec->lists[lru]);
}
@@ -100,6 +291,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec)
{
+ if (lru_gen_deletion(page, lruvec))
+ return;
+
list_del(&page->lru);
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-thp_nr_pages(page));
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -294,6 +294,108 @@ enum lruvec_flags {
*/
};
+struct lruvec;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+/*
+ * For each lruvec, evictable pages are divided into multiple generations. The
+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
+ * monotonically increasing. The sliding window technique is used to track at
+ * most MAX_NR_GENS and at least MIN_NR_GENS generations. An offset within the
+ * window, AKA gen, indexes an array of per-type and per-zone lists for the
+ * corresponding generation. The counter in page->flags stores gen+1 while a
+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
+ */
+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+
+/*
+ * Each generation is then divided into multiple tiers. Tiers represent levels
+ * of usage from file descriptors, i.e., mark_page_accessed(). In contrast to
+ * moving across generations which requires the lru lock, moving across tiers
+ * only involves an atomic operation on page->flags and therefore has a
+ * negligible cost.
+ *
+ * The purposes of tiers are to:
+ * 1) estimate whether pages accessed multiple times via file descriptors are
+ * more active than pages accessed only via page tables by separating the two
+ * access types into upper tiers and the base tier and comparing refault rates
+ * across tiers.
+ * 2) improve buffered io performance by deferring activations of pages
+ * accessed multiple times until the eviction. That is activations happen in
+ * the reclaim path, not the access path.
+ *
+ * Pages accessed N times via file descriptors belong to tier order_base_2(N).
+ * The base tier uses the following page flag:
+ * !PageReferenced() -- readahead pages
+ * PageReferenced() -- single-access pages
+ * All upper tiers use the following page flags:
+ * PageReferenced() && PageWorkingset() -- multi-access pages
+ * in addition to the bits storing N-2 accesses. Therefore, we can support one
+ * upper tier without using additional bits in page->flags.
+ *
+ * Note that
+ * 1) PageWorkingset() is always set for upper tiers because we want to
+ * maintain the existing psi behavior.
+ * 2) !PageReferenced() && PageWorkingset() is not a valid tier. See the
+ * comment in evict_pages().
+ *
+ * Pages from the base tier are evicted regardless of its refault rate. Pages
+ * from upper tiers will be moved to the next generation, if their refault rates
+ * are higher than that of the base tier.
+ */
+#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN)
+#define LRU_TIER_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
+#define LRU_USAGE_SHIFT (CONFIG_TIERS_PER_GEN - 1)
+
+/* Whether to keep historical stats for each generation. */
+#ifdef CONFIG_LRU_GEN_STATS
+#define NR_STAT_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+#else
+#define NR_STAT_GENS 1U
+#endif
+
+struct lrugen {
+ /* the aging increments the max generation number */
+ unsigned long max_seq;
+ /* the eviction increments the min generation numbers */
+ unsigned long min_seq[ANON_AND_FILE];
+ /* the birth time of each generation in jiffies */
+ unsigned long timestamps[MAX_NR_GENS];
+ /* the multigenerational lru lists */
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the sizes of the multigenerational lru lists in pages */
+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* to determine which type and its tiers to evict */
+ atomic_long_t evicted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ atomic_long_t refaulted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* the base tier won't be activated */
+ unsigned long activated[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
+ /* arithmetic mean weighted by geometric series 1/2, 1/4, ... */
+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multigenerational lru is enabled */
+ bool enabled[ANON_AND_FILE];
+};
+
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_set_state(bool enable, bool main, bool swap);
+
+#else /* CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+}
+
+static inline void lru_gen_set_state(bool enable, bool main, bool swap)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
@@ -311,6 +413,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ /* unevictable pages are on LRU_UNEVICTABLE */
+ struct lrugen evictable;
+#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -24,6 +24,14 @@
#error ZONES_SHIFT "Too many zones configured"
#endif
+#ifdef CONFIG_LRU_GEN
+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
+#define LRU_USAGE_WIDTH (CONFIG_TIERS_PER_GEN - 2)
+#else
+#define LRU_GEN_WIDTH 0
+#define LRU_USAGE_WIDTH 0
+#endif
+
#define ZONES_WIDTH ZONES_SHIFT
#ifdef CONFIG_SPARSEMEM
@@ -55,7 +63,8 @@
#define SECTIONS_WIDTH 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+LRU_USAGE_WIDTH+NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
@@ -89,7 +98,7 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
+#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+LRU_USAGE_WIDTH+NODES_WIDTH+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \
<= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
@@ -100,7 +109,7 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
+#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+LRU_USAGE_WIDTH+NODES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \
> BITS_PER_LONG - NR_PAGEFLAGS
#error "Not enough bits in page flags"
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -822,7 +822,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -833,7 +833,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_USAGE_MASK)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
diff --git a/kernel/bounds.c b/kernel/bounds.c
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,12 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ /* bits needed to represent internal values stored in page->flags */
+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
+ /* bits needed to represent normalized values for external uses */
+ DEFINE(LRU_GEN_SHIFT, order_base_2(CONFIG_NR_LRU_GENS));
+#endif
/* End of constants */
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_USAGE_MASK));
/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
diff --git a/mm/mm_init.c b/mm/mm_init.c
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_USAGE_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_USAGE_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
diff --git a/mm/mmzone.c b/mm/mmzone.c
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
+
+ lru_gen_init_lruvec(lruvec);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
diff --git a/mm/swapfile.c b/mm/swapfile.c
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2696,6 +2696,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
err = 0;
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
+ /* stop tracking anon if the multigenerational lru is turned off */
+ lru_gen_set_state(false, false, true);
out_dput:
if (victim)
@@ -3374,6 +3376,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
+ /* start tracking anon if the multigenerational lru is turned on */
+ lru_gen_set_state(true, false, true);
error = 0;
goto out;
diff --git a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -50,6 +50,7 @@
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
+#include <linux/memory.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -2911,6 +2912,312 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return can_demote(pgdat->node_id, sc);
}
+#ifdef CONFIG_LRU_GEN
+
+/*
+ * After pages are faulted in, the aging must scan them twice before the
+ * eviction can consider them. The first scan clears the accessed bit set during
+ * initial faults. And the second scan makes sure they haven't been used since
+ * the first scan.
+ */
+#define MIN_NR_GENS 2
+
+#define MAX_BATCH_SIZE 8192
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define DEFINE_MAX_SEQ() \
+ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq)
+
+#define DEFINE_MIN_SEQ() \
+ unsigned long min_seq[ANON_AND_FILE] = { \
+ READ_ONCE(lruvec->evictable.min_seq[0]), \
+ READ_ONCE(lruvec->evictable.min_seq[1]), \
+ }
+
+#define for_each_type_zone(type, zone) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+static int page_lru_gen(struct page *page)
+{
+ return ((page->flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static int get_lo_wmark(unsigned long max_seq, unsigned long *min_seq, int swappiness)
+{
+ return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1;
+}
+
+static int get_hi_wmark(unsigned long max_seq, unsigned long *min_seq, int swappiness)
+{
+ return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1;
+}
+
+static int get_nr_gens(struct lruvec *lruvec, int type)
+{
+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
+}
+
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+{
+ lockdep_assert_held(&lruvec->lru_lock);
+
+ return get_nr_gens(lruvec, 0) >= MIN_NR_GENS &&
+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS &&
+ get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
+ get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
+}
+
+/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
+#else
+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
+#endif
+
+static DEFINE_MUTEX(lru_gen_state_mutex);
+static int lru_gen_nr_swapfiles __read_mostly;
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ enum lru_list lru;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ for_each_evictable_lru(lru) {
+ type = is_file_lru(lru);
+
+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+
+ VM_WARN_ON_ONCE(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
+ }
+
+ return true;
+}
+
+static bool fill_lru_gen_lists(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int batch_size = 0;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ if (!lruvec->evictable.enabled[type])
+ continue;
+
+ while (!list_empty(head)) {
+ bool success;
+ struct page *page = lru_to_page(head);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page) != active, page);
+ VM_BUG_ON_PAGE(page_lru_gen(page) != -1, page);
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
+
+ prefetchw_prev_lru_page(page, head, flags);
+
+ del_page_from_lru_list(page, lruvec);
+ success = lru_gen_addition(page, lruvec, true);
+ VM_BUG_ON(!success);
+
+ if (++batch_size == MAX_BATCH_SIZE)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_lru_gen_lists(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int batch_size = 0;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
+
+ if (lruvec->evictable.enabled[type])
+ continue;
+
+ while (!list_empty(head)) {
+ bool success;
+ struct page *page = lru_to_page(head);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
+
+ prefetchw_prev_lru_page(page, head, flags);
+
+ success = lru_gen_deletion(page, lruvec);
+ VM_BUG_ON(!success);
+ add_page_to_lru_list(page, lruvec);
+
+ if (++batch_size == MAX_BATCH_SIZE)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * For file page tracking, we enable/disable it according to the main switch.
+ * For anon page tracking, we only enabled it when the main switch is on and
+ * there is at least one swapfile; we disable it when there are no swapfiles
+ * regardless of the value of the main switch. Otherwise, we will eventually
+ * reach the max size of the sliding window and have to call inc_min_seq(),
+ * which brings an unnecessary overhead.
+ */
+void lru_gen_set_state(bool enable, bool main, bool swap)
+{
+ struct mem_cgroup *memcg;
+
+ mem_hotplug_begin();
+ mutex_lock(&lru_gen_state_mutex);
+ cgroup_lock();
+
+ main = main && enable != lru_gen_enabled();
+ swap = swap && !(enable ? lru_gen_nr_swapfiles++ : --lru_gen_nr_swapfiles);
+ swap = swap && lru_gen_enabled();
+ if (!main && !swap)
+ goto unlock;
+
+ if (main) {
+ if (enable)
+ static_branch_enable(&lru_gen_static_key);
+ else
+ static_branch_disable(&lru_gen_static_key);
+ }
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_BUG_ON(!seq_is_valid(lruvec));
+ VM_BUG_ON(!state_is_valid(lruvec));
+
+ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles);
+ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled());
+
+ while (!(enable ? fill_lru_gen_lists(lruvec) :
+ drain_lru_gen_lists(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ cgroup_unlock();
+ mutex_unlock(&lru_gen_state_mutex);
+ mem_hotplug_done();
+}
+
+static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct mem_cgroup *memcg;
+ struct memory_notify *mnb = arg;
+ int nid = mnb->status_change_nid;
+
+ if (action != MEM_GOING_ONLINE || nid == NUMA_NO_NODE)
+ return NOTIFY_DONE;
+
+ mutex_lock(&lru_gen_state_mutex);
+ cgroup_lock();
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ VM_BUG_ON(!seq_is_valid(lruvec));
+ VM_BUG_ON(!state_is_valid(lruvec));
+
+ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles);
+ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled());
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ cgroup_unlock();
+ mutex_unlock(&lru_gen_state_mutex);
+
+ return NOTIFY_DONE;
+}
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+ int i;
+ int gen, type, zone;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
+ lrugen->enabled[1] = lru_gen_enabled();
+
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+}
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ if (hotplug_memory_notifier(lru_gen_online_mem, 0))
+ pr_err("lru_gen: failed to subscribe hotplug notifications\n");
+
+ return 0;
+};
+/*
+ * We want to run as early as possible because debug code may call mm_alloc()
+ * and mmput(). Our only dependency mm_kobj is initialized one stage earlier.
+ */
+arch_initcall(init_lru_gen);
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
--
2.33.0.464.g1972c5931b-goog