| From 462cd4b89d57ee78d15caf25c1f3bb3f1f6e902a Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Mon, 25 Jan 2021 21:12:33 -0700 |
| Subject: [PATCH] BACKPORT: FROMLIST: mm: multigenerational lru: groundwork |
| |
| For each lruvec, evictable pages are divided into multiple |
| generations. The youngest generation number is stored in |
| lrugen->max_seq for both anon and file types as they are aged on an |
| equal footing. The oldest generation numbers are stored in |
| lrugen->min_seq[2] separately for anon and file types as clean file |
| pages can be evicted regardless of may_swap or may_writepage. These |
| three variables are monotonically increasing. Generation numbers are |
| truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into |
| page->flags. The sliding window technique is used to prevent truncated |
| generation numbers from overlapping. Each truncated generation number |
| is an index to |
| lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. Evictable |
| pages are added to the per-zone lists indexed by lrugen->max_seq or |
| lrugen->min_seq[2] (modulo MAX_NR_GENS), depending on their types. |
| |
| Each generation is then divided into multiple tiers. Tiers represent |
| levels of usage from file descriptors only. Pages accessed N times via |
| file descriptors belong to tier order_base_2(N). Each generation |
| contains at most MAX_NR_TIERS tiers, and they require additional |
| MAX_NR_TIERS-2 bits in page->flags. In contrast to moving across |
| generations which requires the lru lock for the list operations, |
| moving across tiers only involves an atomic operation on page->flags |
| and therefore has a negligible cost. A feedback loop modeled after the |
| PID controller monitors the refault rates across all tiers and decides |
| when to activate pages from which tiers in the reclaim path. |
| |
| The framework comprises two conceptually independent components: the |
| aging and the eviction, which can be invoked separately from user |
| space for the purpose of working set estimation and proactive reclaim. |
| |
| The aging produces young generations. Given an lruvec, the aging scans |
| page tables for referenced pages of this lruvec. Upon finding one, the |
| aging updates its generation number to max_seq. After each round of |
| scan, the aging increments max_seq. The aging is due when both of |
| min_seq[2] reaches max_seq-1, assuming both anon and file types are |
| reclaimable. |
| |
| The eviction consumes old generations. Given an lruvec, the eviction |
| scans the pages on the per-zone lists indexed by either of min_seq[2]. |
| It tries to select a type based on the values of min_seq[2] and |
| swappiness. During a scan, the eviction sorts pages according to their |
| new generation numbers, if the aging has found them referenced. When |
| it finds all the per-zone lists of a selected type are empty, the |
| eviction increments min_seq[2] indexed by this selected type. |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> |
| (am from https://lore.kernel.org/patchwork/patch/1432182/) |
| |
| BUG=b:123039911 |
| TEST=Built |
| |
| Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987921 |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: Sonny Rao <sonnyrao@chromium.org> |
| Reviewed-by: Yu Zhao <yuzhao@chromium.org> |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| --- |
| fs/fuse/dev.c | 3 +- |
| include/linux/mm.h | 2 + |
| include/linux/mm_inline.h | 194 +++++++++++++++++++ |
| include/linux/mmzone.h | 106 +++++++++++ |
| include/linux/page-flags-layout.h | 15 +- |
| include/linux/page-flags.h | 4 +- |
| kernel/bounds.c | 6 + |
| mm/huge_memory.c | 3 +- |
| mm/mm_init.c | 6 +- |
| mm/mmzone.c | 2 + |
| mm/swapfile.c | 4 + |
| mm/vmscan.c | 307 ++++++++++++++++++++++++++++++ |
| 12 files changed, 643 insertions(+), 9 deletions(-) |
| |
| diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c |
| --- a/fs/fuse/dev.c |
| +++ b/fs/fuse/dev.c |
| @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page) |
| 1 << PG_active | |
| 1 << PG_workingset | |
| 1 << PG_reclaim | |
| - 1 << PG_waiters))) { |
| + 1 << PG_waiters | |
| + LRU_GEN_MASK | LRU_USAGE_MASK))) { |
| dump_page(page, "fuse: trying to steal weird page"); |
| return 1; |
| } |
| diff --git a/include/linux/mm.h b/include/linux/mm.h |
| --- a/include/linux/mm.h |
| +++ b/include/linux/mm.h |
| @@ -1096,6 +1096,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); |
| #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) |
| #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) |
| #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) |
| +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) |
| +#define LRU_USAGE_PGOFF (LRU_GEN_PGOFF - LRU_USAGE_WIDTH) |
| |
| /* |
| * Define the bit shifts to access each section. For non-existent |
| diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h |
| --- a/include/linux/mm_inline.h |
| +++ b/include/linux/mm_inline.h |
| @@ -79,11 +79,199 @@ static __always_inline enum lru_list page_lru(struct page *page) |
| return lru; |
| } |
| |
| +#ifdef CONFIG_LRU_GEN |
| + |
| +#ifdef CONFIG_LRU_GEN_ENABLED |
| +DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); |
| + |
| +static inline bool lru_gen_enabled(void) |
| +{ |
| + return static_branch_likely(&lru_gen_static_key); |
| +} |
| +#else |
| +DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); |
| + |
| +static inline bool lru_gen_enabled(void) |
| +{ |
| + return static_branch_unlikely(&lru_gen_static_key); |
| +} |
| +#endif |
| + |
| +/* We track at most MAX_NR_GENS generations using the sliding window technique. */ |
| +static inline int lru_gen_from_seq(unsigned long seq) |
| +{ |
| + return seq % MAX_NR_GENS; |
| +} |
| + |
| +/* Return a proper index regardless whether we keep a full history of stats. */ |
| +static inline int hist_from_seq_or_gen(int seq_or_gen) |
| +{ |
| + return seq_or_gen % NR_STAT_GENS; |
| +} |
| + |
| +/* The youngest and the second youngest generations are counted as active. */ |
| +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) |
| +{ |
| + unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq); |
| + |
| + VM_BUG_ON(!max_seq); |
| + VM_BUG_ON(gen >= MAX_NR_GENS); |
| + |
| + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); |
| +} |
| + |
| +/* Update the sizes of the multigenerational lru lists. */ |
| +static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, |
| + int old_gen, int new_gen) |
| +{ |
| + int type = page_is_file_lru(page); |
| + int zone = page_zonenum(page); |
| + int delta = thp_nr_pages(page); |
| + enum lru_list lru = type * LRU_FILE; |
| + struct lrugen *lrugen = &lruvec->evictable; |
| + |
| + lockdep_assert_held(&lruvec->lru_lock); |
| + VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); |
| + VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); |
| + VM_BUG_ON(old_gen == -1 && new_gen == -1); |
| + |
| + if (old_gen >= 0) |
| + WRITE_ONCE(lrugen->sizes[old_gen][type][zone], |
| + lrugen->sizes[old_gen][type][zone] - delta); |
| + if (new_gen >= 0) |
| + WRITE_ONCE(lrugen->sizes[new_gen][type][zone], |
| + lrugen->sizes[new_gen][type][zone] + delta); |
| + |
| + if (old_gen < 0) { |
| + if (lru_gen_is_active(lruvec, new_gen)) |
| + lru += LRU_ACTIVE; |
| + update_lru_size(lruvec, lru, zone, delta); |
| + return; |
| + } |
| + |
| + if (new_gen < 0) { |
| + if (lru_gen_is_active(lruvec, old_gen)) |
| + lru += LRU_ACTIVE; |
| + update_lru_size(lruvec, lru, zone, -delta); |
| + return; |
| + } |
| + |
| + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { |
| + update_lru_size(lruvec, lru, zone, -delta); |
| + update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); |
| + } |
| + |
| + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); |
| +} |
| + |
| +/* Add a page to one of the multigenerational lru lists. Return true on success. */ |
| +static inline bool lru_gen_addition(struct page *page, struct lruvec *lruvec, bool front) |
| +{ |
| + int gen; |
| + unsigned long old_flags, new_flags; |
| + int type = page_is_file_lru(page); |
| + int zone = page_zonenum(page); |
| + struct lrugen *lrugen = &lruvec->evictable; |
| + |
| + if (PageUnevictable(page) || !lrugen->enabled[type]) |
| + return false; |
| + /* |
| + * If a page is being faulted in, add it to the youngest generation. |
| + * try_walk_mm_list() may look at the size of the youngest generation to |
| + * determine if the aging is due. |
| + * |
| + * If a page can't be evicted immediately, i.e., an anon page not in |
| + * swap cache, a dirty file page under reclaim, or a page rejected by |
| + * evict_pages() due to races, dirty buffer heads, etc., add it to the |
| + * second oldest generation. |
| + * |
| + * If a page could be evicted immediately, i.e., a clean file page, add |
| + * it to the oldest generation. |
| + */ |
| + if (PageActive(page)) |
| + gen = lru_gen_from_seq(lrugen->max_seq); |
| + else if ((!type && !PageSwapCache(page)) || |
| + (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))) || |
| + (!PageReferenced(page) && PageWorkingset(page))) |
| + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); |
| + else |
| + gen = lru_gen_from_seq(lrugen->min_seq[type]); |
| + |
| + do { |
| + old_flags = READ_ONCE(page->flags); |
| + VM_BUG_ON_PAGE(old_flags & LRU_GEN_MASK, page); |
| + |
| + new_flags = (old_flags & ~(LRU_GEN_MASK | BIT(PG_active))) | |
| + ((gen + 1UL) << LRU_GEN_PGOFF); |
| + /* see the comment in evict_pages() */ |
| + if (!(old_flags & BIT(PG_referenced))) |
| + new_flags &= ~(LRU_USAGE_MASK | LRU_TIER_FLAGS); |
| + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); |
| + |
| + lru_gen_update_size(page, lruvec, -1, gen); |
| + if (front) |
| + list_add(&page->lru, &lrugen->lists[gen][type][zone]); |
| + else |
| + list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); |
| + |
| + return true; |
| +} |
| + |
| +/* Delete a page from one of the multigenerational lru lists. Return true on success. */ |
| +static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec) |
| +{ |
| + int gen; |
| + unsigned long old_flags, new_flags; |
| + |
| + do { |
| + old_flags = READ_ONCE(page->flags); |
| + if (!(old_flags & LRU_GEN_MASK)) |
| + return false; |
| + |
| + VM_BUG_ON_PAGE(PageActive(page), page); |
| + VM_BUG_ON_PAGE(PageUnevictable(page), page); |
| + |
| + gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; |
| + |
| + new_flags = old_flags & ~LRU_GEN_MASK; |
| + /* mark page active accordingly */ |
| + if (lru_gen_is_active(lruvec, gen)) |
| + new_flags |= BIT(PG_active); |
| + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); |
| + |
| + lru_gen_update_size(page, lruvec, gen, -1); |
| + list_del(&page->lru); |
| + |
| + return true; |
| +} |
| + |
| +#else /* CONFIG_LRU_GEN */ |
| + |
| +static inline bool lru_gen_enabled(void) |
| +{ |
| + return false; |
| +} |
| + |
| +static inline bool lru_gen_addition(struct page *page, struct lruvec *lruvec, bool front) |
| +{ |
| + return false; |
| +} |
| + |
| +static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec) |
| +{ |
| + return false; |
| +} |
| + |
| +#endif /* CONFIG_LRU_GEN */ |
| + |
| static __always_inline void add_page_to_lru_list(struct page *page, |
| struct lruvec *lruvec) |
| { |
| enum lru_list lru = page_lru(page); |
| |
| + if (lru_gen_addition(page, lruvec, true)) |
| + return; |
| + |
| update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); |
| list_add(&page->lru, &lruvec->lists[lru]); |
| } |
| @@ -93,6 +281,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, |
| { |
| enum lru_list lru = page_lru(page); |
| |
| + if (lru_gen_addition(page, lruvec, false)) |
| + return; |
| + |
| update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); |
| list_add_tail(&page->lru, &lruvec->lists[lru]); |
| } |
| @@ -100,6 +291,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, |
| static __always_inline void del_page_from_lru_list(struct page *page, |
| struct lruvec *lruvec) |
| { |
| + if (lru_gen_deletion(page, lruvec)) |
| + return; |
| + |
| list_del(&page->lru); |
| update_lru_size(lruvec, page_lru(page), page_zonenum(page), |
| -thp_nr_pages(page)); |
| diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h |
| --- a/include/linux/mmzone.h |
| +++ b/include/linux/mmzone.h |
| @@ -294,6 +294,108 @@ enum lruvec_flags { |
| */ |
| }; |
| |
| +struct lruvec; |
| + |
| +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) |
| +#define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF) |
| + |
| +#ifdef CONFIG_LRU_GEN |
| + |
| +/* |
| + * For each lruvec, evictable pages are divided into multiple generations. The |
| + * youngest and the oldest generation numbers, AKA max_seq and min_seq, are |
| + * monotonically increasing. The sliding window technique is used to track at |
| + * most MAX_NR_GENS and at least MIN_NR_GENS generations. An offset within the |
| + * window, AKA gen, indexes an array of per-type and per-zone lists for the |
| + * corresponding generation. The counter in page->flags stores gen+1 while a |
| + * page is on one of the multigenerational lru lists. Otherwise, it stores 0. |
| + */ |
| +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) |
| + |
| +/* |
| + * Each generation is then divided into multiple tiers. Tiers represent levels |
| + * of usage from file descriptors, i.e., mark_page_accessed(). In contrast to |
| + * moving across generations which requires the lru lock, moving across tiers |
| + * only involves an atomic operation on page->flags and therefore has a |
| + * negligible cost. |
| + * |
| + * The purposes of tiers are to: |
| + * 1) estimate whether pages accessed multiple times via file descriptors are |
| + * more active than pages accessed only via page tables by separating the two |
| + * access types into upper tiers and the base tier and comparing refault rates |
| + * across tiers. |
| + * 2) improve buffered io performance by deferring activations of pages |
| + * accessed multiple times until the eviction. That is activations happen in |
| + * the reclaim path, not the access path. |
| + * |
| + * Pages accessed N times via file descriptors belong to tier order_base_2(N). |
| + * The base tier uses the following page flag: |
| + * !PageReferenced() -- readahead pages |
| + * PageReferenced() -- single-access pages |
| + * All upper tiers use the following page flags: |
| + * PageReferenced() && PageWorkingset() -- multi-access pages |
| + * in addition to the bits storing N-2 accesses. Therefore, we can support one |
| + * upper tier without using additional bits in page->flags. |
| + * |
| + * Note that |
| + * 1) PageWorkingset() is always set for upper tiers because we want to |
| + * maintain the existing psi behavior. |
| + * 2) !PageReferenced() && PageWorkingset() is not a valid tier. See the |
| + * comment in evict_pages(). |
| + * |
| + * Pages from the base tier are evicted regardless of its refault rate. Pages |
| + * from upper tiers will be moved to the next generation, if their refault rates |
| + * are higher than that of the base tier. |
| + */ |
| +#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) |
| +#define LRU_TIER_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) |
| +#define LRU_USAGE_SHIFT (CONFIG_TIERS_PER_GEN - 1) |
| + |
| +/* Whether to keep historical stats for each generation. */ |
| +#ifdef CONFIG_LRU_GEN_STATS |
| +#define NR_STAT_GENS ((unsigned int)CONFIG_NR_LRU_GENS) |
| +#else |
| +#define NR_STAT_GENS 1U |
| +#endif |
| + |
| +struct lrugen { |
| + /* the aging increments the max generation number */ |
| + unsigned long max_seq; |
| + /* the eviction increments the min generation numbers */ |
| + unsigned long min_seq[ANON_AND_FILE]; |
| + /* the birth time of each generation in jiffies */ |
| + unsigned long timestamps[MAX_NR_GENS]; |
| + /* the multigenerational lru lists */ |
| + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; |
| + /* the sizes of the multigenerational lru lists in pages */ |
| + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; |
| + /* to determine which type and its tiers to evict */ |
| + atomic_long_t evicted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS]; |
| + atomic_long_t refaulted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS]; |
| + /* the base tier won't be activated */ |
| + unsigned long activated[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; |
| + /* arithmetic mean weighted by geometric series 1/2, 1/4, ... */ |
| + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; |
| + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; |
| + /* whether the multigenerational lru is enabled */ |
| + bool enabled[ANON_AND_FILE]; |
| +}; |
| + |
| +void lru_gen_init_lruvec(struct lruvec *lruvec); |
| +void lru_gen_set_state(bool enable, bool main, bool swap); |
| + |
| +#else /* CONFIG_LRU_GEN */ |
| + |
| +static inline void lru_gen_init_lruvec(struct lruvec *lruvec) |
| +{ |
| +} |
| + |
| +static inline void lru_gen_set_state(bool enable, bool main, bool swap) |
| +{ |
| +} |
| + |
| +#endif /* CONFIG_LRU_GEN */ |
| + |
| struct lruvec { |
| struct list_head lists[NR_LRU_LISTS]; |
| /* per lruvec lru_lock for memcg */ |
| @@ -311,6 +413,10 @@ struct lruvec { |
| unsigned long refaults[ANON_AND_FILE]; |
| /* Various lruvec state flags (enum lruvec_flags) */ |
| unsigned long flags; |
| +#ifdef CONFIG_LRU_GEN |
| + /* unevictable pages are on LRU_UNEVICTABLE */ |
| + struct lrugen evictable; |
| +#endif |
| #ifdef CONFIG_MEMCG |
| struct pglist_data *pgdat; |
| #endif |
| diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h |
| --- a/include/linux/page-flags-layout.h |
| +++ b/include/linux/page-flags-layout.h |
| @@ -24,6 +24,14 @@ |
| #error ZONES_SHIFT "Too many zones configured" |
| #endif |
| |
| +#ifdef CONFIG_LRU_GEN |
| +/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ |
| +#define LRU_USAGE_WIDTH (CONFIG_TIERS_PER_GEN - 2) |
| +#else |
| +#define LRU_GEN_WIDTH 0 |
| +#define LRU_USAGE_WIDTH 0 |
| +#endif |
| + |
| #define ZONES_WIDTH ZONES_SHIFT |
| |
| #ifdef CONFIG_SPARSEMEM |
| @@ -55,7 +63,8 @@ |
| #define SECTIONS_WIDTH 0 |
| #endif |
| |
| -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS |
| +#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+LRU_USAGE_WIDTH+NODES_SHIFT \ |
| + <= BITS_PER_LONG - NR_PAGEFLAGS |
| #define NODES_WIDTH NODES_SHIFT |
| #elif defined(CONFIG_SPARSEMEM_VMEMMAP) |
| #error "Vmemmap: No space for nodes field in page flags" |
| @@ -89,7 +98,7 @@ |
| #define LAST_CPUPID_SHIFT 0 |
| #endif |
| |
| -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ |
| +#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+LRU_USAGE_WIDTH+NODES_WIDTH+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ |
| <= BITS_PER_LONG - NR_PAGEFLAGS |
| #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT |
| #else |
| @@ -100,7 +109,7 @@ |
| #define LAST_CPUPID_NOT_IN_PAGE_FLAGS |
| #endif |
| |
| -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ |
| +#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+LRU_USAGE_WIDTH+NODES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \ |
| > BITS_PER_LONG - NR_PAGEFLAGS |
| #error "Not enough bits in page flags" |
| #endif |
| diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h |
| --- a/include/linux/page-flags.h |
| +++ b/include/linux/page-flags.h |
| @@ -822,7 +822,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) |
| 1UL << PG_private | 1UL << PG_private_2 | \ |
| 1UL << PG_writeback | 1UL << PG_reserved | \ |
| 1UL << PG_slab | 1UL << PG_active | \ |
| - 1UL << PG_unevictable | __PG_MLOCKED) |
| + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) |
| |
| /* |
| * Flags checked when a page is prepped for return by the page allocator. |
| @@ -833,7 +833,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) |
| * alloc-free cycle to prevent from reusing the page. |
| */ |
| #define PAGE_FLAGS_CHECK_AT_PREP \ |
| - (PAGEFLAGS_MASK & ~__PG_HWPOISON) |
| + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_USAGE_MASK) |
| |
| #define PAGE_FLAGS_PRIVATE \ |
| (1UL << PG_private | 1UL << PG_private_2) |
| diff --git a/kernel/bounds.c b/kernel/bounds.c |
| --- a/kernel/bounds.c |
| +++ b/kernel/bounds.c |
| @@ -22,6 +22,12 @@ int main(void) |
| DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); |
| #endif |
| DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); |
| +#ifdef CONFIG_LRU_GEN |
| + /* bits needed to represent internal values stored in page->flags */ |
| + DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); |
| + /* bits needed to represent normalized values for external uses */ |
| + DEFINE(LRU_GEN_SHIFT, order_base_2(CONFIG_NR_LRU_GENS)); |
| +#endif |
| /* End of constants */ |
| |
| return 0; |
| diff --git a/mm/huge_memory.c b/mm/huge_memory.c |
| --- a/mm/huge_memory.c |
| +++ b/mm/huge_memory.c |
| @@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struct page *head, int tail, |
| #ifdef CONFIG_64BIT |
| (1L << PG_arch_2) | |
| #endif |
| - (1L << PG_dirty))); |
| + (1L << PG_dirty) | |
| + LRU_GEN_MASK | LRU_USAGE_MASK)); |
| |
| /* ->mapping in first tail page is compound_mapcount */ |
| VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, |
| diff --git a/mm/mm_init.c b/mm/mm_init.c |
| --- a/mm/mm_init.c |
| +++ b/mm/mm_init.c |
| @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) |
| |
| shift = 8 * sizeof(unsigned long); |
| width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH |
| - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; |
| + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_USAGE_WIDTH; |
| mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", |
| - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", |
| + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", |
| SECTIONS_WIDTH, |
| NODES_WIDTH, |
| ZONES_WIDTH, |
| LAST_CPUPID_WIDTH, |
| KASAN_TAG_WIDTH, |
| + LRU_GEN_WIDTH, |
| + LRU_USAGE_WIDTH, |
| NR_PAGEFLAGS); |
| mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", |
| "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", |
| diff --git a/mm/mmzone.c b/mm/mmzone.c |
| --- a/mm/mmzone.c |
| +++ b/mm/mmzone.c |
| @@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec) |
| |
| for_each_lru(lru) |
| INIT_LIST_HEAD(&lruvec->lists[lru]); |
| + |
| + lru_gen_init_lruvec(lruvec); |
| } |
| |
| #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) |
| diff --git a/mm/swapfile.c b/mm/swapfile.c |
| --- a/mm/swapfile.c |
| +++ b/mm/swapfile.c |
| @@ -2696,6 +2696,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
| err = 0; |
| atomic_inc(&proc_poll_event); |
| wake_up_interruptible(&proc_poll_wait); |
| + /* stop tracking anon if the multigenerational lru is turned off */ |
| + lru_gen_set_state(false, false, true); |
| |
| out_dput: |
| if (victim) |
| @@ -3374,6 +3376,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
| mutex_unlock(&swapon_mutex); |
| atomic_inc(&proc_poll_event); |
| wake_up_interruptible(&proc_poll_wait); |
| + /* start tracking anon if the multigenerational lru is turned on */ |
| + lru_gen_set_state(true, false, true); |
| |
| error = 0; |
| goto out; |
| diff --git a/mm/vmscan.c b/mm/vmscan.c |
| --- a/mm/vmscan.c |
| +++ b/mm/vmscan.c |
| @@ -50,6 +50,7 @@ |
| #include <linux/printk.h> |
| #include <linux/dax.h> |
| #include <linux/psi.h> |
| +#include <linux/memory.h> |
| |
| #include <asm/tlbflush.h> |
| #include <asm/div64.h> |
| @@ -2911,6 +2912,312 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, |
| return can_demote(pgdat->node_id, sc); |
| } |
| |
| +#ifdef CONFIG_LRU_GEN |
| + |
| +/* |
| + * After pages are faulted in, the aging must scan them twice before the |
| + * eviction can consider them. The first scan clears the accessed bit set during |
| + * initial faults. And the second scan makes sure they haven't been used since |
| + * the first scan. |
| + */ |
| +#define MIN_NR_GENS 2 |
| + |
| +#define MAX_BATCH_SIZE 8192 |
| + |
| +/****************************************************************************** |
| + * shorthand helpers |
| + ******************************************************************************/ |
| + |
| +#define DEFINE_MAX_SEQ() \ |
| + unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq) |
| + |
| +#define DEFINE_MIN_SEQ() \ |
| + unsigned long min_seq[ANON_AND_FILE] = { \ |
| + READ_ONCE(lruvec->evictable.min_seq[0]), \ |
| + READ_ONCE(lruvec->evictable.min_seq[1]), \ |
| + } |
| + |
| +#define for_each_type_zone(type, zone) \ |
| + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ |
| + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) |
| + |
| +#define for_each_gen_type_zone(gen, type, zone) \ |
| + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ |
| + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ |
| + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) |
| + |
| +static int page_lru_gen(struct page *page) |
| +{ |
| + return ((page->flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; |
| +} |
| + |
| +static int get_lo_wmark(unsigned long max_seq, unsigned long *min_seq, int swappiness) |
| +{ |
| + return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1; |
| +} |
| + |
| +static int get_hi_wmark(unsigned long max_seq, unsigned long *min_seq, int swappiness) |
| +{ |
| + return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1; |
| +} |
| + |
| +static int get_nr_gens(struct lruvec *lruvec, int type) |
| +{ |
| + return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; |
| +} |
| + |
| +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) |
| +{ |
| + lockdep_assert_held(&lruvec->lru_lock); |
| + |
| + return get_nr_gens(lruvec, 0) >= MIN_NR_GENS && |
| + get_nr_gens(lruvec, 0) <= MAX_NR_GENS && |
| + get_nr_gens(lruvec, 1) >= MIN_NR_GENS && |
| + get_nr_gens(lruvec, 1) <= MAX_NR_GENS; |
| +} |
| + |
| +/****************************************************************************** |
| + * state change |
| + ******************************************************************************/ |
| + |
| +#ifdef CONFIG_LRU_GEN_ENABLED |
| +DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); |
| +#else |
| +DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); |
| +#endif |
| + |
| +static DEFINE_MUTEX(lru_gen_state_mutex); |
| +static int lru_gen_nr_swapfiles __read_mostly; |
| + |
| +static bool __maybe_unused state_is_valid(struct lruvec *lruvec) |
| +{ |
| + int gen, type, zone; |
| + enum lru_list lru; |
| + struct lrugen *lrugen = &lruvec->evictable; |
| + |
| + for_each_evictable_lru(lru) { |
| + type = is_file_lru(lru); |
| + |
| + if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) |
| + return false; |
| + } |
| + |
| + for_each_gen_type_zone(gen, type, zone) { |
| + if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) |
| + return false; |
| + |
| + VM_WARN_ON_ONCE(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); |
| + } |
| + |
| + return true; |
| +} |
| + |
| +static bool fill_lru_gen_lists(struct lruvec *lruvec) |
| +{ |
| + enum lru_list lru; |
| + int batch_size = 0; |
| + |
| + for_each_evictable_lru(lru) { |
| + int type = is_file_lru(lru); |
| + bool active = is_active_lru(lru); |
| + struct list_head *head = &lruvec->lists[lru]; |
| + |
| + if (!lruvec->evictable.enabled[type]) |
| + continue; |
| + |
| + while (!list_empty(head)) { |
| + bool success; |
| + struct page *page = lru_to_page(head); |
| + |
| + VM_BUG_ON_PAGE(PageTail(page), page); |
| + VM_BUG_ON_PAGE(PageUnevictable(page), page); |
| + VM_BUG_ON_PAGE(PageActive(page) != active, page); |
| + VM_BUG_ON_PAGE(page_lru_gen(page) != -1, page); |
| + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); |
| + |
| + prefetchw_prev_lru_page(page, head, flags); |
| + |
| + del_page_from_lru_list(page, lruvec); |
| + success = lru_gen_addition(page, lruvec, true); |
| + VM_BUG_ON(!success); |
| + |
| + if (++batch_size == MAX_BATCH_SIZE) |
| + return false; |
| + } |
| + } |
| + |
| + return true; |
| +} |
| + |
| +static bool drain_lru_gen_lists(struct lruvec *lruvec) |
| +{ |
| + int gen, type, zone; |
| + int batch_size = 0; |
| + |
| + for_each_gen_type_zone(gen, type, zone) { |
| + struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; |
| + |
| + if (lruvec->evictable.enabled[type]) |
| + continue; |
| + |
| + while (!list_empty(head)) { |
| + bool success; |
| + struct page *page = lru_to_page(head); |
| + |
| + VM_BUG_ON_PAGE(PageTail(page), page); |
| + VM_BUG_ON_PAGE(PageUnevictable(page), page); |
| + VM_BUG_ON_PAGE(PageActive(page), page); |
| + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); |
| + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); |
| + |
| + prefetchw_prev_lru_page(page, head, flags); |
| + |
| + success = lru_gen_deletion(page, lruvec); |
| + VM_BUG_ON(!success); |
| + add_page_to_lru_list(page, lruvec); |
| + |
| + if (++batch_size == MAX_BATCH_SIZE) |
| + return false; |
| + } |
| + } |
| + |
| + return true; |
| +} |
| + |
| +/* |
| + * For file page tracking, we enable/disable it according to the main switch. |
| + * For anon page tracking, we only enabled it when the main switch is on and |
| + * there is at least one swapfile; we disable it when there are no swapfiles |
| + * regardless of the value of the main switch. Otherwise, we will eventually |
| + * reach the max size of the sliding window and have to call inc_min_seq(), |
| + * which brings an unnecessary overhead. |
| + */ |
| +void lru_gen_set_state(bool enable, bool main, bool swap) |
| +{ |
| + struct mem_cgroup *memcg; |
| + |
| + mem_hotplug_begin(); |
| + mutex_lock(&lru_gen_state_mutex); |
| + cgroup_lock(); |
| + |
| + main = main && enable != lru_gen_enabled(); |
| + swap = swap && !(enable ? lru_gen_nr_swapfiles++ : --lru_gen_nr_swapfiles); |
| + swap = swap && lru_gen_enabled(); |
| + if (!main && !swap) |
| + goto unlock; |
| + |
| + if (main) { |
| + if (enable) |
| + static_branch_enable(&lru_gen_static_key); |
| + else |
| + static_branch_disable(&lru_gen_static_key); |
| + } |
| + |
| + memcg = mem_cgroup_iter(NULL, NULL, NULL); |
| + do { |
| + int nid; |
| + |
| + for_each_node_state(nid, N_MEMORY) { |
| + struct pglist_data *pgdat = NODE_DATA(nid); |
| + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); |
| + struct lrugen *lrugen = &lruvec->evictable; |
| + |
| + spin_lock_irq(&lruvec->lru_lock); |
| + |
| + VM_BUG_ON(!seq_is_valid(lruvec)); |
| + VM_BUG_ON(!state_is_valid(lruvec)); |
| + |
| + WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles); |
| + WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled()); |
| + |
| + while (!(enable ? fill_lru_gen_lists(lruvec) : |
| + drain_lru_gen_lists(lruvec))) { |
| + spin_unlock_irq(&lruvec->lru_lock); |
| + cond_resched(); |
| + spin_lock_irq(&lruvec->lru_lock); |
| + } |
| + |
| + spin_unlock_irq(&lruvec->lru_lock); |
| + } |
| + |
| + cond_resched(); |
| + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); |
| +unlock: |
| + cgroup_unlock(); |
| + mutex_unlock(&lru_gen_state_mutex); |
| + mem_hotplug_done(); |
| +} |
| + |
| +static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *self, |
| + unsigned long action, void *arg) |
| +{ |
| + struct mem_cgroup *memcg; |
| + struct memory_notify *mnb = arg; |
| + int nid = mnb->status_change_nid; |
| + |
| + if (action != MEM_GOING_ONLINE || nid == NUMA_NO_NODE) |
| + return NOTIFY_DONE; |
| + |
| + mutex_lock(&lru_gen_state_mutex); |
| + cgroup_lock(); |
| + |
| + memcg = mem_cgroup_iter(NULL, NULL, NULL); |
| + do { |
| + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); |
| + struct lrugen *lrugen = &lruvec->evictable; |
| + |
| + VM_BUG_ON(!seq_is_valid(lruvec)); |
| + VM_BUG_ON(!state_is_valid(lruvec)); |
| + |
| + WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles); |
| + WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled()); |
| + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); |
| + |
| + cgroup_unlock(); |
| + mutex_unlock(&lru_gen_state_mutex); |
| + |
| + return NOTIFY_DONE; |
| +} |
| + |
| +/****************************************************************************** |
| + * initialization |
| + ******************************************************************************/ |
| + |
| +void lru_gen_init_lruvec(struct lruvec *lruvec) |
| +{ |
| + int i; |
| + int gen, type, zone; |
| + struct lrugen *lrugen = &lruvec->evictable; |
| + |
| + lrugen->max_seq = MIN_NR_GENS + 1; |
| + lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; |
| + lrugen->enabled[1] = lru_gen_enabled(); |
| + |
| + for (i = 0; i <= MIN_NR_GENS + 1; i++) |
| + lrugen->timestamps[i] = jiffies; |
| + |
| + for_each_gen_type_zone(gen, type, zone) |
| + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); |
| +} |
| + |
| +static int __init init_lru_gen(void) |
| +{ |
| + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); |
| + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); |
| + |
| + if (hotplug_memory_notifier(lru_gen_online_mem, 0)) |
| + pr_err("lru_gen: failed to subscribe hotplug notifications\n"); |
| + |
| + return 0; |
| +}; |
| +/* |
| + * We want to run as early as possible because debug code may call mm_alloc() |
| + * and mmput(). Our only dependency mm_kobj is initialized one stage earlier. |
| + */ |
| +arch_initcall(init_lru_gen); |
| + |
| +#endif /* CONFIG_LRU_GEN */ |
| + |
| static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) |
| { |
| unsigned long nr[NR_LRU_LISTS]; |
| -- |
| 2.33.0.464.g1972c5931b-goog |
| |