| From 9f88d02010f55c3a480f26c61213ddf95e892f57 Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Mon, 5 Apr 2021 04:17:41 -0600 |
| Subject: [PATCH] FROMLIST: mm: multigenerational lru: mm_struct list |
| |
| In order to scan page tables, we add an infrastructure to maintain |
| either a system-wide mm_struct list or per-memcg mm_struct lists, and |
| track whether an mm_struct is being used or has been used since the |
| last scan. |
| |
| Multiple threads can concurrently work on the same mm_struct list, and |
| each of them will be given a different mm_struct belonging to a |
| process that has been scheduled since the last scan. |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> |
| (am from https://lore.kernel.org/patchwork/patch/1432184/) |
| |
| BUG=b:123039911 |
| TEST=Built |
| |
| Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2987923 |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: Sean Paul <seanpaul@chromium.org> |
| Reviewed-by: Sonny Rao <sonnyrao@chromium.org> |
| Reviewed-by: Yu Zhao <yuzhao@chromium.org> |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| --- |
| fs/exec.c | 2 + |
| include/linux/memcontrol.h | 6 + |
| include/linux/mm_types.h | 107 ++++++++++++ |
| kernel/exit.c | 1 + |
| kernel/fork.c | 10 ++ |
| kernel/kthread.c | 1 + |
| kernel/sched/core.c | 2 + |
| mm/memcontrol.c | 28 ++++ |
| mm/vmscan.c | 324 +++++++++++++++++++++++++++++++++++++ |
| 9 files changed, 481 insertions(+) |
| |
| diff --git a/fs/exec.c b/fs/exec.c |
| index 38f63451b928..7ead083bcb39 100644 |
| --- a/fs/exec.c |
| +++ b/fs/exec.c |
| @@ -1005,6 +1005,7 @@ static int exec_mmap(struct mm_struct *mm) |
| active_mm = tsk->active_mm; |
| tsk->active_mm = mm; |
| tsk->mm = mm; |
| + lru_gen_add_mm(mm); |
| /* |
| * This prevents preemption while active_mm is being loaded and |
| * it and mm are being updated, which could cause problems for |
| @@ -1015,6 +1016,7 @@ static int exec_mmap(struct mm_struct *mm) |
| if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) |
| local_irq_enable(); |
| activate_mm(active_mm, mm); |
| + lru_gen_switch_mm(active_mm, mm); |
| if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) |
| local_irq_enable(); |
| tsk->mm->vmacache_seqnum = 0; |
| diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h |
| index 1b9705b10457..0211c397d104 100644 |
| --- a/include/linux/memcontrol.h |
| +++ b/include/linux/memcontrol.h |
| @@ -197,6 +197,8 @@ struct memcg_padding { |
| #define MEMCG_PADDING(name) |
| #endif |
| |
| +struct lru_gen_mm_list; |
| + |
| /* |
| * Remember four most recent foreign writebacks with dirty pages in this |
| * cgroup. Inode sharing is expected to be uncommon and, even if we miss |
| @@ -349,6 +351,10 @@ struct mem_cgroup { |
| struct deferred_split deferred_split_queue; |
| #endif |
| |
| +#ifdef CONFIG_LRU_GEN |
| + struct lru_gen_mm_list *mm_list; |
| +#endif |
| + |
| struct mem_cgroup_per_node *nodeinfo[]; |
| }; |
| |
| diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h |
| index 52bbd2b7cb46..d9a2ba150ce8 100644 |
| --- a/include/linux/mm_types.h |
| +++ b/include/linux/mm_types.h |
| @@ -15,6 +15,8 @@ |
| #include <linux/page-flags-layout.h> |
| #include <linux/workqueue.h> |
| #include <linux/seqlock.h> |
| +#include <linux/nodemask.h> |
| +#include <linux/mmdebug.h> |
| |
| #include <asm/mmu.h> |
| |
| @@ -571,6 +573,22 @@ struct mm_struct { |
| |
| #ifdef CONFIG_IOMMU_SUPPORT |
| u32 pasid; |
| +#endif |
| +#ifdef CONFIG_LRU_GEN |
| + struct { |
| + /* the node of a global or per-memcg mm_struct list */ |
| + struct list_head list; |
| +#ifdef CONFIG_MEMCG |
| + /* points to the memcg of the owner task above */ |
| + struct mem_cgroup *memcg; |
| +#endif |
| + /* whether this mm_struct has been used since the last walk */ |
| + nodemask_t nodes; |
| +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
| + /* the number of CPUs using this mm_struct */ |
| + atomic_t nr_cpus; |
| +#endif |
| + } lrugen; |
| #endif |
| } __randomize_layout; |
| |
| @@ -598,6 +616,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) |
| return (struct cpumask *)&mm->cpu_bitmap; |
| } |
| |
| +#ifdef CONFIG_LRU_GEN |
| + |
| +void lru_gen_init_mm(struct mm_struct *mm); |
| +void lru_gen_add_mm(struct mm_struct *mm); |
| +void lru_gen_del_mm(struct mm_struct *mm); |
| +#ifdef CONFIG_MEMCG |
| +int lru_gen_alloc_mm_list(struct mem_cgroup *memcg); |
| +void lru_gen_free_mm_list(struct mem_cgroup *memcg); |
| +void lru_gen_migrate_mm(struct mm_struct *mm); |
| +#endif |
| + |
| +/* Track the usage of each mm_struct so that we can skip inactive ones. */ |
| +static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) |
| +{ |
| + /* exclude init_mm, efi_mm, etc. */ |
| + if (!core_kernel_data((unsigned long)old)) { |
| + VM_BUG_ON(old == &init_mm); |
| + |
| + nodes_setall(old->lrugen.nodes); |
| +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
| + atomic_dec(&old->lrugen.nr_cpus); |
| + VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old); |
| +#endif |
| + } else |
| + VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) || |
| + READ_ONCE(old->lrugen.list.next), old); |
| + |
| + if (!core_kernel_data((unsigned long)new)) { |
| + VM_BUG_ON(new == &init_mm); |
| + |
| +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
| + atomic_inc(&new->lrugen.nr_cpus); |
| + VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new); |
| +#endif |
| + } else |
| + VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) || |
| + READ_ONCE(new->lrugen.list.next), new); |
| +} |
| + |
| +/* Return whether this mm_struct is being used on any CPUs. */ |
| +static inline bool lru_gen_mm_is_active(struct mm_struct *mm) |
| +{ |
| +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
| + return !cpumask_empty(mm_cpumask(mm)); |
| +#else |
| + return atomic_read(&mm->lrugen.nr_cpus); |
| +#endif |
| +} |
| + |
| +#else /* CONFIG_LRU_GEN */ |
| + |
| +static inline void lru_gen_init_mm(struct mm_struct *mm) |
| +{ |
| +} |
| + |
| +static inline void lru_gen_add_mm(struct mm_struct *mm) |
| +{ |
| +} |
| + |
| +static inline void lru_gen_del_mm(struct mm_struct *mm) |
| +{ |
| +} |
| + |
| +#ifdef CONFIG_MEMCG |
| +static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg) |
| +{ |
| + return 0; |
| +} |
| + |
| +static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg) |
| +{ |
| +} |
| + |
| +static inline void lru_gen_migrate_mm(struct mm_struct *mm) |
| +{ |
| +} |
| +#endif |
| + |
| +static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) |
| +{ |
| +} |
| + |
| +static inline bool lru_gen_mm_is_active(struct mm_struct *mm) |
| +{ |
| + return false; |
| +} |
| + |
| +#endif /* CONFIG_LRU_GEN */ |
| + |
| struct mmu_gather; |
| extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); |
| extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); |
| diff --git a/kernel/exit.c b/kernel/exit.c |
| index 9a89e7f36acb..c24d5ffae792 100644 |
| --- a/kernel/exit.c |
| +++ b/kernel/exit.c |
| @@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm) |
| goto retry; |
| } |
| WRITE_ONCE(mm->owner, c); |
| + lru_gen_migrate_mm(mm); |
| task_unlock(c); |
| put_task_struct(c); |
| } |
| diff --git a/kernel/fork.c b/kernel/fork.c |
| index bc94b2cc5995..e5f5dd5ac584 100644 |
| --- a/kernel/fork.c |
| +++ b/kernel/fork.c |
| @@ -669,6 +669,7 @@ static void check_mm(struct mm_struct *mm) |
| #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
| VM_BUG_ON_MM(mm->pmd_huge_pte, mm); |
| #endif |
| + VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm); |
| } |
| |
| #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) |
| @@ -1066,6 +1067,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, |
| goto fail_nocontext; |
| |
| mm->user_ns = get_user_ns(user_ns); |
| + lru_gen_init_mm(mm); |
| return mm; |
| |
| fail_nocontext: |
| @@ -1108,6 +1110,7 @@ static inline void __mmput(struct mm_struct *mm) |
| } |
| if (mm->binfmt) |
| module_put(mm->binfmt->module); |
| + lru_gen_del_mm(mm); |
| mmdrop(mm); |
| } |
| |
| @@ -2530,6 +2533,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) |
| get_task_struct(p); |
| } |
| |
| + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { |
| + /* lock the task to synchronize with memcg migration */ |
| + task_lock(p); |
| + lru_gen_add_mm(p->mm); |
| + task_unlock(p); |
| + } |
| + |
| wake_up_new_task(p); |
| |
| /* forking complete and child started to run, tell ptracer */ |
| diff --git a/kernel/kthread.c b/kernel/kthread.c |
| index 5b37a8567168..fd827fdad26b 100644 |
| --- a/kernel/kthread.c |
| +++ b/kernel/kthread.c |
| @@ -1361,6 +1361,7 @@ void kthread_use_mm(struct mm_struct *mm) |
| tsk->mm = mm; |
| membarrier_update_current_mm(mm); |
| switch_mm_irqs_off(active_mm, mm, tsk); |
| + lru_gen_switch_mm(active_mm, mm); |
| local_irq_enable(); |
| task_unlock(tsk); |
| #ifdef finish_arch_post_lock_switch |
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c |
| index 429b0e74cc2d..d73b8d2dfd44 100644 |
| --- a/kernel/sched/core.c |
| +++ b/kernel/sched/core.c |
| @@ -4670,6 +4670,7 @@ context_switch(struct rq *rq, struct task_struct *prev, |
| * finish_task_switch()'s mmdrop(). |
| */ |
| switch_mm_irqs_off(prev->active_mm, next->mm, next); |
| + lru_gen_switch_mm(prev->active_mm, next->mm); |
| |
| if (!prev->mm) { // from kernel |
| /* will mmdrop() in finish_task_switch(). */ |
| @@ -8419,6 +8420,7 @@ void idle_task_exit(void) |
| |
| if (mm != &init_mm) { |
| switch_mm(mm, &init_mm, current); |
| + lru_gen_switch_mm(mm, &init_mm); |
| finish_arch_post_lock_switch(); |
| } |
| |
| diff --git a/mm/memcontrol.c b/mm/memcontrol.c |
| index d36723fd9ed7..d74e89442009 100644 |
| --- a/mm/memcontrol.c |
| +++ b/mm/memcontrol.c |
| @@ -5169,6 +5169,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) |
| for_each_node(node) |
| free_mem_cgroup_per_node_info(memcg, node); |
| free_percpu(memcg->vmstats_percpu); |
| + lru_gen_free_mm_list(memcg); |
| kfree(memcg); |
| } |
| |
| @@ -5218,6 +5219,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) |
| if (alloc_mem_cgroup_per_node_info(memcg, node)) |
| goto fail; |
| |
| + if (lru_gen_alloc_mm_list(memcg)) |
| + goto fail; |
| + |
| if (memcg_wb_domain_init(memcg, GFP_KERNEL)) |
| goto fail; |
| |
| @@ -6179,6 +6183,29 @@ static void mem_cgroup_move_task(void) |
| } |
| #endif |
| |
| +#ifdef CONFIG_LRU_GEN |
| +static void mem_cgroup_attach(struct cgroup_taskset *tset) |
| +{ |
| + struct cgroup_subsys_state *css; |
| + struct task_struct *task = NULL; |
| + |
| + cgroup_taskset_for_each_leader(task, css, tset) |
| + ; |
| + |
| + if (!task) |
| + return; |
| + |
| + task_lock(task); |
| + if (task->mm && task->mm->owner == task) |
| + lru_gen_migrate_mm(task->mm); |
| + task_unlock(task); |
| +} |
| +#else |
| +static void mem_cgroup_attach(struct cgroup_taskset *tset) |
| +{ |
| +} |
| +#endif |
| + |
| static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) |
| { |
| if (value == PAGE_COUNTER_MAX) |
| @@ -6520,6 +6547,7 @@ struct cgroup_subsys memory_cgrp_subsys = { |
| .css_reset = mem_cgroup_css_reset, |
| .css_rstat_flush = mem_cgroup_css_rstat_flush, |
| .can_attach = mem_cgroup_can_attach, |
| + .attach = mem_cgroup_attach, |
| .cancel_attach = mem_cgroup_cancel_attach, |
| .post_attach = mem_cgroup_move_task, |
| .dfl_cftypes = memory_files, |
| diff --git a/mm/vmscan.c b/mm/vmscan.c |
| index 1778715462b5..e714d0e4f2ff 100644 |
| --- a/mm/vmscan.c |
| +++ b/mm/vmscan.c |
| @@ -2916,6 +2916,323 @@ static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos * |
| sp->refaulted * max(pv->total, 1UL) * pv->gain; |
| } |
| |
| +/****************************************************************************** |
| + * mm_struct list |
| + ******************************************************************************/ |
| + |
| +enum { |
| + MM_SCHED_ACTIVE, /* running processes */ |
| + MM_SCHED_INACTIVE, /* sleeping processes */ |
| + MM_LOCK_CONTENTION, /* lock contentions */ |
| + MM_VMA_INTERVAL, /* VMAs within the range of each PUD/PMD/PTE */ |
| + MM_LEAF_OTHER_NODE, /* entries not from the node under reclaim */ |
| + MM_LEAF_OTHER_MEMCG, /* entries not from the memcg under reclaim */ |
| + MM_LEAF_OLD, /* old entries */ |
| + MM_LEAF_YOUNG, /* young entries */ |
| + MM_LEAF_DIRTY, /* dirty entries */ |
| + MM_LEAF_HOLE, /* non-present entries */ |
| + MM_NONLEAF_OLD, /* old non-leaf PMD entries */ |
| + MM_NONLEAF_YOUNG, /* young non-leaf PMD entries */ |
| + NR_MM_STATS |
| +}; |
| + |
| +/* mnemonic codes for the stats above */ |
| +#define MM_STAT_CODES "aicvnmoydhlu" |
| + |
| +struct lru_gen_mm_list { |
| + /* the head of a global or per-memcg mm_struct list */ |
| + struct list_head head; |
| + /* protects the list */ |
| + spinlock_t lock; |
| + struct { |
| + /* set to max_seq after each round of walk */ |
| + unsigned long cur_seq; |
| + /* the next mm on the list to walk */ |
| + struct list_head *iter; |
| + /* to wait for the last worker to finish */ |
| + struct wait_queue_head wait; |
| + /* the number of concurrent workers */ |
| + int nr_workers; |
| + /* stats for debugging */ |
| + unsigned long stats[NR_STAT_GENS][NR_MM_STATS]; |
| + } nodes[0]; |
| +}; |
| + |
| +static struct lru_gen_mm_list *global_mm_list; |
| + |
| +static struct lru_gen_mm_list *alloc_mm_list(void) |
| +{ |
| + int nid; |
| + struct lru_gen_mm_list *mm_list; |
| + |
| + mm_list = kzalloc(struct_size(mm_list, nodes, nr_node_ids), GFP_KERNEL); |
| + if (!mm_list) |
| + return NULL; |
| + |
| + INIT_LIST_HEAD(&mm_list->head); |
| + spin_lock_init(&mm_list->lock); |
| + |
| + for_each_node(nid) { |
| + mm_list->nodes[nid].cur_seq = MIN_NR_GENS; |
| + mm_list->nodes[nid].iter = &mm_list->head; |
| + init_waitqueue_head(&mm_list->nodes[nid].wait); |
| + } |
| + |
| + return mm_list; |
| +} |
| + |
| +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) |
| +{ |
| +#ifdef CONFIG_MEMCG |
| + if (!mem_cgroup_disabled()) |
| + return memcg ? memcg->mm_list : root_mem_cgroup->mm_list; |
| +#endif |
| + VM_BUG_ON(memcg); |
| + |
| + return global_mm_list; |
| +} |
| + |
| +void lru_gen_init_mm(struct mm_struct *mm) |
| +{ |
| + INIT_LIST_HEAD(&mm->lrugen.list); |
| +#ifdef CONFIG_MEMCG |
| + mm->lrugen.memcg = NULL; |
| +#endif |
| +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
| + atomic_set(&mm->lrugen.nr_cpus, 0); |
| +#endif |
| + nodes_clear(mm->lrugen.nodes); |
| +} |
| + |
| +void lru_gen_add_mm(struct mm_struct *mm) |
| +{ |
| + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); |
| + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); |
| + |
| + VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); |
| +#ifdef CONFIG_MEMCG |
| + VM_BUG_ON_MM(mm->lrugen.memcg, mm); |
| + WRITE_ONCE(mm->lrugen.memcg, memcg); |
| +#endif |
| + spin_lock(&mm_list->lock); |
| + list_add_tail(&mm->lrugen.list, &mm_list->head); |
| + spin_unlock(&mm_list->lock); |
| +} |
| + |
| +void lru_gen_del_mm(struct mm_struct *mm) |
| +{ |
| + int nid; |
| +#ifdef CONFIG_MEMCG |
| + struct lru_gen_mm_list *mm_list = get_mm_list(mm->lrugen.memcg); |
| +#else |
| + struct lru_gen_mm_list *mm_list = get_mm_list(NULL); |
| +#endif |
| + |
| + spin_lock(&mm_list->lock); |
| + |
| + for_each_node(nid) { |
| + if (mm_list->nodes[nid].iter != &mm->lrugen.list) |
| + continue; |
| + |
| + mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; |
| + if (mm_list->nodes[nid].iter == &mm_list->head) |
| + WRITE_ONCE(mm_list->nodes[nid].cur_seq, |
| + mm_list->nodes[nid].cur_seq + 1); |
| + } |
| + |
| + list_del_init(&mm->lrugen.list); |
| + |
| + spin_unlock(&mm_list->lock); |
| + |
| +#ifdef CONFIG_MEMCG |
| + mem_cgroup_put(mm->lrugen.memcg); |
| + WRITE_ONCE(mm->lrugen.memcg, NULL); |
| +#endif |
| +} |
| + |
| +#ifdef CONFIG_MEMCG |
| +int lru_gen_alloc_mm_list(struct mem_cgroup *memcg) |
| +{ |
| + if (mem_cgroup_disabled()) |
| + return 0; |
| + |
| + memcg->mm_list = alloc_mm_list(); |
| + |
| + return memcg->mm_list ? 0 : -ENOMEM; |
| +} |
| + |
| +void lru_gen_free_mm_list(struct mem_cgroup *memcg) |
| +{ |
| + kfree(memcg->mm_list); |
| + memcg->mm_list = NULL; |
| +} |
| + |
| +void lru_gen_migrate_mm(struct mm_struct *mm) |
| +{ |
| + struct mem_cgroup *memcg; |
| + |
| + lockdep_assert_held(&mm->owner->alloc_lock); |
| + |
| + if (mem_cgroup_disabled()) |
| + return; |
| + |
| + rcu_read_lock(); |
| + memcg = mem_cgroup_from_task(mm->owner); |
| + rcu_read_unlock(); |
| + if (memcg == mm->lrugen.memcg) |
| + return; |
| + |
| + VM_BUG_ON_MM(!mm->lrugen.memcg, mm); |
| + VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); |
| + |
| + lru_gen_del_mm(mm); |
| + lru_gen_add_mm(mm); |
| +} |
| + |
| +static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg) |
| +{ |
| + return READ_ONCE(mm->lrugen.memcg) != memcg; |
| +} |
| +#else |
| +static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg) |
| +{ |
| + return false; |
| +} |
| +#endif |
| + |
| +struct mm_walk_args { |
| + struct mem_cgroup *memcg; |
| + unsigned long max_seq; |
| + unsigned long start_pfn; |
| + unsigned long end_pfn; |
| + unsigned long next_addr; |
| + int node_id; |
| + int swappiness; |
| + int batch_size; |
| + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; |
| + int mm_stats[NR_MM_STATS]; |
| + unsigned long bitmap[0]; |
| +}; |
| + |
| +static int size_of_mm_walk_args(void) |
| +{ |
| + int size = sizeof(struct mm_walk_args); |
| + |
| + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) || |
| + IS_ENABLED(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)) |
| + size += sizeof(unsigned long) * BITS_TO_LONGS(PTRS_PER_PMD); |
| + |
| + return size; |
| +} |
| + |
| +static void reset_mm_stats(struct lru_gen_mm_list *mm_list, bool last, |
| + struct mm_walk_args *args) |
| +{ |
| + int i; |
| + int nid = args->node_id; |
| + int hist = hist_from_seq_or_gen(args->max_seq); |
| + |
| + lockdep_assert_held(&mm_list->lock); |
| + |
| + for (i = 0; i < NR_MM_STATS; i++) { |
| + WRITE_ONCE(mm_list->nodes[nid].stats[hist][i], |
| + mm_list->nodes[nid].stats[hist][i] + args->mm_stats[i]); |
| + args->mm_stats[i] = 0; |
| + } |
| + |
| + if (!last || NR_STAT_GENS == 1) |
| + return; |
| + |
| + hist = hist_from_seq_or_gen(args->max_seq + 1); |
| + for (i = 0; i < NR_MM_STATS; i++) |
| + WRITE_ONCE(mm_list->nodes[nid].stats[hist][i], 0); |
| +} |
| + |
| +static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) |
| +{ |
| + int type; |
| + unsigned long size = 0; |
| + |
| + if (!lru_gen_mm_is_active(mm) && !node_isset(args->node_id, mm->lrugen.nodes)) |
| + return true; |
| + |
| + if (mm_is_oom_victim(mm)) |
| + return true; |
| + |
| + for (type = !args->swappiness; type < ANON_AND_FILE; type++) { |
| + size += type ? get_mm_counter(mm, MM_FILEPAGES) : |
| + get_mm_counter(mm, MM_ANONPAGES) + |
| + get_mm_counter(mm, MM_SHMEMPAGES); |
| + } |
| + |
| + /* leave the legwork to the rmap if mappings are too sparse */ |
| + if (size < max(SWAP_CLUSTER_MAX, mm_pgtables_bytes(mm) / PAGE_SIZE)) |
| + return true; |
| + |
| + return !mmget_not_zero(mm); |
| +} |
| + |
| +/* To support multiple workers that concurrently walk an mm_struct list. */ |
| +static bool get_next_mm(struct mm_walk_args *args, struct mm_struct **iter) |
| +{ |
| + bool last = true; |
| + struct mm_struct *mm = NULL; |
| + int nid = args->node_id; |
| + struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); |
| + |
| + if (*iter) |
| + mmput_async(*iter); |
| + else if (args->max_seq <= READ_ONCE(mm_list->nodes[nid].cur_seq)) |
| + return false; |
| + |
| + spin_lock(&mm_list->lock); |
| + |
| + VM_BUG_ON(args->max_seq > mm_list->nodes[nid].cur_seq + 1); |
| + VM_BUG_ON(*iter && args->max_seq < mm_list->nodes[nid].cur_seq); |
| + VM_BUG_ON(*iter && !mm_list->nodes[nid].nr_workers); |
| + |
| + if (args->max_seq <= mm_list->nodes[nid].cur_seq) { |
| + last = *iter; |
| + goto done; |
| + } |
| + |
| + if (mm_list->nodes[nid].iter == &mm_list->head) { |
| + VM_BUG_ON(*iter || mm_list->nodes[nid].nr_workers); |
| + mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; |
| + } |
| + |
| + while (!mm && mm_list->nodes[nid].iter != &mm_list->head) { |
| + mm = list_entry(mm_list->nodes[nid].iter, struct mm_struct, lrugen.list); |
| + mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; |
| + if (should_skip_mm(mm, args)) |
| + mm = NULL; |
| + |
| + args->mm_stats[mm ? MM_SCHED_ACTIVE : MM_SCHED_INACTIVE]++; |
| + } |
| + |
| + if (mm_list->nodes[nid].iter == &mm_list->head) |
| + WRITE_ONCE(mm_list->nodes[nid].cur_seq, |
| + mm_list->nodes[nid].cur_seq + 1); |
| +done: |
| + if (*iter && !mm) |
| + mm_list->nodes[nid].nr_workers--; |
| + if (!*iter && mm) |
| + mm_list->nodes[nid].nr_workers++; |
| + |
| + last = last && !mm_list->nodes[nid].nr_workers && |
| + mm_list->nodes[nid].iter == &mm_list->head; |
| + |
| + reset_mm_stats(mm_list, last, args); |
| + |
| + spin_unlock(&mm_list->lock); |
| + |
| + *iter = mm; |
| + if (mm) |
| + node_clear(nid, mm->lrugen.nodes); |
| + |
| + return last; |
| +} |
| + |
| /****************************************************************************** |
| * state change |
| ******************************************************************************/ |
| @@ -3144,6 +3461,13 @@ static int __init init_lru_gen(void) |
| { |
| BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); |
| BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); |
| + BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); |
| + |
| + if (mem_cgroup_disabled()) { |
| + global_mm_list = alloc_mm_list(); |
| + if (WARN_ON_ONCE(!global_mm_list)) |
| + return -ENOMEM; |
| + } |
| |
| if (hotplug_memory_notifier(lru_gen_online_mem, 0)) |
| pr_err("lru_gen: failed to subscribe hotplug notifications\n"); |
| -- |
| 2.32.0.402.g57bb445576-goog |
| |