| From 940411fdf169501b442397853e470ba66cc3aa00 Mon Sep 17 00:00:00 2001 |
| From: Brian Geffon <bgeffon@chromium.org> |
| Date: Tue, 17 Aug 2021 17:41:38 +0000 |
| Subject: [PATCH] CHROMIUM: mm: per-process reclaim |
| |
| These day, there are many platforms available in the embedded market |
| and they are smarter than kernel which has very limited information |
| about working set so they want to involve memory management more heavily |
| like android's lowmemory killer and ashmem or recent many lowmemory |
| notifier. |
| |
| One of the simple imagine scenario about userspace's intelligence is that |
| platform can manage tasks as forground and background so it would be |
| better to reclaim background's task pages for end-user's *responsibility* |
| although it has frequent referenced pages. |
| |
| This patch adds new knob "reclaim under proc/<pid>/" so task manager |
| can reclaim any target process anytime, anywhere. It could give another |
| method to platform for using memory efficiently. |
| |
| It can avoid process killing for getting free memory, which was really |
| terrible experience because I lost my best score of game I had ever |
| after I switch the phone call while I enjoyed the game. |
| |
| Reclaim file-backed pages only. |
| echo file > /proc/PID/reclaim |
| Reclaim anonymous pages only. |
| echo anon > /proc/PID/reclaim |
| Reclaim shmem pages |
| echo shmem > /proc/PID/reclaim |
| Reclaim all pages |
| echo all > /proc/PID/reclaim |
| Note: for historical reasons "all" is file and anon only, it |
| does not include shmem. |
| |
| [Squashes in 75472663, 13a4a5c] |
| |
| BUG=b:195001087 |
| TEST=build 5.10 for x86_64 test on eve |
| |
| Signed-off-by: Brian Geffon <bgeffon@chromium.org> |
| Change-Id: I8967866f06ac866f8f5291c585c172756e5a180b |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/3100666 |
| Reviewed-by: Joel Fernandes <joelaf@google.com> |
| --- |
| fs/proc/base.c | 3 + |
| fs/proc/internal.h | 1 + |
| fs/proc/task_mmu.c | 337 +++++++++++++++++++++++++++++++++++++++++++ |
| include/linux/rmap.h | 4 + |
| mm/Kconfig | 11 ++ |
| mm/swap.c | 1 + |
| 6 files changed, 357 insertions(+) |
| |
| diff --git a/fs/proc/base.c b/fs/proc/base.c |
| index 926f07ce11b7aa7c0ab91ac50faf1f3a85f7b370..b4efbb9576b1ee70bd3fca7eed581eb33ddfd284 100644 |
| --- a/fs/proc/base.c |
| +++ b/fs/proc/base.c |
| @@ -3286,6 +3286,9 @@ static const struct pid_entry tgid_base_stuff[] = { |
| REG("mounts", S_IRUGO, proc_mounts_operations), |
| REG("mountinfo", S_IRUGO, proc_mountinfo_operations), |
| REG("mountstats", S_IRUSR, proc_mountstats_operations), |
| +#ifdef CONFIG_PROCESS_RECLAIM |
| + REG("reclaim", S_IWUGO, proc_reclaim_operations), |
| +#endif |
| #ifdef CONFIG_PROC_PAGE_MONITOR |
| REG("clear_refs", S_IWUSR, proc_clear_refs_operations), |
| REG("smaps", S_IRUGO, proc_pid_smaps_operations), |
| diff --git a/fs/proc/internal.h b/fs/proc/internal.h |
| index b701d0207edf098814a70400850e3675aee448f1..ec95cc8da708a69fe4d61fcf307479b97afe1ac1 100644 |
| --- a/fs/proc/internal.h |
| +++ b/fs/proc/internal.h |
| @@ -216,6 +216,7 @@ struct pde_opener { |
| extern const struct inode_operations proc_link_inode_operations; |
| extern const struct inode_operations proc_pid_link_inode_operations; |
| extern const struct super_operations proc_sops; |
| +extern const struct file_operations proc_reclaim_operations; |
| |
| void proc_init_kmemcache(void); |
| void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); |
| diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c |
| index 8a74cdcc9af00f7217f8a5ab3700537b5af6913d..b317bfd43fdeb1b61c346bc4d2ef6b485bffe884 100644 |
| --- a/fs/proc/task_mmu.c |
| +++ b/fs/proc/task_mmu.c |
| @@ -19,6 +19,7 @@ |
| #include <linux/shmem_fs.h> |
| #include <linux/uaccess.h> |
| #include <linux/pkeys.h> |
| +#include <linux/mm_inline.h> |
| |
| #include <asm/elf.h> |
| #include <asm/tlb.h> |
| @@ -1763,6 +1764,342 @@ const struct file_operations proc_pagemap_operations = { |
| }; |
| #endif /* CONFIG_PROC_PAGE_MONITOR */ |
| |
| +#ifdef CONFIG_PROCESS_RECLAIM |
| +enum reclaim_type { |
| + RECLAIM_FILE = 1, |
| + RECLAIM_ANON, |
| + RECLAIM_ALL, |
| + /* |
| + * For safety and backwards compatability, shmem reclaim mode |
| + * is only possible by directly using 'shmem', 'all' does not |
| + * inlcude shmem. |
| + */ |
| + RECLAIM_SHMEM, |
| +}; |
| + |
| +struct walk_data { |
| + enum reclaim_type type; |
| +}; |
| + |
| +static int deactivate_pte_range(pmd_t *pmd, unsigned long addr, |
| + unsigned long end, struct mm_walk *walk) |
| +{ |
| + pte_t *orig_pte, *pte, ptent; |
| + spinlock_t *ptl; |
| + struct page *page; |
| + struct vm_area_struct *vma = walk->vma; |
| + struct mm_struct *mm = vma->vm_mm; |
| + unsigned long next = pmd_addr_end(addr, end); |
| + |
| + ptl = pmd_trans_huge_lock(pmd, vma); |
| + if (ptl) { |
| + if (!pmd_present(*pmd)) |
| + goto huge_unlock; |
| + |
| + if (is_huge_zero_pmd(*pmd)) |
| + goto huge_unlock; |
| + |
| + page = pmd_page(*pmd); |
| + if (page_mapcount(page) > 1) |
| + goto huge_unlock; |
| + |
| + if (next - addr != HPAGE_PMD_SIZE) { |
| + int err; |
| + |
| + get_page(page); |
| + spin_unlock(ptl); |
| + lock_page(page); |
| + err = split_huge_page(page); |
| + unlock_page(page); |
| + put_page(page); |
| + if (!err) |
| + goto regular_page; |
| + return 0; |
| + } |
| + |
| + pmdp_test_and_clear_young(vma, addr, pmd); |
| + deactivate_page(page); |
| +huge_unlock: |
| + spin_unlock(ptl); |
| + return 0; |
| + } |
| + |
| + if (pmd_trans_unstable(pmd)) |
| + return 0; |
| + |
| +regular_page: |
| + orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| + for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) { |
| + ptent = *pte; |
| + |
| + if (!pte_present(ptent)) |
| + continue; |
| + |
| + page = vm_normal_page(vma, addr, ptent); |
| + if (!page) |
| + continue; |
| + |
| + if (PageTransCompound(page)) { |
| + if (page_mapcount(page) != 1) |
| + break; |
| + get_page(page); |
| + if (!trylock_page(page)) { |
| + put_page(page); |
| + break; |
| + } |
| + pte_unmap_unlock(orig_pte, ptl); |
| + if (split_huge_page(page)) { |
| + unlock_page(page); |
| + put_page(page); |
| + pte_offset_map_lock(mm, pmd, addr, &ptl); |
| + break; |
| + } |
| + unlock_page(page); |
| + put_page(page); |
| + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
| + pte--; |
| + addr -= PAGE_SIZE; |
| + continue; |
| + } |
| + |
| + VM_BUG_ON_PAGE(PageTransCompound(page), page); |
| + |
| + if (page_mapcount(page) > 1) |
| + continue; |
| + |
| + ptep_test_and_clear_young(vma, addr, pte); |
| + deactivate_page(page); |
| + } |
| + pte_unmap_unlock(orig_pte, ptl); |
| + cond_resched(); |
| + return 0; |
| +} |
| + |
| + |
| +static int reclaim_pte_range(pmd_t *pmd, unsigned long addr, |
| + unsigned long end, struct mm_walk *walk) |
| +{ |
| + pte_t *orig_pte, *pte, ptent; |
| + spinlock_t *ptl; |
| + LIST_HEAD(page_list); |
| + struct page *page; |
| + int isolated = 0; |
| + struct vm_area_struct *vma = walk->vma; |
| + struct walk_data *data = (struct walk_data*)walk->private; |
| + enum reclaim_type type = 0; |
| + struct mm_struct *mm = vma->vm_mm; |
| + unsigned long next = pmd_addr_end(addr, end); |
| + |
| + if (data) |
| + type = data->type; |
| + |
| + ptl = pmd_trans_huge_lock(pmd, vma); |
| + if (ptl) { |
| + if (!pmd_present(*pmd)) |
| + goto huge_unlock; |
| + |
| + if (is_huge_zero_pmd(*pmd)) |
| + goto huge_unlock; |
| + |
| + page = pmd_page(*pmd); |
| + if (type != RECLAIM_SHMEM && page_mapcount(page) > 1) |
| + goto huge_unlock; |
| + |
| + if (next - addr != HPAGE_PMD_SIZE) { |
| + int err; |
| + |
| + get_page(page); |
| + spin_unlock(ptl); |
| + lock_page(page); |
| + err = split_huge_page(page); |
| + unlock_page(page); |
| + put_page(page); |
| + if (!err) |
| + goto regular_page; |
| + return 0; |
| + } |
| + |
| + if (isolate_lru_page(page)) |
| + goto huge_unlock; |
| + |
| + /* Clear all the references to make sure it gets reclaimed */ |
| + pmdp_test_and_clear_young(vma, addr, pmd); |
| + ClearPageReferenced(page); |
| + test_and_clear_page_young(page); |
| + list_add(&page->lru, &page_list); |
| +huge_unlock: |
| + spin_unlock(ptl); |
| + reclaim_pages(&page_list); |
| + return 0; |
| + } |
| + |
| + if (pmd_trans_unstable(pmd)) |
| + return 0; |
| + |
| +regular_page: |
| + orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| + for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) { |
| + ptent = *pte; |
| + if (!pte_present(ptent)) |
| + continue; |
| + |
| + page = vm_normal_page(vma, addr, ptent); |
| + if (!page) |
| + continue; |
| + |
| + if (PageTransCompound(page)) { |
| + if (type != RECLAIM_SHMEM && page_mapcount(page) != 1) |
| + break; |
| + get_page(page); |
| + if (!trylock_page(page)) { |
| + put_page(page); |
| + break; |
| + } |
| + pte_unmap_unlock(orig_pte, ptl); |
| + |
| + if (split_huge_page(page)) { |
| + unlock_page(page); |
| + put_page(page); |
| + pte_offset_map_lock(mm, pmd, addr, &ptl); |
| + break; |
| + } |
| + unlock_page(page); |
| + put_page(page); |
| + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
| + pte--; |
| + addr -= PAGE_SIZE; |
| + continue; |
| + } |
| + |
| + VM_BUG_ON_PAGE(PageTransCompound(page), page); |
| + |
| + if (!PageLRU(page)) |
| + continue; |
| + |
| + if (type != RECLAIM_SHMEM && page_mapcount(page) > 1) |
| + continue; |
| + |
| + if (isolate_lru_page(page)) |
| + continue; |
| + |
| + isolated++; |
| + list_add(&page->lru, &page_list); |
| + /* Clear all the references to make sure it gets reclaimed */ |
| + ptep_test_and_clear_young(vma, addr, pte); |
| + ClearPageReferenced(page); |
| + test_and_clear_page_young(page); |
| + if (isolated >= SWAP_CLUSTER_MAX) { |
| + pte_unmap_unlock(orig_pte, ptl); |
| + reclaim_pages(&page_list); |
| + isolated = 0; |
| + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| + orig_pte = pte; |
| + } |
| + } |
| + |
| + pte_unmap_unlock(orig_pte, ptl); |
| + reclaim_pages(&page_list); |
| + |
| + cond_resched(); |
| + return 0; |
| +} |
| + |
| +static ssize_t reclaim_write(struct file *file, const char __user *buf, |
| + size_t count, loff_t *ppos) |
| +{ |
| + struct task_struct *task; |
| + char buffer[PROC_NUMBUF]; |
| + struct mm_struct *mm; |
| + struct vm_area_struct *vma; |
| + enum reclaim_type type; |
| + char *type_buf; |
| + |
| + memset(buffer, 0, sizeof(buffer)); |
| + if (count > sizeof(buffer) - 1) |
| + count = sizeof(buffer) - 1; |
| + |
| + if (copy_from_user(buffer, buf, count)) |
| + return -EFAULT; |
| + |
| + type_buf = strstrip(buffer); |
| + if (!strcmp(type_buf, "file")) |
| + type = RECLAIM_FILE; |
| + else if (!strcmp(type_buf, "anon")) |
| + type = RECLAIM_ANON; |
| +#ifdef CONFIG_SHMEM |
| + else if (!strcmp(type_buf, "shmem")) |
| + type = RECLAIM_SHMEM; |
| +#endif |
| + else if (!strcmp(type_buf, "all")) |
| + type = RECLAIM_ALL; |
| + else |
| + return -EINVAL; |
| + |
| + task = get_proc_task(file->f_path.dentry->d_inode); |
| + if (!task) |
| + return -ESRCH; |
| + |
| + mm = get_task_mm(task); |
| + if (mm) { |
| + struct mm_walk_ops reclaim_walk = { |
| + .pmd_entry = reclaim_pte_range, |
| + }; |
| + |
| + struct walk_data reclaim_data = { |
| + .type = type, |
| + }; |
| + |
| + mmap_read_lock(mm); |
| + for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| + if (is_vm_hugetlb_page(vma)) |
| + continue; |
| + |
| + if (vma->vm_flags & VM_LOCKED) |
| + continue; |
| + |
| + if (type == RECLAIM_ANON && !vma_is_anonymous(vma)) |
| + continue; |
| + if ((type == RECLAIM_FILE || type == RECLAIM_SHMEM) |
| + && vma_is_anonymous(vma)) { |
| + continue; |
| + } |
| + |
| + if (vma_is_anonymous(vma) || shmem_file(vma->vm_file)) { |
| + if (get_nr_swap_pages() <= 0 || |
| + get_mm_counter(mm, MM_ANONPAGES) == 0) { |
| + if (type == RECLAIM_ALL) |
| + continue; |
| + else |
| + break; |
| + } |
| + |
| + if (shmem_file(vma->vm_file) && type != RECLAIM_SHMEM) { |
| + continue; |
| + } |
| + |
| + reclaim_walk.pmd_entry = reclaim_pte_range; |
| + } else { |
| + reclaim_walk.pmd_entry = deactivate_pte_range; |
| + } |
| + |
| + walk_page_range(mm, vma->vm_start, vma->vm_end, |
| + &reclaim_walk, (void*)&reclaim_data); |
| + } |
| + flush_tlb_mm(mm); |
| + mmap_read_unlock(mm); |
| + mmput(mm); |
| + } |
| + put_task_struct(task); |
| + |
| + return count; |
| +} |
| + |
| +const struct file_operations proc_reclaim_operations = { |
| + .write = reclaim_write, |
| + .llseek = noop_llseek, |
| +}; |
| +#endif |
| + |
| #ifdef CONFIG_NUMA |
| |
| struct numa_maps { |
| diff --git a/include/linux/rmap.h b/include/linux/rmap.h |
| index bd3504d11b15590f0e41232e234eebc1149baad4..b5e06da7e323d2d363110dafad14659f8e38454f 100644 |
| --- a/include/linux/rmap.h |
| +++ b/include/linux/rmap.h |
| @@ -14,6 +14,10 @@ |
| #include <linux/pagemap.h> |
| #include <linux/memremap.h> |
| |
| +extern int isolate_lru_page(struct page *page); |
| +extern void putback_lru_page(struct page *page); |
| +extern unsigned long reclaim_pages(struct list_head *page_list); |
| + |
| /* |
| * The anon_vma heads a list of private "related" vmas, to scan if |
| * an anonymous page pointing to this anon_vma needs to be unmapped: |
| diff --git a/mm/Kconfig b/mm/Kconfig |
| index 14fb4f54df3af0d3cb9d632b525d518bce4bfa57..d5aae867e99d993893e8a1eaec1dcf19079752bd 100644 |
| --- a/mm/Kconfig |
| +++ b/mm/Kconfig |
| @@ -590,6 +590,17 @@ config PAGE_REPORTING |
| those pages to another entity, such as a hypervisor, so that the |
| memory can be freed within the host for other uses. |
| |
| +config PROCESS_RECLAIM |
| + bool "Enable process reclaim" |
| + depends on PROC_FS && MMU |
| + help |
| + It allows to reclaim pages of the process by /proc/pid/reclaim. |
| + |
| + (echo file > /proc/PID/reclaim) reclaims file-backed pages only. |
| + (echo anon > /proc/PID/reclaim) reclaims anonymous pages only. |
| + (echo all > /proc/PID/reclaim) reclaims all pages. |
| + |
| + Any other value is ignored. |
| # |
| # support for page migration |
| # |
| diff --git a/mm/swap.c b/mm/swap.c |
| index 955930f41d20c6d491bb3cf0347086443f16b4c6..711aa04043f433cb413325874dca3da39a943630 100644 |
| --- a/mm/swap.c |
| +++ b/mm/swap.c |
| @@ -624,6 +624,7 @@ static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) |
| lruvec_del_folio(lruvec, folio); |
| folio_clear_active(folio); |
| folio_clear_referenced(folio); |
| + folio_test_clear_young(folio); |
| lruvec_add_folio(lruvec, folio); |
| |
| __count_vm_events(PGDEACTIVATE, nr_pages); |
| -- |
| 2.38.1.584.g0f3c55d4c2-goog |
| |