blob: a625c6408bc02e50c353e356503da3328967a169 [file] [log] [blame] [edit]
From 940411fdf169501b442397853e470ba66cc3aa00 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@chromium.org>
Date: Tue, 17 Aug 2021 17:41:38 +0000
Subject: [PATCH] CHROMIUM: mm: per-process reclaim
These day, there are many platforms available in the embedded market
and they are smarter than kernel which has very limited information
about working set so they want to involve memory management more heavily
like android's lowmemory killer and ashmem or recent many lowmemory
notifier.
One of the simple imagine scenario about userspace's intelligence is that
platform can manage tasks as forground and background so it would be
better to reclaim background's task pages for end-user's *responsibility*
although it has frequent referenced pages.
This patch adds new knob "reclaim under proc/<pid>/" so task manager
can reclaim any target process anytime, anywhere. It could give another
method to platform for using memory efficiently.
It can avoid process killing for getting free memory, which was really
terrible experience because I lost my best score of game I had ever
after I switch the phone call while I enjoyed the game.
Reclaim file-backed pages only.
echo file > /proc/PID/reclaim
Reclaim anonymous pages only.
echo anon > /proc/PID/reclaim
Reclaim shmem pages
echo shmem > /proc/PID/reclaim
Reclaim all pages
echo all > /proc/PID/reclaim
Note: for historical reasons "all" is file and anon only, it
does not include shmem.
[Squashes in 75472663, 13a4a5c]
BUG=b:195001087
TEST=build 5.10 for x86_64 test on eve
Signed-off-by: Brian Geffon <bgeffon@chromium.org>
Change-Id: I8967866f06ac866f8f5291c585c172756e5a180b
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/3100666
Reviewed-by: Joel Fernandes <joelaf@google.com>
---
fs/proc/base.c | 3 +
fs/proc/internal.h | 1 +
fs/proc/task_mmu.c | 337 +++++++++++++++++++++++++++++++++++++++++++
include/linux/rmap.h | 4 +
mm/Kconfig | 11 ++
mm/swap.c | 1 +
6 files changed, 357 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 926f07ce11b7aa7c0ab91ac50faf1f3a85f7b370..b4efbb9576b1ee70bd3fca7eed581eb33ddfd284 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3286,6 +3286,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("mounts", S_IRUGO, proc_mounts_operations),
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
REG("mountstats", S_IRUSR, proc_mountstats_operations),
+#ifdef CONFIG_PROCESS_RECLAIM
+ REG("reclaim", S_IWUGO, proc_reclaim_operations),
+#endif
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index b701d0207edf098814a70400850e3675aee448f1..ec95cc8da708a69fe4d61fcf307479b97afe1ac1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -216,6 +216,7 @@ struct pde_opener {
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;
+extern const struct file_operations proc_reclaim_operations;
void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8a74cdcc9af00f7217f8a5ab3700537b5af6913d..b317bfd43fdeb1b61c346bc4d2ef6b485bffe884 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,7 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/mm_inline.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -1763,6 +1764,342 @@ const struct file_operations proc_pagemap_operations = {
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
+#ifdef CONFIG_PROCESS_RECLAIM
+enum reclaim_type {
+ RECLAIM_FILE = 1,
+ RECLAIM_ANON,
+ RECLAIM_ALL,
+ /*
+ * For safety and backwards compatability, shmem reclaim mode
+ * is only possible by directly using 'shmem', 'all' does not
+ * inlcude shmem.
+ */
+ RECLAIM_SHMEM,
+};
+
+struct walk_data {
+ enum reclaim_type type;
+};
+
+static int deactivate_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (!pmd_present(*pmd))
+ goto huge_unlock;
+
+ if (is_huge_zero_pmd(*pmd))
+ goto huge_unlock;
+
+ page = pmd_page(*pmd);
+ if (page_mapcount(page) > 1)
+ goto huge_unlock;
+
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ pmdp_test_and_clear_young(vma, addr, pmd);
+ deactivate_page(page);
+huge_unlock:
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+regular_page:
+ orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (page_mapcount(page) > 1)
+ continue;
+
+ ptep_test_and_clear_young(vma, addr, pte);
+ deactivate_page(page);
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+ return 0;
+}
+
+
+static int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ LIST_HEAD(page_list);
+ struct page *page;
+ int isolated = 0;
+ struct vm_area_struct *vma = walk->vma;
+ struct walk_data *data = (struct walk_data*)walk->private;
+ enum reclaim_type type = 0;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ if (data)
+ type = data->type;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (!pmd_present(*pmd))
+ goto huge_unlock;
+
+ if (is_huge_zero_pmd(*pmd))
+ goto huge_unlock;
+
+ page = pmd_page(*pmd);
+ if (type != RECLAIM_SHMEM && page_mapcount(page) > 1)
+ goto huge_unlock;
+
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (isolate_lru_page(page))
+ goto huge_unlock;
+
+ /* Clear all the references to make sure it gets reclaimed */
+ pmdp_test_and_clear_young(vma, addr, pmd);
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ list_add(&page->lru, &page_list);
+huge_unlock:
+ spin_unlock(ptl);
+ reclaim_pages(&page_list);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+regular_page:
+ orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageTransCompound(page)) {
+ if (type != RECLAIM_SHMEM && page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (!PageLRU(page))
+ continue;
+
+ if (type != RECLAIM_SHMEM && page_mapcount(page) > 1)
+ continue;
+
+ if (isolate_lru_page(page))
+ continue;
+
+ isolated++;
+ list_add(&page->lru, &page_list);
+ /* Clear all the references to make sure it gets reclaimed */
+ ptep_test_and_clear_young(vma, addr, pte);
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (isolated >= SWAP_CLUSTER_MAX) {
+ pte_unmap_unlock(orig_pte, ptl);
+ reclaim_pages(&page_list);
+ isolated = 0;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ orig_pte = pte;
+ }
+ }
+
+ pte_unmap_unlock(orig_pte, ptl);
+ reclaim_pages(&page_list);
+
+ cond_resched();
+ return 0;
+}
+
+static ssize_t reclaim_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ char buffer[PROC_NUMBUF];
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ enum reclaim_type type;
+ char *type_buf;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+
+ type_buf = strstrip(buffer);
+ if (!strcmp(type_buf, "file"))
+ type = RECLAIM_FILE;
+ else if (!strcmp(type_buf, "anon"))
+ type = RECLAIM_ANON;
+#ifdef CONFIG_SHMEM
+ else if (!strcmp(type_buf, "shmem"))
+ type = RECLAIM_SHMEM;
+#endif
+ else if (!strcmp(type_buf, "all"))
+ type = RECLAIM_ALL;
+ else
+ return -EINVAL;
+
+ task = get_proc_task(file->f_path.dentry->d_inode);
+ if (!task)
+ return -ESRCH;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ struct mm_walk_ops reclaim_walk = {
+ .pmd_entry = reclaim_pte_range,
+ };
+
+ struct walk_data reclaim_data = {
+ .type = type,
+ };
+
+ mmap_read_lock(mm);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (is_vm_hugetlb_page(vma))
+ continue;
+
+ if (vma->vm_flags & VM_LOCKED)
+ continue;
+
+ if (type == RECLAIM_ANON && !vma_is_anonymous(vma))
+ continue;
+ if ((type == RECLAIM_FILE || type == RECLAIM_SHMEM)
+ && vma_is_anonymous(vma)) {
+ continue;
+ }
+
+ if (vma_is_anonymous(vma) || shmem_file(vma->vm_file)) {
+ if (get_nr_swap_pages() <= 0 ||
+ get_mm_counter(mm, MM_ANONPAGES) == 0) {
+ if (type == RECLAIM_ALL)
+ continue;
+ else
+ break;
+ }
+
+ if (shmem_file(vma->vm_file) && type != RECLAIM_SHMEM) {
+ continue;
+ }
+
+ reclaim_walk.pmd_entry = reclaim_pte_range;
+ } else {
+ reclaim_walk.pmd_entry = deactivate_pte_range;
+ }
+
+ walk_page_range(mm, vma->vm_start, vma->vm_end,
+ &reclaim_walk, (void*)&reclaim_data);
+ }
+ flush_tlb_mm(mm);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
+ put_task_struct(task);
+
+ return count;
+}
+
+const struct file_operations proc_reclaim_operations = {
+ .write = reclaim_write,
+ .llseek = noop_llseek,
+};
+#endif
+
#ifdef CONFIG_NUMA
struct numa_maps {
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bd3504d11b15590f0e41232e234eebc1149baad4..b5e06da7e323d2d363110dafad14659f8e38454f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -14,6 +14,10 @@
#include <linux/pagemap.h>
#include <linux/memremap.h>
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+extern unsigned long reclaim_pages(struct list_head *page_list);
+
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
diff --git a/mm/Kconfig b/mm/Kconfig
index 14fb4f54df3af0d3cb9d632b525d518bce4bfa57..d5aae867e99d993893e8a1eaec1dcf19079752bd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -590,6 +590,17 @@ config PAGE_REPORTING
those pages to another entity, such as a hypervisor, so that the
memory can be freed within the host for other uses.
+config PROCESS_RECLAIM
+ bool "Enable process reclaim"
+ depends on PROC_FS && MMU
+ help
+ It allows to reclaim pages of the process by /proc/pid/reclaim.
+
+ (echo file > /proc/PID/reclaim) reclaims file-backed pages only.
+ (echo anon > /proc/PID/reclaim) reclaims anonymous pages only.
+ (echo all > /proc/PID/reclaim) reclaims all pages.
+
+ Any other value is ignored.
#
# support for page migration
#
diff --git a/mm/swap.c b/mm/swap.c
index 955930f41d20c6d491bb3cf0347086443f16b4c6..711aa04043f433cb413325874dca3da39a943630 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -624,6 +624,7 @@ static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
lruvec_del_folio(lruvec, folio);
folio_clear_active(folio);
folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
--
2.38.1.584.g0f3c55d4c2-goog