blob: fc0c0bf730e154b712d9d542ee9d13c22e7bd15e [file] [log] [blame] [edit]
From 6fb477ad0cac2d6cd816cb8b6c6edcd402ed4e33 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@chromium.org>
Date: Tue, 17 Aug 2021 17:41:38 +0000
Subject: [PATCH] CHROMIUM: mm: per-process reclaim
These day, there are many platforms available in the embedded market
and they are smarter than kernel which has very limited information
about working set so they want to involve memory management more heavily
like android's lowmemory killer and ashmem or recent many lowmemory
notifier.
One of the simple imagine scenario about userspace's intelligence is that
platform can manage tasks as forground and background so it would be
better to reclaim background's task pages for end-user's *responsibility*
although it has frequent referenced pages.
This patch adds new knob "reclaim under proc/<pid>/" so task manager
can reclaim any target process anytime, anywhere. It could give another
method to platform for using memory efficiently.
It can avoid process killing for getting free memory, which was really
terrible experience because I lost my best score of game I had ever
after I switch the phone call while I enjoyed the game.
Reclaim file-backed pages only.
echo file > /proc/PID/reclaim
Reclaim anonymous pages only.
echo anon > /proc/PID/reclaim
Reclaim shmem pages
echo shmem > /proc/PID/reclaim
Reclaim all pages
echo all > /proc/PID/reclaim
Note: for historical reasons "all" is file and anon only, it
does not include shmem.
[Squashes in 75472663, 13a4a5c]
BUG=b:195001087
TEST=build 5.10 for x86_64 test on eve
Signed-off-by: Brian Geffon <bgeffon@chromium.org>
Change-Id: I8967866f06ac866f8f5291c585c172756e5a180b
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/3100666
Reviewed-by: Joel Fernandes <joelaf@google.com>
[rebase61(tzungbi):
Squashed:
CHROMIUM: mm: Check pmd_trans_unstable() after splitting huge page in per-process reclaim.
]
Signed-off-by: Tzung-Bi Shih <tzungbi@chromium.org>
---
fs/proc/base.c | 3 +
fs/proc/internal.h | 1 +
fs/proc/task_mmu.c | 337 +++++++++++++++++++++++++++++++++++++++++++
include/linux/rmap.h | 4 +
mm/Kconfig | 11 ++
mm/swap.c | 1 +
6 files changed, 357 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd31e3b6bf77cc84d6e05a223e989bc3ea3bb6f7..4369f16933367a4284b96cee122ff1fbbc088755 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3287,6 +3287,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("mounts", S_IRUGO, proc_mounts_operations),
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
REG("mountstats", S_IRUSR, proc_mountstats_operations),
+#ifdef CONFIG_PROCESS_RECLAIM
+ REG("reclaim", S_IWUGO, proc_reclaim_operations),
+#endif
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9a8f32f21ff569d0dc40e1d7c31b63b9aea293bc..3cb57fd7138ae6aed290eb0003b1ce61c4f937c4 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -216,6 +216,7 @@ struct pde_opener {
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;
+extern const struct file_operations proc_reclaim_operations;
void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ef2eb12906da88c6fe3a227e82598020f0badc44..ccaf1d255c29c208bcbd5ff8d40c7b6dc9bc95d2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -20,6 +20,7 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/mm_inline.h>
#include <linux/minmax.h>
#include <linux/overflow.h>
@@ -2492,6 +2493,342 @@ const struct file_operations proc_pagemap_operations = {
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
+#ifdef CONFIG_PROCESS_RECLAIM
+enum reclaim_type {
+ RECLAIM_FILE = 1,
+ RECLAIM_ANON,
+ RECLAIM_ALL,
+ /*
+ * For safety and backwards compatability, shmem reclaim mode
+ * is only possible by directly using 'shmem', 'all' does not
+ * inlcude shmem.
+ */
+ RECLAIM_SHMEM,
+};
+
+struct walk_data {
+ enum reclaim_type type;
+};
+
+static int deactivate_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (!pmd_present(*pmd))
+ goto huge_unlock;
+
+ if (is_huge_zero_pmd(*pmd))
+ goto huge_unlock;
+
+ page = pmd_page(*pmd);
+ if (page_mapcount(page) > 1)
+ goto huge_unlock;
+
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ pmdp_test_and_clear_young(vma, addr, pmd);
+ deactivate_page(page);
+huge_unlock:
+ spin_unlock(ptl);
+ return 0;
+ }
+
+regular_page:
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (page_mapcount(page) > 1)
+ continue;
+
+ ptep_test_and_clear_young(vma, addr, pte);
+ deactivate_page(page);
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+ return 0;
+}
+
+
+static int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ LIST_HEAD(page_list);
+ struct page *page;
+ int isolated = 0;
+ struct vm_area_struct *vma = walk->vma;
+ struct walk_data *data = (struct walk_data*)walk->private;
+ enum reclaim_type type = 0;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ if (data)
+ type = data->type;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (!pmd_present(*pmd))
+ goto huge_unlock;
+
+ if (is_huge_zero_pmd(*pmd))
+ goto huge_unlock;
+
+ page = pmd_page(*pmd);
+ if (type != RECLAIM_SHMEM && page_mapcount(page) > 1)
+ goto huge_unlock;
+
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (isolate_lru_page(page))
+ goto huge_unlock;
+
+ /* Clear all the references to make sure it gets reclaimed */
+ pmdp_test_and_clear_young(vma, addr, pmd);
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ list_add(&page->lru, &page_list);
+huge_unlock:
+ spin_unlock(ptl);
+ reclaim_pages(&page_list);
+ return 0;
+ }
+
+regular_page:
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageTransCompound(page)) {
+ if (type != RECLAIM_SHMEM && page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (!PageLRU(page))
+ continue;
+
+ if (type != RECLAIM_SHMEM && page_mapcount(page) > 1)
+ continue;
+
+ if (isolate_lru_page(page))
+ continue;
+
+ isolated++;
+ list_add(&page->lru, &page_list);
+ /* Clear all the references to make sure it gets reclaimed */
+ ptep_test_and_clear_young(vma, addr, pte);
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (isolated >= SWAP_CLUSTER_MAX) {
+ pte_unmap_unlock(orig_pte, ptl);
+ reclaim_pages(&page_list);
+ isolated = 0;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ orig_pte = pte;
+ }
+ }
+
+ pte_unmap_unlock(orig_pte, ptl);
+ reclaim_pages(&page_list);
+
+ cond_resched();
+ return 0;
+}
+
+static ssize_t reclaim_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ char buffer[PROC_NUMBUF];
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ enum reclaim_type type;
+ char *type_buf;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+
+ type_buf = strstrip(buffer);
+ if (!strcmp(type_buf, "file"))
+ type = RECLAIM_FILE;
+ else if (!strcmp(type_buf, "anon"))
+ type = RECLAIM_ANON;
+#ifdef CONFIG_SHMEM
+ else if (!strcmp(type_buf, "shmem"))
+ type = RECLAIM_SHMEM;
+#endif
+ else if (!strcmp(type_buf, "all"))
+ type = RECLAIM_ALL;
+ else
+ return -EINVAL;
+
+ task = get_proc_task(file->f_path.dentry->d_inode);
+ if (!task)
+ return -ESRCH;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ struct mm_walk_ops reclaim_walk = {
+ .pmd_entry = reclaim_pte_range,
+ };
+
+ struct walk_data reclaim_data = {
+ .type = type,
+ };
+
+ mmap_read_lock(mm);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (is_vm_hugetlb_page(vma))
+ continue;
+
+ if (vma->vm_flags & VM_LOCKED)
+ continue;
+
+ if (type == RECLAIM_ANON && !vma_is_anonymous(vma))
+ continue;
+ if ((type == RECLAIM_FILE || type == RECLAIM_SHMEM)
+ && vma_is_anonymous(vma)) {
+ continue;
+ }
+
+ if (vma_is_anonymous(vma) || shmem_file(vma->vm_file)) {
+ if (get_nr_swap_pages() <= 0 ||
+ get_mm_counter(mm, MM_ANONPAGES) == 0) {
+ if (type == RECLAIM_ALL)
+ continue;
+ else
+ break;
+ }
+
+ if (shmem_file(vma->vm_file) && type != RECLAIM_SHMEM) {
+ continue;
+ }
+
+ reclaim_walk.pmd_entry = reclaim_pte_range;
+ } else {
+ reclaim_walk.pmd_entry = deactivate_pte_range;
+ }
+
+ walk_page_range(mm, vma->vm_start, vma->vm_end,
+ &reclaim_walk, (void*)&reclaim_data);
+ }
+ flush_tlb_mm(mm);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
+ put_task_struct(task);
+
+ return count;
+}
+
+const struct file_operations proc_reclaim_operations = {
+ .write = reclaim_write,
+ .llseek = noop_llseek,
+};
+#endif
+
#ifdef CONFIG_NUMA
struct numa_maps {
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b26fe858fd444ce3fe38a0b9147b822086c009b9..b65bbbd57cf046134f61e2d3424f4fb4a4afcb4f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -14,6 +14,10 @@
#include <linux/pagemap.h>
#include <linux/memremap.h>
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+extern unsigned long reclaim_pages(struct list_head *page_list);
+
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
diff --git a/mm/Kconfig b/mm/Kconfig
index 3cdb068ae3f60fa406c8a679c2c9e50f64dc0eab..f32e71bcb05b48f393003b726034844a91c9ac72 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -668,6 +668,17 @@ config PAGE_REPORTING
those pages to another entity, such as a hypervisor, so that the
memory can be freed within the host for other uses.
+config PROCESS_RECLAIM
+ bool "Enable process reclaim"
+ depends on PROC_FS && MMU
+ help
+ It allows to reclaim pages of the process by /proc/pid/reclaim.
+
+ (echo file > /proc/PID/reclaim) reclaims file-backed pages only.
+ (echo anon > /proc/PID/reclaim) reclaims anonymous pages only.
+ (echo all > /proc/PID/reclaim) reclaims all pages.
+
+ Any other value is ignored.
#
# support for page migration
#
diff --git a/mm/swap.c b/mm/swap.c
index cd8f0150ba3aa8cde8828d2760f34516a605fb1d..fcf8b941fea1095399711fabe83bd990587c388b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -607,6 +607,7 @@ static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
lruvec_del_folio(lruvec, folio);
folio_clear_active(folio);
folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
--
2.43.0.rc2.451.g8631bc7472-goog