| From 50800e618c8d185fffd7ddb70c16e932cd59ff78 Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Sat, 1 Apr 2023 14:14:19 -0600 |
| Subject: [PATCH] BACKPORT: FROMLIST: mm: multi-gen LRU: use |
| mmu_notifier_test_clear_young() |
| |
| An existing selftest can quickly demonstrate the effectiveness of this |
| patch. On a generic workstation equipped with 64 CPUs and 256GB DRAM: |
| |
| $ sudo max_guest_memory_test -c 64 -m 256 -s 256 |
| |
| MGLRU run2 |
| --------------- |
| Before ~600s |
| After ~50s |
| Off ~250s |
| |
| kswapd (MGLRU before) |
| 100.00% balance_pgdat |
| 100.00% shrink_node |
| 100.00% shrink_one |
| 99.97% try_to_shrink_lruvec |
| 99.06% evict_folios |
| 97.41% shrink_folio_list |
| 31.33% folio_referenced |
| 31.06% rmap_walk_file |
| 30.89% folio_referenced_one |
| 20.83% __mmu_notifier_clear_flush_young |
| 20.54% kvm_mmu_notifier_clear_flush_young |
| => 19.34% _raw_write_lock |
| |
| kswapd (MGLRU after) |
| 100.00% balance_pgdat |
| 100.00% shrink_node |
| 100.00% shrink_one |
| 99.97% try_to_shrink_lruvec |
| 99.51% evict_folios |
| 71.70% shrink_folio_list |
| 7.08% folio_referenced |
| 6.78% rmap_walk_file |
| 6.72% folio_referenced_one |
| 5.60% lru_gen_look_around |
| => 1.53% __mmu_notifier_test_clear_young |
| |
| kswapd (MGLRU off) |
| 100.00% balance_pgdat |
| 100.00% shrink_node |
| 99.92% shrink_lruvec |
| 69.95% shrink_folio_list |
| 19.35% folio_referenced |
| 18.37% rmap_walk_file |
| 17.88% folio_referenced_one |
| 13.20% __mmu_notifier_clear_flush_young |
| 11.64% kvm_mmu_notifier_clear_flush_young |
| => 9.93% _raw_write_lock |
| 26.23% shrink_active_list |
| 25.50% folio_referenced |
| 25.35% rmap_walk_file |
| 25.28% folio_referenced_one |
| 23.87% __mmu_notifier_clear_flush_young |
| 23.69% kvm_mmu_notifier_clear_flush_young |
| => 18.98% _raw_write_lock |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| (am from https://patchwork.kernel.org/patch/13144315/) |
| |
| BUG=b:266976439 |
| UPSTREAM-TASK=b:266738578 |
| TEST=ran crostini.VimCompile |
| |
| Change-Id: I2d6c83ccad765d903d4c2d2e48ded7bd114942c8 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392744 |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: T.J. Alumbaugh <talumbau@google.com> |
| --- |
| include/linux/mmzone.h | 6 +- |
| mm/rmap.c | 9 +-- |
| mm/vmscan.c | 141 +++++++++++++++++++++++++++++++++++++---- |
| 3 files changed, 135 insertions(+), 21 deletions(-) |
| |
| diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h |
| index 4106fbc5b4b3248858899e4e88ab3b248ea6de0b..24d3ec2111c6ec95f74b879d39a4f1941bf80f91 100644 |
| --- a/include/linux/mmzone.h |
| +++ b/include/linux/mmzone.h |
| @@ -387,6 +387,7 @@ enum { |
| LRU_GEN_CORE, |
| LRU_GEN_MM_WALK, |
| LRU_GEN_NONLEAF_YOUNG, |
| + LRU_GEN_SPTE_WALK, |
| NR_LRU_GEN_CAPS |
| }; |
| |
| @@ -489,7 +490,7 @@ struct lru_gen_mm_walk { |
| }; |
| |
| void lru_gen_init_lruvec(struct lruvec *lruvec); |
| -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); |
| +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); |
| |
| #ifdef CONFIG_MEMCG |
| |
| @@ -577,8 +578,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) |
| { |
| } |
| |
| -static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| { |
| + return false; |
| } |
| |
| #ifdef CONFIG_MEMCG |
| diff --git a/mm/rmap.c b/mm/rmap.c |
| index ec7f8e6c9e483a6ff768272d4e89221044974bee..171418864b38af9367c6b2d7f4f3c14b91788811 100644 |
| --- a/mm/rmap.c |
| +++ b/mm/rmap.c |
| @@ -820,13 +820,10 @@ static bool folio_referenced_one(struct folio *folio, |
| return false; /* To break the loop */ |
| } |
| |
| - if (pvmw.pte) { |
| - if (lru_gen_enabled() && |
| - pte_young(ptep_get(pvmw.pte))) { |
| - lru_gen_look_around(&pvmw); |
| + if (lru_gen_enabled() && pvmw.pte) { |
| + if (lru_gen_look_around(&pvmw)) |
| referenced++; |
| - } |
| - |
| + } else if (pvmw.pte) { |
| if (ptep_clear_flush_young_notify(vma, address, |
| pvmw.pte)) |
| referenced++; |
| diff --git a/mm/vmscan.c b/mm/vmscan.c |
| index a4d0fc7cec7efa7ca6da609bda4db888ce16b560..400d15ad8da9028200153e9ae476f819f5a7b7e3 100644 |
| --- a/mm/vmscan.c |
| +++ b/mm/vmscan.c |
| @@ -57,6 +57,10 @@ |
| #include <linux/khugepaged.h> |
| #include <linux/rculist_nulls.h> |
| #include <linux/random.h> |
| +#include <linux/mmu_notifier.h> |
| +#ifdef CONFIG_KVM |
| +#include <linux/kvm_host.h> |
| +#endif |
| |
| #include <asm/tlbflush.h> |
| #include <asm/div64.h> |
| @@ -4047,6 +4051,59 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, |
| return folio; |
| } |
| |
| +#ifndef kvm_arch_has_test_clear_young |
| +#define kvm_arch_has_test_clear_young() 0 |
| +#endif |
| + |
| +static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end, |
| + unsigned long *bitmap, unsigned long *last) |
| +{ |
| + if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK)) |
| + return false; |
| + |
| + if (*last > addr) |
| + goto done; |
| + |
| + *last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ? |
| + addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1; |
| + bitmap_zero(bitmap, MIN_LRU_BATCH); |
| + |
| + mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap); |
| +done: |
| + return test_bit((*last - addr) / PAGE_SIZE, bitmap); |
| +} |
| + |
| +static void clear_spte_young(struct mm_struct *mm, unsigned long addr, |
| + unsigned long *bitmap, unsigned long *last) |
| +{ |
| + int i; |
| + unsigned long start, end = *last + 1; |
| + |
| + if (addr + PAGE_SIZE != end) |
| + return; |
| + |
| + i = find_last_bit(bitmap, MIN_LRU_BATCH); |
| + if (i == MIN_LRU_BATCH) |
| + return; |
| + |
| + start = end - (i + 1) * PAGE_SIZE; |
| + |
| + i = find_first_bit(bitmap, MIN_LRU_BATCH); |
| + |
| + end -= i * PAGE_SIZE; |
| + |
| + mmu_notifier_test_clear_young(mm, start, end, false, bitmap); |
| +} |
| + |
| +static void skip_spte_young(struct mm_struct *mm, unsigned long addr, |
| + unsigned long *bitmap, unsigned long *last) |
| +{ |
| + if (*last > addr) |
| + __clear_bit((*last - addr) / PAGE_SIZE, bitmap); |
| + |
| + clear_spte_young(mm, addr, bitmap, last); |
| +} |
| + |
| static bool suitable_to_scan(int total, int young) |
| { |
| int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); |
| @@ -4062,6 +4119,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, |
| pte_t *pte; |
| spinlock_t *ptl; |
| unsigned long addr; |
| + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; |
| + unsigned long last = 0; |
| int total = 0; |
| int young = 0; |
| struct lru_gen_mm_walk *walk = args->private; |
| @@ -4080,6 +4139,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, |
| arch_enter_lazy_mmu_mode(); |
| restart: |
| for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { |
| + bool success; |
| unsigned long pfn; |
| struct folio *folio; |
| pte_t ptent = ptep_get(pte + i); |
| @@ -4088,20 +4148,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, |
| walk->mm_stats[MM_LEAF_TOTAL]++; |
| |
| pfn = get_pte_pfn(ptent, args->vma, addr); |
| - if (pfn == -1) |
| + if (pfn == -1) { |
| + skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| - if (!pte_young(ptent)) { |
| + success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last); |
| + if (!success && !pte_young(ptent)) { |
| + skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); |
| walk->mm_stats[MM_LEAF_OLD]++; |
| continue; |
| } |
| |
| folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); |
| - if (!folio) |
| + if (!folio) { |
| + skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| - if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) |
| - VM_WARN_ON_ONCE(true); |
| + clear_spte_young(args->vma->vm_mm, addr, bitmap, &last); |
| + if (pte_young(ptent)) |
| + ptep_test_and_clear_young(args->vma, addr, pte + i); |
| |
| young++; |
| walk->mm_stats[MM_LEAF_YOUNG]++; |
| @@ -4693,6 +4760,32 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) |
| } |
| } |
| |
| +static bool should_look_around(struct vm_area_struct *vma, unsigned long addr, |
| + pte_t *pte, int *young) |
| +{ |
| + unsigned long old = true; |
| + |
| + if (!get_cap(LRU_GEN_SPTE_WALK)) { |
| + old = !pte_young(*pte); |
| + *young = ptep_clear_flush_young_notify(vma, addr, pte); |
| + |
| + return !old; |
| + } |
| + |
| + *young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE, true, &old); |
| + |
| + if (!old) |
| + *young = true; |
| + |
| + if (pte_young(*pte)) { |
| + ptep_test_and_clear_young(vma, addr, pte); |
| + *young = true; |
| + return true; |
| + } |
| + |
| + return !old && get_cap(LRU_GEN_SPTE_WALK); |
| +} |
| + |
| /****************************************************************************** |
| * rmap/PT walk feedback |
| ******************************************************************************/ |
| @@ -4704,12 +4797,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) |
| * the PTE table to the Bloom filter. This forms a feedback loop between the |
| * eviction and the aging. |
| */ |
| -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| { |
| int i; |
| unsigned long start; |
| unsigned long end; |
| struct lru_gen_mm_walk *walk; |
| + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; |
| + unsigned long last = 0; |
| int young = 0; |
| pte_t *pte = pvmw->pte; |
| unsigned long addr = pvmw->address; |
| @@ -4724,8 +4819,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| lockdep_assert_held(pvmw->ptl); |
| VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); |
| |
| + if (!should_look_around(pvmw->vma, addr, pte, &young)) |
| + return young; |
| + |
| if (spin_is_contended(pvmw->ptl)) |
| - return; |
| + return young; |
| |
| /* avoid taking the LRU lock under the PTL when possible */ |
| walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; |
| @@ -4733,6 +4831,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| start = max(addr & PMD_MASK, pvmw->vma->vm_start); |
| end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; |
| |
| + if (end - start == PAGE_SIZE) |
| + return young; |
| + |
| if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { |
| if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) |
| end = start + MIN_LRU_BATCH * PAGE_SIZE; |
| @@ -4746,29 +4847,38 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| |
| /* folio_update_gen() requires stable folio_memcg() */ |
| if (!mem_cgroup_trylock_pages(memcg)) |
| - return; |
| + return young; |
| |
| arch_enter_lazy_mmu_mode(); |
| |
| pte -= (addr - start) / PAGE_SIZE; |
| |
| for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { |
| + bool success; |
| unsigned long pfn; |
| pte_t ptent = ptep_get(pte + i); |
| |
| pfn = get_pte_pfn(ptent, pvmw->vma, addr); |
| - if (pfn == -1) |
| + if (pfn == -1) { |
| + skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| - if (!pte_young(ptent)) |
| + success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last); |
| + if (!success && !pte_young(ptent)) { |
| + skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); |
| - if (!folio) |
| + if (!folio) { |
| + skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| - if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) |
| - VM_WARN_ON_ONCE(true); |
| + clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); |
| + if (pte_young(ptent)) |
| + ptep_test_and_clear_young(pvmw->vma, addr, pte + i); |
| |
| young++; |
| |
| @@ -4798,6 +4908,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| /* feedback from rmap walkers to page table walkers */ |
| if (suitable_to_scan(i, young)) |
| update_bloom_filter(lruvec, max_seq, pvmw->pmd); |
| + |
| + return young; |
| } |
| |
| /****************************************************************************** |
| @@ -5838,6 +5950,9 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c |
| if (should_clear_pmd_young()) |
| caps |= BIT(LRU_GEN_NONLEAF_YOUNG); |
| |
| + if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK)) |
| + caps |= BIT(LRU_GEN_SPTE_WALK); |
| + |
| return sysfs_emit(buf, "0x%04x\n", caps); |
| } |
| |
| -- |
| 2.34.1 |
| |