blob: 76866fa723d695474558fe45ced4bc7b1faa0d9e [file] [log] [blame]
From 73605f135057541c4fe3e7374085f2e1cc4094ab Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 1 Apr 2023 13:56:51 -0600
Subject: [PATCH] BACKPORT: FROMLIST: kvm/arm64: add
kvm_arch_test_clear_young()
This patch adds kvm_arch_test_clear_young() for the vast majority of
VMs that are not pKVM and run on hardware that sets the accessed bit
in KVM PTEs.
It relies on two techniques, RCU and cmpxchg, to safely test and clear
the accessed bit without taking the MMU lock. The former protects KVM
page tables from being freed while the latter clears the accessed bit
atomically against both the hardware and other software page table
walkers.
Signed-off-by: Yu Zhao <yuzhao@google.com>
(am from https://patchwork.kernel.org/patch/13144313/)
BUG=b:266976439
UPSTREAM-TASK=b:266738578
TEST=built
Change-Id: I8fb581d47cd57581c5bf6fa90c15a1e836b190bd
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392743
Tested-by: Yu Zhao <yuzhao@chromium.org>
Reviewed-by: T.J. Alumbaugh <talumbau@google.com>
Commit-Queue: Yu Zhao <yuzhao@chromium.org>
---
arch/arm64/include/asm/kvm_host.h | 7 +++
arch/arm64/include/asm/kvm_pgtable.h | 55 +++++++++++++++++++
arch/arm64/kvm/arm.c | 1 +
arch/arm64/kvm/hyp/pgtable.c | 58 +++-----------------
arch/arm64/kvm/mmu.c | 80 ++++++++++++++++++++++++++++
5 files changed, 149 insertions(+), 52 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 38cf5aac3e377a212e30ee8ba851264c7c54cf84..533665df5c73b61ac24160fa06526ca1a5ec13d3 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1155,4 +1155,11 @@ static inline void kvm_hyp_reserve(void) { }
void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
+/* see the comments on the generic kvm_arch_has_test_clear_young() */
+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+ return IS_ENABLED(CONFIG_KVM) && cpu_has_hw_af() && !is_protected_kvm_enabled();
+}
+
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index d3e354bb8351d7aa9762155e18cb49cffee938e0..55dd75ee8b742114aeee709ef48f76c1685d3f4d 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -44,11 +44,59 @@ typedef u64 kvm_pte_t;
#define KVM_PHYS_INVALID (-1ULL)
+#define KVM_PTE_TYPE BIT(1)
+#define KVM_PTE_TYPE_BLOCK 0
+#define KVM_PTE_TYPE_PAGE 1
+#define KVM_PTE_TYPE_TABLE 1
+
+#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+ KVM_PTE_LEAF_ATTR_HI_S2_XN)
+
+#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
+#define KVM_MAX_OWNER_ID 1
+
static inline bool kvm_pte_valid(kvm_pte_t pte)
{
return pte & KVM_PTE_VALID;
}
+static inline bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return false;
+
+ if (!kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
{
u64 pa = pte & KVM_PTE_ADDR_MASK;
@@ -128,6 +176,7 @@ static inline bool kvm_is_block_size_supported(u64 size)
* @put_page: Decrement the refcount on a page. When the
* refcount reaches 0 the page is automatically
* freed.
+ * @put_page_rcu: RCU variant of the above.
* @page_count: Return the refcount of a page.
* @phys_to_virt: Convert a physical address into a virtual
* address mapped in the current context.
@@ -145,6 +194,7 @@ struct kvm_pgtable_mm_ops {
void (*free_unlinked_table)(void *addr, u32 level);
void (*get_page)(void *addr);
void (*put_page)(void *addr);
+ void (*put_page_rcu)(void *addr);
int (*page_count)(void *addr);
void* (*phys_to_virt)(phys_addr_t phys);
phys_addr_t (*virt_to_phys)(void *addr);
@@ -219,6 +269,11 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
* @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries
* without Cache maintenance
* operations required.
+ * kvm_arch_test_clear_young() is a special case. It relies on two
+ * techniques, RCU and cmpxchg, to safely test and clear the accessed
+ * bit without taking the MMU lock. The former protects KVM page tables
+ * from being freed while the latter clears the accessed bit atomically
+ * against both the hardware and other software page table walkers.
*/
enum kvm_pgtable_walk_flags {
KVM_PGTABLE_WALK_LEAF = BIT(0),
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 4866b3f7b4ea3847d885e00cfac47a4d7abf9da3..96765da57115ec2d7ebebc1fca5f4e6b63ee9036 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -197,6 +197,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
*/
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_free_stage2_pgd(&kvm->arch.mmu);
bitmap_free(kvm->arch.pmu_filter);
free_cpumask_var(kvm->arch.supported_cpus);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index f155b8c9e98c7fbf1298f4ecf64c6826c76fdb23..13502d89f1adebc5e2bebd8e72297be151c6eeb2 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -12,47 +12,8 @@
#include <asm/stage2_pgtable.h>
-#define KVM_PTE_TYPE BIT(1)
-#define KVM_PTE_TYPE_BLOCK 0
-#define KVM_PTE_TYPE_PAGE 1
-#define KVM_PTE_TYPE_TABLE 1
-
-#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
-
-#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \
- ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \
- ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
-#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50)
-
-#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
-
-#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
-
#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
-#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
- KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
- KVM_PTE_LEAF_ATTR_HI_S2_XN)
-
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID 1
-
/*
* Used to indicate a pte for which a 'break-before-make' sequence is in
* progress.
@@ -124,17 +85,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
}
-static bool kvm_pte_table(kvm_pte_t pte, u32 level)
-{
- if (level == KVM_PGTABLE_MAX_LEVELS - 1)
- return false;
-
- if (!kvm_pte_valid(pte))
- return false;
-
- return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
-}
-
static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
{
return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
@@ -1124,8 +1074,12 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
kvm_granule_size(ctx->level));
- if (childp)
- mm_ops->put_page(childp);
+ if (childp) {
+ if (mm_ops->put_page_rcu)
+ mm_ops->put_page_rcu(childp);
+ else
+ mm_ops->put_page(childp);
+ }
return 0;
}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a50e8bc217cd8004da7713d10d31fdc013f35336..d0203a44f6b9aa84c92171a4c253b14ab777e431 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -260,6 +260,21 @@ static int kvm_host_page_count(void *addr)
return page_count(virt_to_page(addr));
}
+static void kvm_s2_rcu_put_page(struct rcu_head *head)
+{
+ put_page(container_of(head, struct page, rcu_head));
+}
+
+static void kvm_s2_put_page_rcu(void *addr)
+{
+ struct page *page = virt_to_page(addr);
+
+ if (kvm_host_page_count(addr) == 1)
+ kvm_account_pgtable_pages(addr, -1);
+
+ call_rcu(&page->rcu_head, kvm_s2_rcu_put_page);
+}
+
static phys_addr_t kvm_host_pa(void *addr)
{
return __pa(addr);
@@ -846,6 +861,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.free_unlinked_table = stage2_free_unlinked_table,
.get_page = kvm_host_get_page,
.put_page = kvm_s2_put_page,
+ .put_page_rcu = kvm_s2_put_page_rcu,
.page_count = kvm_host_page_count,
.phys_to_virt = kvm_host_va,
.virt_to_phys = kvm_host_pa,
@@ -1846,6 +1862,70 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
size, false);
}
+struct test_clear_young_arg {
+ struct kvm_gfn_range *range;
+ gfn_t lsb_gfn;
+ unsigned long *bitmap;
+};
+
+static int stage2_test_clear_young(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void *const ctx)
+{
+ gfn_t gfn = addr / PAGE_SIZE;
+ kvm_pte_t old = READ_ONCE(*ptep);
+ kvm_pte_t new = old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ struct test_clear_young_arg *arg = ctx;
+
+ VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep)));
+ VM_WARN_ON_ONCE(gfn < arg->range->start || gfn >= arg->range->end);
+
+ if (kvm_pte_table(old, level))
+ return 0;
+
+ if (!kvm_pte_valid(new))
+ return 0;
+
+ if (new == old)
+ return 0;
+
+ /* see the comments on the generic kvm_arch_has_test_clear_young() */
+ if (__test_and_change_bit(arg->lsb_gfn - gfn, arg->bitmap))
+ cmpxchg64(ptep, old, new);
+
+ return 0;
+}
+
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
+ gfn_t lsb_gfn, unsigned long *bitmap)
+{
+ u64 start = range->start * PAGE_SIZE;
+ u64 end = range->end * PAGE_SIZE;
+ struct test_clear_young_arg arg = {
+ .range = range,
+ .lsb_gfn = lsb_gfn,
+ .bitmap = bitmap,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_test_clear_young,
+ .arg = &arg,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ BUILD_BUG_ON(is_hyp_code());
+
+ if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
+ return false;
+
+ /* see the comments on kvm_pgtable_walk_flags */
+ rcu_read_lock();
+
+ kvm_pgtable_walk(kvm->arch.mmu.pgt, start, end - start, &walker);
+
+ rcu_read_unlock();
+
+ return true;
+}
+
phys_addr_t kvm_mmu_get_httbr(void)
{
return __pa(hyp_pgtable->pgd);
--
2.42.0.820.g83a721a137-goog