blob: 91593268871fb4c71796235d4fbeaf0f799a29e4 [file] [log] [blame] [edit]
From 76cef11a1802df88c94fc72c7d784c3ff60c46e1 Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Mon, 21 Jun 2021 13:20:44 +0900
Subject: [PATCH] CHROMIUM: KVM: mmu: introduce new gfn_to_pfn_page functions
Introduce new gfn_to_pfn_page functions that parallel existing
gfn_to_pfn functions. The new functions are identical except they take
an additional out parameter that is used to return the struct page if
the hva was resolved by gup. This allows callers to differentiate the
gup and follow_pte cases, which in turn allows callers to only touch the
page refcount when necessitated by gup.
The old gfn_to_pfn functions are depreciated, and all callers should be
migrated to the new gfn_to_pfn_page functions. In the interim, the
gfn_to_pfn functions are reimplemented as wrappers of the corresponding
gfn_to_pfn_page functions. The wrappers take a reference to the pfn's
page that had previously been taken in hva_to_pfn_remapped.
Signed-off-by: David Stevens <stevensd@chromium.org>
Change-Id: I9d36226e0af00d6758b3fbeffdce3f43e290486a
[rebase61(tzungbi):
Squashed:
FIXUP: KVM: mmu: introduce new gfn_to_pfn_page functions
FIXUP: KVM: Fix multiple races in gfn=>pfn cache refresh
]
Signed-off-by: Tzung-Bi Shih <tzungbi@chromium.org>
---
include/linux/kvm_host.h | 18 ++++
virt/kvm/kvm_main.c | 200 ++++++++++++++++++++++++++++-----------
virt/kvm/kvm_mm.h | 5 +-
virt/kvm/pfncache.c | 4 +-
4 files changed, 170 insertions(+), 57 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ece90517571dd8b075636da0cde87009cd74d308..6fd9ea3d93f83e858ae742f822fd9d238c432fcf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1197,6 +1197,20 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool interruptible, bool *async,
bool write_fault, bool *writable, hva_t *hva);
+kvm_pfn_t gfn_to_pfn_page(struct kvm *kvm, gfn_t gfn, struct page **page);
+kvm_pfn_t gfn_to_pfn_page_prot(struct kvm *kvm, gfn_t gfn,
+ bool write_fault, bool *writable,
+ struct page **page);
+kvm_pfn_t gfn_to_pfn_page_memslot(const struct kvm_memory_slot *slot,
+ gfn_t gfn, struct page **page);
+kvm_pfn_t gfn_to_pfn_page_memslot_atomic(const struct kvm_memory_slot *slot,
+ gfn_t gfn, struct page **page);
+kvm_pfn_t __gfn_to_pfn_page_memslot(const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool atomic, bool interruptible,
+ bool *async, bool write_fault,
+ bool *writable, hva_t *hva,
+ struct page **page);
+
void kvm_release_pfn_clean(kvm_pfn_t pfn);
void kvm_release_pfn_dirty(kvm_pfn_t pfn);
void kvm_set_pfn_dirty(kvm_pfn_t pfn);
@@ -1276,6 +1290,10 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_page_atomic(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct page **page);
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct page **page);
kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9938786b0d3c19f29711bc9c324b8faa6d18a717..41df82c4f0b0335f6d3b9422de79d0ec0ccedc9d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2599,9 +2599,9 @@ static inline int check_user_page_hwpoison(unsigned long addr)
* only part that runs if we can in atomic context.
*/
static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
- bool *writable, kvm_pfn_t *pfn)
+ bool *writable, kvm_pfn_t *pfn,
+ struct page **page)
{
- struct page *page[1];
/*
* Fast pin a writable pfn only if it is a write fault request
@@ -2612,7 +2612,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
return false;
if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
- *pfn = page_to_pfn(page[0]);
+ *pfn = page_to_pfn(*page);
if (writable)
*writable = true;
@@ -2627,7 +2627,8 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
* 1 indicates success, -errno is returned if error is detected.
*/
static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
- bool interruptible, bool *writable, kvm_pfn_t *pfn)
+ bool interruptible, bool *writable, kvm_pfn_t *pfn,
+ struct page **page)
{
/*
* When a VCPU accesses a page that is not mapped into the secondary
@@ -2641,7 +2642,6 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
* implicitly honor NUMA hinting faults and don't need this flag.
*/
unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
- struct page *page;
int npages;
might_sleep();
@@ -2656,7 +2656,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
if (interruptible)
flags |= FOLL_INTERRUPTIBLE;
- npages = get_user_pages_unlocked(addr, 1, &page, flags);
+ npages = get_user_pages_unlocked(addr, 1, page, flags);
if (npages != 1)
return npages;
@@ -2666,11 +2666,11 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
*writable = true;
- put_page(page);
- page = wpage;
+ put_page(*page);
+ *page = wpage;
}
}
- *pfn = page_to_pfn(page);
+ *pfn = page_to_pfn(*page);
return npages;
}
@@ -2685,16 +2685,6 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
return true;
}
-static int kvm_try_get_pfn(kvm_pfn_t pfn)
-{
- struct page *page = kvm_pfn_to_refcounted_page(pfn);
-
- if (!page)
- return 1;
-
- return get_page_unless_zero(page);
-}
-
static int hva_to_pfn_remapped(struct vm_area_struct *vma,
unsigned long addr, bool write_fault,
bool *writable, kvm_pfn_t *p_pfn)
@@ -2736,26 +2726,6 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
*writable = pte_write(pte);
pfn = pte_pfn(pte);
- /*
- * Get a reference here because callers of *hva_to_pfn* and
- * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
- * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
- * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
- * simply do nothing for reserved pfns.
- *
- * Whoever called remap_pfn_range is also going to call e.g.
- * unmap_mapping_range before the underlying pages are freed,
- * causing a call to our MMU notifier.
- *
- * Certain IO or PFNMAP mappings can be backed with valid
- * struct pages, but be allocated without refcounting e.g.,
- * tail pages of non-compound higher order allocations, which
- * would then underflow the refcount when the caller does the
- * required put_page. Don't allow those pages here.
- */
- if (!kvm_try_get_pfn(pfn))
- r = -EFAULT;
-
out:
pte_unmap_unlock(ptep, ptl);
*p_pfn = pfn;
@@ -2779,7 +2749,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
* whether the mapping is writable.
*/
kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
- bool *async, bool write_fault, bool *writable)
+ bool *async, bool write_fault, bool *writable,
+ struct page **page)
{
struct vm_area_struct *vma;
kvm_pfn_t pfn;
@@ -2788,14 +2759,14 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
/* we can do it either atomically or asynchronously, not both */
BUG_ON(atomic && async);
- if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
+ if (hva_to_pfn_fast(addr, write_fault, writable, &pfn, page))
return pfn;
if (atomic)
return KVM_PFN_ERR_FAULT;
npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
- writable, &pfn);
+ writable, &pfn, page);
if (npages == 1)
return pfn;
if (npages == -EINTR)
@@ -2829,9 +2800,10 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
return pfn;
}
-kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
- bool atomic, bool interruptible, bool *async,
- bool write_fault, bool *writable, hva_t *hva)
+kvm_pfn_t __gfn_to_pfn_page_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
+ bool atomic, bool interruptible, bool *async,
+ bool write_fault, bool *writable, hva_t *hva,
+ struct page **page)
{
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
@@ -2857,47 +2829,165 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
}
return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
- writable);
+ writable, page);
+}
+EXPORT_SYMBOL_GPL(__gfn_to_pfn_page_memslot);
+
+kvm_pfn_t gfn_to_pfn_page_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+ bool *writable, struct page **page)
+{
+ return __gfn_to_pfn_page_memslot(gfn_to_memslot(kvm, gfn), gfn, false,
+ false, NULL, write_fault, writable,
+ NULL, page);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_page_prot);
+
+kvm_pfn_t gfn_to_pfn_page_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
+ struct page **page)
+{
+ return __gfn_to_pfn_page_memslot(slot, gfn, false, false, NULL, true,
+ NULL, NULL, page);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_page_memslot);
+
+kvm_pfn_t gfn_to_pfn_page_memslot_atomic(const struct kvm_memory_slot *slot,
+ gfn_t gfn, struct page **page)
+{
+ return __gfn_to_pfn_page_memslot(slot, gfn, true, false, NULL, true, NULL,
+ NULL, page);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_page_memslot_atomic);
+
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_page_atomic(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct page **page)
+{
+ return gfn_to_pfn_page_memslot_atomic(
+ kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, page);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_page_atomic);
+
+kvm_pfn_t gfn_to_pfn_page(struct kvm *kvm, gfn_t gfn, struct page **page)
+{
+ return gfn_to_pfn_page_memslot(gfn_to_memslot(kvm, gfn), gfn, page);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_page);
+
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct page **page)
+{
+ return gfn_to_pfn_page_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+ gfn, page);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_page);
+
+static kvm_pfn_t kvm_try_get_page_ref(struct page *page, kvm_pfn_t pfn)
+{
+ /* If @page is valid, KVM already has a reference to the pfn/page. */
+ if (page || is_error_pfn(pfn))
+ return pfn;
+
+ /*
+ * If we're here, a pfn resolved by hva_to_pfn_remapped is
+ * going to be returned to something that ultimately calls
+ * kvm_release_pfn_clean, so the refcount needs to be bumped if
+ * the pfn isn't a reserved pfn.
+ *
+ * Whoever called remap_pfn_range is also going to call e.g.
+ * unmap_mapping_range before the underlying pages are freed,
+ * causing a call to our MMU notifier.
+ *
+ * Certain IO or PFNMAP mappings can be backed with valid
+ * struct pages, but be allocated without refcounting e.g.,
+ * tail pages of non-compound higher order allocations, which
+ * would then underflow the refcount when the caller does the
+ * required put_page. Don't allow those pages here.
+ */
+ if (!kvm_pfn_to_refcounted_page(pfn) ||
+ get_page_unless_zero(pfn_to_page(pfn)))
+ return pfn;
+
+ return KVM_PFN_ERR_FAULT;
+}
+
+kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
+ bool atomic, bool interruptible, bool *async,
+ bool write_fault, bool *writable, hva_t *hva)
+{
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = __gfn_to_pfn_page_memslot(slot, gfn, atomic, interruptible,
+ async, write_fault, writable, hva,
+ &page);
+
+ return kvm_try_get_page_ref(page, pfn);
}
EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable)
{
- return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
- NULL, write_fault, writable, NULL);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = gfn_to_pfn_page_prot(kvm, gfn, write_fault, writable, &page);
+
+ return kvm_try_get_page_ref(page, pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
- NULL, NULL);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = gfn_to_pfn_page_memslot(slot, gfn, &page);
+
+ return kvm_try_get_page_ref(page, pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
- NULL, NULL);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = gfn_to_pfn_page_memslot_atomic(slot, gfn, &page);
+
+ return kvm_try_get_page_ref(page, pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
{
- return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = kvm_vcpu_gfn_to_pfn_page_atomic(vcpu, gfn, &page);
+
+ return kvm_try_get_page_ref(page, pfn);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
- return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = gfn_to_pfn_page(kvm, gfn, &page);
+
+ return kvm_try_get_page_ref(page, pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
- return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+ struct page *page;
+ kvm_pfn_t pfn;
+
+ pfn = kvm_vcpu_gfn_to_pfn_page(vcpu, gfn, &page);
+
+ return kvm_try_get_page_ref(page, pfn);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 180f1a09e6ba7bc8b25275e6f938a1484c6e5f30..e7c58c6755ffd396fe31b8c62afc39977caa1d82 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -3,6 +3,8 @@
#ifndef __KVM_MM_H__
#define __KVM_MM_H__ 1
+#include <linux/mm_types.h>
+
/*
* Architectures can choose whether to use an rwlock or spinlock
* for the mmu_lock. These macros, for use in common code
@@ -21,7 +23,8 @@
#endif /* KVM_HAVE_MMU_RWLOCK */
kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
- bool *async, bool write_fault, bool *writable);
+ bool *async, bool write_fault, bool *writable,
+ struct page **page);
#ifdef CONFIG_HAVE_KVM_PFNCACHE
void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d6aba67783078180bb89e77cfa6ef61239d497d..f15092a55f603cd9359e2d0a0c462e5e384c53c7 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -144,6 +144,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
void *new_khva = NULL;
unsigned long mmu_seq;
+ struct page *page;
lockdep_assert_held(&gpc->refresh_lock);
@@ -183,7 +184,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
}
/* We always request a writeable mapping */
- new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
+ new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL,
+ &page);
if (is_error_noslot_pfn(new_pfn))
goto out_error;
--
2.34.1