blob: 8f8566488ecd59a5889bd8e3df5731636b9cfbe7 [file] [log] [blame]
From 3a6e9395ac8cd10676285a158edbb933b9487689 Mon Sep 17 00:00:00 2001
From: Hikaru Nishida <hikalium@chromium.org>
Date: Wed, 20 Oct 2021 21:04:29 +0900
Subject: [PATCH] BACKPORT: FROMLIST: kvm/x86: virtual suspend time injection:
Implement host side
Add main logics that adjust the guest's clocks and notify about the
suspension to the guest.
Adjustment flow:
- Before going into suspend, KVM_REQ_SUSPEND_TIME_ADJ will be
requested for each vcpus through the PM notifier if the suspend time
injection is enabled for the kvm.
- Before the first vmenter after the resume, each vcpu will check the
the request and do two kinds of adjustments.
- One is kvm-wide adjustment: kvm-clock will be adjusted to the value
before the suspend.
- Another is per-vcpu adjustment: tsc will be adjusted to the value
before the suspend.
- Those adjustments happen before the vcpu run: so the guest will not
observe the "rewinding" of the clocks.
- After the adjustment is made, the guest will be notified about the
adjustment through HYPERVISOR_CALLBACK_VECTOR IRQ.
- It is guest's responsibility to adjust their CLOCK_BOOTTIME and
the wall clock to reflect the suspend.
This will be done in the later patch.
Signed-off-by: Hikaru Nishida <hikalium@chromium.org>
(am from https://patchwork.kernel.org/patch/12572235/)
(also found at https://lore.kernel.org/r/20211020210348.RFC.v3.4.I9c4e7c844507384b546e6d1ea1a5286996eed908@changeid)
Conflicts:
arch/x86/kvm/Kconfig
arch/x86/kvm/x86.c
include/linux/kvm_host.h
virt/kvm/kvm_main.c
BUG=b:226698497
TEST=export BOARD=octopus-arc-r
TEST=emerge-${BOARD} chromeos-kernel-5_15
TEST=~/trunk/src/scripts/update_kernel.sh --remote=${DUT}
TEST=tast run ${DUT} arc.Suspend.s10c10
Change-Id: I6ea221a885c91564dd515d83bacbb8eac69496df
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/3555536
Tested-by: Hikaru Nishida <hikalium@chromium.org>
Auto-Submit: Hikaru Nishida <hikalium@chromium.org>
Reviewed-by: Suleiman Souhlal <suleiman@chromium.org>
Commit-Queue: Suleiman Souhlal <suleiman@chromium.org>
---
arch/x86/include/asm/kvm_host.h | 2 +
arch/x86/kvm/Kconfig | 13 ++++
arch/x86/kvm/cpuid.c | 4 ++
arch/x86/kvm/x86.c | 101 ++++++++++++++++++++++++++++++++
include/linux/kvm_host.h | 48 +++++++++++++++
virt/kvm/kvm_main.c | 88 ++++++++++++++++++++++++++++
6 files changed, 256 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d23e80a56eb8..143f7f3820ef 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1102,6 +1102,8 @@ struct kvm_arch {
bool pause_in_guest;
bool cstate_in_guest;
+ u64 msr_suspend_time;
+
unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index e3cbd7706136..97d3f6474ede 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -129,4 +129,17 @@ config KVM_XEN
config KVM_EXTERNAL_WRITE_TRACKING
bool
+config KVM_VIRT_SUSPEND_TIMING
+ bool "Host support for virtual suspend time injection"
+ depends on KVM=y && HAVE_KVM_PM_NOTIFIER
+ default n
+ help
+ This option makes the host's suspension reflected on the guest's clocks.
+ In other words, guest's CLOCK_MONOTONIC will stop and
+ CLOCK_BOOTTIME keeps running during the host's suspension.
+ This feature will only be effective when both guest and host support
+ this feature. For the guest side, see KVM_VIRT_SUSPEND_TIMING_GUEST.
+
+ If unsure, say N.
+
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b24ca7f4ed7c..42de21c7a7e4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1075,6 +1075,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
(1 << KVM_FEATURE_PV_SCHED_YIELD) |
(1 << KVM_FEATURE_ASYNC_PF_INT);
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ entry->eax |= (1 << KVM_FEATURE_HOST_SUSPEND_TIME);
+#endif
+
if (sched_info_on())
entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f2239e799014..6a2ad71f31a9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1456,6 +1456,7 @@ static const u32 emulated_msrs_all[] = {
MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+ MSR_KVM_HOST_SUSPEND_TIME,
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSC_DEADLINE,
@@ -3668,7 +3669,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.msr_kvm_poll_control = data;
break;
+ case MSR_KVM_HOST_SUSPEND_TIME:
+ if (!(data & KVM_MSR_ENABLED))
+ break;
+
+ if (kvm_init_suspend_time_ghc(vcpu->kvm, data & ~1ULL))
+ return 1;
+ vcpu->kvm->arch.msr_suspend_time = data;
+ break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
@@ -4009,6 +4018,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.msr_kvm_poll_control;
break;
+ case MSR_KVM_HOST_SUSPEND_TIME:
+ msr_info->data = vcpu->kvm->arch.msr_suspend_time;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -9906,6 +9918,93 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+bool virt_suspend_time_enabled(struct kvm *kvm)
+{
+ return kvm->arch.msr_suspend_time & KVM_MSR_ENABLED;
+}
+
+/*
+ * Do per-vcpu suspend time adjustment (tsc) and
+ * make an interrupt to notify it.
+ */
+static void vcpu_do_suspend_time_adjustment(struct kvm_vcpu *vcpu,
+ u64 total_ns)
+{
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = HYPERVISOR_CALLBACK_VECTOR
+ };
+ u64 last_suspend_duration = 0;
+ s64 adj;
+
+ spin_lock(&vcpu->suspend_time_ns_lock);
+ if (total_ns > vcpu->suspend_time_ns) {
+ last_suspend_duration = total_ns - vcpu->suspend_time_ns;
+ vcpu->suspend_time_ns = total_ns;
+ }
+ spin_unlock(&vcpu->suspend_time_ns_lock);
+
+ if (!last_suspend_duration) {
+ /* It looks like the suspend is not happened yet. Retry. */
+ kvm_make_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
+ return;
+ }
+
+ adj = __this_cpu_read(cpu_tsc_khz) *
+ div_u64(last_suspend_duration, 1000000);
+ adjust_tsc_offset_host(vcpu, -adj);
+ /*
+ * This request should be processed before
+ * the first vmenter after resume to avoid
+ * an unadjusted TSC value is observed.
+ */
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ kvm_write_suspend_time(vcpu->kvm);
+ if (!kvm_apic_set_irq(vcpu, &irq, NULL))
+ pr_err("kvm: failed to set suspend time irq\n");
+}
+
+/*
+ * Do kvm-wide suspend time adjustment (kvm-clock).
+ */
+static void kvm_do_suspend_time_adjustment(struct kvm *kvm, u64 total_ns)
+{
+ spin_lock(&kvm->suspend_time_ns_lock);
+ if (total_ns > kvm->suspend_time_ns) {
+ u64 last_suspend_duration = total_ns - kvm->suspend_time_ns;
+ /*
+ * Move the offset of kvm_clock here as if it is stopped
+ * during the suspension.
+ */
+ kvm->arch.kvmclock_offset -= last_suspend_duration;
+
+ /* suspend_time is accumulated per VM. */
+ kvm->suspend_time_ns += last_suspend_duration;
+ /*
+ * This adjustment will be reflected to the struct provided
+ * from the guest via MSR_KVM_HOST_SUSPEND_TIME before
+ * the notification interrupt is injected.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+ }
+ spin_unlock(&kvm->suspend_time_ns_lock);
+}
+
+static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
+{
+ u64 total_ns = kvm_total_suspend_time(vcpu->kvm);
+ /* Do kvm-wide adjustment (kvm-clock) */
+ kvm_do_suspend_time_adjustment(vcpu->kvm, total_ns);
+ /* Do per-vcpu adjustment (tsc) */
+ vcpu_do_suspend_time_adjustment(vcpu, total_ns);
+}
+#else
+static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
+{
+}
+#endif
+
/*
* Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -9942,6 +10041,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
}
+ if (kvm_check_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu))
+ kvm_adjust_suspend_time(vcpu);
if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
kvm_mmu_free_obsolete_roots(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3f9b22c4983a..f46e67205ba9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -157,6 +157,7 @@ static inline bool is_error_page(struct page *page)
#define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_UNBLOCK 2
#define KVM_REQ_UNHALT 3
+#define KVM_REQ_SUSPEND_TIME_ADJ 5
#define KVM_REQUEST_ARCH_BASE 8
/*
@@ -350,6 +351,11 @@ struct kvm_vcpu {
} async_pf;
#endif
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ u64 suspend_time_ns;
+ spinlock_t suspend_time_ns_lock;
+#endif
+
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Cpu relax intercept or pause loop exit optimization
@@ -781,6 +787,12 @@ struct kvm {
struct notifier_block pm_notifier;
#endif
char stats_id[KVM_STATS_NAME_SIZE];
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ u64 suspend_time_ns;
+ spinlock_t suspend_time_ns_lock;
+ u64 base_offs_boot_ns;
+ struct gfn_to_hva_cache suspend_time_ghc;
+#endif
};
#define kvm_err(fmt, ...) \
@@ -2230,4 +2242,40 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
/* Max number of entries allowed for each kvm dirty ring */
#define KVM_DIRTY_RING_MAX_ENTRIES 65536
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+bool virt_suspend_time_enabled(struct kvm *kvm);
+void kvm_write_suspend_time(struct kvm *kvm);
+int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa);
+static inline u64 kvm_total_suspend_time(struct kvm *kvm)
+{
+ return ktime_get_offs_boot_ns() - kvm->base_offs_boot_ns;
+}
+
+static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu)
+{
+ return vcpu->suspend_time_ns;
+}
+#else
+static inline bool virt_suspend_time_enabled(struct kvm *kvm)
+{
+ return 0;
+}
+static inline void kvm_write_suspend_time(struct kvm *kvm)
+{
+}
+static inline int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa)
+{
+ return 1;
+}
+static inline u64 kvm_total_suspend_time(struct kvm *kvm)
+{
+ return 0;
+}
+
+static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING */
+
#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 457a2b74e4fc..3b79eb8fdb64 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -430,6 +430,11 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->ready = false;
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
vcpu->last_used_slot = NULL;
+
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ vcpu->suspend_time_ns = kvm->suspend_time_ns;
+ spin_lock_init(&vcpu->suspend_time_ns_lock);
+#endif
}
static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -844,12 +849,70 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_suspend_notifier(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ if (!virt_suspend_time_enabled(kvm))
+ return NOTIFY_DONE;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
+ mutex_unlock(&kvm->lock);
+
+ return NOTIFY_DONE;
+}
+
+static int kvm_resume_notifier(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ if (!virt_suspend_time_enabled(kvm))
+ return NOTIFY_DONE;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Clear KVM_REQ_SUSPEND_TIME_ADJ if the suspend injection is
+ * not needed (e.g. suspend failure)
+ * The following condition is also true when the adjustment is
+ * already done and it is safe to clear the request again here.
+ */
+ if (kvm_total_suspend_time(kvm) ==
+ vcpu_suspend_time_injected(vcpu))
+ kvm_clear_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
+ }
+ mutex_unlock(&kvm->lock);
+
+ return NOTIFY_DONE;
+}
+
+static int kvm_pm_notifier(struct kvm *kvm, unsigned long state)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ return kvm_suspend_notifier(kvm);
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ return kvm_resume_notifier(kvm);
+ }
+
+ return NOTIFY_DONE;
+}
+
static int kvm_pm_notifier_call(struct notifier_block *bl,
unsigned long state,
void *unused)
{
struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
+ if (kvm_pm_notifier(kvm, state) != NOTIFY_DONE)
+ return NOTIFY_BAD;
+
return kvm_arch_pm_notifier(kvm, state);
}
@@ -875,6 +938,26 @@ static void kvm_destroy_pm_notifier(struct kvm *kvm)
}
#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+void kvm_write_suspend_time(struct kvm *kvm)
+{
+ struct kvm_suspend_time st;
+
+ st.suspend_time_ns = kvm->suspend_time_ns;
+ kvm_write_guest_cached(kvm, &kvm->suspend_time_ghc, &st, sizeof(st));
+}
+
+int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa)
+{
+ if (kvm_gfn_to_hva_cache_init(kvm, &kvm->suspend_time_ghc, gpa,
+ sizeof(struct kvm_suspend_time)))
+ return 1;
+
+ kvm_write_suspend_time(kvm);
+ return 0;
+}
+#endif
+
static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
{
if (!memslot->dirty_bitmap)
@@ -1119,6 +1202,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
#endif
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ spin_lock_init(&kvm->suspend_time_ns_lock);
+ kvm->base_offs_boot_ns = ktime_get_offs_boot_ns();
+#endif
+
r = kvm_init_mmu_notifier(kvm);
if (r)
goto out_err_no_mmu_notifier;
--
2.35.0