| From 3a6e9395ac8cd10676285a158edbb933b9487689 Mon Sep 17 00:00:00 2001 |
| From: Hikaru Nishida <hikalium@chromium.org> |
| Date: Wed, 20 Oct 2021 21:04:29 +0900 |
| Subject: [PATCH] BACKPORT: FROMLIST: kvm/x86: virtual suspend time injection: |
| Implement host side |
| |
| Add main logics that adjust the guest's clocks and notify about the |
| suspension to the guest. |
| |
| Adjustment flow: |
| - Before going into suspend, KVM_REQ_SUSPEND_TIME_ADJ will be |
| requested for each vcpus through the PM notifier if the suspend time |
| injection is enabled for the kvm. |
| - Before the first vmenter after the resume, each vcpu will check the |
| the request and do two kinds of adjustments. |
| - One is kvm-wide adjustment: kvm-clock will be adjusted to the value |
| before the suspend. |
| - Another is per-vcpu adjustment: tsc will be adjusted to the value |
| before the suspend. |
| - Those adjustments happen before the vcpu run: so the guest will not |
| observe the "rewinding" of the clocks. |
| - After the adjustment is made, the guest will be notified about the |
| adjustment through HYPERVISOR_CALLBACK_VECTOR IRQ. |
| - It is guest's responsibility to adjust their CLOCK_BOOTTIME and |
| the wall clock to reflect the suspend. |
| This will be done in the later patch. |
| |
| Signed-off-by: Hikaru Nishida <hikalium@chromium.org> |
| (am from https://patchwork.kernel.org/patch/12572235/) |
| (also found at https://lore.kernel.org/r/20211020210348.RFC.v3.4.I9c4e7c844507384b546e6d1ea1a5286996eed908@changeid) |
| |
| Conflicts: |
| arch/x86/kvm/Kconfig |
| arch/x86/kvm/x86.c |
| include/linux/kvm_host.h |
| virt/kvm/kvm_main.c |
| |
| BUG=b:226698497 |
| TEST=export BOARD=octopus-arc-r |
| TEST=emerge-${BOARD} chromeos-kernel-5_15 |
| TEST=~/trunk/src/scripts/update_kernel.sh --remote=${DUT} |
| TEST=tast run ${DUT} arc.Suspend.s10c10 |
| |
| Change-Id: I6ea221a885c91564dd515d83bacbb8eac69496df |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/3555536 |
| Tested-by: Hikaru Nishida <hikalium@chromium.org> |
| Auto-Submit: Hikaru Nishida <hikalium@chromium.org> |
| Reviewed-by: Suleiman Souhlal <suleiman@chromium.org> |
| Commit-Queue: Suleiman Souhlal <suleiman@chromium.org> |
| --- |
| arch/x86/include/asm/kvm_host.h | 2 + |
| arch/x86/kvm/Kconfig | 13 ++++ |
| arch/x86/kvm/cpuid.c | 4 ++ |
| arch/x86/kvm/x86.c | 101 ++++++++++++++++++++++++++++++++ |
| include/linux/kvm_host.h | 48 +++++++++++++++ |
| virt/kvm/kvm_main.c | 88 ++++++++++++++++++++++++++++ |
| 6 files changed, 256 insertions(+) |
| |
| diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h |
| index d23e80a56eb8..143f7f3820ef 100644 |
| --- a/arch/x86/include/asm/kvm_host.h |
| +++ b/arch/x86/include/asm/kvm_host.h |
| @@ -1102,6 +1102,8 @@ struct kvm_arch { |
| bool pause_in_guest; |
| bool cstate_in_guest; |
| |
| + u64 msr_suspend_time; |
| + |
| unsigned long irq_sources_bitmap; |
| s64 kvmclock_offset; |
| |
| diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig |
| index e3cbd7706136..97d3f6474ede 100644 |
| --- a/arch/x86/kvm/Kconfig |
| +++ b/arch/x86/kvm/Kconfig |
| @@ -129,4 +129,17 @@ config KVM_XEN |
| config KVM_EXTERNAL_WRITE_TRACKING |
| bool |
| |
| +config KVM_VIRT_SUSPEND_TIMING |
| + bool "Host support for virtual suspend time injection" |
| + depends on KVM=y && HAVE_KVM_PM_NOTIFIER |
| + default n |
| + help |
| + This option makes the host's suspension reflected on the guest's clocks. |
| + In other words, guest's CLOCK_MONOTONIC will stop and |
| + CLOCK_BOOTTIME keeps running during the host's suspension. |
| + This feature will only be effective when both guest and host support |
| + this feature. For the guest side, see KVM_VIRT_SUSPEND_TIMING_GUEST. |
| + |
| + If unsure, say N. |
| + |
| endif # VIRTUALIZATION |
| diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c |
| index b24ca7f4ed7c..42de21c7a7e4 100644 |
| --- a/arch/x86/kvm/cpuid.c |
| +++ b/arch/x86/kvm/cpuid.c |
| @@ -1075,6 +1075,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) |
| (1 << KVM_FEATURE_PV_SCHED_YIELD) | |
| (1 << KVM_FEATURE_ASYNC_PF_INT); |
| |
| +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING |
| + entry->eax |= (1 << KVM_FEATURE_HOST_SUSPEND_TIME); |
| +#endif |
| + |
| if (sched_info_on()) |
| entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); |
| |
| diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c |
| index f2239e799014..6a2ad71f31a9 100644 |
| --- a/arch/x86/kvm/x86.c |
| +++ b/arch/x86/kvm/x86.c |
| @@ -1456,6 +1456,7 @@ static const u32 emulated_msrs_all[] = { |
| |
| MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, |
| MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, |
| + MSR_KVM_HOST_SUSPEND_TIME, |
| |
| MSR_IA32_TSC_ADJUST, |
| MSR_IA32_TSC_DEADLINE, |
| @@ -3668,7 +3669,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
| |
| vcpu->arch.msr_kvm_poll_control = data; |
| break; |
| + case MSR_KVM_HOST_SUSPEND_TIME: |
| + if (!(data & KVM_MSR_ENABLED)) |
| + break; |
| + |
| + if (kvm_init_suspend_time_ghc(vcpu->kvm, data & ~1ULL)) |
| + return 1; |
| |
| + vcpu->kvm->arch.msr_suspend_time = data; |
| + break; |
| case MSR_IA32_MCG_CTL: |
| case MSR_IA32_MCG_STATUS: |
| case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: |
| @@ -4009,6 +4018,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
| |
| msr_info->data = vcpu->arch.msr_kvm_poll_control; |
| break; |
| + case MSR_KVM_HOST_SUSPEND_TIME: |
| + msr_info->data = vcpu->kvm->arch.msr_suspend_time; |
| + break; |
| case MSR_IA32_P5_MC_ADDR: |
| case MSR_IA32_P5_MC_TYPE: |
| case MSR_IA32_MCG_CAP: |
| @@ -9906,6 +9918,93 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) |
| } |
| EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); |
| |
| +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING |
| +bool virt_suspend_time_enabled(struct kvm *kvm) |
| +{ |
| + return kvm->arch.msr_suspend_time & KVM_MSR_ENABLED; |
| +} |
| + |
| +/* |
| + * Do per-vcpu suspend time adjustment (tsc) and |
| + * make an interrupt to notify it. |
| + */ |
| +static void vcpu_do_suspend_time_adjustment(struct kvm_vcpu *vcpu, |
| + u64 total_ns) |
| +{ |
| + struct kvm_lapic_irq irq = { |
| + .delivery_mode = APIC_DM_FIXED, |
| + .vector = HYPERVISOR_CALLBACK_VECTOR |
| + }; |
| + u64 last_suspend_duration = 0; |
| + s64 adj; |
| + |
| + spin_lock(&vcpu->suspend_time_ns_lock); |
| + if (total_ns > vcpu->suspend_time_ns) { |
| + last_suspend_duration = total_ns - vcpu->suspend_time_ns; |
| + vcpu->suspend_time_ns = total_ns; |
| + } |
| + spin_unlock(&vcpu->suspend_time_ns_lock); |
| + |
| + if (!last_suspend_duration) { |
| + /* It looks like the suspend is not happened yet. Retry. */ |
| + kvm_make_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu); |
| + return; |
| + } |
| + |
| + adj = __this_cpu_read(cpu_tsc_khz) * |
| + div_u64(last_suspend_duration, 1000000); |
| + adjust_tsc_offset_host(vcpu, -adj); |
| + /* |
| + * This request should be processed before |
| + * the first vmenter after resume to avoid |
| + * an unadjusted TSC value is observed. |
| + */ |
| + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
| + kvm_write_suspend_time(vcpu->kvm); |
| + if (!kvm_apic_set_irq(vcpu, &irq, NULL)) |
| + pr_err("kvm: failed to set suspend time irq\n"); |
| +} |
| + |
| +/* |
| + * Do kvm-wide suspend time adjustment (kvm-clock). |
| + */ |
| +static void kvm_do_suspend_time_adjustment(struct kvm *kvm, u64 total_ns) |
| +{ |
| + spin_lock(&kvm->suspend_time_ns_lock); |
| + if (total_ns > kvm->suspend_time_ns) { |
| + u64 last_suspend_duration = total_ns - kvm->suspend_time_ns; |
| + /* |
| + * Move the offset of kvm_clock here as if it is stopped |
| + * during the suspension. |
| + */ |
| + kvm->arch.kvmclock_offset -= last_suspend_duration; |
| + |
| + /* suspend_time is accumulated per VM. */ |
| + kvm->suspend_time_ns += last_suspend_duration; |
| + /* |
| + * This adjustment will be reflected to the struct provided |
| + * from the guest via MSR_KVM_HOST_SUSPEND_TIME before |
| + * the notification interrupt is injected. |
| + */ |
| + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); |
| + } |
| + spin_unlock(&kvm->suspend_time_ns_lock); |
| +} |
| + |
| +static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu) |
| +{ |
| + u64 total_ns = kvm_total_suspend_time(vcpu->kvm); |
| + /* Do kvm-wide adjustment (kvm-clock) */ |
| + kvm_do_suspend_time_adjustment(vcpu->kvm, total_ns); |
| + /* Do per-vcpu adjustment (tsc) */ |
| + vcpu_do_suspend_time_adjustment(vcpu, total_ns); |
| +} |
| +#else |
| +static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu) |
| +{ |
| +} |
| +#endif |
| + |
| /* |
| * Called within kvm->srcu read side. |
| * Returns 1 to let vcpu_run() continue the guest execution loop without |
| @@ -9942,6 +10041,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) |
| goto out; |
| } |
| } |
| + if (kvm_check_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu)) |
| + kvm_adjust_suspend_time(vcpu); |
| if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) |
| kvm_mmu_free_obsolete_roots(vcpu); |
| if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) |
| diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h |
| index 3f9b22c4983a..f46e67205ba9 100644 |
| --- a/include/linux/kvm_host.h |
| +++ b/include/linux/kvm_host.h |
| @@ -157,6 +157,7 @@ static inline bool is_error_page(struct page *page) |
| #define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) |
| #define KVM_REQ_UNBLOCK 2 |
| #define KVM_REQ_UNHALT 3 |
| +#define KVM_REQ_SUSPEND_TIME_ADJ 5 |
| #define KVM_REQUEST_ARCH_BASE 8 |
| |
| /* |
| @@ -350,6 +351,11 @@ struct kvm_vcpu { |
| } async_pf; |
| #endif |
| |
| +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING |
| + u64 suspend_time_ns; |
| + spinlock_t suspend_time_ns_lock; |
| +#endif |
| + |
| #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT |
| /* |
| * Cpu relax intercept or pause loop exit optimization |
| @@ -781,6 +787,12 @@ struct kvm { |
| struct notifier_block pm_notifier; |
| #endif |
| char stats_id[KVM_STATS_NAME_SIZE]; |
| +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING |
| + u64 suspend_time_ns; |
| + spinlock_t suspend_time_ns_lock; |
| + u64 base_offs_boot_ns; |
| + struct gfn_to_hva_cache suspend_time_ghc; |
| +#endif |
| }; |
| |
| #define kvm_err(fmt, ...) \ |
| @@ -2230,4 +2242,40 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) |
| /* Max number of entries allowed for each kvm dirty ring */ |
| #define KVM_DIRTY_RING_MAX_ENTRIES 65536 |
| |
| +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING |
| +bool virt_suspend_time_enabled(struct kvm *kvm); |
| +void kvm_write_suspend_time(struct kvm *kvm); |
| +int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa); |
| +static inline u64 kvm_total_suspend_time(struct kvm *kvm) |
| +{ |
| + return ktime_get_offs_boot_ns() - kvm->base_offs_boot_ns; |
| +} |
| + |
| +static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu) |
| +{ |
| + return vcpu->suspend_time_ns; |
| +} |
| +#else |
| +static inline bool virt_suspend_time_enabled(struct kvm *kvm) |
| +{ |
| + return 0; |
| +} |
| +static inline void kvm_write_suspend_time(struct kvm *kvm) |
| +{ |
| +} |
| +static inline int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa) |
| +{ |
| + return 1; |
| +} |
| +static inline u64 kvm_total_suspend_time(struct kvm *kvm) |
| +{ |
| + return 0; |
| +} |
| + |
| +static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu) |
| +{ |
| + return 0; |
| +} |
| +#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING */ |
| + |
| #endif |
| diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c |
| index 457a2b74e4fc..3b79eb8fdb64 100644 |
| --- a/virt/kvm/kvm_main.c |
| +++ b/virt/kvm/kvm_main.c |
| @@ -430,6 +430,11 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) |
| vcpu->ready = false; |
| preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); |
| vcpu->last_used_slot = NULL; |
| + |
| +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING |
| + vcpu->suspend_time_ns = kvm->suspend_time_ns; |
| + spin_lock_init(&vcpu->suspend_time_ns_lock); |
| +#endif |
| } |
| |
| static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) |
| @@ -844,12 +849,70 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) |
| #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ |
| |
| #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER |
| +static int kvm_suspend_notifier(struct kvm *kvm) |
| +{ |
| + struct kvm_vcpu *vcpu; |
| + int i; |
| + |
| + if (!virt_suspend_time_enabled(kvm)) |
| + return NOTIFY_DONE; |
| + |
| + mutex_lock(&kvm->lock); |
| + kvm_for_each_vcpu(i, vcpu, kvm) |
| + kvm_make_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu); |
| + mutex_unlock(&kvm->lock); |
| + |
| + return NOTIFY_DONE; |
| +} |
| + |
| +static int kvm_resume_notifier(struct kvm *kvm) |
| +{ |
| + struct kvm_vcpu *vcpu; |
| + int i; |
| + |
| + if (!virt_suspend_time_enabled(kvm)) |
| + return NOTIFY_DONE; |
| + |
| + mutex_lock(&kvm->lock); |
| + kvm_for_each_vcpu(i, vcpu, kvm) { |
| + /* |
| + * Clear KVM_REQ_SUSPEND_TIME_ADJ if the suspend injection is |
| + * not needed (e.g. suspend failure) |
| + * The following condition is also true when the adjustment is |
| + * already done and it is safe to clear the request again here. |
| + */ |
| + if (kvm_total_suspend_time(kvm) == |
| + vcpu_suspend_time_injected(vcpu)) |
| + kvm_clear_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu); |
| + } |
| + mutex_unlock(&kvm->lock); |
| + |
| + return NOTIFY_DONE; |
| +} |
| + |
| +static int kvm_pm_notifier(struct kvm *kvm, unsigned long state) |
| +{ |
| + switch (state) { |
| + case PM_HIBERNATION_PREPARE: |
| + case PM_SUSPEND_PREPARE: |
| + return kvm_suspend_notifier(kvm); |
| + case PM_POST_HIBERNATION: |
| + case PM_POST_SUSPEND: |
| + return kvm_resume_notifier(kvm); |
| + } |
| + |
| + return NOTIFY_DONE; |
| +} |
| + |
| static int kvm_pm_notifier_call(struct notifier_block *bl, |
| unsigned long state, |
| void *unused) |
| { |
| struct kvm *kvm = container_of(bl, struct kvm, pm_notifier); |
| |
| + if (kvm_pm_notifier(kvm, state) != NOTIFY_DONE) |
| + return NOTIFY_BAD; |
| + |
| return kvm_arch_pm_notifier(kvm, state); |
| } |
| |
| @@ -875,6 +938,26 @@ static void kvm_destroy_pm_notifier(struct kvm *kvm) |
| } |
| #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ |
| |
| +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING |
| +void kvm_write_suspend_time(struct kvm *kvm) |
| +{ |
| + struct kvm_suspend_time st; |
| + |
| + st.suspend_time_ns = kvm->suspend_time_ns; |
| + kvm_write_guest_cached(kvm, &kvm->suspend_time_ghc, &st, sizeof(st)); |
| +} |
| + |
| +int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa) |
| +{ |
| + if (kvm_gfn_to_hva_cache_init(kvm, &kvm->suspend_time_ghc, gpa, |
| + sizeof(struct kvm_suspend_time))) |
| + return 1; |
| + |
| + kvm_write_suspend_time(kvm); |
| + return 0; |
| +} |
| +#endif |
| + |
| static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) |
| { |
| if (!memslot->dirty_bitmap) |
| @@ -1119,6 +1202,11 @@ static struct kvm *kvm_create_vm(unsigned long type) |
| INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); |
| #endif |
| |
| +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING |
| + spin_lock_init(&kvm->suspend_time_ns_lock); |
| + kvm->base_offs_boot_ns = ktime_get_offs_boot_ns(); |
| +#endif |
| + |
| r = kvm_init_mmu_notifier(kvm); |
| if (r) |
| goto out_err_no_mmu_notifier; |
| -- |
| 2.35.0 |
| |