330c2ca53b3bca579b7612b358c696593c11f743d8fbfe0c8cf3db6f.patch - chromiumos/third_party/kernel-rebase-patches - Git at Google

 From 3a6e9395ac8cd10676285a158edbb933b9487689 Mon Sep 17 00:00:00 2001
 From: Hikaru Nishida <hikalium@chromium.org>
 Date: Wed, 20 Oct 2021 21:04:29 +0900
 Subject: [PATCH] BACKPORT: FROMLIST: kvm/x86: virtual suspend time injection:
  Implement host side

 Add main logics that adjust the guest's clocks and notify about the
 suspension to the guest.

 Adjustment flow:
 - Before going into suspend, KVM_REQ_SUSPEND_TIME_ADJ will be
   requested for each vcpus through the PM notifier if the suspend time
   injection is enabled for the kvm.
 - Before the first vmenter after the resume, each vcpu will check the
   the request and do two kinds of adjustments.
   - One is kvm-wide adjustment: kvm-clock will be adjusted to the value
     before the suspend.
   - Another is per-vcpu adjustment: tsc will be adjusted to the value
     before the suspend.
   - Those adjustments happen before the vcpu run: so the guest will not
     observe the "rewinding" of the clocks.
 - After the adjustment is made, the guest will be notified about the
   adjustment through HYPERVISOR_CALLBACK_VECTOR IRQ.
     - It is guest's responsibility to adjust their CLOCK_BOOTTIME and
       the wall clock to reflect the suspend.
       This will be done in the later patch.

 Signed-off-by: Hikaru Nishida <hikalium@chromium.org>
 (am from https://patchwork.kernel.org/patch/12572235/)
 (also found at https://lore.kernel.org/r/20211020210348.RFC.v3.4.I9c4e7c844507384b546e6d1ea1a5286996eed908@changeid)

 Conflicts:
    arch/x86/kvm/Kconfig
    arch/x86/kvm/x86.c
    include/linux/kvm_host.h
    virt/kvm/kvm_main.c

 BUG=b:226698497
 TEST=export BOARD=octopus-arc-r
 TEST=emerge-${BOARD} chromeos-kernel-5_15
 TEST=~/trunk/src/scripts/update_kernel.sh --remote=${DUT}
 TEST=tast run ${DUT} arc.Suspend.s10c10

 Change-Id: I6ea221a885c91564dd515d83bacbb8eac69496df
 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/3555536
 Tested-by: Hikaru Nishida <hikalium@chromium.org>
 Auto-Submit: Hikaru Nishida <hikalium@chromium.org>
 Reviewed-by: Suleiman Souhlal <suleiman@chromium.org>
 Commit-Queue: Suleiman Souhlal <suleiman@chromium.org>
 ---
  arch/x86/include/asm/kvm_host.h |   2 +
  arch/x86/kvm/Kconfig            |  13 ++++
  arch/x86/kvm/cpuid.c            |   4 ++
  arch/x86/kvm/x86.c              | 101 ++++++++++++++++++++++++++++++++
  include/linux/kvm_host.h        |  48 +++++++++++++++
  virt/kvm/kvm_main.c             |  88 ++++++++++++++++++++++++++++
  6 files changed, 256 insertions(+)

 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index d23e80a56eb8..143f7f3820ef 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -1102,6 +1102,8 @@ struct kvm_arch {
  	bool pause_in_guest;
  	bool cstate_in_guest;

 +	u64 msr_suspend_time;
 +
  	unsigned long irq_sources_bitmap;
  	s64 kvmclock_offset;

 diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
 index e3cbd7706136..97d3f6474ede 100644
 --- a/arch/x86/kvm/Kconfig
 +++ b/arch/x86/kvm/Kconfig
 @@ -129,4 +129,17 @@ config KVM_XEN
  config KVM_EXTERNAL_WRITE_TRACKING
  	bool

 +config KVM_VIRT_SUSPEND_TIMING
 +	bool "Host support for virtual suspend time injection"
 +	depends on KVM=y && HAVE_KVM_PM_NOTIFIER
 +	default n
 +	help
 +	 This option makes the host's suspension reflected on the guest's clocks.
 +	 In other words, guest's CLOCK_MONOTONIC will stop and
 +	 CLOCK_BOOTTIME keeps running during the host's suspension.
 +	 This feature will only be effective when both guest and host support
 +	 this feature. For the guest side, see KVM_VIRT_SUSPEND_TIMING_GUEST.
 +
 +	 If unsure, say N.
 +
  endif # VIRTUALIZATION
 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
 index b24ca7f4ed7c..42de21c7a7e4 100644
 --- a/arch/x86/kvm/cpuid.c
 +++ b/arch/x86/kvm/cpuid.c
 @@ -1075,6 +1075,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
  			     (1 << KVM_FEATURE_PV_SCHED_YIELD) |
  			     (1 << KVM_FEATURE_ASYNC_PF_INT);

 +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
 +		entry->eax |= (1 << KVM_FEATURE_HOST_SUSPEND_TIME);
 +#endif
 +
  		if (sched_info_on())
  			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);

 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index f2239e799014..6a2ad71f31a9 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1456,6 +1456,7 @@ static const u32 emulated_msrs_all[] = {

  	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
  	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
 +	MSR_KVM_HOST_SUSPEND_TIME,

  	MSR_IA32_TSC_ADJUST,
  	MSR_IA32_TSC_DEADLINE,
 @@ -3668,7 +3669,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

  		vcpu->arch.msr_kvm_poll_control = data;
  		break;
 +	case MSR_KVM_HOST_SUSPEND_TIME:
 +		if (!(data & KVM_MSR_ENABLED))
 +			break;
 +
 +		if (kvm_init_suspend_time_ghc(vcpu->kvm, data & ~1ULL))
 +			return 1;

 +		vcpu->kvm->arch.msr_suspend_time = data;
 +		break;
  	case MSR_IA32_MCG_CTL:
  	case MSR_IA32_MCG_STATUS:
  	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
 @@ -4009,6 +4018,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

  		msr_info->data = vcpu->arch.msr_kvm_poll_control;
  		break;
 +	case MSR_KVM_HOST_SUSPEND_TIME:
 +		msr_info->data = vcpu->kvm->arch.msr_suspend_time;
 +		break;
  	case MSR_IA32_P5_MC_ADDR:
  	case MSR_IA32_P5_MC_TYPE:
  	case MSR_IA32_MCG_CAP:
 @@ -9906,6 +9918,93 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
  }
  EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);

 +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
 +bool virt_suspend_time_enabled(struct kvm *kvm)
 +{
 +	return kvm->arch.msr_suspend_time & KVM_MSR_ENABLED;
 +}
 +
 +/*
 + * Do per-vcpu suspend time adjustment (tsc) and
 + * make an interrupt to notify it.
 + */
 +static void vcpu_do_suspend_time_adjustment(struct kvm_vcpu *vcpu,
 +					    u64 total_ns)
 +{
 +	struct kvm_lapic_irq irq = {
 +		.delivery_mode = APIC_DM_FIXED,
 +		.vector = HYPERVISOR_CALLBACK_VECTOR
 +	};
 +	u64 last_suspend_duration = 0;
 +	s64 adj;
 +
 +	spin_lock(&vcpu->suspend_time_ns_lock);
 +	if (total_ns > vcpu->suspend_time_ns) {
 +		last_suspend_duration = total_ns - vcpu->suspend_time_ns;
 +		vcpu->suspend_time_ns = total_ns;
 +	}
 +	spin_unlock(&vcpu->suspend_time_ns_lock);
 +
 +	if (!last_suspend_duration) {
 +		/* It looks like the suspend is not happened yet. Retry. */
 +		kvm_make_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
 +		return;
 +	}
 +
 +	adj = __this_cpu_read(cpu_tsc_khz) *
 +		div_u64(last_suspend_duration, 1000000);
 +	adjust_tsc_offset_host(vcpu, -adj);
 +	/*
 +	 * This request should be processed before
 +	 * the first vmenter after resume to avoid
 +	 * an unadjusted TSC value is observed.
 +	 */
 +	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 +	kvm_write_suspend_time(vcpu->kvm);
 +	if (!kvm_apic_set_irq(vcpu, &irq, NULL))
 +		pr_err("kvm: failed to set suspend time irq\n");
 +}
 +
 +/*
 + * Do kvm-wide suspend time adjustment (kvm-clock).
 + */
 +static void kvm_do_suspend_time_adjustment(struct kvm *kvm, u64 total_ns)
 +{
 +	spin_lock(&kvm->suspend_time_ns_lock);
 +	if (total_ns > kvm->suspend_time_ns) {
 +		u64 last_suspend_duration = total_ns - kvm->suspend_time_ns;
 +		/*
 +		 * Move the offset of kvm_clock here as if it is stopped
 +		 * during the suspension.
 +		 */
 +		kvm->arch.kvmclock_offset -= last_suspend_duration;
 +
 +		/* suspend_time is accumulated per VM. */
 +		kvm->suspend_time_ns += last_suspend_duration;
 +		/*
 +		 * This adjustment will be reflected to the struct provided
 +		 * from the guest via MSR_KVM_HOST_SUSPEND_TIME before
 +		 * the notification interrupt is injected.
 +		 */
 +		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
 +	}
 +	spin_unlock(&kvm->suspend_time_ns_lock);
 +}
 +
 +static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
 +{
 +	u64 total_ns = kvm_total_suspend_time(vcpu->kvm);
 +	/* Do kvm-wide adjustment (kvm-clock) */
 +	kvm_do_suspend_time_adjustment(vcpu->kvm, total_ns);
 +	/* Do per-vcpu adjustment (tsc) */
 +	vcpu_do_suspend_time_adjustment(vcpu, total_ns);
 +}
 +#else
 +static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
 +{
 +}
 +#endif
 +
  /*
   * Called within kvm->srcu read side.
   * Returns 1 to let vcpu_run() continue the guest execution loop without
 @@ -9942,6 +10041,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  				goto out;
  			}
  		}
 +		if (kvm_check_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu))
 +			kvm_adjust_suspend_time(vcpu);
  		if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
  			kvm_mmu_free_obsolete_roots(vcpu);
  		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index 3f9b22c4983a..f46e67205ba9 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -157,6 +157,7 @@ static inline bool is_error_page(struct page *page)
  #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  #define KVM_REQ_UNBLOCK           2
  #define KVM_REQ_UNHALT            3
 +#define KVM_REQ_SUSPEND_TIME_ADJ  5
  #define KVM_REQUEST_ARCH_BASE     8

  /*
 @@ -350,6 +351,11 @@ struct kvm_vcpu {
  	} async_pf;
  #endif

 +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
 +	u64 suspend_time_ns;
 +	spinlock_t suspend_time_ns_lock;
 +#endif
 +
  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
  	/*
  	 * Cpu relax intercept or pause loop exit optimization
 @@ -781,6 +787,12 @@ struct kvm {
  	struct notifier_block pm_notifier;
  #endif
  	char stats_id[KVM_STATS_NAME_SIZE];
 +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
 +	u64 suspend_time_ns;
 +	spinlock_t suspend_time_ns_lock;
 +	u64 base_offs_boot_ns;
 +	struct gfn_to_hva_cache suspend_time_ghc;
 +#endif
  };

  #define kvm_err(fmt, ...) \
 @@ -2230,4 +2242,40 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
  /* Max number of entries allowed for each kvm dirty ring */
  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536

 +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
 +bool virt_suspend_time_enabled(struct kvm *kvm);
 +void kvm_write_suspend_time(struct kvm *kvm);
 +int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa);
 +static inline u64 kvm_total_suspend_time(struct kvm *kvm)
 +{
 +	return ktime_get_offs_boot_ns() - kvm->base_offs_boot_ns;
 +}
 +
 +static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu)
 +{
 +	return vcpu->suspend_time_ns;
 +}
 +#else
 +static inline bool virt_suspend_time_enabled(struct kvm *kvm)
 +{
 +	return 0;
 +}
 +static inline void kvm_write_suspend_time(struct kvm *kvm)
 +{
 +}
 +static inline int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa)
 +{
 +	return 1;
 +}
 +static inline u64 kvm_total_suspend_time(struct kvm *kvm)
 +{
 +	return 0;
 +}
 +
 +static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu)
 +{
 +	return 0;
 +}
 +#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING */
 +
  #endif
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 457a2b74e4fc..3b79eb8fdb64 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -430,6 +430,11 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
  	vcpu->ready = false;
  	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
  	vcpu->last_used_slot = NULL;
 +
 +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
 +	vcpu->suspend_time_ns = kvm->suspend_time_ns;
 +	spin_lock_init(&vcpu->suspend_time_ns_lock);
 +#endif
  }

  static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 @@ -844,12 +849,70 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */

  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 +static int kvm_suspend_notifier(struct kvm *kvm)
 +{
 +	struct kvm_vcpu *vcpu;
 +	int i;
 +
 +	if (!virt_suspend_time_enabled(kvm))
 +		return NOTIFY_DONE;
 +
 +	mutex_lock(&kvm->lock);
 +	kvm_for_each_vcpu(i, vcpu, kvm)
 +		kvm_make_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
 +	mutex_unlock(&kvm->lock);
 +
 +	return NOTIFY_DONE;
 +}
 +
 +static int kvm_resume_notifier(struct kvm *kvm)
 +{
 +	struct kvm_vcpu *vcpu;
 +	int i;
 +
 +	if (!virt_suspend_time_enabled(kvm))
 +		return NOTIFY_DONE;
 +
 +	mutex_lock(&kvm->lock);
 +	kvm_for_each_vcpu(i, vcpu, kvm) {
 +		/*
 +		 * Clear KVM_REQ_SUSPEND_TIME_ADJ if the suspend injection is
 +		 * not needed (e.g. suspend failure)
 +		 * The following condition is also true when the adjustment is
 +		 * already done and it is safe to clear the request again here.
 +		 */
 +		if (kvm_total_suspend_time(kvm) ==
 +		    vcpu_suspend_time_injected(vcpu))
 +			kvm_clear_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
 +	}
 +	mutex_unlock(&kvm->lock);
 +
 +	return NOTIFY_DONE;
 +}
 +
 +static int kvm_pm_notifier(struct kvm *kvm, unsigned long state)
 +{
 +	switch (state) {
 +	case PM_HIBERNATION_PREPARE:
 +	case PM_SUSPEND_PREPARE:
 +		return kvm_suspend_notifier(kvm);
 +	case PM_POST_HIBERNATION:
 +	case PM_POST_SUSPEND:
 +		return kvm_resume_notifier(kvm);
 +	}
 +
 +	return NOTIFY_DONE;
 +}
 +
  static int kvm_pm_notifier_call(struct notifier_block *bl,
  				unsigned long state,
  				void *unused)
  {
  	struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);

 +	if (kvm_pm_notifier(kvm, state) != NOTIFY_DONE)
 +		return NOTIFY_BAD;
 +
  	return kvm_arch_pm_notifier(kvm, state);
  }

 @@ -875,6 +938,26 @@ static void kvm_destroy_pm_notifier(struct kvm *kvm)
  }
  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */

 +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
 +void kvm_write_suspend_time(struct kvm *kvm)
 +{
 +	struct kvm_suspend_time st;
 +
 +	st.suspend_time_ns = kvm->suspend_time_ns;
 +	kvm_write_guest_cached(kvm, &kvm->suspend_time_ghc, &st, sizeof(st));
 +}
 +
 +int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa)
 +{
 +	if (kvm_gfn_to_hva_cache_init(kvm, &kvm->suspend_time_ghc, gpa,
 +				      sizeof(struct kvm_suspend_time)))
 +		return 1;
 +
 +	kvm_write_suspend_time(kvm);
 +	return 0;
 +}
 +#endif
 +
  static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
  {
  	if (!memslot->dirty_bitmap)
 @@ -1119,6 +1202,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
  	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
  #endif

 +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
 +	spin_lock_init(&kvm->suspend_time_ns_lock);
 +	kvm->base_offs_boot_ns = ktime_get_offs_boot_ns();
 +#endif
 +
  	r = kvm_init_mmu_notifier(kvm);
  	if (r)
  		goto out_err_no_mmu_notifier;
 --
 2.35.0
	From 3a6e9395ac8cd10676285a158edbb933b9487689 Mon Sep 17 00:00:00 2001
	From: Hikaru Nishida <hikalium@chromium.org>
	Date: Wed, 20 Oct 2021 21:04:29 +0900
	Subject: [PATCH] BACKPORT: FROMLIST: kvm/x86: virtual suspend time injection:
	Implement host side

	Add main logics that adjust the guest's clocks and notify about the
	suspension to the guest.

	Adjustment flow:
	- Before going into suspend, KVM_REQ_SUSPEND_TIME_ADJ will be
	requested for each vcpus through the PM notifier if the suspend time
	injection is enabled for the kvm.
	- Before the first vmenter after the resume, each vcpu will check the
	the request and do two kinds of adjustments.
	- One is kvm-wide adjustment: kvm-clock will be adjusted to the value
	before the suspend.
	- Another is per-vcpu adjustment: tsc will be adjusted to the value
	before the suspend.
	- Those adjustments happen before the vcpu run: so the guest will not
	observe the "rewinding" of the clocks.
	- After the adjustment is made, the guest will be notified about the
	adjustment through HYPERVISOR_CALLBACK_VECTOR IRQ.
	- It is guest's responsibility to adjust their CLOCK_BOOTTIME and
	the wall clock to reflect the suspend.
	This will be done in the later patch.

	Signed-off-by: Hikaru Nishida <hikalium@chromium.org>
	(am from https://patchwork.kernel.org/patch/12572235/)
	(also found at https://lore.kernel.org/r/20211020210348.RFC.v3.4.I9c4e7c844507384b546e6d1ea1a5286996eed908@changeid)

	Conflicts:
	arch/x86/kvm/Kconfig
	arch/x86/kvm/x86.c
	include/linux/kvm_host.h
	virt/kvm/kvm_main.c

	BUG=b:226698497
	TEST=export BOARD=octopus-arc-r
	TEST=emerge-${BOARD} chromeos-kernel-5_15
	TEST=~/trunk/src/scripts/update_kernel.sh --remote=${DUT}
	TEST=tast run ${DUT} arc.Suspend.s10c10

	Change-Id: I6ea221a885c91564dd515d83bacbb8eac69496df
	Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/3555536
	Tested-by: Hikaru Nishida <hikalium@chromium.org>
	Auto-Submit: Hikaru Nishida <hikalium@chromium.org>
	Reviewed-by: Suleiman Souhlal <suleiman@chromium.org>
	Commit-Queue: Suleiman Souhlal <suleiman@chromium.org>
	---
	arch/x86/include/asm/kvm_host.h \| 2 +
	arch/x86/kvm/Kconfig \| 13 ++++
	arch/x86/kvm/cpuid.c \| 4 ++
	arch/x86/kvm/x86.c \| 101 ++++++++++++++++++++++++++++++++
	include/linux/kvm_host.h \| 48 +++++++++++++++
	virt/kvm/kvm_main.c \| 88 ++++++++++++++++++++++++++++
	6 files changed, 256 insertions(+)

	diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
	index d23e80a56eb8..143f7f3820ef 100644
	--- a/arch/x86/include/asm/kvm_host.h
	+++ b/arch/x86/include/asm/kvm_host.h
	@@ -1102,6 +1102,8 @@ struct kvm_arch {
	bool pause_in_guest;
	bool cstate_in_guest;

	+ u64 msr_suspend_time;
	+
	unsigned long irq_sources_bitmap;
	s64 kvmclock_offset;

	diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
	index e3cbd7706136..97d3f6474ede 100644
	--- a/arch/x86/kvm/Kconfig
	+++ b/arch/x86/kvm/Kconfig
	@@ -129,4 +129,17 @@ config KVM_XEN
	config KVM_EXTERNAL_WRITE_TRACKING
	bool

	+config KVM_VIRT_SUSPEND_TIMING
	+ bool "Host support for virtual suspend time injection"
	+ depends on KVM=y && HAVE_KVM_PM_NOTIFIER
	+ default n
	+ help
	+ This option makes the host's suspension reflected on the guest's clocks.
	+ In other words, guest's CLOCK_MONOTONIC will stop and
	+ CLOCK_BOOTTIME keeps running during the host's suspension.
	+ This feature will only be effective when both guest and host support
	+ this feature. For the guest side, see KVM_VIRT_SUSPEND_TIMING_GUEST.
	+
	+ If unsure, say N.
	+
	endif # VIRTUALIZATION
	diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
	index b24ca7f4ed7c..42de21c7a7e4 100644
	--- a/arch/x86/kvm/cpuid.c
	+++ b/arch/x86/kvm/cpuid.c
	@@ -1075,6 +1075,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
	(1 << KVM_FEATURE_PV_SCHED_YIELD) \|
	(1 << KVM_FEATURE_ASYNC_PF_INT);

	+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	+ entry->eax \|= (1 << KVM_FEATURE_HOST_SUSPEND_TIME);
	+#endif
	+
	if (sched_info_on())
	entry->eax \|= (1 << KVM_FEATURE_STEAL_TIME);

	diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
	index f2239e799014..6a2ad71f31a9 100644
	--- a/arch/x86/kvm/x86.c
	+++ b/arch/x86/kvm/x86.c
	@@ -1456,6 +1456,7 @@ static const u32 emulated_msrs_all[] = {

	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
	+ MSR_KVM_HOST_SUSPEND_TIME,

	MSR_IA32_TSC_ADJUST,
	MSR_IA32_TSC_DEADLINE,
	@@ -3668,7 +3669,15 @@ int kvm_set_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)

	vcpu->arch.msr_kvm_poll_control = data;
	break;
	+ case MSR_KVM_HOST_SUSPEND_TIME:
	+ if (!(data & KVM_MSR_ENABLED))
	+ break;
	+
	+ if (kvm_init_suspend_time_ghc(vcpu->kvm, data & ~1ULL))
	+ return 1;

	+ vcpu->kvm->arch.msr_suspend_time = data;
	+ break;
	case MSR_IA32_MCG_CTL:
	case MSR_IA32_MCG_STATUS:
	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
	@@ -4009,6 +4018,9 @@ int kvm_get_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)

	msr_info->data = vcpu->arch.msr_kvm_poll_control;
	break;
	+ case MSR_KVM_HOST_SUSPEND_TIME:
	+ msr_info->data = vcpu->kvm->arch.msr_suspend_time;
	+ break;
	case MSR_IA32_P5_MC_ADDR:
	case MSR_IA32_P5_MC_TYPE:
	case MSR_IA32_MCG_CAP:
	@@ -9906,6 +9918,93 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
	}
	EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);

	+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	+bool virt_suspend_time_enabled(struct kvm *kvm)
	+{
	+ return kvm->arch.msr_suspend_time & KVM_MSR_ENABLED;
	+}
	+
	+/*
	+ * Do per-vcpu suspend time adjustment (tsc) and
	+ * make an interrupt to notify it.
	+ */
	+static void vcpu_do_suspend_time_adjustment(struct kvm_vcpu *vcpu,
	+ u64 total_ns)
	+{
	+ struct kvm_lapic_irq irq = {
	+ .delivery_mode = APIC_DM_FIXED,
	+ .vector = HYPERVISOR_CALLBACK_VECTOR
	+ };
	+ u64 last_suspend_duration = 0;
	+ s64 adj;
	+
	+ spin_lock(&vcpu->suspend_time_ns_lock);
	+ if (total_ns > vcpu->suspend_time_ns) {
	+ last_suspend_duration = total_ns - vcpu->suspend_time_ns;
	+ vcpu->suspend_time_ns = total_ns;
	+ }
	+ spin_unlock(&vcpu->suspend_time_ns_lock);
	+
	+ if (!last_suspend_duration) {
	+ /* It looks like the suspend is not happened yet. Retry. */
	+ kvm_make_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
	+ return;
	+ }
	+
	+ adj = __this_cpu_read(cpu_tsc_khz) *
	+ div_u64(last_suspend_duration, 1000000);
	+ adjust_tsc_offset_host(vcpu, -adj);
	+ /*
	+ * This request should be processed before
	+ * the first vmenter after resume to avoid
	+ * an unadjusted TSC value is observed.
	+ */
	+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
	+ kvm_write_suspend_time(vcpu->kvm);
	+ if (!kvm_apic_set_irq(vcpu, &irq, NULL))
	+ pr_err("kvm: failed to set suspend time irq\n");
	+}
	+
	+/*
	+ * Do kvm-wide suspend time adjustment (kvm-clock).
	+ */
	+static void kvm_do_suspend_time_adjustment(struct kvm *kvm, u64 total_ns)
	+{
	+ spin_lock(&kvm->suspend_time_ns_lock);
	+ if (total_ns > kvm->suspend_time_ns) {
	+ u64 last_suspend_duration = total_ns - kvm->suspend_time_ns;
	+ /*
	+ * Move the offset of kvm_clock here as if it is stopped
	+ * during the suspension.
	+ */
	+ kvm->arch.kvmclock_offset -= last_suspend_duration;
	+
	+ /* suspend_time is accumulated per VM. */
	+ kvm->suspend_time_ns += last_suspend_duration;
	+ /*
	+ * This adjustment will be reflected to the struct provided
	+ * from the guest via MSR_KVM_HOST_SUSPEND_TIME before
	+ * the notification interrupt is injected.
	+ */
	+ kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
	+ }
	+ spin_unlock(&kvm->suspend_time_ns_lock);
	+}
	+
	+static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
	+{
	+ u64 total_ns = kvm_total_suspend_time(vcpu->kvm);
	+ /* Do kvm-wide adjustment (kvm-clock) */
	+ kvm_do_suspend_time_adjustment(vcpu->kvm, total_ns);
	+ /* Do per-vcpu adjustment (tsc) */
	+ vcpu_do_suspend_time_adjustment(vcpu, total_ns);
	+}
	+#else
	+static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
	+{
	+}
	+#endif
	+
	/*
	* Called within kvm->srcu read side.
	* Returns 1 to let vcpu_run() continue the guest execution loop without
	@@ -9942,6 +10041,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
	goto out;
	}
	}
	+ if (kvm_check_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu))
	+ kvm_adjust_suspend_time(vcpu);
	if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
	kvm_mmu_free_obsolete_roots(vcpu);
	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
	diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
	index 3f9b22c4983a..f46e67205ba9 100644
	--- a/include/linux/kvm_host.h
	+++ b/include/linux/kvm_host.h
	@@ -157,6 +157,7 @@ static inline bool is_error_page(struct page *page)
	#define KVM_REQ_VM_DEAD (1 \| KVM_REQUEST_WAIT \| KVM_REQUEST_NO_WAKEUP)
	#define KVM_REQ_UNBLOCK 2
	#define KVM_REQ_UNHALT 3
	+#define KVM_REQ_SUSPEND_TIME_ADJ 5
	#define KVM_REQUEST_ARCH_BASE 8

	/*
	@@ -350,6 +351,11 @@ struct kvm_vcpu {
	} async_pf;
	#endif

	+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	+ u64 suspend_time_ns;
	+ spinlock_t suspend_time_ns_lock;
	+#endif
	+
	#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
	/*
	* Cpu relax intercept or pause loop exit optimization
	@@ -781,6 +787,12 @@ struct kvm {
	struct notifier_block pm_notifier;
	#endif
	char stats_id[KVM_STATS_NAME_SIZE];
	+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	+ u64 suspend_time_ns;
	+ spinlock_t suspend_time_ns_lock;
	+ u64 base_offs_boot_ns;
	+ struct gfn_to_hva_cache suspend_time_ghc;
	+#endif
	};

	#define kvm_err(fmt, ...) \
	@@ -2230,4 +2242,40 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
	/* Max number of entries allowed for each kvm dirty ring */
	#define KVM_DIRTY_RING_MAX_ENTRIES 65536

	+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	+bool virt_suspend_time_enabled(struct kvm *kvm);
	+void kvm_write_suspend_time(struct kvm *kvm);
	+int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa);
	+static inline u64 kvm_total_suspend_time(struct kvm *kvm)
	+{
	+ return ktime_get_offs_boot_ns() - kvm->base_offs_boot_ns;
	+}
	+
	+static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu)
	+{
	+ return vcpu->suspend_time_ns;
	+}
	+#else
	+static inline bool virt_suspend_time_enabled(struct kvm *kvm)
	+{
	+ return 0;
	+}
	+static inline void kvm_write_suspend_time(struct kvm *kvm)
	+{
	+}
	+static inline int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa)
	+{
	+ return 1;
	+}
	+static inline u64 kvm_total_suspend_time(struct kvm *kvm)
	+{
	+ return 0;
	+}
	+
	+static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu)
	+{
	+ return 0;
	+}
	+#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING */
	+
	#endif
	diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
	index 457a2b74e4fc..3b79eb8fdb64 100644
	--- a/virt/kvm/kvm_main.c
	+++ b/virt/kvm/kvm_main.c
	@@ -430,6 +430,11 @@ static void kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
	vcpu->ready = false;
	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
	vcpu->last_used_slot = NULL;
	+
	+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	+ vcpu->suspend_time_ns = kvm->suspend_time_ns;
	+ spin_lock_init(&vcpu->suspend_time_ns_lock);
	+#endif
	}

	static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
	@@ -844,12 +849,70 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
	#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */

	#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
	+static int kvm_suspend_notifier(struct kvm *kvm)
	+{
	+ struct kvm_vcpu *vcpu;
	+ int i;
	+
	+ if (!virt_suspend_time_enabled(kvm))
	+ return NOTIFY_DONE;
	+
	+ mutex_lock(&kvm->lock);
	+ kvm_for_each_vcpu(i, vcpu, kvm)
	+ kvm_make_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
	+ mutex_unlock(&kvm->lock);
	+
	+ return NOTIFY_DONE;
	+}
	+
	+static int kvm_resume_notifier(struct kvm *kvm)
	+{
	+ struct kvm_vcpu *vcpu;
	+ int i;
	+
	+ if (!virt_suspend_time_enabled(kvm))
	+ return NOTIFY_DONE;
	+
	+ mutex_lock(&kvm->lock);
	+ kvm_for_each_vcpu(i, vcpu, kvm) {
	+ /*
	+ * Clear KVM_REQ_SUSPEND_TIME_ADJ if the suspend injection is
	+ * not needed (e.g. suspend failure)
	+ * The following condition is also true when the adjustment is
	+ * already done and it is safe to clear the request again here.
	+ */
	+ if (kvm_total_suspend_time(kvm) ==
	+ vcpu_suspend_time_injected(vcpu))
	+ kvm_clear_request(KVM_REQ_SUSPEND_TIME_ADJ, vcpu);
	+ }
	+ mutex_unlock(&kvm->lock);
	+
	+ return NOTIFY_DONE;
	+}
	+
	+static int kvm_pm_notifier(struct kvm *kvm, unsigned long state)
	+{
	+ switch (state) {
	+ case PM_HIBERNATION_PREPARE:
	+ case PM_SUSPEND_PREPARE:
	+ return kvm_suspend_notifier(kvm);
	+ case PM_POST_HIBERNATION:
	+ case PM_POST_SUSPEND:
	+ return kvm_resume_notifier(kvm);
	+ }
	+
	+ return NOTIFY_DONE;
	+}
	+
	static int kvm_pm_notifier_call(struct notifier_block *bl,
	unsigned long state,
	void *unused)
	{
	struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);

	+ if (kvm_pm_notifier(kvm, state) != NOTIFY_DONE)
	+ return NOTIFY_BAD;
	+
	return kvm_arch_pm_notifier(kvm, state);
	}

	@@ -875,6 +938,26 @@ static void kvm_destroy_pm_notifier(struct kvm *kvm)
	}
	#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */

	+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	+void kvm_write_suspend_time(struct kvm *kvm)
	+{
	+ struct kvm_suspend_time st;
	+
	+ st.suspend_time_ns = kvm->suspend_time_ns;
	+ kvm_write_guest_cached(kvm, &kvm->suspend_time_ghc, &st, sizeof(st));
	+}
	+
	+int kvm_init_suspend_time_ghc(struct kvm *kvm, gpa_t gpa)
	+{
	+ if (kvm_gfn_to_hva_cache_init(kvm, &kvm->suspend_time_ghc, gpa,
	+ sizeof(struct kvm_suspend_time)))
	+ return 1;
	+
	+ kvm_write_suspend_time(kvm);
	+ return 0;
	+}
	+#endif
	+
	static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
	{
	if (!memslot->dirty_bitmap)
	@@ -1119,6 +1202,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
	#endif

	+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	+ spin_lock_init(&kvm->suspend_time_ns_lock);
	+ kvm->base_offs_boot_ns = ktime_get_offs_boot_ns();
	+#endif
	+
	r = kvm_init_mmu_notifier(kvm);
	if (r)
	goto out_err_no_mmu_notifier;
	--
	2.35.0