b98026fa2ccedc9f6db32894cf95d89f88078ad6091b86fe3c7ce861.patch - chromiumos/third_party/kernel-rebase-patches - Git at Google

 From d2ec76ca3240dee4ea5f02cc3d0321d04e2480a0 Mon Sep 17 00:00:00 2001
 From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org>
 Date: Tue, 31 Oct 2023 16:20:58 -0400
 Subject: [PATCH] CHROMIUM: kvm: x86: upper bound for boost duration

 Unbounded boosting of vcpu tasks can hurt the host workloads. So,
 implement a throttling mechanism to unboost the vcpus if they run
 boosted for more than a threshold value.

 Default threshold for kernel critical section boosting is 3ms and task
 boosting is 500 ms and is a tunable as well.

 UPSTREAM-TASK=b:303645537
 BUG=b:262267726
 TEST=boot

 Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
 Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
 Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
 (cherry picked from commit 298c342708bdcb9f22ac9f86805c9d757954d997)

 Change-Id: Ife506102d5953d104c79caa3f9eb650bf9481366
 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5425467
 Tested-by: Vineeth Pillai <vineethrp@google.com>
 Commit-Queue: Vineeth Pillai <vineethrp@google.com>
 Reviewed-by: Joel Fernandes <joelaf@google.com>
 ---
  arch/x86/include/asm/kvm_host.h |  59 ++++++++++++++++
  arch/x86/kvm/x86.c              | 116 ++++++++++++++++++++++++++++++++
  virt/kvm/kvm_main.c             |  11 +--
  3 files changed, 182 insertions(+), 4 deletions(-)

 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index 45b5aca1276b2fec45bab32312cee33dd4455981..009b3701c71b9b1111bd8527891dbb1ebfaa5271 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -766,6 +766,22 @@ struct vcpu_pv_sched {
  	 */
  	int kern_cs_prio;
  	int kern_cs_policy;
 +	struct hrtimer boost_thr_timer;
 +	u8 boosted;
 +	u8 throttled;
 +	/*
 +	 * nanoseconds since last task prio boost or throttle.
 +	 */
 +	u64 taskprio_ns;
 +	/*
 +	 * nanoseconds since last kernel critical section
 +	 * boost or throttle.
 +	 */
 +	u64 kerncs_ns;
 +	/*
 +	 * Timestamp for last VMENTRY.
 +	 */
 +	ktime_t vmentry_ts;
  	u64 msr_val;
  	struct gfn_to_hva_cache data;
  };
 @@ -2396,6 +2412,15 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
  #define VCPU_KERN_CS_PRIO	8
  #define VCPU_KERN_CS_POLICY	SCHED_RR

 +/*
 + * Vcpu boosted for servicing kernel critical section.
 + */
 +#define KVM_PVSCHED_BOOST_KERNCS	0x1
 +/*
 + * Vcpu boosted to match priority of running task in the vcpu.
 + */
 +#define KVM_PVSCHED_BOOST_TASKPRIO	0x2
 +
  static inline bool kvm_arch_vcpu_pv_sched_enabled(struct kvm_vcpu_arch *arch)
  {
  	return arch->pv_sched.msr_val;
 @@ -2442,11 +2467,45 @@ static inline int kvm_arch_vcpu_normalprio_cmp(struct kvm_vcpu_arch *arch,
  		return -1;
  }

 +static inline bool kvm_arch_vcpu_is_throttled(struct kvm_vcpu_arch *arch)
 +{
 +	return arch->pv_sched.throttled;
 +}
 +
 +static inline bool kvm_arch_vcpu_is_boosted(struct kvm_vcpu_arch *arch)
 +{
 +	return arch->pv_sched.normal_prio < arch->pv_sched.default_normal_prio;
 +}
 +
  static inline void kvm_arch_vcpu_set_sched_attr(struct kvm_vcpu_arch *arch,
  		union vcpu_sched_attr attr)
  {
 +	u8 boost_type = KVM_PVSCHED_BOOST_TASKPRIO;
 +	int normal_prio = __sched_normal_prio(attr);
 +
 +	/*
 +	 * If current priority of the vcpu task is same as its
 +	 * previous priority, we need not update arch->pv_sched.
 +	 */
 +	if (normal_prio == arch->pv_sched.normal_prio)
 +		return;
 +
  	arch->pv_sched.attr = attr;
  	arch->pv_sched.normal_prio = __sched_normal_prio(attr);
 +
 +	if (attr.kern_cs)
 +		boost_type = KVM_PVSCHED_BOOST_KERNCS;
 +
 +	if (kvm_arch_vcpu_is_boosted(arch)) {
 +		arch->pv_sched.boosted |= boost_type;
 +		if (!attr.kern_cs) {
 +			arch->pv_sched.boosted &= ~KVM_PVSCHED_BOOST_KERNCS;
 +			arch->pv_sched.kerncs_ns = 0;
 +		}
 +	} else if (arch->pv_sched.boosted) {
 +		arch->pv_sched.boosted = 0;
 +		arch->pv_sched.taskprio_ns = arch->pv_sched.kerncs_ns = 0;
 +	}
  }

  static inline void kvm_arch_vcpu_set_default_sched_attr(struct kvm_vcpu_arch *arch,
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 6dcbe53f6a59d3463b97d1fc82a91454a1608b89..9c95c5ae8bb6f8a45c6c5577ee5b3ab57df884ec 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -197,6 +197,25 @@ module_param(eager_page_split, bool, 0644);
  static bool __read_mostly mitigate_smt_rsb;
  module_param(mitigate_smt_rsb, bool, 0444);

 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +/*
 + * Maximum allowable boosted time for a guest kernel critical section.
 + */
 +unsigned int pvsched_max_kerncs_us = 3000;
 +module_param(pvsched_max_kerncs_us, uint, 0644);
 +
 +/*
 + * Maximum allowable boosted time for a guest task.
 + */
 +unsigned int pvsched_max_taskprio_us = 500000;
 +module_param(pvsched_max_taskprio_us, uint, 0644);
 +
 +static enum hrtimer_restart boost_throttle_timer_fn(struct hrtimer *data)
 +{
 +	return HRTIMER_NORESTART;
 +}
 +#endif
 +
  /*
   * Restoring the host value for MSRs that are only consumed when running in
   * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
 @@ -10958,6 +10977,86 @@ static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
  #endif

  #ifdef CONFIG_PARAVIRT_SCHED_KVM
 +static inline void kvm_vcpu_pvsched_update_vmenter(struct kvm_vcpu_arch *arch)
 +{
 +	u64 max_ns = 0;
 +	u64 elapsed_ns = 0;
 +	ktime_t expire;
 +	u64 max_taskprio_ns = pvsched_max_taskprio_us * NSEC_PER_USEC;
 +	u64 max_kerncs_ns = pvsched_max_kerncs_us * NSEC_PER_USEC;
 +
 +	arch->pv_sched.vmentry_ts = ktime_get();
 +	if (!arch->pv_sched.boosted && !arch->pv_sched.throttled)
 +		return;
 +
 +	if (arch->pv_sched.boosted) {
 +		if (arch->pv_sched.boosted & KVM_PVSCHED_BOOST_KERNCS) {
 +			max_ns = max_kerncs_ns;
 +			elapsed_ns = arch->pv_sched.kerncs_ns;
 +		} else {
 +			max_ns = max_taskprio_ns;
 +			elapsed_ns = arch->pv_sched.taskprio_ns;
 +		}
 +	} else if (arch->pv_sched.throttled) {
 +		if (arch->pv_sched.throttled & KVM_PVSCHED_BOOST_KERNCS) {
 +			max_ns = max_kerncs_ns;
 +			elapsed_ns = arch->pv_sched.kerncs_ns;
 +		} else {
 +			max_ns = max_taskprio_ns;
 +			elapsed_ns = arch->pv_sched.taskprio_ns;
 +		}
 +	}
 +	WARN_ON(max_ns <= elapsed_ns);
 +	expire = ktime_add_ns(arch->pv_sched.vmentry_ts, max_ns - elapsed_ns);
 +	hrtimer_start(&arch->pv_sched.boost_thr_timer, expire, HRTIMER_MODE_ABS_HARD);
 +}
 +
 +static inline void kvm_vcpu_pvsched_update_vmexit(struct kvm_vcpu_arch *arch)
 +{
 +	u64 delta;
 +	u64 max_ns;
 +	u64 elapsed_ns;
 +	ktime_t now = ktime_get();
 +	u64 max_kerncs_ns = pvsched_max_kerncs_us * NSEC_PER_USEC;
 +	u64 max_taskprio_ns = pvsched_max_taskprio_us * NSEC_PER_USEC;
 +
 +	hrtimer_cancel(&arch->pv_sched.boost_thr_timer);
 +
 +	delta = ktime_sub_ns(now, arch->pv_sched.vmentry_ts);
 +	if (arch->pv_sched.boosted) {
 +		u8 thr_type;
 +
 +		arch->pv_sched.taskprio_ns += delta;
 +		if (arch->pv_sched.boosted & KVM_PVSCHED_BOOST_KERNCS) {
 +			elapsed_ns = arch->pv_sched.kerncs_ns += delta;
 +			max_ns = max_kerncs_ns;
 +			thr_type = KVM_PVSCHED_BOOST_KERNCS;
 +		} else {
 +			elapsed_ns = arch->pv_sched.taskprio_ns;
 +			max_ns = max_taskprio_ns;
 +			thr_type = KVM_PVSCHED_BOOST_TASKPRIO;
 +		}
 +		if (elapsed_ns >= max_ns) {
 +			arch->pv_sched.throttled = thr_type;
 +			arch->pv_sched.boosted = 0;
 +			arch->pv_sched.kerncs_ns = arch->pv_sched.taskprio_ns = 0;
 +		}
 +	} else if (arch->pv_sched.throttled) {
 +		if (arch->pv_sched.throttled & KVM_PVSCHED_BOOST_KERNCS) {
 +			elapsed_ns = arch->pv_sched.kerncs_ns += delta;
 +			max_ns = max_kerncs_ns;
 +		} else {
 +			elapsed_ns = arch->pv_sched.taskprio_ns += delta;
 +			max_ns = max_taskprio_ns;
 +		}
 +
 +		if (elapsed_ns >= max_ns) {
 +			arch->pv_sched.throttled = 0;
 +			arch->pv_sched.kerncs_ns = arch->pv_sched.taskprio_ns = 0;
 +		}
 +	}
 +}
 +
  /*
   * Update the host area of PV_SCHED with the vcpu task sched parameters
   * so that guest can utilize it if needed.
 @@ -10978,6 +11077,8 @@ static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu)
  	if (!kvm_vcpu_sched_enabled(vcpu))
  		return;

 +	kvm_vcpu_pvsched_update_vmexit(&vcpu->arch);
 +
  	if (kvm_cpu_has_pending_timer(vcpu) || kvm_cpu_has_interrupt(vcpu))
  		kvm_vcpu_boost(vcpu, PVSCHED_KERNCS_BOOST_IRQ);
  	else {
 @@ -10995,11 +11096,23 @@ static void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu)
  	kvm_arch_vcpu_set_kerncs_prio(&vcpu->arch, VCPU_KERN_CS_PRIO);
  	kvm_arch_vcpu_set_kerncs_policy(&vcpu->arch, VCPU_KERN_CS_POLICY);
  	kvm_arch_vcpu_set_default_sched_attr(&vcpu->arch, kvm_vcpu_get_sched(vcpu));
 +
 +	hrtimer_init(&vcpu->arch.pv_sched.boost_thr_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 +	vcpu->arch.pv_sched.boost_thr_timer.function = boost_throttle_timer_fn;
 +}
 +
 +static void kvm_vcpu_pv_sched_fini(struct kvm_vcpu *vcpu)
 +{
 +	hrtimer_cancel(&vcpu->arch.pv_sched.boost_thr_timer);
  }
  #else
  static inline void record_vcpu_pv_sched(struct kvm_vcpu *vcpu) { }
  static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu) { }
  static inline void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu) { }
 +static inline void kvm_vcpu_pv_sched_fini(struct kvm_vcpu *vcpu) { }
 +
 +static inline void kvm_vcpu_pvsched_update_vmenter(struct kvm_vcpu_arch *arch) { }
 +static inline void kvm_vcpu_pvsched_update_vmexit(struct kvm_vcpu_arch *arch) { }
  #endif

  /*
 @@ -11283,6 +11396,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  		set_debugreg(0, 7);
  	}

 +	kvm_vcpu_pvsched_update_vmenter(&vcpu->arch);
  	guest_timing_enter_irqoff();

  	for (;;) {
 @@ -12551,6 +12665,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
  	free_page((unsigned long)vcpu->arch.pio_data);
  	kvfree(vcpu->arch.cpuid_entries);
 +
 +	kvm_vcpu_pv_sched_fini(vcpu);
  }

  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 838dc48a45d54bb25973783881a066488fb7558b..c89bd71a4d9ef2008453ea529be58fe378118632 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -4171,13 +4171,15 @@ int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, union vcpu_sched_attr attr)
  	struct pid *pid;
  	struct task_struct *vcpu_task = NULL;
  	int max_rt_prio = kvm_arch_vcpu_kerncs_prio(&vcpu->arch);
 +	union vcpu_sched_attr default_attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch);

  retry_disable:
  	/*
 -	 * If the feature is disabled, revert to CFS.
 +	 * If the feature is disabled, or a boost request comes when throttled, revert to CFS.
  	 */
 -	if (!kvm_vcpu_sched_enabled(vcpu))
 -		attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch);
 +	if (!kvm_vcpu_sched_enabled(vcpu) || (kvm_arch_vcpu_is_throttled(&vcpu->arch) &&
 +		(attr.kern_cs || kvm_arch_vcpu_normalprio_cmp(&vcpu->arch, attr) < 0)))
 +		attr = default_attr;

  	policy = attr.sched_policy;
  	rt_prio = attr.rt_priority;
 @@ -4256,7 +4258,8 @@ void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type)
  		.kern_cs = boost_type
  	};

 -	kvm_vcpu_set_sched(vcpu, attr);
 +	if (kvm_vcpu_sched_enabled(vcpu))
 +		kvm_vcpu_set_sched(vcpu, attr);
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_boost);
  #endif
 --
 2.46.0.rc2.264.g509ed76dc8-goog
	From d2ec76ca3240dee4ea5f02cc3d0321d04e2480a0 Mon Sep 17 00:00:00 2001
	From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org>
	Date: Tue, 31 Oct 2023 16:20:58 -0400
	Subject: [PATCH] CHROMIUM: kvm: x86: upper bound for boost duration

	Unbounded boosting of vcpu tasks can hurt the host workloads. So,
	implement a throttling mechanism to unboost the vcpus if they run
	boosted for more than a threshold value.

	Default threshold for kernel critical section boosting is 3ms and task
	boosting is 500 ms and is a tunable as well.

	UPSTREAM-TASK=b:303645537
	BUG=b:262267726
	TEST=boot

	Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
	Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
	Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
	(cherry picked from commit 298c342708bdcb9f22ac9f86805c9d757954d997)

	Change-Id: Ife506102d5953d104c79caa3f9eb650bf9481366
	Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5425467
	Tested-by: Vineeth Pillai <vineethrp@google.com>
	Commit-Queue: Vineeth Pillai <vineethrp@google.com>
	Reviewed-by: Joel Fernandes <joelaf@google.com>
	---
	arch/x86/include/asm/kvm_host.h \| 59 ++++++++++++++++
	arch/x86/kvm/x86.c \| 116 ++++++++++++++++++++++++++++++++
	virt/kvm/kvm_main.c \| 11 +--
	3 files changed, 182 insertions(+), 4 deletions(-)

	diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
	index 45b5aca1276b2fec45bab32312cee33dd4455981..009b3701c71b9b1111bd8527891dbb1ebfaa5271 100644
	--- a/arch/x86/include/asm/kvm_host.h
	+++ b/arch/x86/include/asm/kvm_host.h
	@@ -766,6 +766,22 @@ struct vcpu_pv_sched {
	*/
	int kern_cs_prio;
	int kern_cs_policy;
	+ struct hrtimer boost_thr_timer;
	+ u8 boosted;
	+ u8 throttled;
	+ /*
	+ * nanoseconds since last task prio boost or throttle.
	+ */
	+ u64 taskprio_ns;
	+ /*
	+ * nanoseconds since last kernel critical section
	+ * boost or throttle.
	+ */
	+ u64 kerncs_ns;
	+ /*
	+ * Timestamp for last VMENTRY.
	+ */
	+ ktime_t vmentry_ts;
	u64 msr_val;
	struct gfn_to_hva_cache data;
	};
	@@ -2396,6 +2412,15 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
	#define VCPU_KERN_CS_PRIO 8
	#define VCPU_KERN_CS_POLICY SCHED_RR

	+/*
	+ * Vcpu boosted for servicing kernel critical section.
	+ */
	+#define KVM_PVSCHED_BOOST_KERNCS 0x1
	+/*
	+ * Vcpu boosted to match priority of running task in the vcpu.
	+ */
	+#define KVM_PVSCHED_BOOST_TASKPRIO 0x2
	+
	static inline bool kvm_arch_vcpu_pv_sched_enabled(struct kvm_vcpu_arch *arch)
	{
	return arch->pv_sched.msr_val;
	@@ -2442,11 +2467,45 @@ static inline int kvm_arch_vcpu_normalprio_cmp(struct kvm_vcpu_arch *arch,
	return -1;
	}

	+static inline bool kvm_arch_vcpu_is_throttled(struct kvm_vcpu_arch *arch)
	+{
	+ return arch->pv_sched.throttled;
	+}
	+
	+static inline bool kvm_arch_vcpu_is_boosted(struct kvm_vcpu_arch *arch)
	+{
	+ return arch->pv_sched.normal_prio < arch->pv_sched.default_normal_prio;
	+}
	+
	static inline void kvm_arch_vcpu_set_sched_attr(struct kvm_vcpu_arch *arch,
	union vcpu_sched_attr attr)
	{
	+ u8 boost_type = KVM_PVSCHED_BOOST_TASKPRIO;
	+ int normal_prio = __sched_normal_prio(attr);
	+
	+ /*
	+ * If current priority of the vcpu task is same as its
	+ * previous priority, we need not update arch->pv_sched.
	+ */
	+ if (normal_prio == arch->pv_sched.normal_prio)
	+ return;
	+
	arch->pv_sched.attr = attr;
	arch->pv_sched.normal_prio = __sched_normal_prio(attr);
	+
	+ if (attr.kern_cs)
	+ boost_type = KVM_PVSCHED_BOOST_KERNCS;
	+
	+ if (kvm_arch_vcpu_is_boosted(arch)) {
	+ arch->pv_sched.boosted \|= boost_type;
	+ if (!attr.kern_cs) {
	+ arch->pv_sched.boosted &= ~KVM_PVSCHED_BOOST_KERNCS;
	+ arch->pv_sched.kerncs_ns = 0;
	+ }
	+ } else if (arch->pv_sched.boosted) {
	+ arch->pv_sched.boosted = 0;
	+ arch->pv_sched.taskprio_ns = arch->pv_sched.kerncs_ns = 0;
	+ }
	}

	static inline void kvm_arch_vcpu_set_default_sched_attr(struct kvm_vcpu_arch *arch,
	diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
	index 6dcbe53f6a59d3463b97d1fc82a91454a1608b89..9c95c5ae8bb6f8a45c6c5577ee5b3ab57df884ec 100644
	--- a/arch/x86/kvm/x86.c
	+++ b/arch/x86/kvm/x86.c
	@@ -197,6 +197,25 @@ module_param(eager_page_split, bool, 0644);
	static bool __read_mostly mitigate_smt_rsb;
	module_param(mitigate_smt_rsb, bool, 0444);

	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+/*
	+ * Maximum allowable boosted time for a guest kernel critical section.
	+ */
	+unsigned int pvsched_max_kerncs_us = 3000;
	+module_param(pvsched_max_kerncs_us, uint, 0644);
	+
	+/*
	+ * Maximum allowable boosted time for a guest task.
	+ */
	+unsigned int pvsched_max_taskprio_us = 500000;
	+module_param(pvsched_max_taskprio_us, uint, 0644);
	+
	+static enum hrtimer_restart boost_throttle_timer_fn(struct hrtimer *data)
	+{
	+ return HRTIMER_NORESTART;
	+}
	+#endif
	+
	/*
	* Restoring the host value for MSRs that are only consumed when running in
	* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
	@@ -10958,6 +10977,86 @@ static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
	#endif

	#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+static inline void kvm_vcpu_pvsched_update_vmenter(struct kvm_vcpu_arch *arch)
	+{
	+ u64 max_ns = 0;
	+ u64 elapsed_ns = 0;
	+ ktime_t expire;
	+ u64 max_taskprio_ns = pvsched_max_taskprio_us * NSEC_PER_USEC;
	+ u64 max_kerncs_ns = pvsched_max_kerncs_us * NSEC_PER_USEC;
	+
	+ arch->pv_sched.vmentry_ts = ktime_get();
	+ if (!arch->pv_sched.boosted && !arch->pv_sched.throttled)
	+ return;
	+
	+ if (arch->pv_sched.boosted) {
	+ if (arch->pv_sched.boosted & KVM_PVSCHED_BOOST_KERNCS) {
	+ max_ns = max_kerncs_ns;
	+ elapsed_ns = arch->pv_sched.kerncs_ns;
	+ } else {
	+ max_ns = max_taskprio_ns;
	+ elapsed_ns = arch->pv_sched.taskprio_ns;
	+ }
	+ } else if (arch->pv_sched.throttled) {
	+ if (arch->pv_sched.throttled & KVM_PVSCHED_BOOST_KERNCS) {
	+ max_ns = max_kerncs_ns;
	+ elapsed_ns = arch->pv_sched.kerncs_ns;
	+ } else {
	+ max_ns = max_taskprio_ns;
	+ elapsed_ns = arch->pv_sched.taskprio_ns;
	+ }
	+ }
	+ WARN_ON(max_ns <= elapsed_ns);
	+ expire = ktime_add_ns(arch->pv_sched.vmentry_ts, max_ns - elapsed_ns);
	+ hrtimer_start(&arch->pv_sched.boost_thr_timer, expire, HRTIMER_MODE_ABS_HARD);
	+}
	+
	+static inline void kvm_vcpu_pvsched_update_vmexit(struct kvm_vcpu_arch *arch)
	+{
	+ u64 delta;
	+ u64 max_ns;
	+ u64 elapsed_ns;
	+ ktime_t now = ktime_get();
	+ u64 max_kerncs_ns = pvsched_max_kerncs_us * NSEC_PER_USEC;
	+ u64 max_taskprio_ns = pvsched_max_taskprio_us * NSEC_PER_USEC;
	+
	+ hrtimer_cancel(&arch->pv_sched.boost_thr_timer);
	+
	+ delta = ktime_sub_ns(now, arch->pv_sched.vmentry_ts);
	+ if (arch->pv_sched.boosted) {
	+ u8 thr_type;
	+
	+ arch->pv_sched.taskprio_ns += delta;
	+ if (arch->pv_sched.boosted & KVM_PVSCHED_BOOST_KERNCS) {
	+ elapsed_ns = arch->pv_sched.kerncs_ns += delta;
	+ max_ns = max_kerncs_ns;
	+ thr_type = KVM_PVSCHED_BOOST_KERNCS;
	+ } else {
	+ elapsed_ns = arch->pv_sched.taskprio_ns;
	+ max_ns = max_taskprio_ns;
	+ thr_type = KVM_PVSCHED_BOOST_TASKPRIO;
	+ }
	+ if (elapsed_ns >= max_ns) {
	+ arch->pv_sched.throttled = thr_type;
	+ arch->pv_sched.boosted = 0;
	+ arch->pv_sched.kerncs_ns = arch->pv_sched.taskprio_ns = 0;
	+ }
	+ } else if (arch->pv_sched.throttled) {
	+ if (arch->pv_sched.throttled & KVM_PVSCHED_BOOST_KERNCS) {
	+ elapsed_ns = arch->pv_sched.kerncs_ns += delta;
	+ max_ns = max_kerncs_ns;
	+ } else {
	+ elapsed_ns = arch->pv_sched.taskprio_ns += delta;
	+ max_ns = max_taskprio_ns;
	+ }
	+
	+ if (elapsed_ns >= max_ns) {
	+ arch->pv_sched.throttled = 0;
	+ arch->pv_sched.kerncs_ns = arch->pv_sched.taskprio_ns = 0;
	+ }
	+ }
	+}
	+
	/*
	* Update the host area of PV_SCHED with the vcpu task sched parameters
	* so that guest can utilize it if needed.
	@@ -10978,6 +11077,8 @@ static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu)
	if (!kvm_vcpu_sched_enabled(vcpu))
	return;

	+ kvm_vcpu_pvsched_update_vmexit(&vcpu->arch);
	+
	if (kvm_cpu_has_pending_timer(vcpu) \|\| kvm_cpu_has_interrupt(vcpu))
	kvm_vcpu_boost(vcpu, PVSCHED_KERNCS_BOOST_IRQ);
	else {
	@@ -10995,11 +11096,23 @@ static void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu)
	kvm_arch_vcpu_set_kerncs_prio(&vcpu->arch, VCPU_KERN_CS_PRIO);
	kvm_arch_vcpu_set_kerncs_policy(&vcpu->arch, VCPU_KERN_CS_POLICY);
	kvm_arch_vcpu_set_default_sched_attr(&vcpu->arch, kvm_vcpu_get_sched(vcpu));
	+
	+ hrtimer_init(&vcpu->arch.pv_sched.boost_thr_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
	+ vcpu->arch.pv_sched.boost_thr_timer.function = boost_throttle_timer_fn;
	+}
	+
	+static void kvm_vcpu_pv_sched_fini(struct kvm_vcpu *vcpu)
	+{
	+ hrtimer_cancel(&vcpu->arch.pv_sched.boost_thr_timer);
	}
	#else
	static inline void record_vcpu_pv_sched(struct kvm_vcpu *vcpu) { }
	static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu) { }
	static inline void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu) { }
	+static inline void kvm_vcpu_pv_sched_fini(struct kvm_vcpu *vcpu) { }
	+
	+static inline void kvm_vcpu_pvsched_update_vmenter(struct kvm_vcpu_arch *arch) { }
	+static inline void kvm_vcpu_pvsched_update_vmexit(struct kvm_vcpu_arch *arch) { }
	#endif

	/*
	@@ -11283,6 +11396,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
	set_debugreg(0, 7);
	}

	+ kvm_vcpu_pvsched_update_vmenter(&vcpu->arch);
	guest_timing_enter_irqoff();

	for (;;) {
	@@ -12551,6 +12665,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
	srcu_read_unlock(&vcpu->kvm->srcu, idx);
	free_page((unsigned long)vcpu->arch.pio_data);
	kvfree(vcpu->arch.cpuid_entries);
	+
	+ kvm_vcpu_pv_sched_fini(vcpu);
	}

	void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
	diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
	index 838dc48a45d54bb25973783881a066488fb7558b..c89bd71a4d9ef2008453ea529be58fe378118632 100644
	--- a/virt/kvm/kvm_main.c
	+++ b/virt/kvm/kvm_main.c
	@@ -4171,13 +4171,15 @@ int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, union vcpu_sched_attr attr)
	struct pid *pid;
	struct task_struct *vcpu_task = NULL;
	int max_rt_prio = kvm_arch_vcpu_kerncs_prio(&vcpu->arch);
	+ union vcpu_sched_attr default_attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch);

	retry_disable:
	/*
	- * If the feature is disabled, revert to CFS.
	+ * If the feature is disabled, or a boost request comes when throttled, revert to CFS.
	*/
	- if (!kvm_vcpu_sched_enabled(vcpu))
	- attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch);
	+ if (!kvm_vcpu_sched_enabled(vcpu) \|\| (kvm_arch_vcpu_is_throttled(&vcpu->arch) &&
	+ (attr.kern_cs \|\| kvm_arch_vcpu_normalprio_cmp(&vcpu->arch, attr) < 0)))
	+ attr = default_attr;

	policy = attr.sched_policy;
	rt_prio = attr.rt_priority;
	@@ -4256,7 +4258,8 @@ void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type)
	.kern_cs = boost_type
	};

	- kvm_vcpu_set_sched(vcpu, attr);
	+ if (kvm_vcpu_sched_enabled(vcpu))
	+ kvm_vcpu_set_sched(vcpu, attr);
	}
	EXPORT_SYMBOL_GPL(kvm_vcpu_boost);
	#endif
	--
	2.46.0.rc2.264.g509ed76dc8-goog