blob: 8084eb976c530ca8817b401bcb0cee1026045f98 [file] [log] [blame] [edit]
From d2ec76ca3240dee4ea5f02cc3d0321d04e2480a0 Mon Sep 17 00:00:00 2001
From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org>
Date: Tue, 31 Oct 2023 16:20:58 -0400
Subject: [PATCH] CHROMIUM: kvm: x86: upper bound for boost duration
Unbounded boosting of vcpu tasks can hurt the host workloads. So,
implement a throttling mechanism to unboost the vcpus if they run
boosted for more than a threshold value.
Default threshold for kernel critical section boosting is 3ms and task
boosting is 500 ms and is a tunable as well.
UPSTREAM-TASK=b:303645537
BUG=b:262267726
TEST=boot
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
(cherry picked from commit 298c342708bdcb9f22ac9f86805c9d757954d997)
Change-Id: Ife506102d5953d104c79caa3f9eb650bf9481366
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5425467
Tested-by: Vineeth Pillai <vineethrp@google.com>
Commit-Queue: Vineeth Pillai <vineethrp@google.com>
Reviewed-by: Joel Fernandes <joelaf@google.com>
---
arch/x86/include/asm/kvm_host.h | 59 ++++++++++++++++
arch/x86/kvm/x86.c | 116 ++++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 11 +--
3 files changed, 182 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 45b5aca1276b2fec45bab32312cee33dd4455981..009b3701c71b9b1111bd8527891dbb1ebfaa5271 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -766,6 +766,22 @@ struct vcpu_pv_sched {
*/
int kern_cs_prio;
int kern_cs_policy;
+ struct hrtimer boost_thr_timer;
+ u8 boosted;
+ u8 throttled;
+ /*
+ * nanoseconds since last task prio boost or throttle.
+ */
+ u64 taskprio_ns;
+ /*
+ * nanoseconds since last kernel critical section
+ * boost or throttle.
+ */
+ u64 kerncs_ns;
+ /*
+ * Timestamp for last VMENTRY.
+ */
+ ktime_t vmentry_ts;
u64 msr_val;
struct gfn_to_hva_cache data;
};
@@ -2396,6 +2412,15 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
#define VCPU_KERN_CS_PRIO 8
#define VCPU_KERN_CS_POLICY SCHED_RR
+/*
+ * Vcpu boosted for servicing kernel critical section.
+ */
+#define KVM_PVSCHED_BOOST_KERNCS 0x1
+/*
+ * Vcpu boosted to match priority of running task in the vcpu.
+ */
+#define KVM_PVSCHED_BOOST_TASKPRIO 0x2
+
static inline bool kvm_arch_vcpu_pv_sched_enabled(struct kvm_vcpu_arch *arch)
{
return arch->pv_sched.msr_val;
@@ -2442,11 +2467,45 @@ static inline int kvm_arch_vcpu_normalprio_cmp(struct kvm_vcpu_arch *arch,
return -1;
}
+static inline bool kvm_arch_vcpu_is_throttled(struct kvm_vcpu_arch *arch)
+{
+ return arch->pv_sched.throttled;
+}
+
+static inline bool kvm_arch_vcpu_is_boosted(struct kvm_vcpu_arch *arch)
+{
+ return arch->pv_sched.normal_prio < arch->pv_sched.default_normal_prio;
+}
+
static inline void kvm_arch_vcpu_set_sched_attr(struct kvm_vcpu_arch *arch,
union vcpu_sched_attr attr)
{
+ u8 boost_type = KVM_PVSCHED_BOOST_TASKPRIO;
+ int normal_prio = __sched_normal_prio(attr);
+
+ /*
+ * If current priority of the vcpu task is same as its
+ * previous priority, we need not update arch->pv_sched.
+ */
+ if (normal_prio == arch->pv_sched.normal_prio)
+ return;
+
arch->pv_sched.attr = attr;
arch->pv_sched.normal_prio = __sched_normal_prio(attr);
+
+ if (attr.kern_cs)
+ boost_type = KVM_PVSCHED_BOOST_KERNCS;
+
+ if (kvm_arch_vcpu_is_boosted(arch)) {
+ arch->pv_sched.boosted |= boost_type;
+ if (!attr.kern_cs) {
+ arch->pv_sched.boosted &= ~KVM_PVSCHED_BOOST_KERNCS;
+ arch->pv_sched.kerncs_ns = 0;
+ }
+ } else if (arch->pv_sched.boosted) {
+ arch->pv_sched.boosted = 0;
+ arch->pv_sched.taskprio_ns = arch->pv_sched.kerncs_ns = 0;
+ }
}
static inline void kvm_arch_vcpu_set_default_sched_attr(struct kvm_vcpu_arch *arch,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6dcbe53f6a59d3463b97d1fc82a91454a1608b89..9c95c5ae8bb6f8a45c6c5577ee5b3ab57df884ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -197,6 +197,25 @@ module_param(eager_page_split, bool, 0644);
static bool __read_mostly mitigate_smt_rsb;
module_param(mitigate_smt_rsb, bool, 0444);
+#ifdef CONFIG_PARAVIRT_SCHED_KVM
+/*
+ * Maximum allowable boosted time for a guest kernel critical section.
+ */
+unsigned int pvsched_max_kerncs_us = 3000;
+module_param(pvsched_max_kerncs_us, uint, 0644);
+
+/*
+ * Maximum allowable boosted time for a guest task.
+ */
+unsigned int pvsched_max_taskprio_us = 500000;
+module_param(pvsched_max_taskprio_us, uint, 0644);
+
+static enum hrtimer_restart boost_throttle_timer_fn(struct hrtimer *data)
+{
+ return HRTIMER_NORESTART;
+}
+#endif
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -10958,6 +10977,86 @@ static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
#endif
#ifdef CONFIG_PARAVIRT_SCHED_KVM
+static inline void kvm_vcpu_pvsched_update_vmenter(struct kvm_vcpu_arch *arch)
+{
+ u64 max_ns = 0;
+ u64 elapsed_ns = 0;
+ ktime_t expire;
+ u64 max_taskprio_ns = pvsched_max_taskprio_us * NSEC_PER_USEC;
+ u64 max_kerncs_ns = pvsched_max_kerncs_us * NSEC_PER_USEC;
+
+ arch->pv_sched.vmentry_ts = ktime_get();
+ if (!arch->pv_sched.boosted && !arch->pv_sched.throttled)
+ return;
+
+ if (arch->pv_sched.boosted) {
+ if (arch->pv_sched.boosted & KVM_PVSCHED_BOOST_KERNCS) {
+ max_ns = max_kerncs_ns;
+ elapsed_ns = arch->pv_sched.kerncs_ns;
+ } else {
+ max_ns = max_taskprio_ns;
+ elapsed_ns = arch->pv_sched.taskprio_ns;
+ }
+ } else if (arch->pv_sched.throttled) {
+ if (arch->pv_sched.throttled & KVM_PVSCHED_BOOST_KERNCS) {
+ max_ns = max_kerncs_ns;
+ elapsed_ns = arch->pv_sched.kerncs_ns;
+ } else {
+ max_ns = max_taskprio_ns;
+ elapsed_ns = arch->pv_sched.taskprio_ns;
+ }
+ }
+ WARN_ON(max_ns <= elapsed_ns);
+ expire = ktime_add_ns(arch->pv_sched.vmentry_ts, max_ns - elapsed_ns);
+ hrtimer_start(&arch->pv_sched.boost_thr_timer, expire, HRTIMER_MODE_ABS_HARD);
+}
+
+static inline void kvm_vcpu_pvsched_update_vmexit(struct kvm_vcpu_arch *arch)
+{
+ u64 delta;
+ u64 max_ns;
+ u64 elapsed_ns;
+ ktime_t now = ktime_get();
+ u64 max_kerncs_ns = pvsched_max_kerncs_us * NSEC_PER_USEC;
+ u64 max_taskprio_ns = pvsched_max_taskprio_us * NSEC_PER_USEC;
+
+ hrtimer_cancel(&arch->pv_sched.boost_thr_timer);
+
+ delta = ktime_sub_ns(now, arch->pv_sched.vmentry_ts);
+ if (arch->pv_sched.boosted) {
+ u8 thr_type;
+
+ arch->pv_sched.taskprio_ns += delta;
+ if (arch->pv_sched.boosted & KVM_PVSCHED_BOOST_KERNCS) {
+ elapsed_ns = arch->pv_sched.kerncs_ns += delta;
+ max_ns = max_kerncs_ns;
+ thr_type = KVM_PVSCHED_BOOST_KERNCS;
+ } else {
+ elapsed_ns = arch->pv_sched.taskprio_ns;
+ max_ns = max_taskprio_ns;
+ thr_type = KVM_PVSCHED_BOOST_TASKPRIO;
+ }
+ if (elapsed_ns >= max_ns) {
+ arch->pv_sched.throttled = thr_type;
+ arch->pv_sched.boosted = 0;
+ arch->pv_sched.kerncs_ns = arch->pv_sched.taskprio_ns = 0;
+ }
+ } else if (arch->pv_sched.throttled) {
+ if (arch->pv_sched.throttled & KVM_PVSCHED_BOOST_KERNCS) {
+ elapsed_ns = arch->pv_sched.kerncs_ns += delta;
+ max_ns = max_kerncs_ns;
+ } else {
+ elapsed_ns = arch->pv_sched.taskprio_ns += delta;
+ max_ns = max_taskprio_ns;
+ }
+
+ if (elapsed_ns >= max_ns) {
+ arch->pv_sched.throttled = 0;
+ arch->pv_sched.kerncs_ns = arch->pv_sched.taskprio_ns = 0;
+ }
+ }
+}
+
/*
* Update the host area of PV_SCHED with the vcpu task sched parameters
* so that guest can utilize it if needed.
@@ -10978,6 +11077,8 @@ static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_sched_enabled(vcpu))
return;
+ kvm_vcpu_pvsched_update_vmexit(&vcpu->arch);
+
if (kvm_cpu_has_pending_timer(vcpu) || kvm_cpu_has_interrupt(vcpu))
kvm_vcpu_boost(vcpu, PVSCHED_KERNCS_BOOST_IRQ);
else {
@@ -10995,11 +11096,23 @@ static void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu)
kvm_arch_vcpu_set_kerncs_prio(&vcpu->arch, VCPU_KERN_CS_PRIO);
kvm_arch_vcpu_set_kerncs_policy(&vcpu->arch, VCPU_KERN_CS_POLICY);
kvm_arch_vcpu_set_default_sched_attr(&vcpu->arch, kvm_vcpu_get_sched(vcpu));
+
+ hrtimer_init(&vcpu->arch.pv_sched.boost_thr_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ vcpu->arch.pv_sched.boost_thr_timer.function = boost_throttle_timer_fn;
+}
+
+static void kvm_vcpu_pv_sched_fini(struct kvm_vcpu *vcpu)
+{
+ hrtimer_cancel(&vcpu->arch.pv_sched.boost_thr_timer);
}
#else
static inline void record_vcpu_pv_sched(struct kvm_vcpu *vcpu) { }
static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu) { }
static inline void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu) { }
+static inline void kvm_vcpu_pv_sched_fini(struct kvm_vcpu *vcpu) { }
+
+static inline void kvm_vcpu_pvsched_update_vmenter(struct kvm_vcpu_arch *arch) { }
+static inline void kvm_vcpu_pvsched_update_vmexit(struct kvm_vcpu_arch *arch) { }
#endif
/*
@@ -11283,6 +11396,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(0, 7);
}
+ kvm_vcpu_pvsched_update_vmenter(&vcpu->arch);
guest_timing_enter_irqoff();
for (;;) {
@@ -12551,6 +12665,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
+
+ kvm_vcpu_pv_sched_fini(vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 838dc48a45d54bb25973783881a066488fb7558b..c89bd71a4d9ef2008453ea529be58fe378118632 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4171,13 +4171,15 @@ int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, union vcpu_sched_attr attr)
struct pid *pid;
struct task_struct *vcpu_task = NULL;
int max_rt_prio = kvm_arch_vcpu_kerncs_prio(&vcpu->arch);
+ union vcpu_sched_attr default_attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch);
retry_disable:
/*
- * If the feature is disabled, revert to CFS.
+ * If the feature is disabled, or a boost request comes when throttled, revert to CFS.
*/
- if (!kvm_vcpu_sched_enabled(vcpu))
- attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch);
+ if (!kvm_vcpu_sched_enabled(vcpu) || (kvm_arch_vcpu_is_throttled(&vcpu->arch) &&
+ (attr.kern_cs || kvm_arch_vcpu_normalprio_cmp(&vcpu->arch, attr) < 0)))
+ attr = default_attr;
policy = attr.sched_policy;
rt_prio = attr.rt_priority;
@@ -4256,7 +4258,8 @@ void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type)
.kern_cs = boost_type
};
- kvm_vcpu_set_sched(vcpu, attr);
+ if (kvm_vcpu_sched_enabled(vcpu))
+ kvm_vcpu_set_sched(vcpu, attr);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_boost);
#endif
--
2.46.0.rc2.264.g509ed76dc8-goog