| From d2ec76ca3240dee4ea5f02cc3d0321d04e2480a0 Mon Sep 17 00:00:00 2001 |
| From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org> |
| Date: Tue, 31 Oct 2023 16:20:58 -0400 |
| Subject: [PATCH] CHROMIUM: kvm: x86: upper bound for boost duration |
| |
| Unbounded boosting of vcpu tasks can hurt the host workloads. So, |
| implement a throttling mechanism to unboost the vcpus if they run |
| boosted for more than a threshold value. |
| |
| Default threshold for kernel critical section boosting is 3ms and task |
| boosting is 500 ms and is a tunable as well. |
| |
| UPSTREAM-TASK=b:303645537 |
| BUG=b:262267726 |
| TEST=boot |
| |
| Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org> |
| Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> |
| Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org> |
| (cherry picked from commit 298c342708bdcb9f22ac9f86805c9d757954d997) |
| |
| Change-Id: Ife506102d5953d104c79caa3f9eb650bf9481366 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5425467 |
| Tested-by: Vineeth Pillai <vineethrp@google.com> |
| Commit-Queue: Vineeth Pillai <vineethrp@google.com> |
| Reviewed-by: Joel Fernandes <joelaf@google.com> |
| --- |
| arch/x86/include/asm/kvm_host.h | 59 ++++++++++++++++ |
| arch/x86/kvm/x86.c | 116 ++++++++++++++++++++++++++++++++ |
| virt/kvm/kvm_main.c | 11 +-- |
| 3 files changed, 182 insertions(+), 4 deletions(-) |
| |
| diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h |
| index 45b5aca1276b2fec45bab32312cee33dd4455981..009b3701c71b9b1111bd8527891dbb1ebfaa5271 100644 |
| --- a/arch/x86/include/asm/kvm_host.h |
| +++ b/arch/x86/include/asm/kvm_host.h |
| @@ -766,6 +766,22 @@ struct vcpu_pv_sched { |
| */ |
| int kern_cs_prio; |
| int kern_cs_policy; |
| + struct hrtimer boost_thr_timer; |
| + u8 boosted; |
| + u8 throttled; |
| + /* |
| + * nanoseconds since last task prio boost or throttle. |
| + */ |
| + u64 taskprio_ns; |
| + /* |
| + * nanoseconds since last kernel critical section |
| + * boost or throttle. |
| + */ |
| + u64 kerncs_ns; |
| + /* |
| + * Timestamp for last VMENTRY. |
| + */ |
| + ktime_t vmentry_ts; |
| u64 msr_val; |
| struct gfn_to_hva_cache data; |
| }; |
| @@ -2396,6 +2412,15 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); |
| #define VCPU_KERN_CS_PRIO 8 |
| #define VCPU_KERN_CS_POLICY SCHED_RR |
| |
| +/* |
| + * Vcpu boosted for servicing kernel critical section. |
| + */ |
| +#define KVM_PVSCHED_BOOST_KERNCS 0x1 |
| +/* |
| + * Vcpu boosted to match priority of running task in the vcpu. |
| + */ |
| +#define KVM_PVSCHED_BOOST_TASKPRIO 0x2 |
| + |
| static inline bool kvm_arch_vcpu_pv_sched_enabled(struct kvm_vcpu_arch *arch) |
| { |
| return arch->pv_sched.msr_val; |
| @@ -2442,11 +2467,45 @@ static inline int kvm_arch_vcpu_normalprio_cmp(struct kvm_vcpu_arch *arch, |
| return -1; |
| } |
| |
| +static inline bool kvm_arch_vcpu_is_throttled(struct kvm_vcpu_arch *arch) |
| +{ |
| + return arch->pv_sched.throttled; |
| +} |
| + |
| +static inline bool kvm_arch_vcpu_is_boosted(struct kvm_vcpu_arch *arch) |
| +{ |
| + return arch->pv_sched.normal_prio < arch->pv_sched.default_normal_prio; |
| +} |
| + |
| static inline void kvm_arch_vcpu_set_sched_attr(struct kvm_vcpu_arch *arch, |
| union vcpu_sched_attr attr) |
| { |
| + u8 boost_type = KVM_PVSCHED_BOOST_TASKPRIO; |
| + int normal_prio = __sched_normal_prio(attr); |
| + |
| + /* |
| + * If current priority of the vcpu task is same as its |
| + * previous priority, we need not update arch->pv_sched. |
| + */ |
| + if (normal_prio == arch->pv_sched.normal_prio) |
| + return; |
| + |
| arch->pv_sched.attr = attr; |
| arch->pv_sched.normal_prio = __sched_normal_prio(attr); |
| + |
| + if (attr.kern_cs) |
| + boost_type = KVM_PVSCHED_BOOST_KERNCS; |
| + |
| + if (kvm_arch_vcpu_is_boosted(arch)) { |
| + arch->pv_sched.boosted |= boost_type; |
| + if (!attr.kern_cs) { |
| + arch->pv_sched.boosted &= ~KVM_PVSCHED_BOOST_KERNCS; |
| + arch->pv_sched.kerncs_ns = 0; |
| + } |
| + } else if (arch->pv_sched.boosted) { |
| + arch->pv_sched.boosted = 0; |
| + arch->pv_sched.taskprio_ns = arch->pv_sched.kerncs_ns = 0; |
| + } |
| } |
| |
| static inline void kvm_arch_vcpu_set_default_sched_attr(struct kvm_vcpu_arch *arch, |
| diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c |
| index 6dcbe53f6a59d3463b97d1fc82a91454a1608b89..9c95c5ae8bb6f8a45c6c5577ee5b3ab57df884ec 100644 |
| --- a/arch/x86/kvm/x86.c |
| +++ b/arch/x86/kvm/x86.c |
| @@ -197,6 +197,25 @@ module_param(eager_page_split, bool, 0644); |
| static bool __read_mostly mitigate_smt_rsb; |
| module_param(mitigate_smt_rsb, bool, 0444); |
| |
| +#ifdef CONFIG_PARAVIRT_SCHED_KVM |
| +/* |
| + * Maximum allowable boosted time for a guest kernel critical section. |
| + */ |
| +unsigned int pvsched_max_kerncs_us = 3000; |
| +module_param(pvsched_max_kerncs_us, uint, 0644); |
| + |
| +/* |
| + * Maximum allowable boosted time for a guest task. |
| + */ |
| +unsigned int pvsched_max_taskprio_us = 500000; |
| +module_param(pvsched_max_taskprio_us, uint, 0644); |
| + |
| +static enum hrtimer_restart boost_throttle_timer_fn(struct hrtimer *data) |
| +{ |
| + return HRTIMER_NORESTART; |
| +} |
| +#endif |
| + |
| /* |
| * Restoring the host value for MSRs that are only consumed when running in |
| * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU |
| @@ -10958,6 +10977,86 @@ static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu) |
| #endif |
| |
| #ifdef CONFIG_PARAVIRT_SCHED_KVM |
| +static inline void kvm_vcpu_pvsched_update_vmenter(struct kvm_vcpu_arch *arch) |
| +{ |
| + u64 max_ns = 0; |
| + u64 elapsed_ns = 0; |
| + ktime_t expire; |
| + u64 max_taskprio_ns = pvsched_max_taskprio_us * NSEC_PER_USEC; |
| + u64 max_kerncs_ns = pvsched_max_kerncs_us * NSEC_PER_USEC; |
| + |
| + arch->pv_sched.vmentry_ts = ktime_get(); |
| + if (!arch->pv_sched.boosted && !arch->pv_sched.throttled) |
| + return; |
| + |
| + if (arch->pv_sched.boosted) { |
| + if (arch->pv_sched.boosted & KVM_PVSCHED_BOOST_KERNCS) { |
| + max_ns = max_kerncs_ns; |
| + elapsed_ns = arch->pv_sched.kerncs_ns; |
| + } else { |
| + max_ns = max_taskprio_ns; |
| + elapsed_ns = arch->pv_sched.taskprio_ns; |
| + } |
| + } else if (arch->pv_sched.throttled) { |
| + if (arch->pv_sched.throttled & KVM_PVSCHED_BOOST_KERNCS) { |
| + max_ns = max_kerncs_ns; |
| + elapsed_ns = arch->pv_sched.kerncs_ns; |
| + } else { |
| + max_ns = max_taskprio_ns; |
| + elapsed_ns = arch->pv_sched.taskprio_ns; |
| + } |
| + } |
| + WARN_ON(max_ns <= elapsed_ns); |
| + expire = ktime_add_ns(arch->pv_sched.vmentry_ts, max_ns - elapsed_ns); |
| + hrtimer_start(&arch->pv_sched.boost_thr_timer, expire, HRTIMER_MODE_ABS_HARD); |
| +} |
| + |
| +static inline void kvm_vcpu_pvsched_update_vmexit(struct kvm_vcpu_arch *arch) |
| +{ |
| + u64 delta; |
| + u64 max_ns; |
| + u64 elapsed_ns; |
| + ktime_t now = ktime_get(); |
| + u64 max_kerncs_ns = pvsched_max_kerncs_us * NSEC_PER_USEC; |
| + u64 max_taskprio_ns = pvsched_max_taskprio_us * NSEC_PER_USEC; |
| + |
| + hrtimer_cancel(&arch->pv_sched.boost_thr_timer); |
| + |
| + delta = ktime_sub_ns(now, arch->pv_sched.vmentry_ts); |
| + if (arch->pv_sched.boosted) { |
| + u8 thr_type; |
| + |
| + arch->pv_sched.taskprio_ns += delta; |
| + if (arch->pv_sched.boosted & KVM_PVSCHED_BOOST_KERNCS) { |
| + elapsed_ns = arch->pv_sched.kerncs_ns += delta; |
| + max_ns = max_kerncs_ns; |
| + thr_type = KVM_PVSCHED_BOOST_KERNCS; |
| + } else { |
| + elapsed_ns = arch->pv_sched.taskprio_ns; |
| + max_ns = max_taskprio_ns; |
| + thr_type = KVM_PVSCHED_BOOST_TASKPRIO; |
| + } |
| + if (elapsed_ns >= max_ns) { |
| + arch->pv_sched.throttled = thr_type; |
| + arch->pv_sched.boosted = 0; |
| + arch->pv_sched.kerncs_ns = arch->pv_sched.taskprio_ns = 0; |
| + } |
| + } else if (arch->pv_sched.throttled) { |
| + if (arch->pv_sched.throttled & KVM_PVSCHED_BOOST_KERNCS) { |
| + elapsed_ns = arch->pv_sched.kerncs_ns += delta; |
| + max_ns = max_kerncs_ns; |
| + } else { |
| + elapsed_ns = arch->pv_sched.taskprio_ns += delta; |
| + max_ns = max_taskprio_ns; |
| + } |
| + |
| + if (elapsed_ns >= max_ns) { |
| + arch->pv_sched.throttled = 0; |
| + arch->pv_sched.kerncs_ns = arch->pv_sched.taskprio_ns = 0; |
| + } |
| + } |
| +} |
| + |
| /* |
| * Update the host area of PV_SCHED with the vcpu task sched parameters |
| * so that guest can utilize it if needed. |
| @@ -10978,6 +11077,8 @@ static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu) |
| if (!kvm_vcpu_sched_enabled(vcpu)) |
| return; |
| |
| + kvm_vcpu_pvsched_update_vmexit(&vcpu->arch); |
| + |
| if (kvm_cpu_has_pending_timer(vcpu) || kvm_cpu_has_interrupt(vcpu)) |
| kvm_vcpu_boost(vcpu, PVSCHED_KERNCS_BOOST_IRQ); |
| else { |
| @@ -10995,11 +11096,23 @@ static void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu) |
| kvm_arch_vcpu_set_kerncs_prio(&vcpu->arch, VCPU_KERN_CS_PRIO); |
| kvm_arch_vcpu_set_kerncs_policy(&vcpu->arch, VCPU_KERN_CS_POLICY); |
| kvm_arch_vcpu_set_default_sched_attr(&vcpu->arch, kvm_vcpu_get_sched(vcpu)); |
| + |
| + hrtimer_init(&vcpu->arch.pv_sched.boost_thr_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); |
| + vcpu->arch.pv_sched.boost_thr_timer.function = boost_throttle_timer_fn; |
| +} |
| + |
| +static void kvm_vcpu_pv_sched_fini(struct kvm_vcpu *vcpu) |
| +{ |
| + hrtimer_cancel(&vcpu->arch.pv_sched.boost_thr_timer); |
| } |
| #else |
| static inline void record_vcpu_pv_sched(struct kvm_vcpu *vcpu) { } |
| static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu) { } |
| static inline void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu) { } |
| +static inline void kvm_vcpu_pv_sched_fini(struct kvm_vcpu *vcpu) { } |
| + |
| +static inline void kvm_vcpu_pvsched_update_vmenter(struct kvm_vcpu_arch *arch) { } |
| +static inline void kvm_vcpu_pvsched_update_vmexit(struct kvm_vcpu_arch *arch) { } |
| #endif |
| |
| /* |
| @@ -11283,6 +11396,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) |
| set_debugreg(0, 7); |
| } |
| |
| + kvm_vcpu_pvsched_update_vmenter(&vcpu->arch); |
| guest_timing_enter_irqoff(); |
| |
| for (;;) { |
| @@ -12551,6 +12665,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
| srcu_read_unlock(&vcpu->kvm->srcu, idx); |
| free_page((unsigned long)vcpu->arch.pio_data); |
| kvfree(vcpu->arch.cpuid_entries); |
| + |
| + kvm_vcpu_pv_sched_fini(vcpu); |
| } |
| |
| void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) |
| diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c |
| index 838dc48a45d54bb25973783881a066488fb7558b..c89bd71a4d9ef2008453ea529be58fe378118632 100644 |
| --- a/virt/kvm/kvm_main.c |
| +++ b/virt/kvm/kvm_main.c |
| @@ -4171,13 +4171,15 @@ int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, union vcpu_sched_attr attr) |
| struct pid *pid; |
| struct task_struct *vcpu_task = NULL; |
| int max_rt_prio = kvm_arch_vcpu_kerncs_prio(&vcpu->arch); |
| + union vcpu_sched_attr default_attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch); |
| |
| retry_disable: |
| /* |
| - * If the feature is disabled, revert to CFS. |
| + * If the feature is disabled, or a boost request comes when throttled, revert to CFS. |
| */ |
| - if (!kvm_vcpu_sched_enabled(vcpu)) |
| - attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch); |
| + if (!kvm_vcpu_sched_enabled(vcpu) || (kvm_arch_vcpu_is_throttled(&vcpu->arch) && |
| + (attr.kern_cs || kvm_arch_vcpu_normalprio_cmp(&vcpu->arch, attr) < 0))) |
| + attr = default_attr; |
| |
| policy = attr.sched_policy; |
| rt_prio = attr.rt_priority; |
| @@ -4256,7 +4258,8 @@ void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type) |
| .kern_cs = boost_type |
| }; |
| |
| - kvm_vcpu_set_sched(vcpu, attr); |
| + if (kvm_vcpu_sched_enabled(vcpu)) |
| + kvm_vcpu_set_sched(vcpu, attr); |
| } |
| EXPORT_SYMBOL_GPL(kvm_vcpu_boost); |
| #endif |
| -- |
| 2.46.0.rc2.264.g509ed76dc8-goog |
| |