aa02b885b1ed36df9a053522ec472dcc35c2f9915df072cdb99d6b99.patch - chromiumos/third_party/kernel-rebase-patches - Git at Google

 From 1f2267bcc1af48e9debfe2ddcb5088af570225e9 Mon Sep 17 00:00:00 2001
 From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org>
 Date: Tue, 7 Nov 2023 21:25:45 -0500
 Subject: [PATCH] CHROMIUM: kvm: x86: paravirt sched framework

 Implement the basic framework needed for guest/host communication to
 facilitate paravirt sched:
  - A kvm MSR that guest uses to provide the GPA of shared memory for
    communicating the scheduling information between host and guest.

    wrmsr(0) disables the feature. wrmsr(valid_gpa) enables the feature
    and uses the gpa for further communication.

  - Add a new cpuid feature flag for the host to advertise the feature
    to the guest.

 When the guest kernel is about to run a critical or latency sensitive
 workload, it can request the hypervisor to boost the priority of the
 vcpu thread. Similarly, guest kernel can request to unboost when the
 vcpu switches to a normal workload. Guest kernel can also share the
 priority attributes of the task that it is going to schedule and host
 can adjust the priority of vcpu thread acordingly.

 When a guest determines that it needs a boost, it need not immediately
 request a synchronous boost as it is already running at that moment.
 Synchronous request is detrimental because it incurs a VMEXIT. Rather,
 let the guest note down its request on a shared memory and the host can
 check this request on next VMEXIT and boost if needed.

 Nmis, irqs, softirqs and preemption disabled are considered latency
 sensitive critical sections in kernel and are candidates for boosting.

 UPSTREAM-TASK=b:303645537
 BUG=b:262267726
 TEST=boot

 Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
 Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
 Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
 (cherry picked from commit e654656547e9b54e4e69fa9d8f13ecf460c9f444)

 Change-Id: I4a63df3d12bec9026fcb7c82f82b4bdf02cbb0b4
 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5425465
 Reviewed-by: Joel Fernandes <joelaf@google.com>
 Reviewed-by: Masami Hiramatsu <mhiramat@google.com>
 Commit-Queue: Vineeth Pillai <vineethrp@google.com>
 Tested-by: Vineeth Pillai <vineethrp@google.com>
 ---
  arch/x86/include/asm/kvm_host.h      | 133 ++++++++++++++++++++++++
  arch/x86/include/uapi/asm/kvm_para.h |   2 +
  arch/x86/kvm/Kconfig                 |  13 +++
  arch/x86/kvm/cpuid.c                 |   2 +
  arch/x86/kvm/x86.c                   |  97 ++++++++++++++++++
  include/linux/kvm_host.h             |  15 +++
  include/uapi/linux/kvm_para.h        |  49 +++++++++
  virt/kvm/kvm_main.c                  | 145 +++++++++++++++++++++++++++
  8 files changed, 456 insertions(+)

 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index f849f3dfe07bf5c1e4638bbc4f611d3a98810e54..45b5aca1276b2fec45bab32312cee33dd4455981 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -16,6 +16,7 @@
  #include <linux/irq_work.h>
  #include <linux/irq.h>
  #include <linux/workqueue.h>
 +#include <linux/sched/deadline.h>

  #include <linux/kvm.h>
  #include <linux/kvm_para.h>
 @@ -740,6 +741,35 @@ struct kvm_queued_exception {
  	bool has_payload;
  };

 +/*
 + * PARAVIRT_SCHED info
 + */
 +struct vcpu_pv_sched {
 +	/*
 +	 * Current scheduling attributes for this vcpu.
 +	 */
 +	union vcpu_sched_attr attr;
 +	/*
 +	 * Kernel priority : [-1, 140)
 +	 * Used for priority comparisons.
 +	 */
 +	int normal_prio;
 +	/*
 +	 * Default scheduling attributes for this vcpu,
 +	 * when the VM was started.
 +	 */
 +	union vcpu_sched_attr default_attr;
 +	int default_normal_prio;
 +	/*
 +	 * Policy and priority for guest kernel critical
 +	 * sections - nmi, irq, softirq and preemption disabled.
 +	 */
 +	int kern_cs_prio;
 +	int kern_cs_policy;
 +	u64 msr_val;
 +	struct gfn_to_hva_cache data;
 +};
 +
  struct kvm_vcpu_arch {
  	/*
  	 * rip and regs accesses must go through
 @@ -1033,6 +1063,10 @@ struct kvm_vcpu_arch {
  	/* Protected Guests */
  	bool guest_state_protected;

 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +	struct vcpu_pv_sched pv_sched;
 +#endif
 +
  	/*
  	 * Set when PDPTS were loaded directly by the userspace without
  	 * reading the guest memory
 @@ -2354,4 +2388,103 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
   */
  #define KVM_EXIT_HYPERCALL_MBZ		GENMASK_ULL(31, 1)

 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +/*
 + * Default policy and priority used for boosting
 + * VCPU threads.
 + */
 +#define VCPU_KERN_CS_PRIO	8
 +#define VCPU_KERN_CS_POLICY	SCHED_RR
 +
 +static inline bool kvm_arch_vcpu_pv_sched_enabled(struct kvm_vcpu_arch *arch)
 +{
 +	return arch->pv_sched.msr_val;
 +}
 +
 +static inline union vcpu_sched_attr kvm_arch_vcpu_default_sched_attr(struct kvm_vcpu_arch *arch)
 +{
 +	return arch->pv_sched.default_attr;
 +}
 +
 +/*
 + * copied from kernel/sched/core.c:__normal_prio()
 + */
 +static inline int __sched_normal_prio(union vcpu_sched_attr attr)
 +{
 +	int prio;
 +
 +	if (attr.sched_policy == SCHED_DEADLINE)
 +		prio = MAX_DL_PRIO - 1;
 +	else if (attr.sched_policy == SCHED_FIFO || attr.sched_policy == SCHED_RR)
 +		prio = MAX_RT_PRIO - 1 - attr.rt_priority;
 +	else
 +		prio = NICE_TO_PRIO(attr.sched_nice);
 +
 +	return prio;
 +}
 +
 +/*
 + * Returns
 + * 0 if vcpus prio is equal to prio specified by attr
 + * 1 if vcpu prio is greater than prio specified by attr
 + * -1 if vcpu prio is less than prio specified by attr
 + */
 +static inline int kvm_arch_vcpu_normalprio_cmp(struct kvm_vcpu_arch *arch,
 +		union vcpu_sched_attr attr)
 +{
 +	int normal_prio = __sched_normal_prio(attr);
 +
 +	if (normal_prio == arch->pv_sched.normal_prio)
 +		return 0;
 +	else if (normal_prio > arch->pv_sched.normal_prio)
 +		return 1;
 +	else
 +		return -1;
 +}
 +
 +static inline void kvm_arch_vcpu_set_sched_attr(struct kvm_vcpu_arch *arch,
 +		union vcpu_sched_attr attr)
 +{
 +	arch->pv_sched.attr = attr;
 +	arch->pv_sched.normal_prio = __sched_normal_prio(attr);
 +}
 +
 +static inline void kvm_arch_vcpu_set_default_sched_attr(struct kvm_vcpu_arch *arch,
 +		union vcpu_sched_attr attr)
 +{
 +	arch->pv_sched.default_attr = attr;
 +	arch->pv_sched.default_normal_prio = __sched_normal_prio(attr);
 +}
 +
 +static inline int kvm_arch_vcpu_kerncs_prio(struct kvm_vcpu_arch *arch)
 +{
 +	return arch->pv_sched.kern_cs_prio;
 +}
 +
 +static inline int kvm_arch_vcpu_kerncs_policy(struct kvm_vcpu_arch *arch)
 +{
 +	return arch->pv_sched.kern_cs_policy;
 +}
 +
 +static inline int kvm_arch_vcpu_set_kerncs_prio(struct kvm_vcpu_arch *arch, int prio)
 +{
 +	if ((unsigned int)prio > MAX_RT_PRIO)
 +		return -EINVAL;
 +
 +	arch->pv_sched.kern_cs_prio = prio;
 +
 +	return 0;
 +}
 +
 +static inline int kvm_arch_vcpu_set_kerncs_policy(struct kvm_vcpu_arch *arch, int policy)
 +{
 +	if (policy != SCHED_FIFO && policy != SCHED_RR)
 +		return -EINVAL;
 +
 +	arch->pv_sched.kern_cs_policy = policy;
 +
 +	return 0;
 +}
 +#endif
 +
  #endif /* _ASM_X86_KVM_HOST_H */
 diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
 index 79c12988bc87aaa68c7ef5f4d39cb6fba81b1455..2f51db8f725a5fa88591c56a5db4674416dfd938 100644
 --- a/arch/x86/include/uapi/asm/kvm_para.h
 +++ b/arch/x86/include/uapi/asm/kvm_para.h
 @@ -36,6 +36,7 @@
  #define KVM_FEATURE_MSI_EXT_DEST_ID	15
  #define KVM_FEATURE_HC_MAP_GPA_RANGE	16
  #define KVM_FEATURE_MIGRATION_CONTROL	17
 +#define KVM_FEATURE_PV_SCHED		29
  #define KVM_FEATURE_HOST_SUSPEND_TIME	30

  #define KVM_HINTS_REALTIME      0
 @@ -60,6 +61,7 @@
  #define MSR_KVM_ASYNC_PF_ACK	0x4b564d07
  #define MSR_KVM_MIGRATION_CONTROL	0x4b564d08
  #define MSR_KVM_HOST_SUSPEND_TIME      0x4b564d98
 +#define MSR_KVM_PV_SCHED	0x4b564da0

  struct kvm_steal_time {
  	__u64 steal;
 diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
 index 22012ad6ff9ace5b14be4bb02c5ddc13a1fa59e8..9eae15d3234112ee5cad97c44b522af50845c66f 100644
 --- a/arch/x86/kvm/Kconfig
 +++ b/arch/x86/kvm/Kconfig
 @@ -221,4 +221,17 @@ config KVM_VIRT_SUSPEND_TIMING

  	 If unsure, say N.

 +config PARAVIRT_SCHED_KVM
 +	bool "Enable paravirt scheduling capability for kvm"
 +	depends on KVM
 +	default n
 +	help
 +	  Paravirtualized scheduling facilitates the exchange of scheduling
 +	  related information between the host and guest through shared memory,
 +	  enhancing the efficiency of vCPU thread scheduling by the hypervisor.
 +	  An illustrative use case involves dynamically boosting the priority of
 +	  a vCPU thread when the guest is executing a latency-sensitive workload
 +	  on that specific vCPU.
 +	  This config enables paravirt scheduling in the kvm hypervisor.
 +
  endif # VIRTUALIZATION
 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
 index 76b4c4e05e89c28c3f16cbb79cdfbc3cd005df00..40b52f84e11053563965526cb1cc631b665387e9 100644
 --- a/arch/x86/kvm/cpuid.c
 +++ b/arch/x86/kvm/cpuid.c
 @@ -1196,6 +1196,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
  			     (1 << KVM_FEATURE_POLL_CONTROL) |
  			     (1 << KVM_FEATURE_PV_SCHED_YIELD) |
  			     (1 << KVM_FEATURE_ASYNC_PF_INT);
 +		if (IS_ENABLED(CONFIG_PARAVIRT_SCHED_KVM))
 +			entry->eax |= (1 << KVM_FEATURE_PV_SCHED);

  #ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
  		entry->eax |= (1 << KVM_FEATURE_HOST_SUSPEND_TIME);
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index e78f86f258c4b3fd91972632bd2887a8f13ac16d..6dcbe53f6a59d3463b97d1fc82a91454a1608b89 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -2184,6 +2184,15 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
  			ret = EXIT_FASTPATH_REENTER_GUEST;
  		}
  		break;
 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +	case MSR_KVM_PV_SCHED:
 +		data = kvm_read_edx_eax(vcpu);
 +		if (data == ULLONG_MAX) {
 +			kvm_skip_emulated_instruction(vcpu);
 +			ret = EXIT_FASTPATH_EXIT_HANDLED;
 +		}
 +		break;
 +#endif
  	default:
  		break;
  	}
 @@ -4035,6 +4044,37 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  			return 1;
  		break;

 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +	case MSR_KVM_PV_SCHED:
 +		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED))
 +			return 1;
 +
 +		if (!(data & KVM_MSR_ENABLED))
 +			break;
 +
 +		if (!(data & ~KVM_MSR_ENABLED) && vcpu->arch.pv_sched.msr_val) {
 +			/*
 +			 * Disable the feature
 +			 */
 +			vcpu->arch.pv_sched.msr_val = 0;
 +			kvm_vcpu_set_sched(vcpu,
 +				kvm_arch_vcpu_default_sched_attr(&vcpu->arch));
 +		} else if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
 +				&vcpu->arch.pv_sched.data, data & ~KVM_MSR_ENABLED,
 +				sizeof(struct pv_sched_data))) {
 +			vcpu->arch.pv_sched.msr_val = data;
 +			kvm_arch_vcpu_set_default_sched_attr(&vcpu->arch,
 +				kvm_vcpu_get_sched(vcpu));
 +			kvm_vcpu_set_sched(vcpu,
 +				kvm_arch_vcpu_default_sched_attr(&vcpu->arch));
 +		} else {
 +			kvm_debug_ratelimited(
 +				"kvm:%p, vcpu:%p, msr: %llx, kvm_gfn_to_hva_cache_init failed!\n",
 +				vcpu->kvm, vcpu, data & ~KVM_MSR_ENABLED);
 +		}
 +		break;
 +#endif
 +
  	case MSR_KVM_POLL_CONTROL:
  		if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
  			return 1;
 @@ -4402,6 +4442,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

  		msr_info->data = vcpu->arch.pv_eoi.msr_val;
  		break;
 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +	case MSR_KVM_PV_SCHED:
 +		msr_info->data = vcpu->arch.pv_sched.msr_val;
 +		break;
 +#endif
  	case MSR_KVM_POLL_CONTROL:
  		if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
  			return 1;
 @@ -10912,6 +10957,51 @@ static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
  }
  #endif

 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +/*
 + * Update the host area of PV_SCHED with the vcpu task sched parameters
 + * so that guest can utilize it if needed.
 + */
 +static void record_vcpu_pv_sched(struct kvm_vcpu *vcpu)
 +{
 +	if (!kvm_arch_vcpu_pv_sched_enabled(&vcpu->arch))
 +		return;
 +
 +	pagefault_disable();
 +	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.pv_sched.data,
 +		&vcpu->arch.pv_sched.attr, PV_SCHEDATTR_HOST_OFFSET, sizeof(union vcpu_sched_attr));
 +	pagefault_enable();
 +}
 +
 +static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu)
 +{
 +	if (!kvm_vcpu_sched_enabled(vcpu))
 +		return;
 +
 +	if (kvm_cpu_has_pending_timer(vcpu) || kvm_cpu_has_interrupt(vcpu))
 +		kvm_vcpu_boost(vcpu, PVSCHED_KERNCS_BOOST_IRQ);
 +	else {
 +		union vcpu_sched_attr attr;
 +
 +		if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.pv_sched.data,
 +					&attr, PV_SCHEDATTR_GUEST_OFFSET, sizeof(attr)))
 +			return;
 +		kvm_vcpu_set_sched(vcpu, attr);
 +	}
 +}
 +
 +static void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu)
 +{
 +	kvm_arch_vcpu_set_kerncs_prio(&vcpu->arch, VCPU_KERN_CS_PRIO);
 +	kvm_arch_vcpu_set_kerncs_policy(&vcpu->arch, VCPU_KERN_CS_POLICY);
 +	kvm_arch_vcpu_set_default_sched_attr(&vcpu->arch, kvm_vcpu_get_sched(vcpu));
 +}
 +#else
 +static inline void record_vcpu_pv_sched(struct kvm_vcpu *vcpu) { }
 +static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu) { }
 +static inline void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu) { }
 +#endif
 +
  /*
   * Called within kvm->srcu read side.
   * Returns 1 to let vcpu_run() continue the guest execution loop without
 @@ -11015,6 +11105,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  			kvm_pmu_handle_event(vcpu);
  		if (kvm_check_request(KVM_REQ_PMI, vcpu))
  			kvm_pmu_deliver_pmi(vcpu);
 +		if (kvm_check_request(KVM_REQ_VCPU_PV_SCHED, vcpu))
 +			record_vcpu_pv_sched(vcpu);
  #ifdef CONFIG_KVM_SMM
  		if (kvm_check_request(KVM_REQ_SMI, vcpu))
  			process_smi(vcpu);
 @@ -11285,6 +11377,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  	guest_timing_exit_irqoff();

  	local_irq_enable();
 +
 +	kvm_vcpu_do_pv_sched(vcpu);
 +
  	preempt_enable();

  	kvm_vcpu_srcu_read_lock(vcpu);
 @@ -12384,6 +12479,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
  	if (r)
  		goto free_guest_fpu;

 +	kvm_vcpu_pv_sched_init(vcpu);
 +
  	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
  	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
  	kvm_xen_init_vcpu(vcpu);
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index 700c886117eccc1369f0cd290779785158c0fbac..79154fa34aa12d2ab04adee7b232c78d8b48f798 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -174,6 +174,7 @@ static inline bool is_error_page(struct page *page)
  #define KVM_REQ_UNBLOCK			2
  #define KVM_REQ_DIRTY_RING_SOFT_FULL	3
  #define KVM_REQ_SUSPEND_TIME_ADJ	5
 +#define KVM_REQ_VCPU_PV_SCHED		6
  #define KVM_REQUEST_ARCH_BASE		8

  /*
 @@ -2611,4 +2612,18 @@ static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu)
  }
  #endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING */

 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, union vcpu_sched_attr attr);
 +union vcpu_sched_attr kvm_vcpu_get_sched(struct kvm_vcpu *vcpu);
 +
 +static inline bool kvm_vcpu_sched_enabled(struct kvm_vcpu *vcpu)
 +{
 +	return kvm_arch_vcpu_pv_sched_enabled(&vcpu->arch);
 +}
 +
 +void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type);
 +#else
 +static inline void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type) { }
 +#endif
 +
  #endif
 diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
 index 960c7e93d1a98a363df4c6e7eeaffee065512f7c..b7c11650ec84f8a740e1111fd460756a98a0a88e 100644
 --- a/include/uapi/linux/kvm_para.h
 +++ b/include/uapi/linux/kvm_para.h
 @@ -2,6 +2,9 @@
  #ifndef _UAPI__LINUX_KVM_PARA_H
  #define _UAPI__LINUX_KVM_PARA_H

 +#include <linux/const.h>
 +#include <linux/types.h>
 +
  /*
   * This header file provides a method for making a hypercall to the host
   * Architectures should define:
 @@ -31,6 +34,52 @@
  #define KVM_HC_SCHED_YIELD		11
  #define KVM_HC_MAP_GPA_RANGE		12

 +enum kerncs_boost_type {
 +	PVSCHED_KERNCS_BOOST_PREEMPT_DISABLED =	0x1,
 +	PVSCHED_KERNCS_BOOST_IRQ =		0x2,
 +	PVSCHED_KERNCS_BOOST_SOFTIRQ =		0x4,
 +	PVSCHED_KERNCS_BOOST_IDLE =		0x8,
 +	PVSCHED_KERNCS_BOOST_ALL =		0xF,
 +};
 +
 +union vcpu_sched_attr {
 +	struct {
 +		__u8	enabled;
 +		__u8	sched_policy;
 +		__s8	sched_nice;
 +		__u8	rt_priority;
 +		/*
 +		 * Guest running a kernel critical section:
 +		 * - nmi, irq, softirq, preemption disabled.
 +		 */
 +		__u8	kern_cs;
 +	};
 +	__u64	pad;
 +};
 +
 +enum pv_schedattr_type {
 +	PV_SCHEDATTR_GUEST = 0,
 +	PV_SCHEDATTR_HOST,
 +	PV_SCHEDATTR_MAX
 +};
 +
 +/*
 + * Offset of guest area in the PV_SCHED shared memory.
 + */
 +#define PV_SCHEDATTR_GUEST_OFFSET	0
 +
 +/*
 + * Offset of the host area in the PV_SCHED shared memory.
 + */
 +#define PV_SCHEDATTR_HOST_OFFSET	(sizeof(union vcpu_sched_attr))
 +
 +/*
 + * PARAVIRT_SCHED info shared between host and guest.
 + */
 +struct pv_sched_data {
 +	union vcpu_sched_attr attr[PV_SCHEDATTR_MAX];
 +};
 +
  /*
   * hypercalls use architecture specific
   */
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 758ece99863c0ef0c65555d9fc68dea790773411..5e086be7a8dc21ea3e4292a8b6133f3295fc5402 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -54,6 +54,9 @@
  #include <asm/ioctl.h>
  #include <linux/uaccess.h>

 +#include <linux/sched.h>
 +#include <uapi/linux/sched/types.h>
 +
  #include "coalesced_mmio.h"
  #include "async_pf.h"
  #include "kvm_mm.h"
 @@ -4108,6 +4111,148 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);

 +#ifdef CONFIG_PARAVIRT_SCHED_KVM
 +union vcpu_sched_attr kvm_vcpu_get_sched(struct kvm_vcpu *vcpu)
 +{
 +	union vcpu_sched_attr attr = { 0 };
 +	struct pid *pid;
 +	struct task_struct *vcpu_task = NULL;
 +
 +	rcu_read_lock();
 +	pid = rcu_dereference(vcpu->pid);
 +	if (pid)
 +		vcpu_task = get_pid_task(pid, PIDTYPE_PID);
 +	rcu_read_unlock();
 +	if (vcpu_task == NULL)
 +		return attr;
 +
 +	attr.sched_policy = vcpu_task->policy;
 +	if (vcpu_task->policy == SCHED_RR || vcpu_task->policy == SCHED_FIFO)
 +		attr.rt_priority = vcpu_task->rt_priority;
 +	else
 +		attr.sched_nice = task_nice(vcpu_task);
 +
 +	put_task_struct(vcpu_task);
 +
 +	return attr;
 +}
 +
 + /*
 +  * Check if we need to act on the boost/unboost request.
 +  * Returns true if:
 +  * - caller is requesting boost and vcpu is boosted, or
 +  * - caller is requesting unboost and vcpu is not boosted.
 +  */
 +static inline bool __needs_set_sched(struct kvm_vcpu *vcpu, int policy, int prio, int nice)
 +{
 +	union vcpu_sched_attr attr = {
 +		.sched_policy = policy,
 +		.rt_priority = prio,
 +		.sched_nice = nice
 +	};
 +
 +	return kvm_arch_vcpu_normalprio_cmp(&vcpu->arch, attr);
 +}
 +
 +int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, union vcpu_sched_attr attr)
 +{
 +	int rt_prio;
 +	int nice;
 +	int policy;
 +	int ret = 0;
 +	struct pid *pid;
 +	struct task_struct *vcpu_task = NULL;
 +	int max_rt_prio = kvm_arch_vcpu_kerncs_prio(&vcpu->arch);
 +
 +retry_disable:
 +	/*
 +	 * If the feature is disabled, revert to CFS.
 +	 */
 +	if (!kvm_vcpu_sched_enabled(vcpu))
 +		attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch);
 +
 +	policy = attr.sched_policy;
 +	rt_prio = attr.rt_priority;
 +	nice = attr.sched_nice;
 +	if (attr.kern_cs || policy == SCHED_DEADLINE) {
 +		nice = 0;
 +		policy = kvm_arch_vcpu_kerncs_policy(&vcpu->arch);
 +		rt_prio = max_rt_prio;
 +	} else if (policy == SCHED_FIFO || policy == SCHED_RR) {
 +		nice = 0;
 +		if (rt_prio > max_rt_prio)
 +			rt_prio = max_rt_prio;
 +	} else {
 +		rt_prio = 0;
 +	}
 +
 +	rcu_read_lock();
 +	pid = rcu_dereference(vcpu->pid);
 +	if (pid)
 +		vcpu_task = get_pid_task(pid, PIDTYPE_PID);
 +	rcu_read_unlock();
 +	if (vcpu_task == NULL)
 +		return -KVM_EINVAL;
 +
 +	/*
 +	 * This might be called from interrupt context.
 +	 * Since we do not use rt-mutexes, we can safely call
 +	 * sched_setscheduler_pi_nocheck with pi = false.
 +	 * NOTE: If in future, we use rt-mutexes, this should
 +	 * be modified to use a tasklet to do boost/unboost.
 +	 */
 +	WARN_ON_ONCE(vcpu_task->pi_top_task);
 +	if (__needs_set_sched(vcpu, policy, rt_prio, nice)) {
 +		struct sched_attr sattr = {
 +			.sched_policy = policy,
 +			.sched_nice = nice,
 +			.sched_priority = rt_prio,
 +		};
 +		ret = sched_setattr_pi_nocheck(vcpu_task, &sattr, false);
 +	}
 +
 +	/*
 +	 * values to return to guest.
 +	 */
 +	attr.sched_policy = vcpu_task->policy;
 +	if (task_is_realtime(vcpu_task))
 +		attr.rt_priority = vcpu_task->rt_priority;
 +	else
 +		attr.sched_nice = task_nice(vcpu_task);
 +
 +	put_task_struct(vcpu_task);
 +
 +	attr.enabled = kvm_vcpu_sched_enabled(vcpu);
 +	/*
 +	 * If the feature is disabled, we set it in the priority field to let the guest know.
 +	 */
 +	if (!attr.enabled) {
 +		/*
 +		 * There could be a race where the disable path disabled the feature
 +		 * but we did the boost without knowing that disable was in progress.
 +		 * Unboost again.
 +		 */
 +		if (attr.sched_policy != SCHED_NORMAL || attr.sched_nice != 0)
 +			goto retry_disable;
 +	}
 +
 +	kvm_arch_vcpu_set_sched_attr(&vcpu->arch, attr);
 +	kvm_make_request(KVM_REQ_VCPU_PV_SCHED, vcpu);
 +
 +	return ret;
 +}
 +
 +void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type)
 +{
 +	union vcpu_sched_attr attr = {
 +		.kern_cs = boost_type
 +	};
 +
 +	kvm_vcpu_set_sched(vcpu, attr);
 +}
 +EXPORT_SYMBOL_GPL(kvm_vcpu_boost);
 +#endif
 +
  #ifndef CONFIG_S390
  /*
   * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
 --
 2.46.0.rc2.264.g509ed76dc8-goog
	From 1f2267bcc1af48e9debfe2ddcb5088af570225e9 Mon Sep 17 00:00:00 2001
	From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org>
	Date: Tue, 7 Nov 2023 21:25:45 -0500
	Subject: [PATCH] CHROMIUM: kvm: x86: paravirt sched framework

	Implement the basic framework needed for guest/host communication to
	facilitate paravirt sched:
	- A kvm MSR that guest uses to provide the GPA of shared memory for
	communicating the scheduling information between host and guest.

	wrmsr(0) disables the feature. wrmsr(valid_gpa) enables the feature
	and uses the gpa for further communication.

	- Add a new cpuid feature flag for the host to advertise the feature
	to the guest.

	When the guest kernel is about to run a critical or latency sensitive
	workload, it can request the hypervisor to boost the priority of the
	vcpu thread. Similarly, guest kernel can request to unboost when the
	vcpu switches to a normal workload. Guest kernel can also share the
	priority attributes of the task that it is going to schedule and host
	can adjust the priority of vcpu thread acordingly.

	When a guest determines that it needs a boost, it need not immediately
	request a synchronous boost as it is already running at that moment.
	Synchronous request is detrimental because it incurs a VMEXIT. Rather,
	let the guest note down its request on a shared memory and the host can
	check this request on next VMEXIT and boost if needed.

	Nmis, irqs, softirqs and preemption disabled are considered latency
	sensitive critical sections in kernel and are candidates for boosting.

	UPSTREAM-TASK=b:303645537
	BUG=b:262267726
	TEST=boot

	Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
	Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
	Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
	(cherry picked from commit e654656547e9b54e4e69fa9d8f13ecf460c9f444)

	Change-Id: I4a63df3d12bec9026fcb7c82f82b4bdf02cbb0b4
	Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5425465
	Reviewed-by: Joel Fernandes <joelaf@google.com>
	Reviewed-by: Masami Hiramatsu <mhiramat@google.com>
	Commit-Queue: Vineeth Pillai <vineethrp@google.com>
	Tested-by: Vineeth Pillai <vineethrp@google.com>
	---
	arch/x86/include/asm/kvm_host.h \| 133 ++++++++++++++++++++++++
	arch/x86/include/uapi/asm/kvm_para.h \| 2 +
	arch/x86/kvm/Kconfig \| 13 +++
	arch/x86/kvm/cpuid.c \| 2 +
	arch/x86/kvm/x86.c \| 97 ++++++++++++++++++
	include/linux/kvm_host.h \| 15 +++
	include/uapi/linux/kvm_para.h \| 49 +++++++++
	virt/kvm/kvm_main.c \| 145 +++++++++++++++++++++++++++
	8 files changed, 456 insertions(+)

	diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
	index f849f3dfe07bf5c1e4638bbc4f611d3a98810e54..45b5aca1276b2fec45bab32312cee33dd4455981 100644
	--- a/arch/x86/include/asm/kvm_host.h
	+++ b/arch/x86/include/asm/kvm_host.h
	@@ -16,6 +16,7 @@
	#include <linux/irq_work.h>
	#include <linux/irq.h>
	#include <linux/workqueue.h>
	+#include <linux/sched/deadline.h>

	#include <linux/kvm.h>
	#include <linux/kvm_para.h>
	@@ -740,6 +741,35 @@ struct kvm_queued_exception {
	bool has_payload;
	};

	+/*
	+ * PARAVIRT_SCHED info
	+ */
	+struct vcpu_pv_sched {
	+ /*
	+ * Current scheduling attributes for this vcpu.
	+ */
	+ union vcpu_sched_attr attr;
	+ /*
	+ * Kernel priority : [-1, 140)
	+ * Used for priority comparisons.
	+ */
	+ int normal_prio;
	+ /*
	+ * Default scheduling attributes for this vcpu,
	+ * when the VM was started.
	+ */
	+ union vcpu_sched_attr default_attr;
	+ int default_normal_prio;
	+ /*
	+ * Policy and priority for guest kernel critical
	+ * sections - nmi, irq, softirq and preemption disabled.
	+ */
	+ int kern_cs_prio;
	+ int kern_cs_policy;
	+ u64 msr_val;
	+ struct gfn_to_hva_cache data;
	+};
	+
	struct kvm_vcpu_arch {
	/*
	* rip and regs accesses must go through
	@@ -1033,6 +1063,10 @@ struct kvm_vcpu_arch {
	/* Protected Guests */
	bool guest_state_protected;

	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+ struct vcpu_pv_sched pv_sched;
	+#endif
	+
	/*
	* Set when PDPTS were loaded directly by the userspace without
	* reading the guest memory
	@@ -2354,4 +2388,103 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
	*/
	#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)

	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+/*
	+ * Default policy and priority used for boosting
	+ * VCPU threads.
	+ */
	+#define VCPU_KERN_CS_PRIO 8
	+#define VCPU_KERN_CS_POLICY SCHED_RR
	+
	+static inline bool kvm_arch_vcpu_pv_sched_enabled(struct kvm_vcpu_arch *arch)
	+{
	+ return arch->pv_sched.msr_val;
	+}
	+
	+static inline union vcpu_sched_attr kvm_arch_vcpu_default_sched_attr(struct kvm_vcpu_arch *arch)
	+{
	+ return arch->pv_sched.default_attr;
	+}
	+
	+/*
	+ * copied from kernel/sched/core.c:__normal_prio()
	+ */
	+static inline int __sched_normal_prio(union vcpu_sched_attr attr)
	+{
	+ int prio;
	+
	+ if (attr.sched_policy == SCHED_DEADLINE)
	+ prio = MAX_DL_PRIO - 1;
	+ else if (attr.sched_policy == SCHED_FIFO \|\| attr.sched_policy == SCHED_RR)
	+ prio = MAX_RT_PRIO - 1 - attr.rt_priority;
	+ else
	+ prio = NICE_TO_PRIO(attr.sched_nice);
	+
	+ return prio;
	+}
	+
	+/*
	+ * Returns
	+ * 0 if vcpus prio is equal to prio specified by attr
	+ * 1 if vcpu prio is greater than prio specified by attr
	+ * -1 if vcpu prio is less than prio specified by attr
	+ */
	+static inline int kvm_arch_vcpu_normalprio_cmp(struct kvm_vcpu_arch *arch,
	+ union vcpu_sched_attr attr)
	+{
	+ int normal_prio = __sched_normal_prio(attr);
	+
	+ if (normal_prio == arch->pv_sched.normal_prio)
	+ return 0;
	+ else if (normal_prio > arch->pv_sched.normal_prio)
	+ return 1;
	+ else
	+ return -1;
	+}
	+
	+static inline void kvm_arch_vcpu_set_sched_attr(struct kvm_vcpu_arch *arch,
	+ union vcpu_sched_attr attr)
	+{
	+ arch->pv_sched.attr = attr;
	+ arch->pv_sched.normal_prio = __sched_normal_prio(attr);
	+}
	+
	+static inline void kvm_arch_vcpu_set_default_sched_attr(struct kvm_vcpu_arch *arch,
	+ union vcpu_sched_attr attr)
	+{
	+ arch->pv_sched.default_attr = attr;
	+ arch->pv_sched.default_normal_prio = __sched_normal_prio(attr);
	+}
	+
	+static inline int kvm_arch_vcpu_kerncs_prio(struct kvm_vcpu_arch *arch)
	+{
	+ return arch->pv_sched.kern_cs_prio;
	+}
	+
	+static inline int kvm_arch_vcpu_kerncs_policy(struct kvm_vcpu_arch *arch)
	+{
	+ return arch->pv_sched.kern_cs_policy;
	+}
	+
	+static inline int kvm_arch_vcpu_set_kerncs_prio(struct kvm_vcpu_arch *arch, int prio)
	+{
	+ if ((unsigned int)prio > MAX_RT_PRIO)
	+ return -EINVAL;
	+
	+ arch->pv_sched.kern_cs_prio = prio;
	+
	+ return 0;
	+}
	+
	+static inline int kvm_arch_vcpu_set_kerncs_policy(struct kvm_vcpu_arch *arch, int policy)
	+{
	+ if (policy != SCHED_FIFO && policy != SCHED_RR)
	+ return -EINVAL;
	+
	+ arch->pv_sched.kern_cs_policy = policy;
	+
	+ return 0;
	+}
	+#endif
	+
	#endif /* _ASM_X86_KVM_HOST_H */
	diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
	index 79c12988bc87aaa68c7ef5f4d39cb6fba81b1455..2f51db8f725a5fa88591c56a5db4674416dfd938 100644
	--- a/arch/x86/include/uapi/asm/kvm_para.h
	+++ b/arch/x86/include/uapi/asm/kvm_para.h
	@@ -36,6 +36,7 @@
	#define KVM_FEATURE_MSI_EXT_DEST_ID 15
	#define KVM_FEATURE_HC_MAP_GPA_RANGE 16
	#define KVM_FEATURE_MIGRATION_CONTROL 17
	+#define KVM_FEATURE_PV_SCHED 29
	#define KVM_FEATURE_HOST_SUSPEND_TIME 30

	#define KVM_HINTS_REALTIME 0
	@@ -60,6 +61,7 @@
	#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07
	#define MSR_KVM_MIGRATION_CONTROL 0x4b564d08
	#define MSR_KVM_HOST_SUSPEND_TIME 0x4b564d98
	+#define MSR_KVM_PV_SCHED 0x4b564da0

	struct kvm_steal_time {
	__u64 steal;
	diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
	index 22012ad6ff9ace5b14be4bb02c5ddc13a1fa59e8..9eae15d3234112ee5cad97c44b522af50845c66f 100644
	--- a/arch/x86/kvm/Kconfig
	+++ b/arch/x86/kvm/Kconfig
	@@ -221,4 +221,17 @@ config KVM_VIRT_SUSPEND_TIMING

	If unsure, say N.

	+config PARAVIRT_SCHED_KVM
	+ bool "Enable paravirt scheduling capability for kvm"
	+ depends on KVM
	+ default n
	+ help
	+ Paravirtualized scheduling facilitates the exchange of scheduling
	+ related information between the host and guest through shared memory,
	+ enhancing the efficiency of vCPU thread scheduling by the hypervisor.
	+ An illustrative use case involves dynamically boosting the priority of
	+ a vCPU thread when the guest is executing a latency-sensitive workload
	+ on that specific vCPU.
	+ This config enables paravirt scheduling in the kvm hypervisor.
	+
	endif # VIRTUALIZATION
	diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
	index 76b4c4e05e89c28c3f16cbb79cdfbc3cd005df00..40b52f84e11053563965526cb1cc631b665387e9 100644
	--- a/arch/x86/kvm/cpuid.c
	+++ b/arch/x86/kvm/cpuid.c
	@@ -1196,6 +1196,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
	(1 << KVM_FEATURE_POLL_CONTROL) \|
	(1 << KVM_FEATURE_PV_SCHED_YIELD) \|
	(1 << KVM_FEATURE_ASYNC_PF_INT);
	+ if (IS_ENABLED(CONFIG_PARAVIRT_SCHED_KVM))
	+ entry->eax \|= (1 << KVM_FEATURE_PV_SCHED);

	#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
	entry->eax \|= (1 << KVM_FEATURE_HOST_SUSPEND_TIME);
	diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
	index e78f86f258c4b3fd91972632bd2887a8f13ac16d..6dcbe53f6a59d3463b97d1fc82a91454a1608b89 100644
	--- a/arch/x86/kvm/x86.c
	+++ b/arch/x86/kvm/x86.c
	@@ -2184,6 +2184,15 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
	ret = EXIT_FASTPATH_REENTER_GUEST;
	}
	break;
	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+ case MSR_KVM_PV_SCHED:
	+ data = kvm_read_edx_eax(vcpu);
	+ if (data == ULLONG_MAX) {
	+ kvm_skip_emulated_instruction(vcpu);
	+ ret = EXIT_FASTPATH_EXIT_HANDLED;
	+ }
	+ break;
	+#endif
	default:
	break;
	}
	@@ -4035,6 +4044,37 @@ int kvm_set_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
	return 1;
	break;

	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+ case MSR_KVM_PV_SCHED:
	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED))
	+ return 1;
	+
	+ if (!(data & KVM_MSR_ENABLED))
	+ break;
	+
	+ if (!(data & ~KVM_MSR_ENABLED) && vcpu->arch.pv_sched.msr_val) {
	+ /*
	+ * Disable the feature
	+ */
	+ vcpu->arch.pv_sched.msr_val = 0;
	+ kvm_vcpu_set_sched(vcpu,
	+ kvm_arch_vcpu_default_sched_attr(&vcpu->arch));
	+ } else if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
	+ &vcpu->arch.pv_sched.data, data & ~KVM_MSR_ENABLED,
	+ sizeof(struct pv_sched_data))) {
	+ vcpu->arch.pv_sched.msr_val = data;
	+ kvm_arch_vcpu_set_default_sched_attr(&vcpu->arch,
	+ kvm_vcpu_get_sched(vcpu));
	+ kvm_vcpu_set_sched(vcpu,
	+ kvm_arch_vcpu_default_sched_attr(&vcpu->arch));
	+ } else {
	+ kvm_debug_ratelimited(
	+ "kvm:%p, vcpu:%p, msr: %llx, kvm_gfn_to_hva_cache_init failed!\n",
	+ vcpu->kvm, vcpu, data & ~KVM_MSR_ENABLED);
	+ }
	+ break;
	+#endif
	+
	case MSR_KVM_POLL_CONTROL:
	if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
	return 1;
	@@ -4402,6 +4442,11 @@ int kvm_get_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)

	msr_info->data = vcpu->arch.pv_eoi.msr_val;
	break;
	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+ case MSR_KVM_PV_SCHED:
	+ msr_info->data = vcpu->arch.pv_sched.msr_val;
	+ break;
	+#endif
	case MSR_KVM_POLL_CONTROL:
	if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
	return 1;
	@@ -10912,6 +10957,51 @@ static void kvm_adjust_suspend_time(struct kvm_vcpu *vcpu)
	}
	#endif

	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+/*
	+ * Update the host area of PV_SCHED with the vcpu task sched parameters
	+ * so that guest can utilize it if needed.
	+ */
	+static void record_vcpu_pv_sched(struct kvm_vcpu *vcpu)
	+{
	+ if (!kvm_arch_vcpu_pv_sched_enabled(&vcpu->arch))
	+ return;
	+
	+ pagefault_disable();
	+ kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.pv_sched.data,
	+ &vcpu->arch.pv_sched.attr, PV_SCHEDATTR_HOST_OFFSET, sizeof(union vcpu_sched_attr));
	+ pagefault_enable();
	+}
	+
	+static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu)
	+{
	+ if (!kvm_vcpu_sched_enabled(vcpu))
	+ return;
	+
	+ if (kvm_cpu_has_pending_timer(vcpu) \|\| kvm_cpu_has_interrupt(vcpu))
	+ kvm_vcpu_boost(vcpu, PVSCHED_KERNCS_BOOST_IRQ);
	+ else {
	+ union vcpu_sched_attr attr;
	+
	+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.pv_sched.data,
	+ &attr, PV_SCHEDATTR_GUEST_OFFSET, sizeof(attr)))
	+ return;
	+ kvm_vcpu_set_sched(vcpu, attr);
	+ }
	+}
	+
	+static void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu)
	+{
	+ kvm_arch_vcpu_set_kerncs_prio(&vcpu->arch, VCPU_KERN_CS_PRIO);
	+ kvm_arch_vcpu_set_kerncs_policy(&vcpu->arch, VCPU_KERN_CS_POLICY);
	+ kvm_arch_vcpu_set_default_sched_attr(&vcpu->arch, kvm_vcpu_get_sched(vcpu));
	+}
	+#else
	+static inline void record_vcpu_pv_sched(struct kvm_vcpu *vcpu) { }
	+static inline void kvm_vcpu_do_pv_sched(struct kvm_vcpu *vcpu) { }
	+static inline void kvm_vcpu_pv_sched_init(struct kvm_vcpu *vcpu) { }
	+#endif
	+
	/*
	* Called within kvm->srcu read side.
	* Returns 1 to let vcpu_run() continue the guest execution loop without
	@@ -11015,6 +11105,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
	kvm_pmu_handle_event(vcpu);
	if (kvm_check_request(KVM_REQ_PMI, vcpu))
	kvm_pmu_deliver_pmi(vcpu);
	+ if (kvm_check_request(KVM_REQ_VCPU_PV_SCHED, vcpu))
	+ record_vcpu_pv_sched(vcpu);
	#ifdef CONFIG_KVM_SMM
	if (kvm_check_request(KVM_REQ_SMI, vcpu))
	process_smi(vcpu);
	@@ -11285,6 +11377,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
	guest_timing_exit_irqoff();

	local_irq_enable();
	+
	+ kvm_vcpu_do_pv_sched(vcpu);
	+
	preempt_enable();

	kvm_vcpu_srcu_read_lock(vcpu);
	@@ -12384,6 +12479,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
	if (r)
	goto free_guest_fpu;

	+ kvm_vcpu_pv_sched_init(vcpu);
	+
	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
	kvm_xen_init_vcpu(vcpu);
	diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
	index 700c886117eccc1369f0cd290779785158c0fbac..79154fa34aa12d2ab04adee7b232c78d8b48f798 100644
	--- a/include/linux/kvm_host.h
	+++ b/include/linux/kvm_host.h
	@@ -174,6 +174,7 @@ static inline bool is_error_page(struct page *page)
	#define KVM_REQ_UNBLOCK 2
	#define KVM_REQ_DIRTY_RING_SOFT_FULL 3
	#define KVM_REQ_SUSPEND_TIME_ADJ 5
	+#define KVM_REQ_VCPU_PV_SCHED 6
	#define KVM_REQUEST_ARCH_BASE 8

	/*
	@@ -2611,4 +2612,18 @@ static inline u64 vcpu_suspend_time_injected(struct kvm_vcpu *vcpu)
	}
	#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING */

	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, union vcpu_sched_attr attr);
	+union vcpu_sched_attr kvm_vcpu_get_sched(struct kvm_vcpu *vcpu);
	+
	+static inline bool kvm_vcpu_sched_enabled(struct kvm_vcpu *vcpu)
	+{
	+ return kvm_arch_vcpu_pv_sched_enabled(&vcpu->arch);
	+}
	+
	+void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type);
	+#else
	+static inline void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type) { }
	+#endif
	+
	#endif
	diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
	index 960c7e93d1a98a363df4c6e7eeaffee065512f7c..b7c11650ec84f8a740e1111fd460756a98a0a88e 100644
	--- a/include/uapi/linux/kvm_para.h
	+++ b/include/uapi/linux/kvm_para.h
	@@ -2,6 +2,9 @@
	#ifndef _UAPI__LINUX_KVM_PARA_H
	#define _UAPI__LINUX_KVM_PARA_H

	+#include <linux/const.h>
	+#include <linux/types.h>
	+
	/*
	* This header file provides a method for making a hypercall to the host
	* Architectures should define:
	@@ -31,6 +34,52 @@
	#define KVM_HC_SCHED_YIELD 11
	#define KVM_HC_MAP_GPA_RANGE 12

	+enum kerncs_boost_type {
	+ PVSCHED_KERNCS_BOOST_PREEMPT_DISABLED = 0x1,
	+ PVSCHED_KERNCS_BOOST_IRQ = 0x2,
	+ PVSCHED_KERNCS_BOOST_SOFTIRQ = 0x4,
	+ PVSCHED_KERNCS_BOOST_IDLE = 0x8,
	+ PVSCHED_KERNCS_BOOST_ALL = 0xF,
	+};
	+
	+union vcpu_sched_attr {
	+ struct {
	+ __u8 enabled;
	+ __u8 sched_policy;
	+ __s8 sched_nice;
	+ __u8 rt_priority;
	+ /*
	+ * Guest running a kernel critical section:
	+ * - nmi, irq, softirq, preemption disabled.
	+ */
	+ __u8 kern_cs;
	+ };
	+ __u64 pad;
	+};
	+
	+enum pv_schedattr_type {
	+ PV_SCHEDATTR_GUEST = 0,
	+ PV_SCHEDATTR_HOST,
	+ PV_SCHEDATTR_MAX
	+};
	+
	+/*
	+ * Offset of guest area in the PV_SCHED shared memory.
	+ */
	+#define PV_SCHEDATTR_GUEST_OFFSET 0
	+
	+/*
	+ * Offset of the host area in the PV_SCHED shared memory.
	+ */
	+#define PV_SCHEDATTR_HOST_OFFSET (sizeof(union vcpu_sched_attr))
	+
	+/*
	+ * PARAVIRT_SCHED info shared between host and guest.
	+ */
	+struct pv_sched_data {
	+ union vcpu_sched_attr attr[PV_SCHEDATTR_MAX];
	+};
	+
	/*
	* hypercalls use architecture specific
	*/
	diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
	index 758ece99863c0ef0c65555d9fc68dea790773411..5e086be7a8dc21ea3e4292a8b6133f3295fc5402 100644
	--- a/virt/kvm/kvm_main.c
	+++ b/virt/kvm/kvm_main.c
	@@ -54,6 +54,9 @@
	#include <asm/ioctl.h>
	#include <linux/uaccess.h>

	+#include <linux/sched.h>
	+#include <uapi/linux/sched/types.h>
	+
	#include "coalesced_mmio.h"
	#include "async_pf.h"
	#include "kvm_mm.h"
	@@ -4108,6 +4111,148 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
	}
	EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);

	+#ifdef CONFIG_PARAVIRT_SCHED_KVM
	+union vcpu_sched_attr kvm_vcpu_get_sched(struct kvm_vcpu *vcpu)
	+{
	+ union vcpu_sched_attr attr = { 0 };
	+ struct pid *pid;
	+ struct task_struct *vcpu_task = NULL;
	+
	+ rcu_read_lock();
	+ pid = rcu_dereference(vcpu->pid);
	+ if (pid)
	+ vcpu_task = get_pid_task(pid, PIDTYPE_PID);
	+ rcu_read_unlock();
	+ if (vcpu_task == NULL)
	+ return attr;
	+
	+ attr.sched_policy = vcpu_task->policy;
	+ if (vcpu_task->policy == SCHED_RR \|\| vcpu_task->policy == SCHED_FIFO)
	+ attr.rt_priority = vcpu_task->rt_priority;
	+ else
	+ attr.sched_nice = task_nice(vcpu_task);
	+
	+ put_task_struct(vcpu_task);
	+
	+ return attr;
	+}
	+
	+ /*
	+ * Check if we need to act on the boost/unboost request.
	+ * Returns true if:
	+ * - caller is requesting boost and vcpu is boosted, or
	+ * - caller is requesting unboost and vcpu is not boosted.
	+ */
	+static inline bool __needs_set_sched(struct kvm_vcpu *vcpu, int policy, int prio, int nice)
	+{
	+ union vcpu_sched_attr attr = {
	+ .sched_policy = policy,
	+ .rt_priority = prio,
	+ .sched_nice = nice
	+ };
	+
	+ return kvm_arch_vcpu_normalprio_cmp(&vcpu->arch, attr);
	+}
	+
	+int kvm_vcpu_set_sched(struct kvm_vcpu *vcpu, union vcpu_sched_attr attr)
	+{
	+ int rt_prio;
	+ int nice;
	+ int policy;
	+ int ret = 0;
	+ struct pid *pid;
	+ struct task_struct *vcpu_task = NULL;
	+ int max_rt_prio = kvm_arch_vcpu_kerncs_prio(&vcpu->arch);
	+
	+retry_disable:
	+ /*
	+ * If the feature is disabled, revert to CFS.
	+ */
	+ if (!kvm_vcpu_sched_enabled(vcpu))
	+ attr = kvm_arch_vcpu_default_sched_attr(&vcpu->arch);
	+
	+ policy = attr.sched_policy;
	+ rt_prio = attr.rt_priority;
	+ nice = attr.sched_nice;
	+ if (attr.kern_cs \|\| policy == SCHED_DEADLINE) {
	+ nice = 0;
	+ policy = kvm_arch_vcpu_kerncs_policy(&vcpu->arch);
	+ rt_prio = max_rt_prio;
	+ } else if (policy == SCHED_FIFO \|\| policy == SCHED_RR) {
	+ nice = 0;
	+ if (rt_prio > max_rt_prio)
	+ rt_prio = max_rt_prio;
	+ } else {
	+ rt_prio = 0;
	+ }
	+
	+ rcu_read_lock();
	+ pid = rcu_dereference(vcpu->pid);
	+ if (pid)
	+ vcpu_task = get_pid_task(pid, PIDTYPE_PID);
	+ rcu_read_unlock();
	+ if (vcpu_task == NULL)
	+ return -KVM_EINVAL;
	+
	+ /*
	+ * This might be called from interrupt context.
	+ * Since we do not use rt-mutexes, we can safely call
	+ * sched_setscheduler_pi_nocheck with pi = false.
	+ * NOTE: If in future, we use rt-mutexes, this should
	+ * be modified to use a tasklet to do boost/unboost.
	+ */
	+ WARN_ON_ONCE(vcpu_task->pi_top_task);
	+ if (__needs_set_sched(vcpu, policy, rt_prio, nice)) {
	+ struct sched_attr sattr = {
	+ .sched_policy = policy,
	+ .sched_nice = nice,
	+ .sched_priority = rt_prio,
	+ };
	+ ret = sched_setattr_pi_nocheck(vcpu_task, &sattr, false);
	+ }
	+
	+ /*
	+ * values to return to guest.
	+ */
	+ attr.sched_policy = vcpu_task->policy;
	+ if (task_is_realtime(vcpu_task))
	+ attr.rt_priority = vcpu_task->rt_priority;
	+ else
	+ attr.sched_nice = task_nice(vcpu_task);
	+
	+ put_task_struct(vcpu_task);
	+
	+ attr.enabled = kvm_vcpu_sched_enabled(vcpu);
	+ /*
	+ * If the feature is disabled, we set it in the priority field to let the guest know.
	+ */
	+ if (!attr.enabled) {
	+ /*
	+ * There could be a race where the disable path disabled the feature
	+ * but we did the boost without knowing that disable was in progress.
	+ * Unboost again.
	+ */
	+ if (attr.sched_policy != SCHED_NORMAL \|\| attr.sched_nice != 0)
	+ goto retry_disable;
	+ }
	+
	+ kvm_arch_vcpu_set_sched_attr(&vcpu->arch, attr);
	+ kvm_make_request(KVM_REQ_VCPU_PV_SCHED, vcpu);
	+
	+ return ret;
	+}
	+
	+void kvm_vcpu_boost(struct kvm_vcpu *vcpu, enum kerncs_boost_type boost_type)
	+{
	+ union vcpu_sched_attr attr = {
	+ .kern_cs = boost_type
	+ };
	+
	+ kvm_vcpu_set_sched(vcpu, attr);
	+}
	+EXPORT_SYMBOL_GPL(kvm_vcpu_boost);
	+#endif
	+
	#ifndef CONFIG_S390
	/*
	* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
	--
	2.46.0.rc2.264.g509ed76dc8-goog