| From d359148cc6c0be723fb687577eedf8a72d8e6598 Mon Sep 17 00:00:00 2001 |
| From: Peter Zijlstra <peterz@infradead.org> |
| Date: Tue, 17 Nov 2020 18:19:34 -0500 |
| Subject: [PATCH] FROMLIST: sched: Core-wide rq->lock |
| |
| Introduce the basic infrastructure to have a core wide rq->lock. |
| |
| This relies on the rq->__lock order being in increasing CPU number |
| (inside a core). It is also constrained to SMT8 per lockdep (and |
| SMT256 per preempt_count). |
| |
| Luckily SMT8 is the max supported SMT count for Linux (Mips, Sparc and |
| Power are known to have this). |
| |
| BUG=b:152605392 |
| TEST=run power_VideoCall test |
| |
| (am from |
| https://lore.kernel.org/lkml/20210422123308.256677625@infradead.org/) |
| |
| Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
| Tested-by: Don Hiatt <dhiatt@digitalocean.com> |
| Signed-off-by: Joel Fernandes <joelaf@google.com> |
| Change-Id: Ib26f4510ff9de18147f86b4e216c7abfc3c87831 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2880780 |
| Reviewed-by: Sonny Rao <sonnyrao@chromium.org> |
| --- |
| kernel/Kconfig.preempt | 5 ++ |
| kernel/sched/core.c | 164 ++++++++++++++++++++++++++++++++++++++++- |
| kernel/sched/sched.h | 57 ++++++++++++++ |
| 3 files changed, 222 insertions(+), 4 deletions(-) |
| |
| diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt |
| index 416017301660..4842bd6845c3 100644 |
| --- a/kernel/Kconfig.preempt |
| +++ b/kernel/Kconfig.preempt |
| @@ -82,6 +82,11 @@ config PREEMPTION |
| bool |
| select PREEMPT_COUNT |
| |
| +config SCHED_CORE |
| + bool "Core Scheduling for SMT" |
| + default y |
| + depends on SCHED_SMT |
| + |
| config PREEMPT_DYNAMIC |
| bool |
| help |
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c |
| index 790dab92fbab..005ea8ad4b72 100644 |
| --- a/kernel/sched/core.c |
| +++ b/kernel/sched/core.c |
| @@ -84,6 +84,108 @@ unsigned int sysctl_sched_rt_period = 1000000; |
| |
| __read_mostly int scheduler_running; |
| |
| +#ifdef CONFIG_SCHED_CORE |
| + |
| +DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); |
| + |
| +/* |
| + * Magic required such that: |
| + * |
| + * raw_spin_rq_lock(rq); |
| + * ... |
| + * raw_spin_rq_unlock(rq); |
| + * |
| + * ends up locking and unlocking the _same_ lock, and all CPUs |
| + * always agree on what rq has what lock. |
| + * |
| + * XXX entirely possible to selectively enable cores, don't bother for now. |
| + */ |
| + |
| +static DEFINE_MUTEX(sched_core_mutex); |
| +static int sched_core_count; |
| +static struct cpumask sched_core_mask; |
| + |
| +static void __sched_core_flip(bool enabled) |
| +{ |
| + int cpu, t, i; |
| + |
| + cpus_read_lock(); |
| + |
| + /* |
| + * Toggle the online cores, one by one. |
| + */ |
| + cpumask_copy(&sched_core_mask, cpu_online_mask); |
| + for_each_cpu(cpu, &sched_core_mask) { |
| + const struct cpumask *smt_mask = cpu_smt_mask(cpu); |
| + |
| + i = 0; |
| + local_irq_disable(); |
| + for_each_cpu(t, smt_mask) { |
| + /* supports up to SMT8 */ |
| + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); |
| + } |
| + |
| + for_each_cpu(t, smt_mask) |
| + cpu_rq(t)->core_enabled = enabled; |
| + |
| + for_each_cpu(t, smt_mask) |
| + raw_spin_unlock(&cpu_rq(t)->__lock); |
| + local_irq_enable(); |
| + |
| + cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); |
| + } |
| + |
| + /* |
| + * Toggle the offline CPUs. |
| + */ |
| + cpumask_copy(&sched_core_mask, cpu_possible_mask); |
| + cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask); |
| + |
| + for_each_cpu(cpu, &sched_core_mask) |
| + cpu_rq(cpu)->core_enabled = enabled; |
| + |
| + cpus_read_unlock(); |
| +} |
| + |
| +static void __sched_core_enable(void) |
| +{ |
| + // XXX verify there are no cookie tasks (yet) |
| + |
| + static_branch_enable(&__sched_core_enabled); |
| + /* |
| + * Ensure all previous instances of raw_spin_rq_*lock() have finished |
| + * and future ones will observe !sched_core_disabled(). |
| + */ |
| + synchronize_rcu(); |
| + __sched_core_flip(true); |
| +} |
| + |
| +static void __sched_core_disable(void) |
| +{ |
| + // XXX verify there are no cookie tasks (left) |
| + |
| + __sched_core_flip(false); |
| + static_branch_disable(&__sched_core_enabled); |
| +} |
| + |
| +void sched_core_get(void) |
| +{ |
| + mutex_lock(&sched_core_mutex); |
| + if (!sched_core_count++) |
| + __sched_core_enable(); |
| + mutex_unlock(&sched_core_mutex); |
| +} |
| + |
| +void sched_core_put(void) |
| +{ |
| + mutex_lock(&sched_core_mutex); |
| + if (!--sched_core_count) |
| + __sched_core_disable(); |
| + mutex_unlock(&sched_core_mutex); |
| +} |
| + |
| +#endif /* CONFIG_SCHED_CORE */ |
| + |
| /* |
| * part of the period that we allow rt tasks to run in us. |
| * default: 0.95s |
| @@ -188,16 +290,23 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass) |
| { |
| raw_spinlock_t *lock; |
| |
| + /* Matches synchronize_rcu() in __sched_core_enable() */ |
| + preempt_disable(); |
| if (sched_core_disabled()) { |
| raw_spin_lock_nested(&rq->__lock, subclass); |
| + /* preempt_count *MUST* be > 1 */ |
| + preempt_enable_no_resched(); |
| return; |
| } |
| |
| for (;;) { |
| lock = rq_lockp(rq); |
| raw_spin_lock_nested(lock, subclass); |
| - if (likely(lock == rq_lockp(rq))) |
| + if (likely(lock == rq_lockp(rq))) { |
| + /* preempt_count *MUST* be > 1 */ |
| + preempt_enable_no_resched(); |
| return; |
| + } |
| raw_spin_unlock(lock); |
| } |
| } |
| @@ -207,14 +316,21 @@ bool raw_spin_rq_trylock(struct rq *rq) |
| raw_spinlock_t *lock; |
| bool ret; |
| |
| - if (sched_core_disabled()) |
| - return raw_spin_trylock(&rq->__lock); |
| + /* Matches synchronize_rcu() in __sched_core_enable() */ |
| + preempt_disable(); |
| + if (sched_core_disabled()) { |
| + ret = raw_spin_trylock(&rq->__lock); |
| + preempt_enable(); |
| + return ret; |
| + } |
| |
| for (;;) { |
| lock = rq_lockp(rq); |
| ret = raw_spin_trylock(lock); |
| - if (!ret || (likely(lock == rq_lockp(rq)))) |
| + if (!ret || (likely(lock == rq_lockp(rq)))) { |
| + preempt_enable(); |
| return ret; |
| + } |
| raw_spin_unlock(lock); |
| } |
| } |
| @@ -5042,6 +5158,40 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
| BUG(); |
| } |
| |
| +#ifdef CONFIG_SCHED_CORE |
| + |
| +static inline void sched_core_cpu_starting(unsigned int cpu) |
| +{ |
| + const struct cpumask *smt_mask = cpu_smt_mask(cpu); |
| + struct rq *rq, *core_rq = NULL; |
| + int i; |
| + |
| + core_rq = cpu_rq(cpu)->core; |
| + |
| + if (!core_rq) { |
| + for_each_cpu(i, smt_mask) { |
| + rq = cpu_rq(i); |
| + if (rq->core && rq->core == rq) |
| + core_rq = rq; |
| + } |
| + |
| + if (!core_rq) |
| + core_rq = cpu_rq(cpu); |
| + |
| + for_each_cpu(i, smt_mask) { |
| + rq = cpu_rq(i); |
| + |
| + WARN_ON_ONCE(rq->core && rq->core != core_rq); |
| + rq->core = core_rq; |
| + } |
| + } |
| +} |
| +#else /* !CONFIG_SCHED_CORE */ |
| + |
| +static inline void sched_core_cpu_starting(unsigned int cpu) {} |
| + |
| +#endif /* CONFIG_SCHED_CORE */ |
| + |
| /* |
| * __schedule() is the main scheduler function. |
| * |
| @@ -8012,6 +8162,7 @@ static void sched_rq_cpu_starting(unsigned int cpu) |
| |
| int sched_cpu_starting(unsigned int cpu) |
| { |
| + sched_core_cpu_starting(cpu); |
| sched_rq_cpu_starting(cpu); |
| sched_tick_start(cpu); |
| return 0; |
| @@ -8296,6 +8447,11 @@ void __init sched_init(void) |
| #endif /* CONFIG_SMP */ |
| hrtick_rq_init(rq); |
| atomic_set(&rq->nr_iowait, 0); |
| + |
| +#ifdef CONFIG_SCHED_CORE |
| + rq->core = NULL; |
| + rq->core_enabled = 0; |
| +#endif |
| } |
| |
| set_load_weight(&init_task, false); |
| diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h |
| index 9e3b90764593..0d28c5bbb80d 100644 |
| --- a/kernel/sched/sched.h |
| +++ b/kernel/sched/sched.h |
| @@ -1070,6 +1070,12 @@ struct rq { |
| struct cpuidle_state *idle_state; |
| #endif |
| |
| +#ifdef CONFIG_SCHED_CORE |
| + /* per rq */ |
| + struct rq *core; |
| + unsigned int core_enabled; |
| +#endif |
| + |
| #ifdef CONFIG_SMP |
| unsigned int nr_pinned; |
| #endif |
| @@ -1113,6 +1119,34 @@ static inline bool is_migration_disabled(struct task_struct *p) |
| #endif |
| } |
| |
| +#ifdef CONFIG_SCHED_CORE |
| + |
| +DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); |
| + |
| +static inline bool sched_core_enabled(struct rq *rq) |
| +{ |
| + return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled; |
| +} |
| + |
| +static inline bool sched_core_disabled(void) |
| +{ |
| + return !static_branch_unlikely(&__sched_core_enabled); |
| +} |
| + |
| +static inline raw_spinlock_t *rq_lockp(struct rq *rq) |
| +{ |
| + if (sched_core_enabled(rq)) |
| + return &rq->core->__lock; |
| + |
| + return &rq->__lock; |
| +} |
| + |
| +#else /* !CONFIG_SCHED_CORE */ |
| + |
| +static inline bool sched_core_enabled(struct rq *rq) |
| +{ |
| + return false; |
| +} |
| static inline bool sched_core_disabled(void) |
| { |
| return true; |
| @@ -1123,6 +1157,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) |
| return &rq->__lock; |
| } |
| |
| +#endif /* CONFIG_SCHED_CORE */ |
| + |
| static inline void lockdep_assert_rq_held(struct rq *rq) |
| { |
| lockdep_assert_held(rq_lockp(rq)); |
| @@ -2242,6 +2278,27 @@ unsigned long arch_scale_freq_capacity(int cpu) |
| |
| static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) |
| { |
| +#ifdef CONFIG_SCHED_CORE |
| + /* |
| + * In order to not have {0,2},{1,3} turn into into an AB-BA, |
| + * order by core-id first and cpu-id second. |
| + * |
| + * Notably: |
| + * |
| + * double_rq_lock(0,3); will take core-0, core-1 lock |
| + * double_rq_lock(1,2); will take core-1, core-0 lock |
| + * |
| + * when only cpu-id is considered. |
| + */ |
| + if (rq1->core->cpu < rq2->core->cpu) |
| + return true; |
| + if (rq1->core->cpu > rq2->core->cpu) |
| + return false; |
| + |
| + /* |
| + * __sched_core_flip() relies on SMT having cpu-id lock order. |
| + */ |
| +#endif |
| return rq1->cpu < rq2->cpu; |
| } |
| |
| -- |
| 2.31.1.818.g46aad6cb9e-goog |
| |