blob: acdcd2208df919123640554cd35c9fa354ebdc45 [file] [log] [blame]
From d359148cc6c0be723fb687577eedf8a72d8e6598 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 Nov 2020 18:19:34 -0500
Subject: [PATCH] FROMLIST: sched: Core-wide rq->lock
Introduce the basic infrastructure to have a core wide rq->lock.
This relies on the rq->__lock order being in increasing CPU number
(inside a core). It is also constrained to SMT8 per lockdep (and
SMT256 per preempt_count).
Luckily SMT8 is the max supported SMT count for Linux (Mips, Sparc and
Power are known to have this).
BUG=b:152605392
TEST=run power_VideoCall test
(am from
https://lore.kernel.org/lkml/20210422123308.256677625@infradead.org/)
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Don Hiatt <dhiatt@digitalocean.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Change-Id: Ib26f4510ff9de18147f86b4e216c7abfc3c87831
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2880780
Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
---
kernel/Kconfig.preempt | 5 ++
kernel/sched/core.c | 164 ++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 57 ++++++++++++++
3 files changed, 222 insertions(+), 4 deletions(-)
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 416017301660..4842bd6845c3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -82,6 +82,11 @@ config PREEMPTION
bool
select PREEMPT_COUNT
+config SCHED_CORE
+ bool "Core Scheduling for SMT"
+ default y
+ depends on SCHED_SMT
+
config PREEMPT_DYNAMIC
bool
help
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 790dab92fbab..005ea8ad4b72 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,108 @@ unsigned int sysctl_sched_rt_period = 1000000;
__read_mostly int scheduler_running;
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/*
+ * Magic required such that:
+ *
+ * raw_spin_rq_lock(rq);
+ * ...
+ * raw_spin_rq_unlock(rq);
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+
+static DEFINE_MUTEX(sched_core_mutex);
+static int sched_core_count;
+static struct cpumask sched_core_mask;
+
+static void __sched_core_flip(bool enabled)
+{
+ int cpu, t, i;
+
+ cpus_read_lock();
+
+ /*
+ * Toggle the online cores, one by one.
+ */
+ cpumask_copy(&sched_core_mask, cpu_online_mask);
+ for_each_cpu(cpu, &sched_core_mask) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ i = 0;
+ local_irq_disable();
+ for_each_cpu(t, smt_mask) {
+ /* supports up to SMT8 */
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+ }
+
+ for_each_cpu(t, smt_mask)
+ cpu_rq(t)->core_enabled = enabled;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_enable();
+
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
+ }
+
+ /*
+ * Toggle the offline CPUs.
+ */
+ cpumask_copy(&sched_core_mask, cpu_possible_mask);
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
+
+ for_each_cpu(cpu, &sched_core_mask)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ cpus_read_unlock();
+}
+
+static void __sched_core_enable(void)
+{
+ // XXX verify there are no cookie tasks (yet)
+
+ static_branch_enable(&__sched_core_enabled);
+ /*
+ * Ensure all previous instances of raw_spin_rq_*lock() have finished
+ * and future ones will observe !sched_core_disabled().
+ */
+ synchronize_rcu();
+ __sched_core_flip(true);
+}
+
+static void __sched_core_disable(void)
+{
+ // XXX verify there are no cookie tasks (left)
+
+ __sched_core_flip(false);
+ static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+ mutex_lock(&sched_core_mutex);
+ if (!sched_core_count++)
+ __sched_core_enable();
+ mutex_unlock(&sched_core_mutex);
+}
+
+void sched_core_put(void)
+{
+ mutex_lock(&sched_core_mutex);
+ if (!--sched_core_count)
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* part of the period that we allow rt tasks to run in us.
* default: 0.95s
@@ -188,16 +290,23 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{
raw_spinlock_t *lock;
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
if (sched_core_disabled()) {
raw_spin_lock_nested(&rq->__lock, subclass);
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
return;
}
for (;;) {
lock = rq_lockp(rq);
raw_spin_lock_nested(lock, subclass);
- if (likely(lock == rq_lockp(rq)))
+ if (likely(lock == rq_lockp(rq))) {
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
return;
+ }
raw_spin_unlock(lock);
}
}
@@ -207,14 +316,21 @@ bool raw_spin_rq_trylock(struct rq *rq)
raw_spinlock_t *lock;
bool ret;
- if (sched_core_disabled())
- return raw_spin_trylock(&rq->__lock);
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ ret = raw_spin_trylock(&rq->__lock);
+ preempt_enable();
+ return ret;
+ }
for (;;) {
lock = rq_lockp(rq);
ret = raw_spin_trylock(lock);
- if (!ret || (likely(lock == rq_lockp(rq))))
+ if (!ret || (likely(lock == rq_lockp(rq)))) {
+ preempt_enable();
return ret;
+ }
raw_spin_unlock(lock);
}
}
@@ -5042,6 +5158,40 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
BUG();
}
+#ifdef CONFIG_SCHED_CORE
+
+static inline void sched_core_cpu_starting(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq, *core_rq = NULL;
+ int i;
+
+ core_rq = cpu_rq(cpu)->core;
+
+ if (!core_rq) {
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+ if (rq->core && rq->core == rq)
+ core_rq = rq;
+ }
+
+ if (!core_rq)
+ core_rq = cpu_rq(cpu);
+
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+
+ WARN_ON_ONCE(rq->core && rq->core != core_rq);
+ rq->core = core_rq;
+ }
+ }
+}
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* __schedule() is the main scheduler function.
*
@@ -8012,6 +8162,7 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
+ sched_core_cpu_starting(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -8296,6 +8447,11 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = NULL;
+ rq->core_enabled = 0;
+#endif
}
set_load_weight(&init_task, false);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9e3b90764593..0d28c5bbb80d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1070,6 +1070,12 @@ struct rq {
struct cpuidle_state *idle_state;
#endif
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ unsigned int core_enabled;
+#endif
+
#ifdef CONFIG_SMP
unsigned int nr_pinned;
#endif
@@ -1113,6 +1119,34 @@ static inline bool is_migration_disabled(struct task_struct *p)
#endif
}
+#ifdef CONFIG_SCHED_CORE
+
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return !static_branch_unlikely(&__sched_core_enabled);
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
static inline bool sched_core_disabled(void)
{
return true;
@@ -1123,6 +1157,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
return &rq->__lock;
}
+#endif /* CONFIG_SCHED_CORE */
+
static inline void lockdep_assert_rq_held(struct rq *rq)
{
lockdep_assert_held(rq_lockp(rq));
@@ -2242,6 +2278,27 @@ unsigned long arch_scale_freq_capacity(int cpu)
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
{
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * In order to not have {0,2},{1,3} turn into into an AB-BA,
+ * order by core-id first and cpu-id second.
+ *
+ * Notably:
+ *
+ * double_rq_lock(0,3); will take core-0, core-1 lock
+ * double_rq_lock(1,2); will take core-1, core-0 lock
+ *
+ * when only cpu-id is considered.
+ */
+ if (rq1->core->cpu < rq2->core->cpu)
+ return true;
+ if (rq1->core->cpu > rq2->core->cpu)
+ return false;
+
+ /*
+ * __sched_core_flip() relies on SMT having cpu-id lock order.
+ */
+#endif
return rq1->cpu < rq2->cpu;
}
--
2.31.1.818.g46aad6cb9e-goog