| From 901f9cb90fba23eecba6e12fda0be563de5f22ea Mon Sep 17 00:00:00 2001 |
| From: Peter Oskolkov <posk@google.com> |
| Date: Wed, 22 Jul 2020 16:45:37 -0700 |
| Subject: [PATCH] FROMLIST: futex/sched: add |
| wake_up_process_prefer_current_cpu, use in FUTEX_SWAP |
| |
| As described in the previous patch in this patchset |
| ("futex: introduce FUTEX_SWAP operation"), it is often |
| beneficial to wake a task and run it on the same CPU |
| where the current going to sleep task it running. |
| |
| Internally at Google, switchto_switch sycall not only |
| migrates the wakee to the current CPU, but also moves |
| the waker's load stats to the wakee, thus ensuring |
| that the migration to the current CPU does not interfere |
| with load balancing. switchto_switch also does the |
| context switch into the wakee, bypassing schedule(). |
| |
| This patchset does not go that far yet, it simply |
| migrates the wakee to the current CPU and calls schedule(). |
| |
| In follow-up patches I will try to fune-tune the behavior by adjusting |
| load stats and schedule(): our internal switchto_switch |
| is still about 2x faster than FUTEX_SWAP (see numbers below). |
| |
| And now about performance: futex_swap benchmark |
| from the last patch in this patchset produces this typical |
| output: |
| |
| $ ./futex_swap -i 100000 |
| |
| Change-Id: I07c9d747773f4414dd6b388d3da7d1bbdcf64023 |
| |
| ------- running SWAP_WAKE_WAIT ----------- |
| |
| completed 100000 swap and back iterations in 820683263 ns: 4103 ns per swap |
| PASS |
| |
| ------- running SWAP_SWAP ----------- |
| |
| completed 100000 swap and back iterations in 124034476 ns: 620 ns per swap |
| PASS |
| |
| In the above, the first benchmark (SWAP_WAKE_WAIT) calls FUTEX_WAKE, |
| then FUTEX_WAIT; the second benchmark (SWAP_SWAP) calls FUTEX_SWAP. |
| |
| If the benchmark is restricted to a single cpu: |
| |
| $ taskset -c 1 ./futex_swap -i 1000000 |
| |
| The numbers are very similar, as expected (with wake+wait being |
| a bit slower than swap due to two vs one syscalls). |
| |
| Please also note that switchto_switch is about 2x faster than |
| FUTEX_SWAP because it does a contex switch to the wakee immediately, |
| bypassing schedule(), so this is one of the options I'll |
| explore in further patches (if/when this initial patchset is |
| accepted). |
| |
| Tested: see the last patch is this patchset. |
| |
| Signed-off-by: Peter Oskolkov <posk@google.com> |
| (am from https://lore.kernel.org/patchwork/patch/1277904/) |
| (also found at https://lore.kernel.org/r/20200722234538.166697-3-posk@posk.io) |
| |
| BUG=b:170967073 |
| TEST=eve-kernelnext and manual verification with selftests |
| |
| Change-Id: Iab840761d4bc624a13015c814f4f602b69bb3941 |
| Signed-off-by: Guenter Roeck <groeck@chromium.org> |
| --- |
| include/linux/sched.h | 1 + |
| kernel/futex.c | 9 ++++----- |
| kernel/sched/core.c | 5 +++++ |
| kernel/sched/fair.c | 3 +++ |
| kernel/sched/sched.h | 1 + |
| 5 files changed, 14 insertions(+), 5 deletions(-) |
| |
| diff --git a/include/linux/sched.h b/include/linux/sched.h |
| index ec8d07d88641..c135cc87bf57 100644 |
| --- a/include/linux/sched.h |
| +++ b/include/linux/sched.h |
| @@ -1807,6 +1807,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr); |
| |
| extern int wake_up_state(struct task_struct *tsk, unsigned int state); |
| extern int wake_up_process(struct task_struct *tsk); |
| +extern int wake_up_process_prefer_current_cpu(struct task_struct *tsk); |
| extern void wake_up_new_task(struct task_struct *tsk); |
| |
| #ifdef CONFIG_SMP |
| diff --git a/kernel/futex.c b/kernel/futex.c |
| index 006188c09afe..1c0be8dc16f9 100644 |
| --- a/kernel/futex.c |
| +++ b/kernel/futex.c |
| @@ -2617,12 +2617,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, |
| */ |
| if (!timeout || timeout->task) { |
| if (next) { |
| - /* |
| - * wake_up_process() below will be replaced |
| - * in the next patch with |
| - * wake_up_process_prefer_current_cpu(). |
| - */ |
| +#ifdef CONFIG_SMP |
| + wake_up_process_prefer_current_cpu(next); |
| +#else |
| wake_up_process(next); |
| +#endif |
| put_task_struct(next); |
| next = NULL; |
| } |
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c |
| index 2d9ff40f4661..94aa5461927a 100644 |
| --- a/kernel/sched/core.c |
| +++ b/kernel/sched/core.c |
| @@ -8397,6 +8397,11 @@ void sched_setnuma(struct task_struct *p, int nid) |
| } |
| #endif /* CONFIG_NUMA_BALANCING */ |
| |
| +int wake_up_process_prefer_current_cpu(struct task_struct *next) |
| +{ |
| + return try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU); |
| +} |
| + |
| #ifdef CONFIG_HOTPLUG_CPU |
| /* |
| * Ensure that the idle task is using init_mm right before its CPU goes |
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c |
| index 44c452072a1b..c0b4b35f7782 100644 |
| --- a/kernel/sched/fair.c |
| +++ b/kernel/sched/fair.c |
| @@ -6862,6 +6862,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) |
| * required for stable ->cpus_allowed |
| */ |
| lockdep_assert_held(&p->pi_lock); |
| + if ((wake_flags & WF_CURRENT_CPU) && cpumask_test_cpu(cpu, p->cpus_ptr)) |
| + return cpu; |
| + |
| if (wake_flags & WF_TTWU) { |
| record_wakee(p); |
| |
| diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h |
| index 14a41a243f7b..a04c588071a7 100644 |
| --- a/kernel/sched/sched.h |
| +++ b/kernel/sched/sched.h |
| @@ -2038,6 +2038,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) |
| #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ |
| #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ |
| #define WF_ON_CPU 0x40 /* Wakee is on_cpu */ |
| +#define WF_CURRENT_CPU 0x80 /* Prefer to move wakee to the current CPU */ |
| |
| #ifdef CONFIG_SMP |
| static_assert(WF_EXEC == SD_BALANCE_EXEC); |
| -- |
| 2.32.0.93.g670b81a890-goog |
| |