| From 7a20bcf14e1efbd6260d28487ce85167d25cc2aa Mon Sep 17 00:00:00 2001 |
| From: Peter Oskolkov <posk@google.com> |
| Date: Wed, 22 Jul 2020 16:45:37 -0700 |
| Subject: [PATCH] FROMLIST: futex/sched: add |
| wake_up_process_prefer_current_cpu, use in FUTEX_SWAP |
| |
| As described in the previous patch in this patchset |
| ("futex: introduce FUTEX_SWAP operation"), it is often |
| beneficial to wake a task and run it on the same CPU |
| where the current going to sleep task it running. |
| |
| Internally at Google, switchto_switch sycall not only |
| migrates the wakee to the current CPU, but also moves |
| the waker's load stats to the wakee, thus ensuring |
| that the migration to the current CPU does not interfere |
| with load balancing. switchto_switch also does the |
| context switch into the wakee, bypassing schedule(). |
| |
| This patchset does not go that far yet, it simply |
| migrates the wakee to the current CPU and calls schedule(). |
| |
| In follow-up patches I will try to fune-tune the behavior by adjusting |
| load stats and schedule(): our internal switchto_switch |
| is still about 2x faster than FUTEX_SWAP (see numbers below). |
| |
| And now about performance: futex_swap benchmark |
| from the last patch in this patchset produces this typical |
| output: |
| |
| $ ./futex_swap -i 100000 |
| |
| Change-Id: I07c9d747773f4414dd6b388d3da7d1bbdcf64023 |
| |
| ------- running SWAP_WAKE_WAIT ----------- |
| |
| completed 100000 swap and back iterations in 820683263 ns: 4103 ns per swap |
| PASS |
| |
| ------- running SWAP_SWAP ----------- |
| |
| completed 100000 swap and back iterations in 124034476 ns: 620 ns per swap |
| PASS |
| |
| In the above, the first benchmark (SWAP_WAKE_WAIT) calls FUTEX_WAKE, |
| then FUTEX_WAIT; the second benchmark (SWAP_SWAP) calls FUTEX_SWAP. |
| |
| If the benchmark is restricted to a single cpu: |
| |
| $ taskset -c 1 ./futex_swap -i 1000000 |
| |
| The numbers are very similar, as expected (with wake+wait being |
| a bit slower than swap due to two vs one syscalls). |
| |
| Please also note that switchto_switch is about 2x faster than |
| FUTEX_SWAP because it does a contex switch to the wakee immediately, |
| bypassing schedule(), so this is one of the options I'll |
| explore in further patches (if/when this initial patchset is |
| accepted). |
| |
| Tested: see the last patch is this patchset. |
| |
| Signed-off-by: Peter Oskolkov <posk@google.com> |
| (am from https://lore.kernel.org/patchwork/patch/1277904/) |
| (also found at https://lore.kernel.org/r/20200722234538.166697-3-posk@posk.io) |
| |
| BUG=b:170967073 |
| TEST=eve-kernelnext and manual verification with selftests |
| |
| Change-Id: Iab840761d4bc624a13015c814f4f602b69bb3941 |
| Signed-off-by: Guenter Roeck <groeck@chromium.org> |
| --- |
| include/linux/sched.h | 1 + |
| kernel/sched/core.c | 5 +++++ |
| kernel/sched/fair.c | 3 +++ |
| kernel/sched/sched.h | 1 + |
| 4 files changed, 10 insertions(+) |
| |
| diff --git a/include/linux/sched.h b/include/linux/sched.h |
| --- a/include/linux/sched.h |
| +++ b/include/linux/sched.h |
| @@ -1909,6 +1909,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr); |
| |
| extern int wake_up_state(struct task_struct *tsk, unsigned int state); |
| extern int wake_up_process(struct task_struct *tsk); |
| +extern int wake_up_process_prefer_current_cpu(struct task_struct *tsk); |
| extern void wake_up_new_task(struct task_struct *tsk); |
| |
| #ifdef CONFIG_SMP |
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c |
| --- a/kernel/sched/core.c |
| +++ b/kernel/sched/core.c |
| @@ -8760,6 +8760,11 @@ void sched_setnuma(struct task_struct *p, int nid) |
| } |
| #endif /* CONFIG_NUMA_BALANCING */ |
| |
| +int wake_up_process_prefer_current_cpu(struct task_struct *next) |
| +{ |
| + return try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU); |
| +} |
| + |
| #ifdef CONFIG_HOTPLUG_CPU |
| /* |
| * Ensure that the idle task is using init_mm right before its CPU goes |
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c |
| --- a/kernel/sched/fair.c |
| +++ b/kernel/sched/fair.c |
| @@ -6887,6 +6887,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) |
| * required for stable ->cpus_allowed |
| */ |
| lockdep_assert_held(&p->pi_lock); |
| + if ((wake_flags & WF_CURRENT_CPU) && cpumask_test_cpu(cpu, p->cpus_ptr)) |
| + return cpu; |
| + |
| if (wake_flags & WF_TTWU) { |
| record_wakee(p); |
| |
| diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h |
| --- a/kernel/sched/sched.h |
| +++ b/kernel/sched/sched.h |
| @@ -2031,6 +2031,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) |
| #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ |
| #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ |
| #define WF_ON_CPU 0x40 /* Wakee is on_cpu */ |
| +#define WF_CURRENT_CPU 0x80 /* Prefer to move wakee to the current CPU */ |
| |
| #ifdef CONFIG_SMP |
| static_assert(WF_EXEC == SD_BALANCE_EXEC); |
| -- |
| 2.34.0.rc2.393.gf8c9666880-goog |
| |