| From f85a657f1c745c762f8d6c11a411464364871ea5 Mon Sep 17 00:00:00 2001 |
| From: Peter Oskolkov <posk@google.com> |
| Date: Wed, 22 Jul 2020 16:45:36 -0700 |
| Subject: [PATCH] FROMLIST: futex: introduce FUTEX_SWAP operation |
| |
| As Paul Turner presented at LPC in 2013 ... |
| - pdf: http://pdxplumbers.osuosl.org/2013/ocw//system/presentations/1653/original/LPC%20-%20User%20Threading.pdf |
| - video: https://www.youtube.com/watch?v=KXuZi9aeGTw |
| |
| ... Google has developed an M:N userspace threading subsystem backed |
| by Google-private SwitchTo Linux Kernel API (page 17 in the pdf referenced |
| above). This subsystem provides latency-sensitive services at Google with |
| fine-grained user-space control/scheduling over what is running when, |
| and this subsystem is used widely internally (called schedulers or fibers). |
| |
| This patchset is the first step to open-source this work. As explained |
| in the linked pdf and video, SwitchTo API has three core operations: wait, |
| resume, and swap (=switch). So this patchset adds a FUTEX_SWAP operation |
| that, in addition to FUTEX_WAIT and FUTEX_WAKE, will provide a foundation |
| on top of which user-space threading libraries can be built. |
| |
| Another common use case for FUTEX_SWAP is message passing a-la RPC |
| between tasks: task/thread T1 prepares a message, |
| wakes T2 to work on it, and waits for the results; when T2 is done, it |
| wakes T1 and waits for more work to arrive. Currently the simplest |
| way to implement this is |
| |
| a. T1: futex-wake T2, futex-wait |
| b. T2: wakes, does what it has been woken to do |
| c. T2: futex-wake T1, futex-wait |
| |
| With FUTEX_SWAP, steps a and c above can be reduced to one futex operation |
| that runs 5-10 times faster. |
| |
| Patches in this patchset: |
| |
| Patch 1: (this patch) introduce FUTEX_SWAP futex operation that, |
| internally, does wake + wait. The purpose of this patch is |
| to work out the API. |
| Patch 2: a first rough attempt to make FUTEX_SWAP faster than |
| what wake + wait can do. |
| Patch 3: a selftest that can also be used to benchmark FUTEX_SWAP vs |
| FUTEX_WAKE + FUTEX_WAIT. |
| |
| Tested: see patch 3 in this patchset. |
| |
| Signed-off-by: Peter Oskolkov <posk@google.com> |
| (am from https://lore.kernel.org/patchwork/patch/1277905/) |
| (also found at https://lore.kernel.org/r/20200722234538.166697-2-posk@posk.io) |
| |
| BUG=b:170967073 |
| TEST=eve-kernelnext and manually verification with selftests |
| |
| Change-Id: I0f693816689477a5f7252ff361df4dd96ef4af88 |
| Signed-off-by: Guenter Roeck <groeck@chromium.org> |
| --- |
| include/uapi/linux/futex.h | 2 + |
| kernel/futex/futex.h | 8 +++- |
| kernel/futex/requeue.c | 2 +- |
| kernel/futex/syscalls.c | 7 +++- |
| kernel/futex/waitwake.c | 77 +++++++++++++++++++++++++++++++++----- |
| 5 files changed, 82 insertions(+), 14 deletions(-) |
| |
| diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h |
| index 71a5df8d2689..6ca369509efc 100644 |
| --- a/include/uapi/linux/futex.h |
| +++ b/include/uapi/linux/futex.h |
| @@ -22,6 +22,7 @@ |
| #define FUTEX_WAIT_REQUEUE_PI 11 |
| #define FUTEX_CMP_REQUEUE_PI 12 |
| #define FUTEX_LOCK_PI2 13 |
| +#define FUTEX_SWAP 14 |
| |
| #define FUTEX_PRIVATE_FLAG 128 |
| #define FUTEX_CLOCK_REALTIME 256 |
| @@ -42,6 +43,7 @@ |
| FUTEX_PRIVATE_FLAG) |
| #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ |
| FUTEX_PRIVATE_FLAG) |
| +#define FUTEX_SWAP_PRIVATE (FUTEX_SWAP | FUTEX_PRIVATE_FLAG) |
| |
| /* |
| * Flags to specify the bit length of the futex word for futex2 syscalls. |
| diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h |
| index b5379c0e6d6d..f34befb946c2 100644 |
| --- a/kernel/futex/futex.h |
| +++ b/kernel/futex/futex.h |
| @@ -143,7 +143,8 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2) |
| extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
| struct futex_q *q, struct futex_hash_bucket **hb); |
| extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, |
| - struct hrtimer_sleeper *timeout); |
| + struct hrtimer_sleeper *timeout, |
| + struct task_struct *next); |
| extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); |
| |
| extern int fault_in_user_writeable(u32 __user *uaddr); |
| @@ -265,7 +266,7 @@ extern int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
| u32 *cmpval, int requeue_pi); |
| |
| extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, |
| - ktime_t *abs_time, u32 bitset); |
| + ktime_t *abs_time, u32 bitset, struct task_struct *next); |
| |
| /** |
| * struct futex_vector - Auxiliary struct for futex_waitv() |
| @@ -291,4 +292,7 @@ extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); |
| |
| extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); |
| |
| +extern int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val, |
| + ktime_t *abs_time, u32 __user *uaddr2); |
| + |
| #endif /* _FUTEX_H */ |
| diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c |
| index cba8b1a6a4cc..7f8fcf9a1fc7 100644 |
| --- a/kernel/futex/requeue.c |
| +++ b/kernel/futex/requeue.c |
| @@ -816,7 +816,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
| } |
| |
| /* Queue the futex_q, drop the hb lock, wait for wakeup. */ |
| - futex_wait_queue(hb, &q, to); |
| + futex_wait_queue(hb, &q, to, NULL); |
| |
| switch (futex_requeue_pi_wakeup_sync(&q)) { |
| case Q_REQUEUE_PI_IGNORE: |
| diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c |
| index 086a22d1adb7..effae3505d81 100644 |
| --- a/kernel/futex/syscalls.c |
| +++ b/kernel/futex/syscalls.c |
| @@ -103,7 +103,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
| val3 = FUTEX_BITSET_MATCH_ANY; |
| fallthrough; |
| case FUTEX_WAIT_BITSET: |
| - return futex_wait(uaddr, flags, val, timeout, val3); |
| + return futex_wait(uaddr, flags, val, timeout, val3, NULL); |
| case FUTEX_WAKE: |
| val3 = FUTEX_BITSET_MATCH_ANY; |
| fallthrough; |
| @@ -130,6 +130,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
| uaddr2); |
| case FUTEX_CMP_REQUEUE_PI: |
| return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); |
| + case FUTEX_SWAP: |
| + return futex_swap(uaddr, flags, val, timeout, uaddr2); |
| } |
| return -ENOSYS; |
| } |
| @@ -142,6 +144,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) |
| case FUTEX_LOCK_PI2: |
| case FUTEX_WAIT_BITSET: |
| case FUTEX_WAIT_REQUEUE_PI: |
| + case FUTEX_SWAP: |
| return true; |
| } |
| return false; |
| @@ -154,7 +157,7 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) |
| return -EINVAL; |
| |
| *t = timespec64_to_ktime(*ts); |
| - if (cmd == FUTEX_WAIT) |
| + if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP) |
| *t = ktime_add_safe(ktime_get(), *t); |
| else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) |
| *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); |
| diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c |
| index 4ce0923f1ce3..140c2c205f49 100644 |
| --- a/kernel/futex/waitwake.c |
| +++ b/kernel/futex/waitwake.c |
| @@ -138,15 +138,16 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) |
| } |
| |
| /* |
| - * Wake up waiters matching bitset queued on this futex (uaddr). |
| + * Prepare wake queue matching bitset queued on this futex (uaddr). |
| */ |
| -int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) |
| +static int |
| +prepare_wake_q(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset, |
| + struct wake_q_head *wake_q) |
| { |
| struct futex_hash_bucket *hb; |
| struct futex_q *this, *next; |
| union futex_key key = FUTEX_KEY_INIT; |
| int ret; |
| - DEFINE_WAKE_Q(wake_q); |
| |
| if (!bitset) |
| return -EINVAL; |
| @@ -174,14 +175,28 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) |
| if (!(this->bitset & bitset)) |
| continue; |
| |
| - futex_wake_mark(&wake_q, this); |
| + futex_wake_mark(wake_q, this); |
| if (++ret >= nr_wake) |
| break; |
| } |
| } |
| |
| spin_unlock(&hb->lock); |
| + return ret; |
| +} |
| + |
| +/* |
| + * Wake up waiters matching bitset queued on this futex (uaddr). |
| + */ |
| +int |
| +futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) |
| +{ |
| + int ret; |
| + DEFINE_WAKE_Q(wake_q); |
| + |
| + ret = prepare_wake_q(uaddr, flags, nr_wake, bitset, &wake_q); |
| wake_up_q(&wake_q); |
| + |
| return ret; |
| } |
| |
| @@ -324,9 +339,12 @@ static long futex_wait_restart(struct restart_block *restart); |
| * @hb: the futex hash bucket, must be locked by the caller |
| * @q: the futex_q to queue up on |
| * @timeout: the prepared hrtimer_sleeper, or null for no timeout |
| + * @next: if present, wake next and hint to the scheduler that we'd |
| + * prefer to execute it locally. |
| */ |
| void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, |
| - struct hrtimer_sleeper *timeout) |
| + struct hrtimer_sleeper *timeout, |
| + struct task_struct *next) |
| { |
| /* |
| * The task state is guaranteed to be set before another task can |
| @@ -351,10 +369,26 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, |
| * flagged for rescheduling. Only call schedule if there |
| * is no timeout, or if it has yet to expire. |
| */ |
| - if (!timeout || timeout->task) |
| + if (!timeout || timeout->task) { |
| + if (next) { |
| + /* |
| + * wake_up_process() below will be replaced |
| + * in the next patch with |
| + * wake_up_process_prefer_current_cpu(). |
| + */ |
| + wake_up_process(next); |
| + put_task_struct(next); |
| + next = NULL; |
| + } |
| freezable_schedule(); |
| + } |
| } |
| __set_current_state(TASK_RUNNING); |
| + |
| + if (next) { |
| + wake_up_process(next); |
| + put_task_struct(next); |
| + } |
| } |
| |
| /** |
| @@ -629,7 +663,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
| return ret; |
| } |
| |
| -int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) |
| +int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, struct task_struct *next) |
| { |
| struct hrtimer_sleeper timeout, *to; |
| struct restart_block *restart; |
| @@ -653,7 +687,8 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time |
| goto out; |
| |
| /* futex_queue and wait for wakeup, timeout, or a signal. */ |
| - futex_wait_queue(hb, &q, to); |
| + futex_wait_queue(hb, &q, to, next); |
| + next = NULL; |
| |
| /* If we were woken (and unqueued), we succeeded, whatever. */ |
| ret = 0; |
| @@ -684,6 +719,10 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time |
| ret = set_restart_fn(restart, futex_wait_restart); |
| |
| out: |
| + if (next) { |
| + wake_up_process(next); |
| + put_task_struct(next); |
| + } |
| if (to) { |
| hrtimer_cancel(&to->timer); |
| destroy_hrtimer_on_stack(&to->timer); |
| @@ -703,6 +742,26 @@ static long futex_wait_restart(struct restart_block *restart) |
| restart->fn = do_no_restart_syscall; |
| |
| return (long)futex_wait(uaddr, restart->futex.flags, |
| - restart->futex.val, tp, restart->futex.bitset); |
| + restart->futex.val, tp, restart->futex.bitset, NULL); |
| +} |
| + |
| +int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val, |
| + ktime_t *abs_time, u32 __user *uaddr2) |
| +{ |
| + u32 bitset = FUTEX_BITSET_MATCH_ANY; |
| + struct task_struct *next = NULL; |
| + DEFINE_WAKE_Q(wake_q); |
| + int ret; |
| + |
| + ret = prepare_wake_q(uaddr2, flags, 1, bitset, &wake_q); |
| + if (ret < 0) |
| + return ret; |
| + if (!wake_q_empty(&wake_q)) { |
| + /* At most one wakee can be present. Pull it out. */ |
| + next = container_of(wake_q.first, struct task_struct, wake_q); |
| + next->wake_q.next = NULL; |
| + } |
| + |
| + return futex_wait(uaddr, flags, val, abs_time, bitset, next); |
| } |
| |
| -- |
| 2.37.1.595.g718a3a8f04-goog |
| |