blob: 04475abb8caf950930d15d1a6ca65652fee14c62 [file] [log] [blame] [edit]
From f85a657f1c745c762f8d6c11a411464364871ea5 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 22 Jul 2020 16:45:36 -0700
Subject: [PATCH] FROMLIST: futex: introduce FUTEX_SWAP operation
As Paul Turner presented at LPC in 2013 ...
- pdf: http://pdxplumbers.osuosl.org/2013/ocw//system/presentations/1653/original/LPC%20-%20User%20Threading.pdf
- video: https://www.youtube.com/watch?v=KXuZi9aeGTw
... Google has developed an M:N userspace threading subsystem backed
by Google-private SwitchTo Linux Kernel API (page 17 in the pdf referenced
above). This subsystem provides latency-sensitive services at Google with
fine-grained user-space control/scheduling over what is running when,
and this subsystem is used widely internally (called schedulers or fibers).
This patchset is the first step to open-source this work. As explained
in the linked pdf and video, SwitchTo API has three core operations: wait,
resume, and swap (=switch). So this patchset adds a FUTEX_SWAP operation
that, in addition to FUTEX_WAIT and FUTEX_WAKE, will provide a foundation
on top of which user-space threading libraries can be built.
Another common use case for FUTEX_SWAP is message passing a-la RPC
between tasks: task/thread T1 prepares a message,
wakes T2 to work on it, and waits for the results; when T2 is done, it
wakes T1 and waits for more work to arrive. Currently the simplest
way to implement this is
a. T1: futex-wake T2, futex-wait
b. T2: wakes, does what it has been woken to do
c. T2: futex-wake T1, futex-wait
With FUTEX_SWAP, steps a and c above can be reduced to one futex operation
that runs 5-10 times faster.
Patches in this patchset:
Patch 1: (this patch) introduce FUTEX_SWAP futex operation that,
internally, does wake + wait. The purpose of this patch is
to work out the API.
Patch 2: a first rough attempt to make FUTEX_SWAP faster than
what wake + wait can do.
Patch 3: a selftest that can also be used to benchmark FUTEX_SWAP vs
FUTEX_WAKE + FUTEX_WAIT.
Tested: see patch 3 in this patchset.
Signed-off-by: Peter Oskolkov <posk@google.com>
(am from https://lore.kernel.org/patchwork/patch/1277905/)
(also found at https://lore.kernel.org/r/20200722234538.166697-2-posk@posk.io)
BUG=b:170967073
TEST=eve-kernelnext and manually verification with selftests
Change-Id: I0f693816689477a5f7252ff361df4dd96ef4af88
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
include/uapi/linux/futex.h | 2 +
kernel/futex/futex.h | 8 +++-
kernel/futex/requeue.c | 2 +-
kernel/futex/syscalls.c | 7 +++-
kernel/futex/waitwake.c | 77 +++++++++++++++++++++++++++++++++-----
5 files changed, 82 insertions(+), 14 deletions(-)
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 71a5df8d2689..6ca369509efc 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -22,6 +22,7 @@
#define FUTEX_WAIT_REQUEUE_PI 11
#define FUTEX_CMP_REQUEUE_PI 12
#define FUTEX_LOCK_PI2 13
+#define FUTEX_SWAP 14
#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
@@ -42,6 +43,7 @@
FUTEX_PRIVATE_FLAG)
#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
FUTEX_PRIVATE_FLAG)
+#define FUTEX_SWAP_PRIVATE (FUTEX_SWAP | FUTEX_PRIVATE_FLAG)
/*
* Flags to specify the bit length of the futex word for futex2 syscalls.
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index b5379c0e6d6d..f34befb946c2 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -143,7 +143,8 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2)
extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb);
extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout);
+ struct hrtimer_sleeper *timeout,
+ struct task_struct *next);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
extern int fault_in_user_writeable(u32 __user *uaddr);
@@ -265,7 +266,7 @@ extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
u32 *cmpval, int requeue_pi);
extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
- ktime_t *abs_time, u32 bitset);
+ ktime_t *abs_time, u32 bitset, struct task_struct *next);
/**
* struct futex_vector - Auxiliary struct for futex_waitv()
@@ -291,4 +292,7 @@ extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
+extern int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 __user *uaddr2);
+
#endif /* _FUTEX_H */
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index cba8b1a6a4cc..7f8fcf9a1fc7 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -816,7 +816,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
}
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue(hb, &q, to);
+ futex_wait_queue(hb, &q, to, NULL);
switch (futex_requeue_pi_wakeup_sync(&q)) {
case Q_REQUEUE_PI_IGNORE:
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 086a22d1adb7..effae3505d81 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -103,7 +103,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough;
case FUTEX_WAIT_BITSET:
- return futex_wait(uaddr, flags, val, timeout, val3);
+ return futex_wait(uaddr, flags, val, timeout, val3, NULL);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough;
@@ -130,6 +130,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
uaddr2);
case FUTEX_CMP_REQUEUE_PI:
return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+ case FUTEX_SWAP:
+ return futex_swap(uaddr, flags, val, timeout, uaddr2);
}
return -ENOSYS;
}
@@ -142,6 +144,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd)
case FUTEX_LOCK_PI2:
case FUTEX_WAIT_BITSET:
case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_SWAP:
return true;
}
return false;
@@ -154,7 +157,7 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
return -EINVAL;
*t = timespec64_to_ktime(*ts);
- if (cmd == FUTEX_WAIT)
+ if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
*t = ktime_add_safe(ktime_get(), *t);
else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 4ce0923f1ce3..140c2c205f49 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -138,15 +138,16 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
}
/*
- * Wake up waiters matching bitset queued on this futex (uaddr).
+ * Prepare wake queue matching bitset queued on this futex (uaddr).
*/
-int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+static int
+prepare_wake_q(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset,
+ struct wake_q_head *wake_q)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
int ret;
- DEFINE_WAKE_Q(wake_q);
if (!bitset)
return -EINVAL;
@@ -174,14 +175,28 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
- futex_wake_mark(&wake_q, this);
+ futex_wake_mark(wake_q, this);
if (++ret >= nr_wake)
break;
}
}
spin_unlock(&hb->lock);
+ return ret;
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+ int ret;
+ DEFINE_WAKE_Q(wake_q);
+
+ ret = prepare_wake_q(uaddr, flags, nr_wake, bitset, &wake_q);
wake_up_q(&wake_q);
+
return ret;
}
@@ -324,9 +339,12 @@ static long futex_wait_restart(struct restart_block *restart);
* @hb: the futex hash bucket, must be locked by the caller
* @q: the futex_q to queue up on
* @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ * @next: if present, wake next and hint to the scheduler that we'd
+ * prefer to execute it locally.
*/
void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
+ struct hrtimer_sleeper *timeout,
+ struct task_struct *next)
{
/*
* The task state is guaranteed to be set before another task can
@@ -351,10 +369,26 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* flagged for rescheduling. Only call schedule if there
* is no timeout, or if it has yet to expire.
*/
- if (!timeout || timeout->task)
+ if (!timeout || timeout->task) {
+ if (next) {
+ /*
+ * wake_up_process() below will be replaced
+ * in the next patch with
+ * wake_up_process_prefer_current_cpu().
+ */
+ wake_up_process(next);
+ put_task_struct(next);
+ next = NULL;
+ }
freezable_schedule();
+ }
}
__set_current_state(TASK_RUNNING);
+
+ if (next) {
+ wake_up_process(next);
+ put_task_struct(next);
+ }
}
/**
@@ -629,7 +663,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
return ret;
}
-int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, struct task_struct *next)
{
struct hrtimer_sleeper timeout, *to;
struct restart_block *restart;
@@ -653,7 +687,8 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
goto out;
/* futex_queue and wait for wakeup, timeout, or a signal. */
- futex_wait_queue(hb, &q, to);
+ futex_wait_queue(hb, &q, to, next);
+ next = NULL;
/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
@@ -684,6 +719,10 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
ret = set_restart_fn(restart, futex_wait_restart);
out:
+ if (next) {
+ wake_up_process(next);
+ put_task_struct(next);
+ }
if (to) {
hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
@@ -703,6 +742,26 @@ static long futex_wait_restart(struct restart_block *restart)
restart->fn = do_no_restart_syscall;
return (long)futex_wait(uaddr, restart->futex.flags,
- restart->futex.val, tp, restart->futex.bitset);
+ restart->futex.val, tp, restart->futex.bitset, NULL);
+}
+
+int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 __user *uaddr2)
+{
+ u32 bitset = FUTEX_BITSET_MATCH_ANY;
+ struct task_struct *next = NULL;
+ DEFINE_WAKE_Q(wake_q);
+ int ret;
+
+ ret = prepare_wake_q(uaddr2, flags, 1, bitset, &wake_q);
+ if (ret < 0)
+ return ret;
+ if (!wake_q_empty(&wake_q)) {
+ /* At most one wakee can be present. Pull it out. */
+ next = container_of(wake_q.first, struct task_struct, wake_q);
+ next->wake_q.next = NULL;
+ }
+
+ return futex_wait(uaddr, flags, val, abs_time, bitset, next);
}
--
2.37.1.595.g718a3a8f04-goog