blob: 872750397995e9eb83c4fc6667fee60623bb657f [file] [log] [blame] [edit]
From 886a4ec6d1f8d0ec1eb4d6e1cc9fabb3d8cf8966 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Wed, 10 Mar 2021 13:50:23 -0800
Subject: [PATCH] FROMLIST: eventfd: Introduce EFD_ZERO_ON_WAKE
This patch introduces a new flag to eventfd, called EFD_ZERO_ON_WAKE.
This change is primarily introduced for use cases which do not care about
the value stored in the eventfd itself. Such existing use cases require an
additional read syscall to clear the count.
This flag provides the following guarantees:
(1) Writes can never block or return EAGAIN.
The reason this is true is because we don't actually need to store the
value and as a result the internal value is only changed between 0 and
1 and back to 0. Therefore POLLERR and POLLOUT are never possible
outcomes. A poll with POLLOUT or a write will always immediately
return regardless of EFD_NONBLOCK.
(2) Read / POLLIN result in the internal value being reset to 0.
When EFD_NONBLOCK is set reads when the internal value is 0 will
immediately return with EAGAIN, as it always has. Similiarly, when
a read is performed without EFD_NONBLOCK it will block until a write
occurs. In both cases after the read completes successfully the
internal value is reset to 0. When polling with POLLIN, upon return
of a POLLIN event the internal value will be reset to 0.
Signed-off-by: Brian Geffon <bgeffon@google.com>
(am from https://lore.kernel.org/patchwork/patch/1393221/)
(also found at https://lore.kernel.org/r/20210310215023.4129753-1-bgeffon@google.com)
BUG=b:173022729
TEST=manually tested with chrome
Signed-off-by: Brian Geffon <bgeffon@chromium.org>
Change-Id: I81e6daeb52bb8f38f712735c1a3050970addb52c
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2782351
Reviewed-by: Sean Paul <seanpaul@chromium.org>
Reviewed-by: Sonny Rao <sonnyrao@chromium.org>
Reviewed-by: Joel Fernandes <joelaf@google.com>
---
fs/eventfd.c | 37 ++++++++++++++++++++++++++++++++++++-
include/linux/eventfd.h | 8 +++++++-
2 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8aa36cd373516516aeda0409ab1da140063ad6cc..185eb292e4c8fc3f3e8631a85d9c7e360c1ea754 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -175,8 +175,21 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
*/
count = READ_ONCE(ctx->count);
- if (count > 0)
+ if (count > 0) {
+ if ((ctx->flags & EFD_ZERO_ON_WAKE) &&
+ (poll_requested_events(wait) & EPOLLIN)) {
+ /*
+ * We're going to cause a wake on EPOLLIN, we need to zero the count.
+ * We validate that EPOLLIN is a requested event because if the user
+ * did something odd like POLLPRI we wouldn't want to zero the count
+ * if no wake happens.
+ */
+ spin_lock_irq(&ctx->wqh.lock);
+ ctx->count = 0;
+ spin_unlock_irq(&ctx->wqh.lock);
+ }
events |= EPOLLIN;
+ }
if (count == ULLONG_MAX)
events |= EPOLLERR;
if (ULLONG_MAX - 1 > count)
@@ -243,6 +256,9 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
spin_unlock_irq(&ctx->wqh.lock);
return -ERESTARTSYS;
}
+ } else {
+ if (ctx->flags & EFD_ZERO_ON_WAKE)
+ ctx->count = 0;
}
eventfd_ctx_do_read(ctx, &ucnt);
current->in_eventfd = 1;
@@ -271,6 +287,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
return -EINVAL;
spin_lock_irq(&ctx->wqh.lock);
res = -EAGAIN;
+
+ /*
+ * In the case of EFD_ZERO_ON_WAKE the actual count is never needed, for this
+ * reason we only adjust it to set it from 0 to 1 or 1 to 0. This means that
+ * write will never return EWOULDBLOCK or block, because there is always
+ * going to be enough space to write as the amount we will increment could
+ * be at most 1 as it's clamped below. Additionally, we know that POLLERR
+ * cannot be returned when EFD_ZERO_ON_WAKE is used for the same reason.
+ */
+ if (ctx->flags & EFD_ZERO_ON_WAKE)
+ ucnt = (ctx->count == 0) ? 1 : 0;
+
if (ULLONG_MAX - ctx->count > ucnt)
res = sizeof(ucnt);
else if (!(file->f_flags & O_NONBLOCK)) {
@@ -396,9 +424,16 @@ static int do_eventfd(unsigned int count, int flags)
BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+ /* O_NOFOLLOW has been repurposed as EFD_ZERO_ON_WAKE */
+ BUILD_BUG_ON(EFD_ZERO_ON_WAKE != O_NOFOLLOW);
+
if (flags & ~EFD_FLAGS_SET)
return -EINVAL;
+ /* The semaphore semantics would be lost if using EFD_ZERO_ON_WAKE */
+ if ((flags & EFD_ZERO_ON_WAKE) && (flags & EFD_SEMAPHORE))
+ return -EINVAL;
+
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index b9d83652c097abba51ea4f6863d7af375a3e1234..779dab9cef3468a26a96746cbf296e5306ee71e4 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -23,8 +23,14 @@
* from eventfd, in order to leave a free define-space for
* shared O_* flags.
*/
+/*
+ * We intentionally use the value of O_NOFOLLOW for EFD_ZERO_ON_WAKE
+ * because O_NOFOLLOW would have no meaning with an eventfd.
+ */
+#define EFD_ZERO_ON_WAKE O_NOFOLLOW
+
#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
-#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_ZERO_ON_WAKE)
struct eventfd_ctx;
struct file;
--
2.34.1