| From 886a4ec6d1f8d0ec1eb4d6e1cc9fabb3d8cf8966 Mon Sep 17 00:00:00 2001 |
| From: Brian Geffon <bgeffon@google.com> |
| Date: Wed, 10 Mar 2021 13:50:23 -0800 |
| Subject: [PATCH] FROMLIST: eventfd: Introduce EFD_ZERO_ON_WAKE |
| |
| This patch introduces a new flag to eventfd, called EFD_ZERO_ON_WAKE. |
| This change is primarily introduced for use cases which do not care about |
| the value stored in the eventfd itself. Such existing use cases require an |
| additional read syscall to clear the count. |
| |
| This flag provides the following guarantees: |
| |
| (1) Writes can never block or return EAGAIN. |
| The reason this is true is because we don't actually need to store the |
| value and as a result the internal value is only changed between 0 and |
| 1 and back to 0. Therefore POLLERR and POLLOUT are never possible |
| outcomes. A poll with POLLOUT or a write will always immediately |
| return regardless of EFD_NONBLOCK. |
| |
| (2) Read / POLLIN result in the internal value being reset to 0. |
| When EFD_NONBLOCK is set reads when the internal value is 0 will |
| immediately return with EAGAIN, as it always has. Similiarly, when |
| a read is performed without EFD_NONBLOCK it will block until a write |
| occurs. In both cases after the read completes successfully the |
| internal value is reset to 0. When polling with POLLIN, upon return |
| of a POLLIN event the internal value will be reset to 0. |
| |
| Signed-off-by: Brian Geffon <bgeffon@google.com> |
| (am from https://lore.kernel.org/patchwork/patch/1393221/) |
| (also found at https://lore.kernel.org/r/20210310215023.4129753-1-bgeffon@google.com) |
| |
| BUG=b:173022729 |
| TEST=manually tested with chrome |
| |
| Signed-off-by: Brian Geffon <bgeffon@chromium.org> |
| Change-Id: I81e6daeb52bb8f38f712735c1a3050970addb52c |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/2782351 |
| Reviewed-by: Sean Paul <seanpaul@chromium.org> |
| Reviewed-by: Sonny Rao <sonnyrao@chromium.org> |
| Reviewed-by: Joel Fernandes <joelaf@google.com> |
| --- |
| fs/eventfd.c | 37 ++++++++++++++++++++++++++++++++++++- |
| include/linux/eventfd.h | 8 +++++++- |
| 2 files changed, 43 insertions(+), 2 deletions(-) |
| |
| diff --git a/fs/eventfd.c b/fs/eventfd.c |
| index 8aa36cd373516516aeda0409ab1da140063ad6cc..185eb292e4c8fc3f3e8631a85d9c7e360c1ea754 100644 |
| --- a/fs/eventfd.c |
| +++ b/fs/eventfd.c |
| @@ -175,8 +175,21 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait) |
| */ |
| count = READ_ONCE(ctx->count); |
| |
| - if (count > 0) |
| + if (count > 0) { |
| + if ((ctx->flags & EFD_ZERO_ON_WAKE) && |
| + (poll_requested_events(wait) & EPOLLIN)) { |
| + /* |
| + * We're going to cause a wake on EPOLLIN, we need to zero the count. |
| + * We validate that EPOLLIN is a requested event because if the user |
| + * did something odd like POLLPRI we wouldn't want to zero the count |
| + * if no wake happens. |
| + */ |
| + spin_lock_irq(&ctx->wqh.lock); |
| + ctx->count = 0; |
| + spin_unlock_irq(&ctx->wqh.lock); |
| + } |
| events |= EPOLLIN; |
| + } |
| if (count == ULLONG_MAX) |
| events |= EPOLLERR; |
| if (ULLONG_MAX - 1 > count) |
| @@ -243,6 +256,9 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) |
| spin_unlock_irq(&ctx->wqh.lock); |
| return -ERESTARTSYS; |
| } |
| + } else { |
| + if (ctx->flags & EFD_ZERO_ON_WAKE) |
| + ctx->count = 0; |
| } |
| eventfd_ctx_do_read(ctx, &ucnt); |
| current->in_eventfd = 1; |
| @@ -271,6 +287,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c |
| return -EINVAL; |
| spin_lock_irq(&ctx->wqh.lock); |
| res = -EAGAIN; |
| + |
| + /* |
| + * In the case of EFD_ZERO_ON_WAKE the actual count is never needed, for this |
| + * reason we only adjust it to set it from 0 to 1 or 1 to 0. This means that |
| + * write will never return EWOULDBLOCK or block, because there is always |
| + * going to be enough space to write as the amount we will increment could |
| + * be at most 1 as it's clamped below. Additionally, we know that POLLERR |
| + * cannot be returned when EFD_ZERO_ON_WAKE is used for the same reason. |
| + */ |
| + if (ctx->flags & EFD_ZERO_ON_WAKE) |
| + ucnt = (ctx->count == 0) ? 1 : 0; |
| + |
| if (ULLONG_MAX - ctx->count > ucnt) |
| res = sizeof(ucnt); |
| else if (!(file->f_flags & O_NONBLOCK)) { |
| @@ -396,9 +424,16 @@ static int do_eventfd(unsigned int count, int flags) |
| BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); |
| BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); |
| |
| + /* O_NOFOLLOW has been repurposed as EFD_ZERO_ON_WAKE */ |
| + BUILD_BUG_ON(EFD_ZERO_ON_WAKE != O_NOFOLLOW); |
| + |
| if (flags & ~EFD_FLAGS_SET) |
| return -EINVAL; |
| |
| + /* The semaphore semantics would be lost if using EFD_ZERO_ON_WAKE */ |
| + if ((flags & EFD_ZERO_ON_WAKE) && (flags & EFD_SEMAPHORE)) |
| + return -EINVAL; |
| + |
| ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); |
| if (!ctx) |
| return -ENOMEM; |
| diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h |
| index b9d83652c097abba51ea4f6863d7af375a3e1234..779dab9cef3468a26a96746cbf296e5306ee71e4 100644 |
| --- a/include/linux/eventfd.h |
| +++ b/include/linux/eventfd.h |
| @@ -23,8 +23,14 @@ |
| * from eventfd, in order to leave a free define-space for |
| * shared O_* flags. |
| */ |
| +/* |
| + * We intentionally use the value of O_NOFOLLOW for EFD_ZERO_ON_WAKE |
| + * because O_NOFOLLOW would have no meaning with an eventfd. |
| + */ |
| +#define EFD_ZERO_ON_WAKE O_NOFOLLOW |
| + |
| #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) |
| -#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) |
| +#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_ZERO_ON_WAKE) |
| |
| struct eventfd_ctx; |
| struct file; |
| -- |
| 2.34.1 |
| |