blob: 63f9820be753b624f5755276d11210c59ae4da4d [file] [log] [blame]
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef NET_SOCKET_SOCKET_APPLE_H_
#define NET_SOCKET_SOCKET_APPLE_H_
#include <sys/socket.h>
#include <sys/types.h>
// This is a workaround for https://crbug.com/40064248. See also: b/283787255,
// FB12198214, and FB19384824.
//
// In this bug, the `sendto` and `write` system calls, and the `send` wrapper
// around `sendto`, appear to return bogus values under certain conditions. This
// has been observed when writing to established IPv6 (AF_INET6) sockets, both
// TCP (SOCK_STREAM) and UDP (SOCK_DGRAM), following certain network
// reconfigurations on the system. It occurs when bringing up a utun-based VPN,
// when data sent via the socket would become subject to the tunnel. The bug
// occurs on macOS 13.3 22E252 (2023-03-27) and later OS versions.
//
// This discussion focuses on `sendto` but the bug is identical for `send`, as
// `send` is a thin C wrapper tail-calling the `sendto` system call; for
// `write`, substitute SYS_write = 4 for SYS_sendto = 133.
//
// ssize_t sendto(int fd, void const* buffer, size_t size, int flags, sockaddr
// const* address, socklen_t address_size)
//
// `size` contains the number of bytes in `buffer` to be sent via the socket
// whose file descriptor is `fd`.
//
// The ssize_t return value (rv) should be:
// - rv = -1 for an error, with errno set appropriately; or
// - 0 ≤ rv ≤ size on success. rv contains the number of bytes accepted by the
// kernel, conveying the portion of `buffer` that was either sent or queued
// for sending. This may be equal to `size` if the entire buffer was
// accepted, or it may be less than `size` if the kernel did not accept the
// entire buffer (a “short write”).
//
// When the bug occurs, `sendto` appears to return successfully (not -1) but
// reports a bogus value in place of the number of bytes accepted.
// - On arm64 (including x86_64-on-arm64 via Rosetta translation), the bogus
// return value is the value of `fd` passed to `sendto`.
// - On x86_64 (without binary translation), the bogus return value is the
// system call class (SYSCALL_CLASS_UNIX = 2) and number (SYS_sendto = 133),
// packed into a single integer: 0x2000085.
//
// The characteristics of the bogus return value impact how easy it is to detect
// the bug’s occurrence with a defensive return value checking technique.
// - rv > size: The bug has unambiguously occurred. This would mean that the
// kernel has accepted more data than was provided in `buffer`, which is
// impossible. This is easy to detect. 0x2000085 (32MB + 133) is almost
// always larger than the buffer size, so these unambiguous returns occur
// frequently when the bug occurs on x86_64.
// - rv ≤ size: This is indistinguishable from a normal successful return. File
// descriptor numbers are normally small, so these ambiguous return values
// appear frequently when the bug occurs on arm64.
//
// The mechanics of the bug are specific to the kernel’s architecture, largely
// following each architecture’s standard function call ABI, with an extra
// register dedicated to selecting the system call, and an architectural flag
// bit used to distinguish successful from error returns.
// - On arm64, the system call number (SYS_sendto = 133) is stored in x16.
// Arguments are presented in w0 (the low 32 bits of x0) = fd, x1 = buffer,
// x2 = size, w3 = flags, x4 = address, and w5 = address_size. x0 is used for
// the return value or error number. cpsr.c (the carry flag of “nzcv”) is set
// for an error return, and clear for a successful return.
// - On x86_64, the system call class and number (0x2000085) is stored in eax
// (the low 32 bits of rax). Arguments are presented in edi = fd, rsi =
// buffer, rdx = size, ecx = flags, r8 = address, and r9d = address_size. rax
// is used for the return value or error number. rflags.cf (the carry flag)
// is set for an error return, and clear for a successful return.
//
// The bug occurs when the EJUSTRETURN path is incorrectly taken in the kernel
// on system call return. xnu source code references are to xnu-11417.121.6,
// which shipped with macOS 15.5 24F74 (2025-05-12):
// - https://github.com/apple-oss-distributions/xnu/blob/xnu-11417.121.6/bsd/dev/arm/systemcalls.c#L504
// - https://github.com/apple-oss-distributions/xnu/blob/xnu-11417.121.6/bsd/dev/i386/systemcalls.c#L411
//
// The EJUSTRETURN path exists at this level primarily for the use of the
// `sigreturn` system call, which returns from a user-space signal handler
// function back to the interrupted user thread via the kernel. `sigreturn`
// restores the interrupted thread context, and has no return to its caller
// proper, so EJUSTRETURN exists to suppress the normal register manipulation
// done during any other system call return.
//
// There are also kernel-internal uses of EJUSTRETURN, but aside from
// `sigreturn`, none should “leak” to system call return. Socket code and other
// networking code in the kernel use EJUSTRETURN internally, at that layer
// generally meaning that no further processing should be performed. The bug is
// caused when one of these internal uses of EJUSTRETURN, xnu bsd/net/pf_ioctl.c
// `pf_inet6_hook`, is not handled properly within the networking layer and
// instead propagates from that layer to become `sendto`’s return, improperly
// “leaking” to the system call return level where it takes on a different
// meaning.
//
// When the bug occurs and the EJUSTRETURN path is taken for a return from
// `sendto`, the user-bound return value (uthread->uu_rval[0]) does not make it
// into the register state to be restored on user return (x0 via ss64->x[0], rax
// via regs->rax), leaving the return value register’s previous contents from
// system call entry intact. x0 will still contain the file descriptor number,
// and rax will still contain the system call selector. The carry flag will have
// been optimistically cleared, so the bug’s occurrence is always observed as a
// successful return in the user program.
//
// To provide more robust detection of even the ambiguous case, this workaround
// leverages the fact that all successful system call returns set a secondary
// return register in addition to the primary return value in x0 and rax. The
// secondary return registers are x1 and rdx, identical to each architecture’s
// ABI for a return of a struct containing 2 integers. For the vast majority of
// system calls (only `fork` and `pipe` are exceptions), the secondary return
// register is cleared. Thus, if the secondary return register is nonzero on
// system call entry, and it remains nonzero on system call return, it can be
// taken as a signal that the bug occurred unambiguously.
//
// By architecture:
// - On arm64, x1 = buffer. x1 can be consulted to detect the bug unambiguously
// as long as buffer != nullptr. If buffer == nullptr and size > 0, the call
// would have resulted in an error return (EFAULT) so the bug would not have
// been observed.
// - On x86_64, rdx = size. rdx can be consulted to detect the bug
// unambiguously as long as size != 0.
//
// For slightly different reasons on each architecture, it is possible for the
// secondary return value mechanism to fail to detect the bug’s occurrence when
// size == 0. The bug cannot occur for a `sendto` on a TCP socket with size ==
// 0, because sending 0 bytes via TCP is a no-op, and an early-return path is
// taken in the kernel without the bogus return value appearing. Thus, for any
// TCP socket, the secondary return value alone provides robust and unambiguous
// detection of the bug’s occurrence.
//
// A 0-byte `sendto` on a UDP socket is valid (it sends or queues a packet with
// no data payload beyond the UDP header), so for a 0-byte `sendto`, the
// secondary return register does not indicate the bug’s occurrence. Leveraging
// the fact that a 0-byte successful `sendto` can only validly return 0, the
// primary return value being nonzero can provide unambiguous detection of the
// bug’s occurrence on x86_64. On arm64, there’s a small amount of potential
// confusion in that w0 = fd, and 0 is valid as a file descriptor. Fortunately,
// as a file descriptor, 0 is STDIN_FILENO, and this code is not likely to
// manipulate a socket on the standard input stream, and even less likely to
// `sendto` via an input stream (although this is by convention, not strict
// requirement).
//
// The workaround is implemented in a bug-detecting wrapper around `sendto`.
// Substitute the SendtoAndDetectBogusReturnValue wrapper for a call to
// `sendto`, or SendAndDetectBogusReturnValue wrapper for a call to `send`, and
// when the bug occurs and is detected, its return value will be
// kSendBogusReturnValueDetected. In all other respects, the wrappers behave
// identically to `sendto` and `send`.
//
// Assuming that `send` tail-calls `sendto`, and a successful `sendto` returns
// immediately from the system call to its caller without modifying any
// registers, this detection mechanism will be valid. These assumptions hold
// empirically, as well as through a read of all of the source code on both the
// user and kernel sides of the system call boundary, at all relevant OS
// versions. However, there’s no guarantee that it must hold into the future,
// and it’s possible that a future OS version might invalidate these
// assumptions. In that case, this technique of detecting the bug’s occurrence
// via x1 and rdx might be jeopardized. The TODO below covering compile-time
// disabling of the workaround, and a similar one in socket_apple.cc covering
// run-time disabling of the workaround, are intended to mitigate against this
// possibility as soon as a macOS version with the fix for this bug is
// published.
//
// Note that these assumptions are not valid for an error return from the system
// call, and the secondary return register is not set during an error return
// either, so the bug-detecting wrapper takes care to only attempt detection
// during apparent successful returns.
// The assumptions above aren’t valid for certain sanitizers. Under Address
// Sanitizer, some system calls are intercepted via interposition, so
// libclang_rt.asan_osx_dynamic.dylib’s `wrap_sendto`, which does pre- and
// post-processing around a call to `sendto`, will appear when the program
// expects to call `sendto` directly. ASan’s interceptors exist between the
// wrapping that this workaround implements and the underlying system call; the
// basis of implementation for the ones relevant to this workaround is llvm
// compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc. Unaware
// of the normally hidden secondary system call return register, ASan’s
// interceptors may clobber it before this workaround’s wrapper has an
// opportunity to examine it. Address Sanitizer and Thread Sanitizer’s runtime
// libraries both expose `wrap_sendto`, so don’t attempt this workaround in ASan
// or TSan builds.
//
// TODO(mark): In the future, when a version of macOS with a fix for FB19384824
// is published, amend this compile-time guard to add a condition that only
// enables this workaround so long as the minimum OS version being targeted is
// older than the OS version that contains the fix (such as via <Availability.h>
// __MAC_OS_X_VERSION_MIN_REQUIRED and __IPHONE_OS_VERSION_MIN_REQUIRED).
#if !defined(ADDRESS_SANITIZER) && !defined(THREAD_SANITIZER)
#define WORK_AROUND_CRBUG_40064248 1
namespace net {
// A return value used to signal the bug’s occurrence in-band. This must be
// negative to avoid being confused with any possible successful return value,
// and it must not be -1 to avoid being confused with a normal errno-setting
// error return. In-band signaling makes things easier for callers, because
// `send` and `sendto` can be swapped out easily in favor of their wrappers,
// which can be used equally well with HANDLE_EINTR as appropriate.
inline constexpr ssize_t kSendBogusReturnValueDetected = -2;
static_assert(kSendBogusReturnValueDetected < 0 &&
kSendBogusReturnValueDetected != -1);
// Wrap `sendto`, returning kSendBogusReturnValueDetected when the bug’s
// occurrence is detected.
ssize_t SendtoAndDetectBogusReturnValue(int fd,
void const* buffer,
size_t size,
int flags,
sockaddr const* address,
socklen_t address_size);
// `send` is the same as `sendto` with the final two arguments zeroed.
inline ssize_t SendAndDetectBogusReturnValue(int const fd,
void const* const buffer,
size_t const size,
int const flags) {
return SendtoAndDetectBogusReturnValue(fd, buffer, size, flags, nullptr, 0);
}
} // namespace net
#endif // !ADDRESS_SANITIZER && !THREAD_SANITIZER
#endif // NET_SOCKET_SOCKET_APPLE_H_