From d06c596e571962f853df761fb36374d8bda0fcb7 Mon Sep 17 00:00:00 2001 From: SXX Date: Wed, 6 May 2026 18:51:03 +0800 Subject: [PATCH] Fix fd races in dup and fcntl duplication paths Take fd references under the fdtable lock when duplicating descriptors so dup(), dup3(), and fcntl(F_DUPFD*) cannot race with a concurrent close() and retain a freed fd. This also makes fd refcount updates use the fd lock and adds retained-fd helpers for fcntl paths that operate on an existing descriptor after the table lookup. Add a guest e2e regression test that stresses dup, dup2, and fcntl(F_DUPFD_CLOEXEC) concurrently. The same test fails on an unpatched tree and passes with this change. While tightening refcount handling, fix stdio initialization to transfer the original fd reference directly instead of reviving a zero-refcount fd. --- fs/fd.c | 155 +++++++++++++++++++++++++-------- fs/fd.h | 2 + kernel/init.c | 3 +- tests/e2e/fd_race/expected.txt | 1 + tests/e2e/fd_race/fd_race.c | 135 ++++++++++++++++++++++++++++ tests/e2e/fd_race/test.sh | 5 ++ 6 files changed, 264 insertions(+), 37 deletions(-) create mode 100644 tests/e2e/fd_race/expected.txt create mode 100644 tests/e2e/fd_race/fd_race.c create mode 100644 tests/e2e/fd_race/test.sh diff --git a/fs/fd.c b/fs/fd.c index 6bc39f5918..a3e4112ea4 100644 --- a/fs/fd.c +++ b/fs/fd.c @@ -26,13 +26,20 @@ struct fd *fd_create(const struct fd_ops *ops) { } struct fd *fd_retain(struct fd *fd) { + lock(&fd->lock); + assert(fd->refcount > 0); fd->refcount++; + unlock(&fd->lock); return fd; } int fd_close(struct fd *fd) { int err = 0; - if (--fd->refcount == 0) { + lock(&fd->lock); + assert(fd->refcount > 0); + bool destroy = --fd->refcount == 0; + unlock(&fd->lock); + if (destroy) { poll_cleanup_fd(fd); if (fd->ops->close) err = fd->ops->close(fd); @@ -127,7 +134,7 @@ struct fdtable *fdtable_copy(struct fdtable *table) { memcpy(new_table->files, table->files, sizeof(struct fd *) * size); for (fd_t f = 0; f < size; f++) if (new_table->files[f]) - new_table->files[f]->refcount++; + fd_retain(new_table->files[f]); memcpy(new_table->cloexec, table->cloexec, BITS_SIZE(size)); unlock(&table->lock); return new_table; @@ -155,6 +162,19 @@ struct fd *f_get(fd_t f) { return fd; } +struct fd *f_get_retain(fd_t f) { + lock(¤t->files->lock); + struct fd *fd = fdtable_get(current->files, f); + if (fd != NULL) + fd_retain(fd); + unlock(¤t->files->lock); + return fd; +} + +void f_put(struct fd *fd) { + fd_close(fd); +} + static fd_t f_install_start(struct fd *fd, fd_t start) { assert(start >= 0); struct fdtable *table = current->files; @@ -243,27 +263,41 @@ void fdtable_do_cloexec(struct fdtable *table) { dword_t sys_dup(fd_t f) { STRACE("dup(%d)", f); - struct fd *fd = f_get(f); - if (fd == NULL) + struct fdtable *table = current->files; + lock(&table->lock); + struct fd *fd = fdtable_get(table, f); + if (fd == NULL) { + unlock(&table->lock); return _EBADF; - fd->refcount++; - return f_install(fd, 0); + } + fd_retain(fd); + fd_t new_f = f_install_start(fd, 0); + unlock(&table->lock); + return new_f; } dword_t sys_dup3(fd_t f, fd_t new_f, int_t flags) { STRACE("dup3(%d, %d, %d)", f, new_f, flags); struct fdtable *table = current->files; - struct fd *fd = f_get(f); - if (fd == NULL) + lock(&table->lock); + struct fd *fd = fdtable_get(table, f); + if (fd == NULL) { + unlock(&table->lock); return _EBADF; + } int err = fdtable_expand(table, new_f); - if (err < 0) + if (err < 0) { + unlock(&table->lock); return err; + } fd_retain(fd); - f_close(new_f); + fdtable_close(table, new_f); table->files[new_f] = fd; if (flags & O_CLOEXEC_) bit_set(new_f, table->cloexec); + else + bit_clear(new_f, table->cloexec); + unlock(&table->lock); return new_f; } @@ -287,9 +321,7 @@ int fd_setflags(struct fd *fd, int flags) { dword_t sys_fcntl(fd_t f, dword_t cmd, dword_t arg) { struct fdtable *table = current->files; - struct fd *fd = f_get(f); - if (fd == NULL) - return _EBADF; + struct fd *fd; struct flock32_ flock32; struct flock_ flock; fd_t new_f; @@ -297,38 +329,79 @@ dword_t sys_fcntl(fd_t f, dword_t cmd, dword_t arg) { switch (cmd) { case F_DUPFD_: STRACE("fcntl(%d, F_DUPFD, %d)", f, arg); - fd->refcount++; - return f_install_start(fd, arg); + lock(&table->lock); + fd = fdtable_get(table, f); + if (fd == NULL) { + unlock(&table->lock); + return _EBADF; + } + fd_retain(fd); + new_f = f_install_start(fd, arg); + unlock(&table->lock); + return new_f; case F_DUPFD_CLOEXEC_: STRACE("fcntl(%d, F_DUPFD_CLOEXEC, %d)", f, arg); - fd->refcount++; + lock(&table->lock); + fd = fdtable_get(table, f); + if (fd == NULL) { + unlock(&table->lock); + return _EBADF; + } + fd_retain(fd); new_f = f_install_start(fd, arg); - bit_set(new_f, table->cloexec); + if (new_f >= 0) + bit_set(new_f, table->cloexec); + unlock(&table->lock); return new_f; case F_GETFD_: STRACE("fcntl(%d, F_GETFD)", f); - return bit_test(f, table->cloexec); + lock(&table->lock); + if (fdtable_get(table, f) == NULL) { + unlock(&table->lock); + return _EBADF; + } + err = bit_test(f, table->cloexec); + unlock(&table->lock); + return err; case F_SETFD_: STRACE("fcntl(%d, F_SETFD, 0x%x)", f, arg); + lock(&table->lock); + if (fdtable_get(table, f) == NULL) { + unlock(&table->lock); + return _EBADF; + } if (arg & 1) bit_set(f, table->cloexec); else bit_clear(f, table->cloexec); + unlock(&table->lock); return 0; + } + + fd = f_get_retain(f); + if (fd == NULL) + return _EBADF; + + switch (cmd) { + case F_GETFL_: STRACE("fcntl(%d, F_GETFL)", f); - return fd_getflags(fd); + err = fd_getflags(fd); + break; case F_SETFL_: STRACE("fcntl(%d, F_SETFL, %#x)", f, arg); - return fd_setflags(fd, arg); + err = fd_setflags(fd, arg); + break; case F_GETLK_: STRACE("fcntl(%d, F_GETLK, %#x)", f, arg); - if (user_read(arg, &flock32, sizeof(flock32))) - return _EFAULT; + if (user_read(arg, &flock32, sizeof(flock32))) { + err = _EFAULT; + break; + } flock.type = flock32.type; flock.whence = flock32.whence; flock.start = flock32.start; @@ -342,43 +415,55 @@ dword_t sys_fcntl(fd_t f, dword_t cmd, dword_t arg) { flock32.len = flock.len; flock32.pid = flock.pid; if (user_write(arg, &flock32, sizeof(flock32))) - return _EFAULT; + err = _EFAULT; } - return err; + break; case F_GETLK64_: STRACE("fcntl(%d, F_GETLK64, %#x)", f, arg); - if (user_read(arg, &flock, sizeof(flock))) - return _EFAULT; + if (user_read(arg, &flock, sizeof(flock))) { + err = _EFAULT; + break; + } err = fcntl_getlk(fd, &flock); if (err >= 0) if (user_write(arg, &flock, sizeof(flock))) - return _EFAULT; - return err; + err = _EFAULT; + break; case F_SETLK_: case F_SETLKW_: STRACE("fcntl(%d, F_SETLK%*s, %#x)", f, cmd == F_SETLKW_, "W", arg); - if (user_read(arg, &flock32, sizeof(flock32))) - return _EFAULT; + if (user_read(arg, &flock32, sizeof(flock32))) { + err = _EFAULT; + break; + } flock.type = flock32.type; flock.whence = flock32.whence; flock.start = flock32.start; flock.len = flock32.len; flock.pid = flock32.pid; - return fcntl_setlk(fd, &flock, cmd == F_SETLKW64_); + err = fcntl_setlk(fd, &flock, cmd == F_SETLKW64_); + break; case F_SETLK64_: case F_SETLKW64_: STRACE("fcntl(%d, F_SETLK%*s64, %#x)", f, cmd == F_SETLKW_, "W", arg); - if (user_read(arg, &flock, sizeof(flock))) - return _EFAULT; - return fcntl_setlk(fd, &flock, cmd == F_SETLKW_); + if (user_read(arg, &flock, sizeof(flock))) { + err = _EFAULT; + break; + } + err = fcntl_setlk(fd, &flock, cmd == F_SETLKW_); + break; default: STRACE("fcntl(%d, %d)", f, cmd); - return _EINVAL; + err = _EINVAL; + break; } + + f_put(fd); + return err; } dword_t sys_fcntl32(fd_t fd, dword_t cmd, dword_t arg) { diff --git a/fs/fd.h b/fs/fd.h index 4bd1f18353..c4b40eb7c5 100644 --- a/fs/fd.h +++ b/fs/fd.h @@ -184,6 +184,8 @@ void fdtable_do_cloexec(struct fdtable *table); struct fd *fdtable_get(struct fdtable *table, fd_t f); struct fd *f_get(fd_t f); +struct fd *f_get_retain(fd_t f); +void f_put(struct fd *fd); // steals a reference to the fd, gives it to the table on success and destroys it on error // flags is checked for O_CLOEXEC and O_NONBLOCK fd_t f_install(struct fd *fd, int flags); diff --git a/kernel/init.c b/kernel/init.c index 769d769959..8c7e02ab52 100644 --- a/kernel/init.c +++ b/kernel/init.c @@ -138,8 +138,7 @@ int create_stdio(const char *file, int major, int minor) { return err; } - fd->refcount = 0; - current->files->files[0] = fd_retain(fd); + current->files->files[0] = fd; current->files->files[1] = fd_retain(fd); current->files->files[2] = fd_retain(fd); return 0; diff --git a/tests/e2e/fd_race/expected.txt b/tests/e2e/fd_race/expected.txt new file mode 100644 index 0000000000..b5754e2037 --- /dev/null +++ b/tests/e2e/fd_race/expected.txt @@ -0,0 +1 @@ +ok \ No newline at end of file diff --git a/tests/e2e/fd_race/fd_race.c b/tests/e2e/fd_race/fd_race.c new file mode 100644 index 0000000000..3c60d9347c --- /dev/null +++ b/tests/e2e/fd_race/fd_race.c @@ -0,0 +1,135 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +enum { + DUP_THREADS = 4, + DUP_ITERS = 100000, +}; + +static int target_fd = -1; +static pthread_barrier_t start_barrier; +static volatile int failed = 0; +static volatile int active_dupers = DUP_THREADS; +static volatile long ebadf_count = 0; +static const char *target_path = "fd_race.c"; + +static void fail_errno(const char *op) { + perror(op); + failed = 1; +} + +static int check_duplicate(int fd, const char *op) { + if (fd < 0) { + if (errno == EBADF) { + __sync_fetch_and_add(&ebadf_count, 1); + fprintf(stderr, "%s saw EBADF while fd %d was being atomically replaced\n", op, target_fd); + } else { + perror(op); + } + failed = 1; + return -1; + } + if (close(fd) < 0) { + fail_errno("close(duplicate)"); + return -1; + } + return failed ? -1 : 0; +} + +static void *replace_thread(void *arg) { + (void) arg; + pthread_barrier_wait(&start_barrier); + while (!failed && __sync_fetch_and_add(&active_dupers, 0) > 0) { + int temp = open(target_path, O_RDONLY); + if (temp < 0) { + fail_errno("open"); + break; + } + if (dup2(temp, target_fd) < 0) { + int err = errno; + close(temp); + errno = err; + fail_errno("dup2"); + break; + } + if (close(temp) < 0) { + fail_errno("close(temp)"); + break; + } + sched_yield(); + } + return NULL; +} + +static void *dup_thread(void *arg) { + long id = (long) arg; + pthread_barrier_wait(&start_barrier); + for (int i = 0; i < DUP_ITERS && !failed; i++) { + errno = 0; + int duplicate; + const char *op; + if (((i + id) & 1) == 0) { + op = "dup"; + duplicate = dup(target_fd); + } else { + op = "fcntl(F_DUPFD_CLOEXEC)"; + duplicate = fcntl(target_fd, F_DUPFD_CLOEXEC, 32); + } + if (check_duplicate(duplicate, op) < 0) + break; + if ((i & 0xff) == 0) + sched_yield(); + } + __sync_sub_and_fetch(&active_dupers, 1); + return NULL; +} + +int main(void) { + target_fd = open(target_path, O_RDONLY); + if (target_fd < 0) { + perror("open target"); + return 1; + } + + pthread_t replacer; + pthread_t dupers[DUP_THREADS]; + if (pthread_barrier_init(&start_barrier, NULL, DUP_THREADS + 1) != 0) { + perror("pthread_barrier_init"); + close(target_fd); + return 1; + } + + if (pthread_create(&replacer, NULL, replace_thread, NULL) != 0) { + perror("pthread_create replacer"); + close(target_fd); + return 1; + } + for (long i = 0; i < DUP_THREADS; i++) { + if (pthread_create(&dupers[i], NULL, dup_thread, (void *) i) != 0) { + perror("pthread_create duper"); + failed = 1; + __sync_sub_and_fetch(&active_dupers, 1); + break; + } + } + + for (int i = 0; i < DUP_THREADS; i++) + pthread_join(dupers[i], NULL); + pthread_join(replacer, NULL); + pthread_barrier_destroy(&start_barrier); + close(target_fd); + + if (failed) { + fprintf(stderr, "fd race detected, ebadf_count=%ld\n", ebadf_count); + return 1; + } + + printf("ok\n"); + return 0; +} \ No newline at end of file diff --git a/tests/e2e/fd_race/test.sh b/tests/e2e/fd_race/test.sh new file mode 100644 index 0000000000..12a467770d --- /dev/null +++ b/tests/e2e/fd_race/test.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -eu + +gcc -O2 -pthread fd_race.c -o fd_race +./fd_race \ No newline at end of file