From a717ed5dc1717fe7c36c40e80948380d9d760e79 Mon Sep 17 00:00:00 2001 From: Kailun Qin Date: Fri, 26 Jan 2024 02:55:23 -0500 Subject: [PATCH] [LibOS] Add support for timerfd system calls This commit adds support for system calls that create and operate on a timer that delivers timer expiration notifications via a file descriptor, specifically: `timerfd_create()`, `timerfd_settime()` and `timerfd_gettime()`. The timerfd object is associated with a dummy eventfd created on the host to trigger notifications (e.g., in epoll). The object is created inside Gramine, with all its operations resolved entirely inside Gramine (note that the time source in Gramine SGX is still untrusted). The emulation is currently implemented at the level of a single process. All timerfds created in the parent process are marked as invalid in child processes. In multi-process applications, Gramine does not exit immediately after fork; it only exits if the application attempts to use timerfds in the child. Therefore, inter-process timing signals via timerfds are not allowed. LibOS regression tests are also added. Signed-off-by: Kailun Qin --- Documentation/devel/features.md | 30 +- libos/include/libos_fs.h | 3 +- libos/include/libos_handle.h | 17 +- libos/include/libos_table.h | 4 + libos/include/libos_utils.h | 8 +- libos/include/linux_abi/time.h | 31 +- libos/src/arch/x86_64/libos_table.c | 6 +- libos/src/fs/libos_fs.c | 1 + libos/src/fs/proc/thread.c | 1 + libos/src/fs/timerfd/fs.c | 146 ++++++++++ libos/src/libos_async.c | 50 ++-- libos/src/libos_parser.c | 104 ++++++- libos/src/meson.build | 2 + libos/src/sys/libos_alarm.c | 13 +- libos/src/sys/libos_epoll.c | 7 + libos/src/sys/libos_exit.c | 4 +- libos/src/sys/libos_ioctl.c | 4 +- libos/src/sys/libos_sleep.c | 1 + libos/src/sys/libos_time.c | 1 + libos/src/sys/libos_timerfd.c | 296 ++++++++++++++++++++ libos/test/ltp/ltp.cfg | 8 +- libos/test/regression/meson.build | 2 + libos/test/regression/test_libos.py | 12 + libos/test/regression/tests.toml | 2 + libos/test/regression/tests_musl.toml | 2 + libos/test/regression/timerfd.c | 388 ++++++++++++++++++++++++++ libos/test/regression/timerfd_fork.c | 67 +++++ 27 files changed, 1158 insertions(+), 52 deletions(-) create mode 100644 libos/src/fs/timerfd/fs.c create mode 100644 libos/src/sys/libos_timerfd.c create mode 100644 libos/test/regression/timerfd.c create mode 100644 libos/test/regression/timerfd_fork.c diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md index 93f84e3b07..e334a624e4 100644 --- a/Documentation/devel/features.md +++ b/Documentation/devel/features.md @@ -1036,7 +1036,7 @@ The below list is generated from the [syscall table of Linux - ☒ `signalfd()` [7](#signals-and-process-state-changes) -- ☒ `timerfd_create()` +- ▣ `timerfd_create()` [20](#sleeps-timers-and-alarms) - ▣ `eventfd()` @@ -1045,10 +1045,10 @@ The below list is generated from the [syscall table of Linux - ▣ `fallocate()` [9a](#file-system-operations) -- ☒ `timerfd_settime()` +- ▣ `timerfd_settime()` [20](#sleeps-timers-and-alarms) -- ☒ `timerfd_gettime()` +- ▣ `timerfd_gettime()` [20](#sleeps-timers-and-alarms) - ☑ `accept4()` @@ -2891,9 +2891,23 @@ Gramine implements getting and setting the interval timer: `getitimer()` and `se Gramine implements alarm clocks via `alarm()`. +Gramine implements timers that notify via file descriptors: `timerfd_create()`, `timerfd_settime()` +and `timerfd_gettime()`. The timerfd object is created inside Gramine, and all operations are +resolved entirely inside Gramine (note that the time source in Gramine SGX is still untrusted). Each +timerfd object is associated with a dummy eventfd created on the host. This is purely for triggering +read notifications (e.g., in epoll); timerfd data is verified inside Gramine and is never exposed to +the host. Since the host is used purely for notifications, a malicious host can only induce Denial +of Service (DoS) attacks. `TFD_TIMER_CANCEL_ON_SET` is silently ignored because there are no +"discontinuous changes of time" in Gramine (via e.g., `settimeofday()`). `TFD_IOC_SET_TICKS` is not +supported. + +The emulation is currently implemented at the level of a single process. All timerfds created in the +parent process are marked as invalid in child processes. In multi-process applications, Gramine does +not exit immediately after fork; it only exits if the application attempts to use timerfds in the +child. Therefore, inter-process timing signals via timerfds are not allowed. + Gramine does *not* currently implement the POSIX per-process timer: `timer_create()`, etc. Gramine -also does not currently implement timers that notify via file descriptors. Gramine could implement -these timers in the future, if need arises. +could implement it in the future, if need arises.
Related system calls @@ -2909,9 +2923,9 @@ these timers in the future, if need arises. - ☒ `timer_getoverrun()`: may be implemented in the future - ☒ `timer_delete()`: may be implemented in the future -- ☒ `timerfd_create()`: may be implemented in the future -- ☒ `timerfd_settime()`: may be implemented in the future -- ☒ `timerfd_gettime()`: may be implemented in the future +- ▣ `timerfd_create()`: see the notes above +- ▣ `timerfd_settime()`: see the notes above +- ▣ `timerfd_gettime()`: see the notes above

diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h index 0590a9c8db..b101c041df 100644 --- a/libos/include/libos_fs.h +++ b/libos/include/libos_fs.h @@ -190,7 +190,7 @@ struct libos_fs_ops { int (*poll)(struct libos_handle* hdl, int in_events, int* out_events); /* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed - * ones. Used in e.g. secure eventfd FS to verify if the host is not lying to us. */ + * ones. Used in e.g. secure eventfd and timerfd FS to verify if the host is not lying to us. */ void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events); /* checkpoint/migrate the file system */ @@ -948,6 +948,7 @@ extern struct libos_fs eventfd_builtin_fs; extern struct libos_fs synthetic_builtin_fs; extern struct libos_fs path_builtin_fs; extern struct libos_fs shm_builtin_fs; +extern struct libos_fs timerfd_builtin_fs; struct libos_fs* find_fs(const char* name); diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h index d0920cff06..71331d7fb7 100644 --- a/libos/include/libos_handle.h +++ b/libos/include/libos_handle.h @@ -46,6 +46,7 @@ enum libos_handle_type { /* Special handles: */ TYPE_EPOLL, /* epoll handles, see `libos_epoll.c` */ TYPE_EVENTFD, /* eventfd handles, used by `eventfd` filesystem */ + TYPE_TIMERFD, /* timerfd handles, used by `timerfd` filesystem */ }; struct libos_pipe_handle { @@ -142,6 +143,18 @@ struct libos_eventfd_handle { uint64_t dummy_host_val; }; +struct libos_timerfd_handle { + bool broken_in_child; + + spinlock_t expiration_lock; /* protecting below fields */ + uint64_t num_expirations; + uint64_t dummy_host_val; + + spinlock_t timer_lock; /* protecting below fields */ + uint64_t timeout; /* always an absolute time */ + uint64_t reset; +}; + struct libos_handle { enum libos_handle_type type; bool is_dir; @@ -217,6 +230,8 @@ struct libos_handle { struct libos_epoll_handle epoll; /* TYPE_EPOLL */ struct libos_eventfd_handle eventfd; /* TYPE_EVENTFD */ + + struct libos_timerfd_handle timerfd; /* TYPE_TIMERFD */ } info; struct libos_dir_handle dir_info; @@ -232,7 +247,7 @@ struct libos_handle { * `read`, `seek` but not `pread`). This lock should be taken *before* `libos_handle.lock` and * `libos_inode.lock`. Must be used *only* via maybe_lock_pos_handle() and * maybe_unlock_pos_handle(); these functions make sure that the lock is acquired only on those - * handle types that are seekable (e.g. not on eventfds or pipes). */ + * handle types that are seekable (e.g. not on eventfds, timerfds or pipes). */ struct libos_lock pos_lock; }; diff --git a/libos/include/libos_table.h b/libos/include/libos_table.h index e204aaf6de..1b8840ff24 100644 --- a/libos/include/libos_table.h +++ b/libos/include/libos_table.h @@ -220,3 +220,7 @@ long libos_syscall_getrandom(char* buf, size_t count, unsigned int flags); long libos_syscall_mlock2(unsigned long start, size_t len, int flags); long libos_syscall_sysinfo(struct sysinfo* info); long libos_syscall_close_range(unsigned int first, unsigned int last, unsigned int flags); +long libos_syscall_timerfd_create(int clockid, int flags); +long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value, + struct __kernel_itimerspec* ovalue); +long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value); diff --git a/libos/include/libos_utils.h b/libos/include/libos_utils.h index a4298a50ef..bdae8966bc 100644 --- a/libos/include/libos_utils.h +++ b/libos/include/libos_utils.h @@ -52,8 +52,14 @@ void clean_link_map_list(void); int create_pipe(char* name, char* uri, size_t size, PAL_HANDLE* hdl, bool use_vmid_for_name); /* Asynchronous event support */ +enum async_event_type { + ASYNC_EVENT_TYPE_IO = 1, + ASYNC_EVENT_TYPE_ALARM_TIMER = 2, +}; + int init_async_worker(void); -int64_t install_async_event(PAL_HANDLE object, unsigned long time, +int64_t install_async_event(enum async_event_type type, PAL_HANDLE object, + unsigned long time_us, bool absolute_time, void (*callback)(IDTYPE caller, void* arg), void* arg); void terminate_async_worker(void); diff --git a/libos/include/linux_abi/time.h b/libos/include/linux_abi/time.h index da848822de..303d184c0b 100644 --- a/libos/include/linux_abi/time.h +++ b/libos/include/linux_abi/time.h @@ -9,11 +9,11 @@ /* These need to be binary-identical with the ones used by Linux. */ // TODO: remove all of these includes and make this header libc-independent. -#include -#include -#include #include +typedef long __kernel_suseconds_t; +typedef long __kernel_time_t; + typedef __kernel_time_t time_t; #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) @@ -37,3 +37,28 @@ struct __kernel_timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ }; + +/* The IDs of the various system clocks (for POSIX.1b interval timers). */ +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 1 +#define CLOCK_PROCESS_CPUTIME_ID 2 +#define CLOCK_THREAD_CPUTIME_ID 3 +#define CLOCK_MONOTONIC_RAW 4 +#define CLOCK_REALTIME_COARSE 5 +#define CLOCK_MONOTONIC_COARSE 6 +#define CLOCK_BOOTTIME 7 +#define CLOCK_REALTIME_ALARM 8 +#define CLOCK_BOOTTIME_ALARM 9 + +#define MAX_CLOCKS 16 + +#define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_CANCEL_ON_SET (1 << 1) +#define TFD_CLOEXEC O_CLOEXEC +#define TFD_NONBLOCK O_NONBLOCK + +#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) +/* Flags for timerfd_create. */ +#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS +/* Flags for timerfd_settime. */ +#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) diff --git a/libos/src/arch/x86_64/libos_table.c b/libos/src/arch/x86_64/libos_table.c index 480f81259a..fbe6d3c49b 100644 --- a/libos/src/arch/x86_64/libos_table.c +++ b/libos/src/arch/x86_64/libos_table.c @@ -297,11 +297,11 @@ libos_syscall_t libos_syscall_table[LIBOS_SYSCALL_BOUND] = { [__NR_utimensat] = (libos_syscall_t)0, // libos_syscall_utimensat [__NR_epoll_pwait] = (libos_syscall_t)libos_syscall_epoll_pwait, [__NR_signalfd] = (libos_syscall_t)0, // libos_syscall_signalfd - [__NR_timerfd_create] = (libos_syscall_t)0, // libos_syscall_timerfd_create + [__NR_timerfd_create] = (libos_syscall_t)libos_syscall_timerfd_create, [__NR_eventfd] = (libos_syscall_t)libos_syscall_eventfd, [__NR_fallocate] = (libos_syscall_t)libos_syscall_fallocate, - [__NR_timerfd_settime] = (libos_syscall_t)0, // libos_syscall_timerfd_settime - [__NR_timerfd_gettime] = (libos_syscall_t)0, // libos_syscall_timerfd_gettime + [__NR_timerfd_settime] = (libos_syscall_t)libos_syscall_timerfd_settime, + [__NR_timerfd_gettime] = (libos_syscall_t)libos_syscall_timerfd_gettime, [__NR_accept4] = (libos_syscall_t)libos_syscall_accept4, [__NR_signalfd4] = (libos_syscall_t)0, // libos_syscall_signalfd4 [__NR_eventfd2] = (libos_syscall_t)libos_syscall_eventfd2, diff --git a/libos/src/fs/libos_fs.c b/libos/src/fs/libos_fs.c index 5a29a36d6d..f10aefd74b 100644 --- a/libos/src/fs/libos_fs.c +++ b/libos/src/fs/libos_fs.c @@ -33,6 +33,7 @@ static struct libos_fs* g_builtin_fs[] = { &synthetic_builtin_fs, &path_builtin_fs, &shm_builtin_fs, + &timerfd_builtin_fs, }; static struct libos_lock g_mount_mgr_lock; diff --git a/libos/src/fs/proc/thread.c b/libos/src/fs/proc/thread.c index c3da147c48..ed1fa1a95a 100644 --- a/libos/src/fs/proc/thread.c +++ b/libos/src/fs/proc/thread.c @@ -287,6 +287,7 @@ static char* describe_handle(struct libos_handle* hdl) { case TYPE_EPOLL: str = "epoll:[?]"; break; case TYPE_EVENTFD: str = "eventfd:[?]"; break; case TYPE_SHM: str = "shm:[?]"; break; + case TYPE_TIMERFD: str = "timerfd:[?]"; break; default: str = "unknown:[?]"; break; } return strdup(str); diff --git a/libos/src/fs/timerfd/fs.c b/libos/src/fs/timerfd/fs.c new file mode 100644 index 0000000000..502ce37c2b --- /dev/null +++ b/libos/src/fs/timerfd/fs.c @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* + * This file contains code for implementation of "timerfd" filesystem. For more information, see + * `libos/src/sys/libos_timerfd.c`. + */ + +#include "libos_fs.h" +#include "libos_handle.h" +#include "libos_internal.h" +#include "libos_lock.h" +#include "linux_abi/errors.h" +#include "pal.h" + +/* Enforce a restriction that all timerfds created in the parent process are marked as invalid in + * child processes, i.e. inter-process timing signals via timerfds are not allowed. This restriction + * is because LibOS doesn't yet implement sync between timerfd objects. */ +static int timerfd_checkin(struct libos_handle* hdl) { + assert(hdl->type == TYPE_TIMERFD); + hdl->info.timerfd.broken_in_child = true; + return 0; +} + +/* This implementation is the same as `eventfd_dummy_host_read()` in "fs/eventfd/fs.c". */ +static void timerfd_dummy_host_read(struct libos_handle* hdl) { + int ret; + uint64_t buf_dummy_host_val = 0; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + do { + ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + } while (ret == PAL_ERROR_INTERRUPTED); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +/* This implementation is the same as `eventfd_dummy_host_wait()` in "fs/eventfd/fs.c". */ +static void timerfd_dummy_host_wait(struct libos_handle* hdl) { + pal_wait_flags_t wait_for_events = PAL_WAIT_READ; + pal_wait_flags_t ret_events = 0; + int ret = PalStreamsWaitEvents(1, &hdl->pal_handle, &wait_for_events, &ret_events, NULL); + if (ret < 0 && ret != PAL_ERROR_INTERRUPTED) { + BUG(); + } + (void)ret_events; /* we don't care what events the host returned, we can't trust them anyway */ +} + +static ssize_t timerfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) { + __UNUSED(pos); + assert(hdl->type == TYPE_TIMERFD); + + if (count < sizeof(uint64_t)) + return -EINVAL; + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + int ret; + spinlock_lock(&hdl->info.timerfd.expiration_lock); + + while (!hdl->info.timerfd.num_expirations) { + if (hdl->flags & O_NONBLOCK) { + ret = -EAGAIN; + goto out; + } + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + timerfd_dummy_host_wait(hdl); + spinlock_lock(&hdl->info.timerfd.expiration_lock); + } + + memcpy(buf, &hdl->info.timerfd.num_expirations, sizeof(uint64_t)); + hdl->info.timerfd.num_expirations = 0; + + /* perform a read (not supposed to block) to clear the event from polling threads */ + if (hdl->info.timerfd.dummy_host_val) { + timerfd_dummy_host_read(hdl); + hdl->info.timerfd.dummy_host_val = 0; + } + + ret = (ssize_t)count; +out: + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false); + return ret; +} + +static void timerfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) { + assert(hdl->type == TYPE_TIMERFD); + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + *pal_ret_events = PAL_WAIT_ERROR; + return; + } + + if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP | PAL_WAIT_WRITE)) { + /* impossible: we control timerfd inside the LibOS, and we never raise such conditions */ + BUG(); + } + + spinlock_lock(&hdl->info.timerfd.expiration_lock); + if (*pal_ret_events & PAL_WAIT_READ) { + /* there is data to read: verify if timerfd has number of expirations greater than zero */ + if (!hdl->info.timerfd.num_expirations) { + /* spurious or malicious notification, can legitimately happen if another thread + * consumed this event between this thread's poll wakeup and the post_poll callback; + * we currently choose to return a spurious notification to the user */ + *pal_ret_events &= ~PAL_WAIT_READ; + } + } + spinlock_unlock(&hdl->info.timerfd.expiration_lock); +} + +static int timerfd_close(struct libos_handle* hdl) { + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + /* cancel the pending timerfd object */ + return install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + /*time_us=*/0, /*absolute_time=*/false, /*callback=*/NULL, + /*arg=*/NULL); +} + +struct libos_fs_ops timerfd_fs_ops = { + .checkin = &timerfd_checkin, + .read = &timerfd_read, + .close = &timerfd_close, + .post_poll = &timerfd_post_poll, +}; + +struct libos_fs timerfd_builtin_fs = { + .name = "timerfd", + .fs_ops = &timerfd_fs_ops, +}; diff --git a/libos/src/libos_async.c b/libos/src/libos_async.c index d05af32691..310518316c 100644 --- a/libos/src/libos_async.c +++ b/libos/src/libos_async.c @@ -15,12 +15,13 @@ DEFINE_LIST(async_event); struct async_event { + enum async_event_type type; IDTYPE caller; /* thread installing this event */ LIST_TYPE(async_event) list; LIST_TYPE(async_event) triggered_list; void (*callback)(IDTYPE caller, void* arg); void* arg; - PAL_HANDLE object; /* handle (async IO) to wait on */ + PAL_HANDLE object; /* handle (async IO or timerfd) to wait on */ uint64_t expire_time_us; /* alarm/timer to wait on */ }; DEFINE_LISTP(async_event); @@ -35,25 +36,26 @@ static struct libos_lock async_worker_lock; /* TODO: use async_worker_thread->pollable_event instead */ static struct libos_pollable_event install_new_event; -/* Threads register async events like alarm(), setitimer(), ioctl(FIOASYNC) - * using this function. These events are enqueued in async_list and delivered - * to async worker thread by triggering install_new_event. When event is - * triggered in async worker thread, the corresponding event's callback with - * arguments `arg` is called. This callback typically sends a signal to the +/* Threads register async events like alarm(), setitimer(), timerfd_settime(), ioctl(FIOASYNC) using + * this function. These events are enqueued in async_list and delivered to async worker thread by + * triggering install_new_event. When event is triggered in async worker thread, the corresponding + * event's callback with arguments `arg` is called. This callback typically sends a signal to the * thread which registered the event (saved in `event->caller`). * - * We distinguish between alarm/timer events and async IO events: - * - alarm/timer events set object = NULL and time_us = microseconds - * (time_us = 0 cancels all pending alarms/timers). + * The async event type is specified in `type`. Alarm/timer events and async IO events are currently + * supported: + * - alarm/timer events set time_us = microsseconds (time_us = 0 cancels all pending + * alarms/timers). Specfically when object != NULL, this indicates a timerfd event. * - async IO events set object = handle and time_us = 0. * - * Function returns remaining usecs for alarm/timer events (same as alarm()) - * or 0 for async IO events. On error, it returns a negated error code. + * Function returns remaining usecs for alarm/timer events (same as alarm()) or 0 for async IO + * events. On error, it returns a negated error code. */ -int64_t install_async_event(PAL_HANDLE object, uint64_t time_us, +int64_t install_async_event(enum async_event_type type, PAL_HANDLE object, + uint64_t time_us, bool absolute_time, void (*callback)(IDTYPE caller, void* arg), void* arg) { - /* if event happens on object, time_us must be zero */ - assert(!object || (object && !time_us)); + assert((type == ASYNC_EVENT_TYPE_ALARM_TIMER) || + (type == ASYNC_EVENT_TYPE_IO && (!object || !time_us))); uint64_t now_us = 0; int ret = PalSystemTimeQuery(&now_us); @@ -68,21 +70,22 @@ int64_t install_async_event(PAL_HANDLE object, uint64_t time_us, return -ENOMEM; } + event->type = type; event->callback = callback; event->arg = arg; event->caller = get_cur_tid(); event->object = object; - event->expire_time_us = time_us ? now_us + time_us : 0; + event->expire_time_us = time_us ? (absolute_time ? time_us : now_us + time_us) : 0; lock(&async_worker_lock); - if (callback != &cleanup_thread && !object) { - /* This is alarm() or setitimer() emulation, treat both according to - * alarm() syscall semantics: cancel any pending alarm/timer. */ + if (callback != &cleanup_thread && type == ASYNC_EVENT_TYPE_ALARM_TIMER) { + /* This is alarm(), setitimer(), timerfd_settime() emulation, treat all according to alarm() + * syscall semantics: cancel any pending alarm/timer. */ struct async_event* tmp; struct async_event* n; LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { - if (tmp->expire_time_us) { + if (tmp->object == object && tmp->expire_time_us) { /* this is a pending alarm/timer, cancel it and save its expiration time */ if (max_prev_expire_time_us < tmp->expire_time_us) max_prev_expire_time_us = tmp->expire_time_us; @@ -164,7 +167,7 @@ static int libos_async_worker(void* arg) { struct async_event* n; LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { /* repopulate `pals` with IO events and find the next expiring alarm/timer */ - if (tmp->object) { + if (tmp->type == ASYNC_EVENT_TYPE_IO && tmp->object) { if (pals_cnt == pals_max_cnt) { /* grow `pals` to accommodate more objects */ PAL_HANDLE* tmp_pals = malloc(sizeof(*tmp_pals) * (1 + pals_max_cnt * 2)); @@ -200,7 +203,8 @@ static int libos_async_worker(void* arg) { pal_events[pals_cnt + 1] = PAL_WAIT_READ; ret_events[pals_cnt + 1] = 0; pals_cnt++; - } else if (tmp->expire_time_us && tmp->expire_time_us > now_us) { + } else if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER && tmp->expire_time_us && + tmp->expire_time_us > now_us) { if (!next_expire_time_us || next_expire_time_us > tmp->expire_time_us) { /* use time of the next expiring alarm/timer */ next_expire_time_us = tmp->expire_time_us; @@ -252,7 +256,7 @@ static int libos_async_worker(void* arg) { /* check if this event is an IO event found in async_list */ LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { - if (tmp->object == pals[i]) { + if (tmp->type == ASYNC_EVENT_TYPE_IO && tmp->object == pals[i]) { log_debug("Async IO event triggered at %lu", now_us); LISTP_ADD_TAIL(tmp, &triggered, triggered_list); break; @@ -282,7 +286,7 @@ static int libos_async_worker(void* arg) { LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &triggered, triggered_list) { LISTP_DEL(tmp, &triggered, triggered_list); tmp->callback(tmp->caller, tmp->arg); - if (!tmp->object) { + if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER) { /* this is a one-off exit-child or alarm/timer event */ free(tmp); } diff --git a/libos/src/libos_parser.c b/libos/src/libos_parser.c index f1f470cc41..641314e476 100644 --- a/libos/src/libos_parser.c +++ b/libos/src/libos_parser.c @@ -21,6 +21,7 @@ #include "linux_abi/sched.h" #include "linux_abi/signals.h" #include "linux_abi/syscalls_nr_arch.h" +#include "linux_abi/time.h" #include "socket_utils.h" static void parse_open_flags(struct print_buf*, va_list*); @@ -38,6 +39,7 @@ static void parse_sigprocmask_how(struct print_buf*, va_list*); static void parse_msync_flags(struct print_buf*, va_list*); static void parse_madvise_behavior(struct print_buf*, va_list* ap); static void parse_timespec(struct print_buf*, va_list*); +static void parse_itimerspec(struct print_buf*, va_list*); static void parse_sockaddr(struct print_buf*, va_list*); static void parse_domain(struct print_buf*, va_list*); static void parse_socktype(struct print_buf*, va_list*); @@ -53,6 +55,9 @@ static void parse_getrandom_flags(struct print_buf*, va_list*); static void parse_epoll_op(struct print_buf*, va_list*); static void parse_epoll_event(struct print_buf* buf, va_list* ap); static void parse_close_range_flags(struct print_buf* buf, va_list* ap); +static void parse_clockid(struct print_buf* buf, va_list* ap); +static void parse_timerfd_create_flags(struct print_buf* buf, va_list* ap); +static void parse_timerfd_settime_flags(struct print_buf* buf, va_list* ap); static void parse_string_arg(struct print_buf*, va_list* ap); static void parse_pointer_arg(struct print_buf*, va_list* ap); @@ -522,13 +527,17 @@ struct parser_table { parse_integer_arg, parse_pointer_arg, parse_integer_arg, parse_integer_arg, parse_pointer_arg, parse_pointer_arg}}, [__NR_signalfd] = {.slow = false, .name = "signalfd", .parser = {NULL}}, - [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {NULL}}, + [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {parse_long_arg, + parse_clockid, parse_timerfd_create_flags}}, [__NR_eventfd] = {.slow = false, .name = "eventfd", .parser = {parse_long_arg, parse_integer_arg}}, [__NR_fallocate] = {.slow = false, .name = "fallocate", .parser = {parse_long_arg, parse_integer_arg, parse_integer_arg, parse_long_arg, parse_long_arg}}, - [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {NULL}}, - [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {NULL}}, + [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {parse_long_arg, + parse_integer_arg, parse_timerfd_settime_flags, parse_itimerspec, + parse_itimerspec}}, + [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {parse_long_arg, + parse_integer_arg, parse_itimerspec}}, [__NR_accept4] = {.slow = true, .name = "accept4", .parser = {parse_long_arg, parse_integer_arg, parse_pointer_arg, parse_pointer_arg, parse_integer_arg}}, [__NR_signalfd4] = {.slow = false, .name = "signalfd4", .parser = {NULL}}, @@ -1145,6 +1154,24 @@ static void parse_timespec(struct print_buf* buf, va_list* ap) { buf_printf(buf, "[%ld,%ld]", tv->tv_sec, tv->tv_nsec); } +static void parse_itimerspec(struct print_buf* buf, va_list* ap) { + const struct itimerspec* it = va_arg(*ap, const struct itimerspec*); + + if (!it) { + buf_puts(buf, "NULL"); + return; + } + + if (!is_user_memory_readable((void*)it, sizeof(*it))) { + buf_printf(buf, "(invalid-addr %p)", it); + return; + } + + buf_printf(buf, "intvl:[%ld,%ld] val:[%ld,%ld]", + it->it_interval.tv_sec, it->it_interval.tv_nsec, + it->it_value.tv_sec, it->it_value.tv_nsec); +} + static void parse_sockaddr(struct print_buf* buf, va_list* ap) { void* addr = va_arg(*ap, void*); @@ -1621,6 +1648,77 @@ static void parse_close_range_flags(struct print_buf* buf, va_list* ap) { buf_printf(buf, "|0x%x", flags); } +static void parse_clockid(struct print_buf* buf, va_list* ap) { + int clockid = va_arg(*ap, int); + switch (clockid) { + case CLOCK_REALTIME: + buf_puts(buf, "CLOCK_REALTIME"); + break; + case CLOCK_MONOTONIC: + buf_puts(buf, "CLOCK_MONOTONIC"); + break; + case CLOCK_PROCESS_CPUTIME_ID: + buf_puts(buf, "CLOCK_PROCESS_CPUTIME_ID"); + break; + case CLOCK_THREAD_CPUTIME_ID: + buf_puts(buf, "CLOCK_THREAD_CPUTIME_ID"); + break; + case CLOCK_MONOTONIC_RAW: + buf_puts(buf, "CLOCK_MONOTONIC_RAW"); + break; + case CLOCK_REALTIME_COARSE: + buf_puts(buf, "CLOCK_REALTIME_COARSE"); + break; + case CLOCK_MONOTONIC_COARSE: + buf_puts(buf, "CLOCK_MONOTONIC_COARSE"); + break; + case CLOCK_BOOTTIME: + buf_puts(buf, "CLOCK_BOOTTIME"); + break; + case CLOCK_REALTIME_ALARM: + buf_puts(buf, "CLOCK_REALTIME_ALARM"); + break; + case CLOCK_BOOTTIME_ALARM: + buf_puts(buf, "CLOCK_BOOTTIME_ALARM"); + break; + default: + buf_printf(buf, "(unknown: %d)", clockid); + break; + } +} + +static void parse_timerfd_create_flags(struct print_buf* buf, va_list* ap) { + int flags = va_arg(*ap, int); + +#define FLG(n) \ + { #n, n } + const struct flag_table all_flags[] = { + FLG(TFD_NONBLOCK), + FLG(TFD_CLOEXEC), + }; +#undef FLG + + flags = parse_flags(buf, flags, all_flags, ARRAY_SIZE(all_flags)); + if (flags) + buf_printf(buf, "|0x%x", flags); +} + +static void parse_timerfd_settime_flags(struct print_buf* buf, va_list* ap) { + int flags = va_arg(*ap, int); + +#define FLG(n) \ + { #n, n } + const struct flag_table all_flags[] = { + FLG(TFD_TIMER_ABSTIME), + FLG(TFD_TIMER_CANCEL_ON_SET), + }; +#undef FLG + + flags = parse_flags(buf, flags, all_flags, ARRAY_SIZE(all_flags)); + if (flags) + buf_printf(buf, "|0x%x", flags); +} + static void parse_string_arg(struct print_buf* buf, va_list* ap) { const char* arg = va_arg(*ap, const char*); if (is_user_string_readable(arg)) { diff --git a/libos/src/meson.build b/libos/src/meson.build index 5a262a8160..2f3bf32b09 100644 --- a/libos/src/meson.build +++ b/libos/src/meson.build @@ -43,6 +43,7 @@ libos_sources = files( 'fs/sys/cpu_info.c', 'fs/sys/fs.c', 'fs/sys/node_info.c', + 'fs/timerfd/fs.c', 'fs/tmpfs/fs.c', 'gramine_hash.c', 'ipc/libos_ipc.c', @@ -101,6 +102,7 @@ libos_sources = files( 'sys/libos_socket.c', 'sys/libos_stat.c', 'sys/libos_time.c', + 'sys/libos_timerfd.c', 'sys/libos_uname.c', 'sys/libos_wait.c', 'sys/libos_wrappers.c', diff --git a/libos/src/sys/libos_alarm.c b/libos/src/sys/libos_alarm.c index 0ccecfce25..ee683fd38e 100644 --- a/libos/src/sys/libos_alarm.c +++ b/libos/src/sys/libos_alarm.c @@ -35,7 +35,8 @@ static void signal_alarm(IDTYPE caller, void* arg) { long libos_syscall_alarm(unsigned int seconds) { uint64_t usecs = 1000000ULL * seconds; - int64_t ret = install_async_event(NULL, usecs, &signal_alarm, NULL); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + usecs, /*absolute_time=*/false, &signal_alarm, /*arg=*/NULL); if (ret < 0) return ret; @@ -66,8 +67,9 @@ static void signal_itimer(IDTYPE caller, void* arg) { spinlock_unlock(&g_real_itimer_lock); if (next_reset) { - int64_t ret = install_async_event(/*object=*/NULL, next_reset, &signal_itimer, - /*arg=*/NULL); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + next_reset, /*absolute_time=*/false, + &signal_itimer, /*arg=*/NULL); if (ret < 0) { log_error( "failed to re-enqueue the next timer event initially set up by 'setitimer()': %s", @@ -113,8 +115,9 @@ long libos_syscall_setitimer(int which, struct __kernel_itimerval* value, : 0; uint64_t current_reset = g_real_itimer.reset; - int64_t install_ret = install_async_event(NULL, next_value, &signal_itimer, /*arg=*/NULL); - + int64_t install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + next_value, /*absolute_time=*/false, + &signal_itimer, /*arg=*/NULL); if (install_ret < 0) { spinlock_unlock(&g_real_itimer_lock); return install_ret; diff --git a/libos/src/sys/libos_epoll.c b/libos/src/sys/libos_epoll.c index b3e7a058fa..3694e1a3f5 100644 --- a/libos/src/sys/libos_epoll.c +++ b/libos/src/sys/libos_epoll.c @@ -189,6 +189,12 @@ void maybe_epoll_et_trigger(struct libos_handle* handle, int ret, bool in, bool needs_et = true; } break; + case TYPE_TIMERFD: + /* timerfd in edge-triggered mode will notify only on expiration state changes (i.e., + * if the expiration count is not read, the timerfd remains in an expired state, and no + * notification will be triggered) */ + needs_et = in; + break; default: /* Type unsupported with EPOLLET. */ break; @@ -461,6 +467,7 @@ long libos_syscall_epoll_ctl(int epfd, int op, int fd, struct epoll_event* event case TYPE_PIPE: case TYPE_SOCK: case TYPE_EVENTFD: + case TYPE_TIMERFD: break; default: /* epoll not supported by this type of handle */ diff --git a/libos/src/sys/libos_exit.c b/libos/src/sys/libos_exit.c index fbaee29b7b..afbdb3ce1e 100644 --- a/libos/src/sys/libos_exit.c +++ b/libos/src/sys/libos_exit.c @@ -101,7 +101,9 @@ noreturn void thread_exit(int error_code, int term_signal) { cur_thread->clear_child_tid_pal = 1; /* any non-zero value suffices */ /* We pass this ownership to `cleanup_thread`. */ get_thread(cur_thread); - int64_t ret = install_async_event(NULL, 0, &cleanup_thread, cur_thread); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + /*time_us=*/0, /*absolute_time=*/false, &cleanup_thread, + cur_thread); /* Take the reference to the current thread from the tcb. */ lock(&cur_thread->lock); diff --git a/libos/src/sys/libos_ioctl.c b/libos/src/sys/libos_ioctl.c index 89d5424da9..8bbad6efa4 100644 --- a/libos/src/sys/libos_ioctl.c +++ b/libos/src/sys/libos_ioctl.c @@ -104,7 +104,9 @@ long libos_syscall_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { rwlock_write_unlock(&handle_map->lock); break; case FIOASYNC: - ret = install_async_event(hdl->pal_handle, 0, &signal_io, NULL); + ret = install_async_event(ASYNC_EVENT_TYPE_IO, hdl->pal_handle, + /*time_us=*/0, /*absolute_time=*/false, &signal_io, + /*arg=*/NULL); break; case FIONREAD: { if (!is_user_memory_writable((void*)arg, sizeof(int))) { diff --git a/libos/src/sys/libos_sleep.c b/libos/src/sys/libos_sleep.c index 56fc8b12b9..c01af88992 100644 --- a/libos/src/sys/libos_sleep.c +++ b/libos/src/sys/libos_sleep.c @@ -11,6 +11,7 @@ #include "libos_thread.h" #include "libos_utils.h" #include "linux_abi/errors.h" +#include "linux_abi/time.h" #include "pal.h" long libos_syscall_pause(void) { diff --git a/libos/src/sys/libos_time.c b/libos/src/sys/libos_time.c index cf34c334ee..60316080bc 100644 --- a/libos/src/sys/libos_time.c +++ b/libos/src/sys/libos_time.c @@ -8,6 +8,7 @@ #include "libos_internal.h" #include "libos_table.h" #include "linux_abi/errors.h" +#include "linux_abi/time.h" #include "pal.h" long libos_syscall_gettimeofday(struct __kernel_timeval* tv, struct __kernel_timezone* tz) { diff --git a/libos/src/sys/libos_timerfd.c b/libos/src/sys/libos_timerfd.c new file mode 100644 index 0000000000..63d0506194 --- /dev/null +++ b/libos/src/sys/libos_timerfd.c @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* Implementation of "timerfd" system calls. + * + * The timerfd object is created inside the LibOS, and all operations are resolved entirely inside + * the LibOS (note that the time source in Gramine SGX is still untrusted). Each timerfd object is + * associated with a dummy eventfd created on the host. This is purely for triggering read + * notifications (e.g., in epoll); timerfd data is verified inside the LibOS and is never exposed to + * the host. Since the host is used purely for notifications, a malicious host can only induce + * Denial of Service (DoS) attacks. + * + * The emulation is currently implemented at the level of a single process. All timerfds created in + * the parent process are marked as invalid in child processes. In multi-process applications, + * Gramine does not exit immediately after fork; it only exits if the application attempts to use + * timerfds in the child. Therefore, inter-process timing signals via timerfds are not allowed. + * + * The host's eventfd object is "dummy" and used purely for notifications -- to unblock blocking + * read/select/poll/epoll system calls. The read notify logic is already hardened, by + * double-checking that the timerfd object indeed expired. However, there are three possible attacks + * on polling mechanisms (select/poll/epoll): + * + * a. Malicious host may inject the notification too early: POLLIN when no timer expired yet. This + * may lead to a synchronization failure of the app. To prevent this, timerfd implements a + * callback `post_poll()` where it verifies that a timer was indeed expired (i.e., that the + * notification is not spurious). + * b. Malicious host may inject the notification too late or not send a notification at all. + * This is a Denial of Service (DoS), which we don't care about. + * c. Malicious host may inject POLLERR, POLLHUP, POLLRDHUP, POLLNVAL, POLLOUT. This is impossible + * as we control timerfd objects inside the LibOS, and we never raise such conditions. So the + * callback `post_poll()` panics if it detects such a return event. + */ + +#include "libos_checkpoint.h" +#include "libos_fs.h" +#include "libos_handle.h" +#include "libos_internal.h" +#include "libos_table.h" +#include "libos_utils.h" +#include "linux_abi/fs.h" +#include "linux_abi/time.h" +#include "linux_eventfd.h" +#include "pal.h" + +/* This implementation is the same as `eventfd_dummy_host_write()` in "fs/eventfd/fs.c". */ +static void timerfd_dummy_host_write(struct libos_handle* hdl) { + int ret; + uint64_t buf_dummy_host_val = 1; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + do { + ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + } while (ret == PAL_ERROR_INTERRUPTED); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +static int create_timerfd_pal_handle(PAL_HANDLE* out_pal_handle) { + PAL_HANDLE hdl = NULL; + + int ret = PalStreamOpen(URI_PREFIX_EVENTFD, PAL_ACCESS_RDWR, /*share_flags=*/0, + PAL_CREATE_IGNORED, /*options=*/0, &hdl); + if (ret < 0) { + log_error("timerfd: dummy host eventfd creation failure"); + return pal_to_unix_errno(ret); + } + + *out_pal_handle = hdl; + return 0; +} + +long libos_syscall_timerfd_create(int clockid, int flags) { + int ret; + + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && + clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME && + clockid != CLOCK_BOOTTIME_ALARM)) + return -EINVAL; + + if (clockid != CLOCK_REALTIME) { + if (FIRST_TIME()) { + log_warning("Unsupported clockid in 'timerfd_create()'; replaced by the system-wide " + "real-time clock."); + } + } + + struct libos_handle* hdl = get_new_handle(); + if (!hdl) + return -ENOMEM; + + hdl->type = TYPE_TIMERFD; + hdl->fs = &timerfd_builtin_fs; + hdl->flags = O_RDONLY | (flags & TFD_NONBLOCK ? O_NONBLOCK : 0); + hdl->acc_mode = MAY_READ; + + hdl->info.timerfd.broken_in_child = false; + hdl->info.timerfd.num_expirations = 0; + hdl->info.timerfd.dummy_host_val = 0; + hdl->info.timerfd.timeout = 0; + hdl->info.timerfd.reset = 0; + + ret = create_timerfd_pal_handle(&hdl->pal_handle); + if (ret < 0) + goto out; + + ret = set_new_fd_handle(hdl, flags & TFD_CLOEXEC ? FD_CLOEXEC : 0, NULL); +out: + put_handle(hdl); + return ret; +} + +static void timerfd_update(struct libos_handle* hdl) { + spinlock_lock(&hdl->info.timerfd.expiration_lock); + + /* When the expiration count overflows, the read will saturate at UINT64_MAX while the timer + * will continue to fire. */ + if (hdl->info.timerfd.num_expirations < UINT64_MAX) + hdl->info.timerfd.num_expirations++; + + hdl->info.timerfd.dummy_host_val++; + + /* perform a write (not supposed to block) to send an event to reading/polling threads */ + timerfd_dummy_host_write(hdl); + + spinlock_unlock(&hdl->info.timerfd.expiration_lock); +} + +static void callback_itimer(IDTYPE caller, void* arg) { + __UNUSED(caller); + + struct libos_handle* hdl = (struct libos_handle*)arg; + + spinlock_lock(&hdl->info.timerfd.timer_lock); + hdl->info.timerfd.timeout += hdl->info.timerfd.reset; + uint64_t next_reset = hdl->info.timerfd.reset; + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + if (next_reset) { + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + next_reset, /*absolute_time=*/false, + &callback_itimer, (void*)hdl); + if (ret < 0) { + log_error( + "failed to re-enqueue the next timer event initially set up by " + "'timerfd_settime()': %s", unix_strerror(ret)); + die_or_inf_loop(); + } + } + + timerfd_update(hdl); +} + +long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value, + struct __kernel_itimerspec* ovalue) { + int ret; + + struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL); + if (!hdl) + return -EBADF; + + if (hdl->type != TYPE_TIMERFD) { + ret = -EINVAL; + goto out; + } + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + if (!is_user_memory_readable(value, sizeof(*value))) { + ret = -EFAULT; + goto out; + } + if (ovalue && !is_user_memory_writable(ovalue, sizeof(*ovalue))) { + ret = -EFAULT; + goto out; + } + + /* `TFD_TIMER_CANCEL_ON_SET` is silently ignored because there are no "discontinuous changes of + * time" in Gramine (via e.g., `settimeofday()`). */ + + if (flags & ~TFD_SETTIME_FLAGS) { + ret = -EINVAL; + goto out; + } + + uint64_t setup_time = 0; + ret = PalSystemTimeQuery(&setup_time); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + goto out; + } + + uint64_t new_timeout = timespec_to_us(&value->it_value); + uint64_t new_reset = timespec_to_us(&value->it_interval); + + spinlock_lock(&hdl->info.timerfd.timer_lock); + + uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time + ? hdl->info.timerfd.timeout - setup_time + : 0; + uint64_t current_reset = hdl->info.timerfd.reset; + + bool absolute_time = flags & TFD_TIMER_ABSTIME; + if (absolute_time) { + hdl->info.timerfd.timeout = new_timeout; + } else { + hdl->info.timerfd.timeout = setup_time + new_timeout; + } + hdl->info.timerfd.reset = new_reset; + + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + int64_t install_ret; + if (new_timeout) { + install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + new_timeout, absolute_time, + &callback_itimer, (void*)hdl); + } else { + /* cancel the pending timerfd object */ + install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + /*time_us=*/0, /*absolute_time=*/false, + /*callback=*/NULL, /*arg=*/NULL); + } + if (install_ret < 0) { + ret = install_ret; + goto out; + } + + if (ovalue) { + ovalue->it_interval.tv_sec = current_reset / TIME_US_IN_S; + ovalue->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US; + ovalue->it_value.tv_sec = current_timeout / TIME_US_IN_S; + ovalue->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US; + } + + ret = 0; +out: + put_handle(hdl); + return ret; +} + +long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value) { + int ret; + + struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL); + if (!hdl) + return -EBADF; + + if (hdl->type != TYPE_TIMERFD) { + ret = -EINVAL; + goto out; + } + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + if (!is_user_memory_writable(value, sizeof(*value))) { + ret = -EFAULT; + goto out; + } + + uint64_t setup_time = 0; + ret = PalSystemTimeQuery(&setup_time); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + goto out; + } + + spinlock_lock(&hdl->info.timerfd.timer_lock); + uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time + ? hdl->info.timerfd.timeout - setup_time + : 0; + uint64_t current_reset = hdl->info.timerfd.reset; + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + value->it_interval.tv_sec = current_reset / TIME_US_IN_S; + value->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US; + value->it_value.tv_sec = current_timeout / TIME_US_IN_S; + value->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US; + + ret = 0; +out: + put_handle(hdl); + return ret; +} diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg index 9195858e20..8c358afeeb 100644 --- a/libos/test/ltp/ltp.cfg +++ b/libos/test/ltp/ltp.cfg @@ -2436,8 +2436,12 @@ skip = yes [timer_settime*] skip = yes -# no timerfd -[timerfd*] +# clocks other than `CLOCK_REALTIME` are not supported +[timerfd04] +skip = yes + +# relies on "/proc/sys/kernel/tainted" (see tst_taint.c:tst_taint_check) +[timerfd_settime02] skip = yes [times03] diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build index b6f201b24e..444c9a6996 100644 --- a/libos/test/regression/meson.build +++ b/libos/test/regression/meson.build @@ -155,6 +155,8 @@ tests = { 'tcp_einprogress': {}, 'tcp_ipv6_v6only': {}, 'tcp_msg_peek': {}, + 'timerfd': {}, + 'timerfd_fork': {}, 'udp': {}, 'uid_gid': {}, 'unix': {}, diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py index 57ab164f74..90b46ef90d 100644 --- a/libos/test/regression/test_libos.py +++ b/libos/test/regression/test_libos.py @@ -1084,6 +1084,18 @@ def test_161_rlimit_nofile_4k(self): self.assertIn("(after setrlimit) opened fd: 4096", stdout) self.assertIn("TEST OK", stdout) + def test_170_timerfd(self): + stdout, _ = self.run_binary(['timerfd'], timeout=120) + self.assertIn("TEST OK", stdout) + + def test_171_timerfd_fork(self): + try: + self.run_binary(['timerfd_fork']) + self.fail('timerfd_fork unexpectedly succeeded') + except subprocess.CalledProcessError as e: + stdout = e.stdout.decode() + self.assertIn('child died', stdout) + class TC_31_Syscall(RegressionTestCase): def test_000_syscall_redirect(self): stdout, _ = self.run_binary(['syscall']) diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml index 23fa2fc5c6..063e5fdaed 100644 --- a/libos/test/regression/tests.toml +++ b/libos/test/regression/tests.toml @@ -134,6 +134,8 @@ manifests = [ "tcp_einprogress", "tcp_ipv6_v6only", "tcp_msg_peek", + "timerfd", + "timerfd_fork", "toml_parsing", "udp", "uid_gid", diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml index 7a3acc3743..20334622a4 100644 --- a/libos/test/regression/tests_musl.toml +++ b/libos/test/regression/tests_musl.toml @@ -135,6 +135,8 @@ manifests = [ "tcp_einprogress", "tcp_ipv6_v6only", "tcp_msg_peek", + "timerfd", + "timerfd_fork", "toml_parsing", "udp", "uid_gid", diff --git a/libos/test/regression/timerfd.c b/libos/test/regression/timerfd.c new file mode 100644 index 0000000000..cd4ee10520 --- /dev/null +++ b/libos/test/regression/timerfd.c @@ -0,0 +1,388 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* + * Single-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and + * `timerfd_gettime()`). + * + * The tests involve cases including reading a blocking/non-blocking timerfd, poll/epoll/select on + * timerfds, setting up a relative/absolute/periodic timerfd and reading a timerfd from multiple + * threads. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define EXPECTED_EXPIRATIONS 1 +#define EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT 5 +#define NUM_FDS 2 +#define NUM_THREADS 5 +#define PERIODIC_INTERVAL 1 +#define TIMEOUT_VALUE 2 + +static void set_timerfd_relative(int fd, bool periodic) { + struct itimerspec new_value = { + .it_value.tv_sec = TIMEOUT_VALUE, + .it_interval.tv_sec = periodic ? PERIODIC_INTERVAL : 0, + }; + + CHECK(timerfd_settime(fd, 0, &new_value, NULL)); +} + +static void set_timerfds_relative(int fds[NUM_FDS], bool periodic) { + for (int i = 0; i < NUM_FDS; i++) + set_timerfd_relative(fds[i], periodic); +} + +static void set_timerfd_absolute(int fd, struct timespec* abs_time) { + struct itimerspec new_value; + + /* Set the timer to expire at the absolute time specified */ + new_value.it_value.tv_sec = abs_time->tv_sec; + new_value.it_value.tv_nsec = abs_time->tv_nsec; + new_value.it_interval.tv_sec = 0; + new_value.it_interval.tv_nsec = 0; + + /* Set the timer to absolute time */ + CHECK(timerfd_settime(fd, TFD_TIMER_ABSTIME, &new_value, NULL)); +} + +static void create_timerfds(int fds[NUM_FDS]) { + for (int i = 0; i < NUM_FDS; i++) + fds[i] = CHECK(timerfd_create(CLOCK_REALTIME, 0)); +} + +static void close_timerfds(int fds[NUM_FDS]) { + for (int i = 0; i < NUM_FDS; i++) + CHECK(close(fds[i])); +} + +static void test_select(int fds[NUM_FDS]) { + fd_set rfds; + int max_fd = 0; + int ready_fds = 0; + + for (int i = 0; i < NUM_FDS; i++) { + if (fds[i] > max_fd) + max_fd = fds[i]; + } + + while (ready_fds < NUM_FDS) { + FD_ZERO(&rfds); + for (int i = 0; i < NUM_FDS; i++) { + FD_SET(fds[i], &rfds); + } + + int nfds = select(max_fd + 1, &rfds, NULL, NULL, NULL); + if (nfds <= 0) + err(1, "select on read event failed"); + + for (int i = 0; i < NUM_FDS; i++) { + if (FD_ISSET(fds[i], &rfds)) { + uint64_t expirations; + CHECK(read(fds[i], &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "select: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + ready_fds++; + } + } + } + + if (ready_fds != NUM_FDS) + errx(1, "select: unexpected number of ready fds (expected %d, got %d)", + NUM_FDS, ready_fds); +} + +static void test_poll(int fds[NUM_FDS]) { + struct pollfd pfds[NUM_FDS]; + int ready_fds = 0; + + for (int i = 0; i < NUM_FDS; i++) { + pfds[i].fd = fds[i]; + pfds[i].events = POLLIN; + pfds[i].revents = 0; + } + + while (ready_fds < NUM_FDS) { + int nfds = poll(pfds, NUM_FDS, -1); + if (nfds <= 0) + err(1, "poll with POLLIN failed"); + + for (int i = 0; i < NUM_FDS; i++) { + if (pfds[i].revents & POLLIN) { + uint64_t expirations; + CHECK(read(pfds[i].fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "poll: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + ready_fds++; + pfds[i].revents = 0; + } + } + } + + if (ready_fds != NUM_FDS) + errx(1, "poll: unexpected number of ready fds (expected %d, got %d)", + NUM_FDS, ready_fds); +} + +static void test_epoll(int fds[NUM_FDS]) { + int epfd = CHECK(epoll_create1(0)); + + struct epoll_event ev; + ev.events = EPOLLIN; + for (int i = 0; i < NUM_FDS; i++) { + ev.data.fd = fds[i]; + CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fds[i], &ev)); + } + + struct epoll_event events[NUM_FDS]; + int ready_fds = 0; + + while (ready_fds < NUM_FDS) { + int nfds = epoll_wait(epfd, events, NUM_FDS, -1); + if (nfds <= 0) + err(1, "epoll_wait with EPOLLIN failed"); + + for (int i = 0; i < nfds; i++) { + uint64_t expirations; + CHECK(read(events[i].data.fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "epoll_wait: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + ready_fds++; + } + } + + if (ready_fds != NUM_FDS) + errx(1, "epoll_wait: unexpected number of ready fds (expected %d, got %d)", + NUM_FDS, ready_fds); + + CHECK(close(epfd)); +} + +/* this test expects the timerfd (`fd`) to be a periodic timer */ +static void test_epoll_modes(int fd) { + int epfd = CHECK(epoll_create1(0)); + + /* level-triggered mode */ + struct epoll_event ev; + ev.events = EPOLLIN; + ev.data.fd = fd; + CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev)); + + struct epoll_event events[1]; + int nfds = CHECK(epoll_wait(epfd, events, 1, -1)); + if (nfds != 1) + errx(1, "epoll: unexpected number of fds (expected 1, got %u)", nfds); + + /* waiting for another event without reading the expiration count */ + nfds = CHECK(epoll_wait(epfd, events, 1, /*timeout=*/PERIODIC_INTERVAL * 1000 * 2)); + if (nfds != 1) + errx(1, "epoll: unexpected number of fds in level-triggered mode without reading " + "(expected 1, got %u)", nfds); + + /* switch to edge-triggered mode */ + ev.events = EPOLLIN | EPOLLET; + CHECK(epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ev)); + + nfds = CHECK(epoll_wait(epfd, events, 1, -1)); + if (nfds != 1) + errx(1, "epoll: unexpected number of fds (expected 1, got %u)", nfds); + + /* waiting for another event without reading the expiration count: here, even though the timer + * expired at least once, there is no event reported because we're in edge-triggered mode (which + * does not "reset" the event since there was no read) */ + nfds = CHECK(epoll_wait(epfd, events, 1, /*timeout=*/PERIODIC_INTERVAL * 1000 * 2)); + if (nfds != 0) + errx(1, "epoll: unexpected number of fds in edge-triggered mode without reading " + "(expected 0, got %u)", nfds); + + CHECK(close(epfd)); +} + +static void test_periodic_timer(int fd) { + uint64_t expirations; + size_t total_expirations = 0; + + while (total_expirations < EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT) { + CHECK(read(fd, &expirations, sizeof(expirations))); + total_expirations += expirations; + } +} + +static void* timerfd_read_thread(void* arg) { + int fd = *(int*)arg; + uint64_t expirations; + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations == 0) + err(1, "threaded read: unexpected number of expirations"); + pthread_exit(NULL); +} + +/* a periodic timer is required so that all NUM_THREADS threads have something to read */ +static void test_periodic_timer_threaded_read(int fd) { + pthread_t threads[NUM_THREADS]; + for (int i = 0; i < NUM_THREADS; i++) { + CHECK(pthread_create(&threads[i], NULL, timerfd_read_thread, &fd)); + /* wait for the thread to finish */ + CHECK(pthread_join(threads[i], NULL)); + } +} + +static void test_timerfd_gettime(int fd) { + struct itimerspec curr_value; + CHECK(timerfd_gettime(fd, &curr_value)); + + /* the timer should be set to expire close to 2 seconds */ + if (curr_value.it_value.tv_sec > 2 || curr_value.it_value.tv_sec < 1 || + curr_value.it_value.tv_nsec < 0 || curr_value.it_value.tv_nsec >= 1000000000) { + errx(1, "timerfd_gettime: unexpected timer value (expected close to 2.0, got %ld.%09ld)", + curr_value.it_value.tv_sec, curr_value.it_value.tv_nsec); + } +} + +static void test_disarm_timer(int fd) { + struct itimerspec old_value; + + /* immediately disarm the timer and get the old value */ + struct itimerspec disarm_value = { 0 }; + CHECK(timerfd_settime(fd, 0, &disarm_value, &old_value)); + + /* check that the old value is around 2 seconds */ + if (old_value.it_value.tv_sec > 2 || old_value.it_value.tv_sec < 1 || + old_value.it_value.tv_nsec < 0 || old_value.it_value.tv_nsec >= 1000000000) { + errx(1, "disarm_timer: unexpected old timer value (expected close to 2.0, got %ld.%09ld)", + old_value.it_value.tv_sec, old_value.it_value.tv_nsec); + } + + /* test poll with a timeout to ensure the timer was disarmed */ + struct pollfd pfd = { + .fd = fd, + .events = POLLIN, + }; + int ret = poll(&pfd, 1, /*timeout=*/(TIMEOUT_VALUE + 1) * 1000); + if (ret != 0) + errx(1, "disarm_timer: poll returned %d, expected 0 (timeout)", ret); +} + +static void test_absolute_time(int fd) { + struct timespec now; + struct timespec abs_time; + uint64_t expirations; + + /* test timerfd with absolute time set in the future */ + CHECK(clock_gettime(CLOCK_REALTIME, &now)); + abs_time.tv_sec = now.tv_sec + TIMEOUT_VALUE; + abs_time.tv_nsec = now.tv_nsec; + + set_timerfd_absolute(fd, &abs_time); + + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "absolute_time future: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + + expirations = 0; + memset(&now, 0, sizeof(struct timespec)); + memset(&abs_time, 0, sizeof(struct timespec)); + + /* test timerfd with absolute time set in the past */ + CHECK(clock_gettime(CLOCK_REALTIME, &now)); + abs_time.tv_sec = now.tv_sec - TIMEOUT_VALUE; + abs_time.tv_nsec = now.tv_nsec; + + set_timerfd_absolute(fd, &abs_time); + + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "absolute_time past: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } +} + +/* This test must be executed twice: first reading from a non-periodic timerfd in blocking mode to + * capture its expiration, and then switching to non-blocking mode for a second read, which + * immediately returns with EAGAIN because the timer is disarmed and there are no new expiration + * events. */ +static void test_read(int fd, bool non_blocking) { + if (non_blocking) { + CHECK(fcntl(fd, F_SETFL, O_NONBLOCK)); + } + + uint64_t expirations; + int retval = read(fd, &expirations, sizeof(expirations)); + + if (non_blocking) { + if (retval != -1 || errno != EAGAIN) { + errx(1, "non-blocking read: read returned %d, errno %d, expected -1 and EAGAIN", + retval, errno); + } + } else { + CHECK(retval); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "read: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + } +} + +int main(void) { + int fds[NUM_FDS]; + create_timerfds(fds); + + set_timerfds_relative(fds, /*periodic=*/false); + test_select(fds); + + set_timerfds_relative(fds, /*periodic=*/false); + test_poll(fds); + + set_timerfds_relative(fds, /*periodic=*/false); + test_epoll(fds); + + set_timerfd_relative(fds[0], /*periodic=*/true); + test_epoll_modes(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/true); + test_periodic_timer(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/true); + test_periodic_timer_threaded_read(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/false); + test_timerfd_gettime(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/false); + test_disarm_timer(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/false); + test_read(fds[0], /*non_blocking=*/false); + test_read(fds[0], /*non_blocking=*/true); + + test_absolute_time(fds[1]); + + close_timerfds(fds); + + puts("TEST OK"); + return 0; +} diff --git a/libos/test/regression/timerfd_fork.c b/libos/test/regression/timerfd_fork.c new file mode 100644 index 0000000000..0bf82fc64c --- /dev/null +++ b/libos/test/regression/timerfd_fork.c @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* Multi-process test for `timerfd` syscalls (`timerfd_create()` and `timerfd_settime()`). + * + * Note that timerfd is currently only emulated in a secure single-process mode, so this test does + * not work. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define EXPECTED_EXPIRATIONS 1 +#define TIMEOUT_VALUE 2 + +static void set_timerfd(int fd) { + struct itimerspec new_value = { .it_value.tv_sec = TIMEOUT_VALUE }; + + CHECK(timerfd_settime(fd, 0, &new_value, NULL)); +} + +static void test_multi_process(int fd) { + pid_t pid = CHECK(fork()); + if (pid == 0) { + uint64_t expirations; + /* child: wait on a blocking read for the timer to expire */ + /* Note: Due to the limitation of `timerfd` syscalls being only emulated in a secure + * single-process mode, `read()` will return a negative error code. */ + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "child process: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + exit(0); + } else { + int status = 0; + + /* parent: do nothing and let the child process read the timerfd */ + CHECK(waitpid(pid, &status, 0)); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + errx(1, "child died with status: %#x", status); + } + } +} + +int main(void) { + int fd = CHECK(timerfd_create(CLOCK_REALTIME, 0)); + + set_timerfd(fd); + test_multi_process(fd); + + CHECK(close(fd)); + + puts("TEST OK"); + return 0; +}