diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md index 93f84e3b07..e334a624e4 100644 --- a/Documentation/devel/features.md +++ b/Documentation/devel/features.md @@ -1036,7 +1036,7 @@ The below list is generated from the [syscall table of Linux - ☒ `signalfd()` [7](#signals-and-process-state-changes) -- ☒ `timerfd_create()` +- ▣ `timerfd_create()` [20](#sleeps-timers-and-alarms) - ▣ `eventfd()` @@ -1045,10 +1045,10 @@ The below list is generated from the [syscall table of Linux - ▣ `fallocate()` [9a](#file-system-operations) -- ☒ `timerfd_settime()` +- ▣ `timerfd_settime()` [20](#sleeps-timers-and-alarms) -- ☒ `timerfd_gettime()` +- ▣ `timerfd_gettime()` [20](#sleeps-timers-and-alarms) - ☑ `accept4()` @@ -2891,9 +2891,23 @@ Gramine implements getting and setting the interval timer: `getitimer()` and `se Gramine implements alarm clocks via `alarm()`. +Gramine implements timers that notify via file descriptors: `timerfd_create()`, `timerfd_settime()` +and `timerfd_gettime()`. The timerfd object is created inside Gramine, and all operations are +resolved entirely inside Gramine (note that the time source in Gramine SGX is still untrusted). Each +timerfd object is associated with a dummy eventfd created on the host. This is purely for triggering +read notifications (e.g., in epoll); timerfd data is verified inside Gramine and is never exposed to +the host. Since the host is used purely for notifications, a malicious host can only induce Denial +of Service (DoS) attacks. `TFD_TIMER_CANCEL_ON_SET` is silently ignored because there are no +"discontinuous changes of time" in Gramine (via e.g., `settimeofday()`). `TFD_IOC_SET_TICKS` is not +supported. + +The emulation is currently implemented at the level of a single process. All timerfds created in the +parent process are marked as invalid in child processes. In multi-process applications, Gramine does +not exit immediately after fork; it only exits if the application attempts to use timerfds in the +child. Therefore, inter-process timing signals via timerfds are not allowed. + Gramine does *not* currently implement the POSIX per-process timer: `timer_create()`, etc. Gramine -also does not currently implement timers that notify via file descriptors. Gramine could implement -these timers in the future, if need arises. +could implement it in the future, if need arises.
Related system calls @@ -2909,9 +2923,9 @@ these timers in the future, if need arises. - ☒ `timer_getoverrun()`: may be implemented in the future - ☒ `timer_delete()`: may be implemented in the future -- ☒ `timerfd_create()`: may be implemented in the future -- ☒ `timerfd_settime()`: may be implemented in the future -- ☒ `timerfd_gettime()`: may be implemented in the future +- ▣ `timerfd_create()`: see the notes above +- ▣ `timerfd_settime()`: see the notes above +- ▣ `timerfd_gettime()`: see the notes above

diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h index 0590a9c8db..b101c041df 100644 --- a/libos/include/libos_fs.h +++ b/libos/include/libos_fs.h @@ -190,7 +190,7 @@ struct libos_fs_ops { int (*poll)(struct libos_handle* hdl, int in_events, int* out_events); /* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed - * ones. Used in e.g. secure eventfd FS to verify if the host is not lying to us. */ + * ones. Used in e.g. secure eventfd and timerfd FS to verify if the host is not lying to us. */ void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events); /* checkpoint/migrate the file system */ @@ -948,6 +948,7 @@ extern struct libos_fs eventfd_builtin_fs; extern struct libos_fs synthetic_builtin_fs; extern struct libos_fs path_builtin_fs; extern struct libos_fs shm_builtin_fs; +extern struct libos_fs timerfd_builtin_fs; struct libos_fs* find_fs(const char* name); diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h index d0920cff06..71331d7fb7 100644 --- a/libos/include/libos_handle.h +++ b/libos/include/libos_handle.h @@ -46,6 +46,7 @@ enum libos_handle_type { /* Special handles: */ TYPE_EPOLL, /* epoll handles, see `libos_epoll.c` */ TYPE_EVENTFD, /* eventfd handles, used by `eventfd` filesystem */ + TYPE_TIMERFD, /* timerfd handles, used by `timerfd` filesystem */ }; struct libos_pipe_handle { @@ -142,6 +143,18 @@ struct libos_eventfd_handle { uint64_t dummy_host_val; }; +struct libos_timerfd_handle { + bool broken_in_child; + + spinlock_t expiration_lock; /* protecting below fields */ + uint64_t num_expirations; + uint64_t dummy_host_val; + + spinlock_t timer_lock; /* protecting below fields */ + uint64_t timeout; /* always an absolute time */ + uint64_t reset; +}; + struct libos_handle { enum libos_handle_type type; bool is_dir; @@ -217,6 +230,8 @@ struct libos_handle { struct libos_epoll_handle epoll; /* TYPE_EPOLL */ struct libos_eventfd_handle eventfd; /* TYPE_EVENTFD */ + + struct libos_timerfd_handle timerfd; /* TYPE_TIMERFD */ } info; struct libos_dir_handle dir_info; @@ -232,7 +247,7 @@ struct libos_handle { * `read`, `seek` but not `pread`). This lock should be taken *before* `libos_handle.lock` and * `libos_inode.lock`. Must be used *only* via maybe_lock_pos_handle() and * maybe_unlock_pos_handle(); these functions make sure that the lock is acquired only on those - * handle types that are seekable (e.g. not on eventfds or pipes). */ + * handle types that are seekable (e.g. not on eventfds, timerfds or pipes). */ struct libos_lock pos_lock; }; diff --git a/libos/include/libos_table.h b/libos/include/libos_table.h index e204aaf6de..1b8840ff24 100644 --- a/libos/include/libos_table.h +++ b/libos/include/libos_table.h @@ -220,3 +220,7 @@ long libos_syscall_getrandom(char* buf, size_t count, unsigned int flags); long libos_syscall_mlock2(unsigned long start, size_t len, int flags); long libos_syscall_sysinfo(struct sysinfo* info); long libos_syscall_close_range(unsigned int first, unsigned int last, unsigned int flags); +long libos_syscall_timerfd_create(int clockid, int flags); +long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value, + struct __kernel_itimerspec* ovalue); +long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value); diff --git a/libos/include/libos_utils.h b/libos/include/libos_utils.h index a4298a50ef..bdae8966bc 100644 --- a/libos/include/libos_utils.h +++ b/libos/include/libos_utils.h @@ -52,8 +52,14 @@ void clean_link_map_list(void); int create_pipe(char* name, char* uri, size_t size, PAL_HANDLE* hdl, bool use_vmid_for_name); /* Asynchronous event support */ +enum async_event_type { + ASYNC_EVENT_TYPE_IO = 1, + ASYNC_EVENT_TYPE_ALARM_TIMER = 2, +}; + int init_async_worker(void); -int64_t install_async_event(PAL_HANDLE object, unsigned long time, +int64_t install_async_event(enum async_event_type type, PAL_HANDLE object, + unsigned long time_us, bool absolute_time, void (*callback)(IDTYPE caller, void* arg), void* arg); void terminate_async_worker(void); diff --git a/libos/include/linux_abi/time.h b/libos/include/linux_abi/time.h index da848822de..303d184c0b 100644 --- a/libos/include/linux_abi/time.h +++ b/libos/include/linux_abi/time.h @@ -9,11 +9,11 @@ /* These need to be binary-identical with the ones used by Linux. */ // TODO: remove all of these includes and make this header libc-independent. -#include -#include -#include #include +typedef long __kernel_suseconds_t; +typedef long __kernel_time_t; + typedef __kernel_time_t time_t; #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) @@ -37,3 +37,28 @@ struct __kernel_timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ }; + +/* The IDs of the various system clocks (for POSIX.1b interval timers). */ +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 1 +#define CLOCK_PROCESS_CPUTIME_ID 2 +#define CLOCK_THREAD_CPUTIME_ID 3 +#define CLOCK_MONOTONIC_RAW 4 +#define CLOCK_REALTIME_COARSE 5 +#define CLOCK_MONOTONIC_COARSE 6 +#define CLOCK_BOOTTIME 7 +#define CLOCK_REALTIME_ALARM 8 +#define CLOCK_BOOTTIME_ALARM 9 + +#define MAX_CLOCKS 16 + +#define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_CANCEL_ON_SET (1 << 1) +#define TFD_CLOEXEC O_CLOEXEC +#define TFD_NONBLOCK O_NONBLOCK + +#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) +/* Flags for timerfd_create. */ +#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS +/* Flags for timerfd_settime. */ +#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) diff --git a/libos/src/arch/x86_64/libos_table.c b/libos/src/arch/x86_64/libos_table.c index 480f81259a..fbe6d3c49b 100644 --- a/libos/src/arch/x86_64/libos_table.c +++ b/libos/src/arch/x86_64/libos_table.c @@ -297,11 +297,11 @@ libos_syscall_t libos_syscall_table[LIBOS_SYSCALL_BOUND] = { [__NR_utimensat] = (libos_syscall_t)0, // libos_syscall_utimensat [__NR_epoll_pwait] = (libos_syscall_t)libos_syscall_epoll_pwait, [__NR_signalfd] = (libos_syscall_t)0, // libos_syscall_signalfd - [__NR_timerfd_create] = (libos_syscall_t)0, // libos_syscall_timerfd_create + [__NR_timerfd_create] = (libos_syscall_t)libos_syscall_timerfd_create, [__NR_eventfd] = (libos_syscall_t)libos_syscall_eventfd, [__NR_fallocate] = (libos_syscall_t)libos_syscall_fallocate, - [__NR_timerfd_settime] = (libos_syscall_t)0, // libos_syscall_timerfd_settime - [__NR_timerfd_gettime] = (libos_syscall_t)0, // libos_syscall_timerfd_gettime + [__NR_timerfd_settime] = (libos_syscall_t)libos_syscall_timerfd_settime, + [__NR_timerfd_gettime] = (libos_syscall_t)libos_syscall_timerfd_gettime, [__NR_accept4] = (libos_syscall_t)libos_syscall_accept4, [__NR_signalfd4] = (libos_syscall_t)0, // libos_syscall_signalfd4 [__NR_eventfd2] = (libos_syscall_t)libos_syscall_eventfd2, diff --git a/libos/src/fs/libos_fs.c b/libos/src/fs/libos_fs.c index 5a29a36d6d..f10aefd74b 100644 --- a/libos/src/fs/libos_fs.c +++ b/libos/src/fs/libos_fs.c @@ -33,6 +33,7 @@ static struct libos_fs* g_builtin_fs[] = { &synthetic_builtin_fs, &path_builtin_fs, &shm_builtin_fs, + &timerfd_builtin_fs, }; static struct libos_lock g_mount_mgr_lock; diff --git a/libos/src/fs/proc/thread.c b/libos/src/fs/proc/thread.c index c3da147c48..ed1fa1a95a 100644 --- a/libos/src/fs/proc/thread.c +++ b/libos/src/fs/proc/thread.c @@ -287,6 +287,7 @@ static char* describe_handle(struct libos_handle* hdl) { case TYPE_EPOLL: str = "epoll:[?]"; break; case TYPE_EVENTFD: str = "eventfd:[?]"; break; case TYPE_SHM: str = "shm:[?]"; break; + case TYPE_TIMERFD: str = "timerfd:[?]"; break; default: str = "unknown:[?]"; break; } return strdup(str); diff --git a/libos/src/fs/timerfd/fs.c b/libos/src/fs/timerfd/fs.c new file mode 100644 index 0000000000..502ce37c2b --- /dev/null +++ b/libos/src/fs/timerfd/fs.c @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* + * This file contains code for implementation of "timerfd" filesystem. For more information, see + * `libos/src/sys/libos_timerfd.c`. + */ + +#include "libos_fs.h" +#include "libos_handle.h" +#include "libos_internal.h" +#include "libos_lock.h" +#include "linux_abi/errors.h" +#include "pal.h" + +/* Enforce a restriction that all timerfds created in the parent process are marked as invalid in + * child processes, i.e. inter-process timing signals via timerfds are not allowed. This restriction + * is because LibOS doesn't yet implement sync between timerfd objects. */ +static int timerfd_checkin(struct libos_handle* hdl) { + assert(hdl->type == TYPE_TIMERFD); + hdl->info.timerfd.broken_in_child = true; + return 0; +} + +/* This implementation is the same as `eventfd_dummy_host_read()` in "fs/eventfd/fs.c". */ +static void timerfd_dummy_host_read(struct libos_handle* hdl) { + int ret; + uint64_t buf_dummy_host_val = 0; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + do { + ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + } while (ret == PAL_ERROR_INTERRUPTED); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +/* This implementation is the same as `eventfd_dummy_host_wait()` in "fs/eventfd/fs.c". */ +static void timerfd_dummy_host_wait(struct libos_handle* hdl) { + pal_wait_flags_t wait_for_events = PAL_WAIT_READ; + pal_wait_flags_t ret_events = 0; + int ret = PalStreamsWaitEvents(1, &hdl->pal_handle, &wait_for_events, &ret_events, NULL); + if (ret < 0 && ret != PAL_ERROR_INTERRUPTED) { + BUG(); + } + (void)ret_events; /* we don't care what events the host returned, we can't trust them anyway */ +} + +static ssize_t timerfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) { + __UNUSED(pos); + assert(hdl->type == TYPE_TIMERFD); + + if (count < sizeof(uint64_t)) + return -EINVAL; + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + int ret; + spinlock_lock(&hdl->info.timerfd.expiration_lock); + + while (!hdl->info.timerfd.num_expirations) { + if (hdl->flags & O_NONBLOCK) { + ret = -EAGAIN; + goto out; + } + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + timerfd_dummy_host_wait(hdl); + spinlock_lock(&hdl->info.timerfd.expiration_lock); + } + + memcpy(buf, &hdl->info.timerfd.num_expirations, sizeof(uint64_t)); + hdl->info.timerfd.num_expirations = 0; + + /* perform a read (not supposed to block) to clear the event from polling threads */ + if (hdl->info.timerfd.dummy_host_val) { + timerfd_dummy_host_read(hdl); + hdl->info.timerfd.dummy_host_val = 0; + } + + ret = (ssize_t)count; +out: + spinlock_unlock(&hdl->info.timerfd.expiration_lock); + maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false); + return ret; +} + +static void timerfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) { + assert(hdl->type == TYPE_TIMERFD); + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + *pal_ret_events = PAL_WAIT_ERROR; + return; + } + + if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP | PAL_WAIT_WRITE)) { + /* impossible: we control timerfd inside the LibOS, and we never raise such conditions */ + BUG(); + } + + spinlock_lock(&hdl->info.timerfd.expiration_lock); + if (*pal_ret_events & PAL_WAIT_READ) { + /* there is data to read: verify if timerfd has number of expirations greater than zero */ + if (!hdl->info.timerfd.num_expirations) { + /* spurious or malicious notification, can legitimately happen if another thread + * consumed this event between this thread's poll wakeup and the post_poll callback; + * we currently choose to return a spurious notification to the user */ + *pal_ret_events &= ~PAL_WAIT_READ; + } + } + spinlock_unlock(&hdl->info.timerfd.expiration_lock); +} + +static int timerfd_close(struct libos_handle* hdl) { + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + /* cancel the pending timerfd object */ + return install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + /*time_us=*/0, /*absolute_time=*/false, /*callback=*/NULL, + /*arg=*/NULL); +} + +struct libos_fs_ops timerfd_fs_ops = { + .checkin = &timerfd_checkin, + .read = &timerfd_read, + .close = &timerfd_close, + .post_poll = &timerfd_post_poll, +}; + +struct libos_fs timerfd_builtin_fs = { + .name = "timerfd", + .fs_ops = &timerfd_fs_ops, +}; diff --git a/libos/src/libos_async.c b/libos/src/libos_async.c index d05af32691..310518316c 100644 --- a/libos/src/libos_async.c +++ b/libos/src/libos_async.c @@ -15,12 +15,13 @@ DEFINE_LIST(async_event); struct async_event { + enum async_event_type type; IDTYPE caller; /* thread installing this event */ LIST_TYPE(async_event) list; LIST_TYPE(async_event) triggered_list; void (*callback)(IDTYPE caller, void* arg); void* arg; - PAL_HANDLE object; /* handle (async IO) to wait on */ + PAL_HANDLE object; /* handle (async IO or timerfd) to wait on */ uint64_t expire_time_us; /* alarm/timer to wait on */ }; DEFINE_LISTP(async_event); @@ -35,25 +36,26 @@ static struct libos_lock async_worker_lock; /* TODO: use async_worker_thread->pollable_event instead */ static struct libos_pollable_event install_new_event; -/* Threads register async events like alarm(), setitimer(), ioctl(FIOASYNC) - * using this function. These events are enqueued in async_list and delivered - * to async worker thread by triggering install_new_event. When event is - * triggered in async worker thread, the corresponding event's callback with - * arguments `arg` is called. This callback typically sends a signal to the +/* Threads register async events like alarm(), setitimer(), timerfd_settime(), ioctl(FIOASYNC) using + * this function. These events are enqueued in async_list and delivered to async worker thread by + * triggering install_new_event. When event is triggered in async worker thread, the corresponding + * event's callback with arguments `arg` is called. This callback typically sends a signal to the * thread which registered the event (saved in `event->caller`). * - * We distinguish between alarm/timer events and async IO events: - * - alarm/timer events set object = NULL and time_us = microseconds - * (time_us = 0 cancels all pending alarms/timers). + * The async event type is specified in `type`. Alarm/timer events and async IO events are currently + * supported: + * - alarm/timer events set time_us = microsseconds (time_us = 0 cancels all pending + * alarms/timers). Specfically when object != NULL, this indicates a timerfd event. * - async IO events set object = handle and time_us = 0. * - * Function returns remaining usecs for alarm/timer events (same as alarm()) - * or 0 for async IO events. On error, it returns a negated error code. + * Function returns remaining usecs for alarm/timer events (same as alarm()) or 0 for async IO + * events. On error, it returns a negated error code. */ -int64_t install_async_event(PAL_HANDLE object, uint64_t time_us, +int64_t install_async_event(enum async_event_type type, PAL_HANDLE object, + uint64_t time_us, bool absolute_time, void (*callback)(IDTYPE caller, void* arg), void* arg) { - /* if event happens on object, time_us must be zero */ - assert(!object || (object && !time_us)); + assert((type == ASYNC_EVENT_TYPE_ALARM_TIMER) || + (type == ASYNC_EVENT_TYPE_IO && (!object || !time_us))); uint64_t now_us = 0; int ret = PalSystemTimeQuery(&now_us); @@ -68,21 +70,22 @@ int64_t install_async_event(PAL_HANDLE object, uint64_t time_us, return -ENOMEM; } + event->type = type; event->callback = callback; event->arg = arg; event->caller = get_cur_tid(); event->object = object; - event->expire_time_us = time_us ? now_us + time_us : 0; + event->expire_time_us = time_us ? (absolute_time ? time_us : now_us + time_us) : 0; lock(&async_worker_lock); - if (callback != &cleanup_thread && !object) { - /* This is alarm() or setitimer() emulation, treat both according to - * alarm() syscall semantics: cancel any pending alarm/timer. */ + if (callback != &cleanup_thread && type == ASYNC_EVENT_TYPE_ALARM_TIMER) { + /* This is alarm(), setitimer(), timerfd_settime() emulation, treat all according to alarm() + * syscall semantics: cancel any pending alarm/timer. */ struct async_event* tmp; struct async_event* n; LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { - if (tmp->expire_time_us) { + if (tmp->object == object && tmp->expire_time_us) { /* this is a pending alarm/timer, cancel it and save its expiration time */ if (max_prev_expire_time_us < tmp->expire_time_us) max_prev_expire_time_us = tmp->expire_time_us; @@ -164,7 +167,7 @@ static int libos_async_worker(void* arg) { struct async_event* n; LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { /* repopulate `pals` with IO events and find the next expiring alarm/timer */ - if (tmp->object) { + if (tmp->type == ASYNC_EVENT_TYPE_IO && tmp->object) { if (pals_cnt == pals_max_cnt) { /* grow `pals` to accommodate more objects */ PAL_HANDLE* tmp_pals = malloc(sizeof(*tmp_pals) * (1 + pals_max_cnt * 2)); @@ -200,7 +203,8 @@ static int libos_async_worker(void* arg) { pal_events[pals_cnt + 1] = PAL_WAIT_READ; ret_events[pals_cnt + 1] = 0; pals_cnt++; - } else if (tmp->expire_time_us && tmp->expire_time_us > now_us) { + } else if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER && tmp->expire_time_us && + tmp->expire_time_us > now_us) { if (!next_expire_time_us || next_expire_time_us > tmp->expire_time_us) { /* use time of the next expiring alarm/timer */ next_expire_time_us = tmp->expire_time_us; @@ -252,7 +256,7 @@ static int libos_async_worker(void* arg) { /* check if this event is an IO event found in async_list */ LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) { - if (tmp->object == pals[i]) { + if (tmp->type == ASYNC_EVENT_TYPE_IO && tmp->object == pals[i]) { log_debug("Async IO event triggered at %lu", now_us); LISTP_ADD_TAIL(tmp, &triggered, triggered_list); break; @@ -282,7 +286,7 @@ static int libos_async_worker(void* arg) { LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &triggered, triggered_list) { LISTP_DEL(tmp, &triggered, triggered_list); tmp->callback(tmp->caller, tmp->arg); - if (!tmp->object) { + if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER) { /* this is a one-off exit-child or alarm/timer event */ free(tmp); } diff --git a/libos/src/libos_parser.c b/libos/src/libos_parser.c index f1f470cc41..641314e476 100644 --- a/libos/src/libos_parser.c +++ b/libos/src/libos_parser.c @@ -21,6 +21,7 @@ #include "linux_abi/sched.h" #include "linux_abi/signals.h" #include "linux_abi/syscalls_nr_arch.h" +#include "linux_abi/time.h" #include "socket_utils.h" static void parse_open_flags(struct print_buf*, va_list*); @@ -38,6 +39,7 @@ static void parse_sigprocmask_how(struct print_buf*, va_list*); static void parse_msync_flags(struct print_buf*, va_list*); static void parse_madvise_behavior(struct print_buf*, va_list* ap); static void parse_timespec(struct print_buf*, va_list*); +static void parse_itimerspec(struct print_buf*, va_list*); static void parse_sockaddr(struct print_buf*, va_list*); static void parse_domain(struct print_buf*, va_list*); static void parse_socktype(struct print_buf*, va_list*); @@ -53,6 +55,9 @@ static void parse_getrandom_flags(struct print_buf*, va_list*); static void parse_epoll_op(struct print_buf*, va_list*); static void parse_epoll_event(struct print_buf* buf, va_list* ap); static void parse_close_range_flags(struct print_buf* buf, va_list* ap); +static void parse_clockid(struct print_buf* buf, va_list* ap); +static void parse_timerfd_create_flags(struct print_buf* buf, va_list* ap); +static void parse_timerfd_settime_flags(struct print_buf* buf, va_list* ap); static void parse_string_arg(struct print_buf*, va_list* ap); static void parse_pointer_arg(struct print_buf*, va_list* ap); @@ -522,13 +527,17 @@ struct parser_table { parse_integer_arg, parse_pointer_arg, parse_integer_arg, parse_integer_arg, parse_pointer_arg, parse_pointer_arg}}, [__NR_signalfd] = {.slow = false, .name = "signalfd", .parser = {NULL}}, - [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {NULL}}, + [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {parse_long_arg, + parse_clockid, parse_timerfd_create_flags}}, [__NR_eventfd] = {.slow = false, .name = "eventfd", .parser = {parse_long_arg, parse_integer_arg}}, [__NR_fallocate] = {.slow = false, .name = "fallocate", .parser = {parse_long_arg, parse_integer_arg, parse_integer_arg, parse_long_arg, parse_long_arg}}, - [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {NULL}}, - [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {NULL}}, + [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {parse_long_arg, + parse_integer_arg, parse_timerfd_settime_flags, parse_itimerspec, + parse_itimerspec}}, + [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {parse_long_arg, + parse_integer_arg, parse_itimerspec}}, [__NR_accept4] = {.slow = true, .name = "accept4", .parser = {parse_long_arg, parse_integer_arg, parse_pointer_arg, parse_pointer_arg, parse_integer_arg}}, [__NR_signalfd4] = {.slow = false, .name = "signalfd4", .parser = {NULL}}, @@ -1145,6 +1154,24 @@ static void parse_timespec(struct print_buf* buf, va_list* ap) { buf_printf(buf, "[%ld,%ld]", tv->tv_sec, tv->tv_nsec); } +static void parse_itimerspec(struct print_buf* buf, va_list* ap) { + const struct itimerspec* it = va_arg(*ap, const struct itimerspec*); + + if (!it) { + buf_puts(buf, "NULL"); + return; + } + + if (!is_user_memory_readable((void*)it, sizeof(*it))) { + buf_printf(buf, "(invalid-addr %p)", it); + return; + } + + buf_printf(buf, "intvl:[%ld,%ld] val:[%ld,%ld]", + it->it_interval.tv_sec, it->it_interval.tv_nsec, + it->it_value.tv_sec, it->it_value.tv_nsec); +} + static void parse_sockaddr(struct print_buf* buf, va_list* ap) { void* addr = va_arg(*ap, void*); @@ -1621,6 +1648,77 @@ static void parse_close_range_flags(struct print_buf* buf, va_list* ap) { buf_printf(buf, "|0x%x", flags); } +static void parse_clockid(struct print_buf* buf, va_list* ap) { + int clockid = va_arg(*ap, int); + switch (clockid) { + case CLOCK_REALTIME: + buf_puts(buf, "CLOCK_REALTIME"); + break; + case CLOCK_MONOTONIC: + buf_puts(buf, "CLOCK_MONOTONIC"); + break; + case CLOCK_PROCESS_CPUTIME_ID: + buf_puts(buf, "CLOCK_PROCESS_CPUTIME_ID"); + break; + case CLOCK_THREAD_CPUTIME_ID: + buf_puts(buf, "CLOCK_THREAD_CPUTIME_ID"); + break; + case CLOCK_MONOTONIC_RAW: + buf_puts(buf, "CLOCK_MONOTONIC_RAW"); + break; + case CLOCK_REALTIME_COARSE: + buf_puts(buf, "CLOCK_REALTIME_COARSE"); + break; + case CLOCK_MONOTONIC_COARSE: + buf_puts(buf, "CLOCK_MONOTONIC_COARSE"); + break; + case CLOCK_BOOTTIME: + buf_puts(buf, "CLOCK_BOOTTIME"); + break; + case CLOCK_REALTIME_ALARM: + buf_puts(buf, "CLOCK_REALTIME_ALARM"); + break; + case CLOCK_BOOTTIME_ALARM: + buf_puts(buf, "CLOCK_BOOTTIME_ALARM"); + break; + default: + buf_printf(buf, "(unknown: %d)", clockid); + break; + } +} + +static void parse_timerfd_create_flags(struct print_buf* buf, va_list* ap) { + int flags = va_arg(*ap, int); + +#define FLG(n) \ + { #n, n } + const struct flag_table all_flags[] = { + FLG(TFD_NONBLOCK), + FLG(TFD_CLOEXEC), + }; +#undef FLG + + flags = parse_flags(buf, flags, all_flags, ARRAY_SIZE(all_flags)); + if (flags) + buf_printf(buf, "|0x%x", flags); +} + +static void parse_timerfd_settime_flags(struct print_buf* buf, va_list* ap) { + int flags = va_arg(*ap, int); + +#define FLG(n) \ + { #n, n } + const struct flag_table all_flags[] = { + FLG(TFD_TIMER_ABSTIME), + FLG(TFD_TIMER_CANCEL_ON_SET), + }; +#undef FLG + + flags = parse_flags(buf, flags, all_flags, ARRAY_SIZE(all_flags)); + if (flags) + buf_printf(buf, "|0x%x", flags); +} + static void parse_string_arg(struct print_buf* buf, va_list* ap) { const char* arg = va_arg(*ap, const char*); if (is_user_string_readable(arg)) { diff --git a/libos/src/meson.build b/libos/src/meson.build index 5a262a8160..2f3bf32b09 100644 --- a/libos/src/meson.build +++ b/libos/src/meson.build @@ -43,6 +43,7 @@ libos_sources = files( 'fs/sys/cpu_info.c', 'fs/sys/fs.c', 'fs/sys/node_info.c', + 'fs/timerfd/fs.c', 'fs/tmpfs/fs.c', 'gramine_hash.c', 'ipc/libos_ipc.c', @@ -101,6 +102,7 @@ libos_sources = files( 'sys/libos_socket.c', 'sys/libos_stat.c', 'sys/libos_time.c', + 'sys/libos_timerfd.c', 'sys/libos_uname.c', 'sys/libos_wait.c', 'sys/libos_wrappers.c', diff --git a/libos/src/sys/libos_alarm.c b/libos/src/sys/libos_alarm.c index 0ccecfce25..ee683fd38e 100644 --- a/libos/src/sys/libos_alarm.c +++ b/libos/src/sys/libos_alarm.c @@ -35,7 +35,8 @@ static void signal_alarm(IDTYPE caller, void* arg) { long libos_syscall_alarm(unsigned int seconds) { uint64_t usecs = 1000000ULL * seconds; - int64_t ret = install_async_event(NULL, usecs, &signal_alarm, NULL); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + usecs, /*absolute_time=*/false, &signal_alarm, /*arg=*/NULL); if (ret < 0) return ret; @@ -66,8 +67,9 @@ static void signal_itimer(IDTYPE caller, void* arg) { spinlock_unlock(&g_real_itimer_lock); if (next_reset) { - int64_t ret = install_async_event(/*object=*/NULL, next_reset, &signal_itimer, - /*arg=*/NULL); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + next_reset, /*absolute_time=*/false, + &signal_itimer, /*arg=*/NULL); if (ret < 0) { log_error( "failed to re-enqueue the next timer event initially set up by 'setitimer()': %s", @@ -113,8 +115,9 @@ long libos_syscall_setitimer(int which, struct __kernel_itimerval* value, : 0; uint64_t current_reset = g_real_itimer.reset; - int64_t install_ret = install_async_event(NULL, next_value, &signal_itimer, /*arg=*/NULL); - + int64_t install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + next_value, /*absolute_time=*/false, + &signal_itimer, /*arg=*/NULL); if (install_ret < 0) { spinlock_unlock(&g_real_itimer_lock); return install_ret; diff --git a/libos/src/sys/libos_epoll.c b/libos/src/sys/libos_epoll.c index b3e7a058fa..3694e1a3f5 100644 --- a/libos/src/sys/libos_epoll.c +++ b/libos/src/sys/libos_epoll.c @@ -189,6 +189,12 @@ void maybe_epoll_et_trigger(struct libos_handle* handle, int ret, bool in, bool needs_et = true; } break; + case TYPE_TIMERFD: + /* timerfd in edge-triggered mode will notify only on expiration state changes (i.e., + * if the expiration count is not read, the timerfd remains in an expired state, and no + * notification will be triggered) */ + needs_et = in; + break; default: /* Type unsupported with EPOLLET. */ break; @@ -461,6 +467,7 @@ long libos_syscall_epoll_ctl(int epfd, int op, int fd, struct epoll_event* event case TYPE_PIPE: case TYPE_SOCK: case TYPE_EVENTFD: + case TYPE_TIMERFD: break; default: /* epoll not supported by this type of handle */ diff --git a/libos/src/sys/libos_exit.c b/libos/src/sys/libos_exit.c index fbaee29b7b..afbdb3ce1e 100644 --- a/libos/src/sys/libos_exit.c +++ b/libos/src/sys/libos_exit.c @@ -101,7 +101,9 @@ noreturn void thread_exit(int error_code, int term_signal) { cur_thread->clear_child_tid_pal = 1; /* any non-zero value suffices */ /* We pass this ownership to `cleanup_thread`. */ get_thread(cur_thread); - int64_t ret = install_async_event(NULL, 0, &cleanup_thread, cur_thread); + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL, + /*time_us=*/0, /*absolute_time=*/false, &cleanup_thread, + cur_thread); /* Take the reference to the current thread from the tcb. */ lock(&cur_thread->lock); diff --git a/libos/src/sys/libos_ioctl.c b/libos/src/sys/libos_ioctl.c index 89d5424da9..8bbad6efa4 100644 --- a/libos/src/sys/libos_ioctl.c +++ b/libos/src/sys/libos_ioctl.c @@ -104,7 +104,9 @@ long libos_syscall_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { rwlock_write_unlock(&handle_map->lock); break; case FIOASYNC: - ret = install_async_event(hdl->pal_handle, 0, &signal_io, NULL); + ret = install_async_event(ASYNC_EVENT_TYPE_IO, hdl->pal_handle, + /*time_us=*/0, /*absolute_time=*/false, &signal_io, + /*arg=*/NULL); break; case FIONREAD: { if (!is_user_memory_writable((void*)arg, sizeof(int))) { diff --git a/libos/src/sys/libos_sleep.c b/libos/src/sys/libos_sleep.c index 56fc8b12b9..c01af88992 100644 --- a/libos/src/sys/libos_sleep.c +++ b/libos/src/sys/libos_sleep.c @@ -11,6 +11,7 @@ #include "libos_thread.h" #include "libos_utils.h" #include "linux_abi/errors.h" +#include "linux_abi/time.h" #include "pal.h" long libos_syscall_pause(void) { diff --git a/libos/src/sys/libos_time.c b/libos/src/sys/libos_time.c index cf34c334ee..60316080bc 100644 --- a/libos/src/sys/libos_time.c +++ b/libos/src/sys/libos_time.c @@ -8,6 +8,7 @@ #include "libos_internal.h" #include "libos_table.h" #include "linux_abi/errors.h" +#include "linux_abi/time.h" #include "pal.h" long libos_syscall_gettimeofday(struct __kernel_timeval* tv, struct __kernel_timezone* tz) { diff --git a/libos/src/sys/libos_timerfd.c b/libos/src/sys/libos_timerfd.c new file mode 100644 index 0000000000..63d0506194 --- /dev/null +++ b/libos/src/sys/libos_timerfd.c @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* Implementation of "timerfd" system calls. + * + * The timerfd object is created inside the LibOS, and all operations are resolved entirely inside + * the LibOS (note that the time source in Gramine SGX is still untrusted). Each timerfd object is + * associated with a dummy eventfd created on the host. This is purely for triggering read + * notifications (e.g., in epoll); timerfd data is verified inside the LibOS and is never exposed to + * the host. Since the host is used purely for notifications, a malicious host can only induce + * Denial of Service (DoS) attacks. + * + * The emulation is currently implemented at the level of a single process. All timerfds created in + * the parent process are marked as invalid in child processes. In multi-process applications, + * Gramine does not exit immediately after fork; it only exits if the application attempts to use + * timerfds in the child. Therefore, inter-process timing signals via timerfds are not allowed. + * + * The host's eventfd object is "dummy" and used purely for notifications -- to unblock blocking + * read/select/poll/epoll system calls. The read notify logic is already hardened, by + * double-checking that the timerfd object indeed expired. However, there are three possible attacks + * on polling mechanisms (select/poll/epoll): + * + * a. Malicious host may inject the notification too early: POLLIN when no timer expired yet. This + * may lead to a synchronization failure of the app. To prevent this, timerfd implements a + * callback `post_poll()` where it verifies that a timer was indeed expired (i.e., that the + * notification is not spurious). + * b. Malicious host may inject the notification too late or not send a notification at all. + * This is a Denial of Service (DoS), which we don't care about. + * c. Malicious host may inject POLLERR, POLLHUP, POLLRDHUP, POLLNVAL, POLLOUT. This is impossible + * as we control timerfd objects inside the LibOS, and we never raise such conditions. So the + * callback `post_poll()` panics if it detects such a return event. + */ + +#include "libos_checkpoint.h" +#include "libos_fs.h" +#include "libos_handle.h" +#include "libos_internal.h" +#include "libos_table.h" +#include "libos_utils.h" +#include "linux_abi/fs.h" +#include "linux_abi/time.h" +#include "linux_eventfd.h" +#include "pal.h" + +/* This implementation is the same as `eventfd_dummy_host_write()` in "fs/eventfd/fs.c". */ +static void timerfd_dummy_host_write(struct libos_handle* hdl) { + int ret; + uint64_t buf_dummy_host_val = 1; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + do { + ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + } while (ret == PAL_ERROR_INTERRUPTED); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +static int create_timerfd_pal_handle(PAL_HANDLE* out_pal_handle) { + PAL_HANDLE hdl = NULL; + + int ret = PalStreamOpen(URI_PREFIX_EVENTFD, PAL_ACCESS_RDWR, /*share_flags=*/0, + PAL_CREATE_IGNORED, /*options=*/0, &hdl); + if (ret < 0) { + log_error("timerfd: dummy host eventfd creation failure"); + return pal_to_unix_errno(ret); + } + + *out_pal_handle = hdl; + return 0; +} + +long libos_syscall_timerfd_create(int clockid, int flags) { + int ret; + + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && + clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME && + clockid != CLOCK_BOOTTIME_ALARM)) + return -EINVAL; + + if (clockid != CLOCK_REALTIME) { + if (FIRST_TIME()) { + log_warning("Unsupported clockid in 'timerfd_create()'; replaced by the system-wide " + "real-time clock."); + } + } + + struct libos_handle* hdl = get_new_handle(); + if (!hdl) + return -ENOMEM; + + hdl->type = TYPE_TIMERFD; + hdl->fs = &timerfd_builtin_fs; + hdl->flags = O_RDONLY | (flags & TFD_NONBLOCK ? O_NONBLOCK : 0); + hdl->acc_mode = MAY_READ; + + hdl->info.timerfd.broken_in_child = false; + hdl->info.timerfd.num_expirations = 0; + hdl->info.timerfd.dummy_host_val = 0; + hdl->info.timerfd.timeout = 0; + hdl->info.timerfd.reset = 0; + + ret = create_timerfd_pal_handle(&hdl->pal_handle); + if (ret < 0) + goto out; + + ret = set_new_fd_handle(hdl, flags & TFD_CLOEXEC ? FD_CLOEXEC : 0, NULL); +out: + put_handle(hdl); + return ret; +} + +static void timerfd_update(struct libos_handle* hdl) { + spinlock_lock(&hdl->info.timerfd.expiration_lock); + + /* When the expiration count overflows, the read will saturate at UINT64_MAX while the timer + * will continue to fire. */ + if (hdl->info.timerfd.num_expirations < UINT64_MAX) + hdl->info.timerfd.num_expirations++; + + hdl->info.timerfd.dummy_host_val++; + + /* perform a write (not supposed to block) to send an event to reading/polling threads */ + timerfd_dummy_host_write(hdl); + + spinlock_unlock(&hdl->info.timerfd.expiration_lock); +} + +static void callback_itimer(IDTYPE caller, void* arg) { + __UNUSED(caller); + + struct libos_handle* hdl = (struct libos_handle*)arg; + + spinlock_lock(&hdl->info.timerfd.timer_lock); + hdl->info.timerfd.timeout += hdl->info.timerfd.reset; + uint64_t next_reset = hdl->info.timerfd.reset; + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + if (next_reset) { + int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + next_reset, /*absolute_time=*/false, + &callback_itimer, (void*)hdl); + if (ret < 0) { + log_error( + "failed to re-enqueue the next timer event initially set up by " + "'timerfd_settime()': %s", unix_strerror(ret)); + die_or_inf_loop(); + } + } + + timerfd_update(hdl); +} + +long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value, + struct __kernel_itimerspec* ovalue) { + int ret; + + struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL); + if (!hdl) + return -EBADF; + + if (hdl->type != TYPE_TIMERFD) { + ret = -EINVAL; + goto out; + } + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + if (!is_user_memory_readable(value, sizeof(*value))) { + ret = -EFAULT; + goto out; + } + if (ovalue && !is_user_memory_writable(ovalue, sizeof(*ovalue))) { + ret = -EFAULT; + goto out; + } + + /* `TFD_TIMER_CANCEL_ON_SET` is silently ignored because there are no "discontinuous changes of + * time" in Gramine (via e.g., `settimeofday()`). */ + + if (flags & ~TFD_SETTIME_FLAGS) { + ret = -EINVAL; + goto out; + } + + uint64_t setup_time = 0; + ret = PalSystemTimeQuery(&setup_time); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + goto out; + } + + uint64_t new_timeout = timespec_to_us(&value->it_value); + uint64_t new_reset = timespec_to_us(&value->it_interval); + + spinlock_lock(&hdl->info.timerfd.timer_lock); + + uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time + ? hdl->info.timerfd.timeout - setup_time + : 0; + uint64_t current_reset = hdl->info.timerfd.reset; + + bool absolute_time = flags & TFD_TIMER_ABSTIME; + if (absolute_time) { + hdl->info.timerfd.timeout = new_timeout; + } else { + hdl->info.timerfd.timeout = setup_time + new_timeout; + } + hdl->info.timerfd.reset = new_reset; + + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + int64_t install_ret; + if (new_timeout) { + install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + new_timeout, absolute_time, + &callback_itimer, (void*)hdl); + } else { + /* cancel the pending timerfd object */ + install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle, + /*time_us=*/0, /*absolute_time=*/false, + /*callback=*/NULL, /*arg=*/NULL); + } + if (install_ret < 0) { + ret = install_ret; + goto out; + } + + if (ovalue) { + ovalue->it_interval.tv_sec = current_reset / TIME_US_IN_S; + ovalue->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US; + ovalue->it_value.tv_sec = current_timeout / TIME_US_IN_S; + ovalue->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US; + } + + ret = 0; +out: + put_handle(hdl); + return ret; +} + +long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value) { + int ret; + + struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL); + if (!hdl) + return -EBADF; + + if (hdl->type != TYPE_TIMERFD) { + ret = -EINVAL; + goto out; + } + + if (hdl->info.timerfd.broken_in_child) { + log_warning("Child process tried to access timerfd created by parent process. This is " + "disallowed in Gramine."); + return -EIO; + } + + if (!is_user_memory_writable(value, sizeof(*value))) { + ret = -EFAULT; + goto out; + } + + uint64_t setup_time = 0; + ret = PalSystemTimeQuery(&setup_time); + if (ret < 0) { + ret = pal_to_unix_errno(ret); + goto out; + } + + spinlock_lock(&hdl->info.timerfd.timer_lock); + uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time + ? hdl->info.timerfd.timeout - setup_time + : 0; + uint64_t current_reset = hdl->info.timerfd.reset; + spinlock_unlock(&hdl->info.timerfd.timer_lock); + + value->it_interval.tv_sec = current_reset / TIME_US_IN_S; + value->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US; + value->it_value.tv_sec = current_timeout / TIME_US_IN_S; + value->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US; + + ret = 0; +out: + put_handle(hdl); + return ret; +} diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg index 9195858e20..8c358afeeb 100644 --- a/libos/test/ltp/ltp.cfg +++ b/libos/test/ltp/ltp.cfg @@ -2436,8 +2436,12 @@ skip = yes [timer_settime*] skip = yes -# no timerfd -[timerfd*] +# clocks other than `CLOCK_REALTIME` are not supported +[timerfd04] +skip = yes + +# relies on "/proc/sys/kernel/tainted" (see tst_taint.c:tst_taint_check) +[timerfd_settime02] skip = yes [times03] diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build index b6f201b24e..444c9a6996 100644 --- a/libos/test/regression/meson.build +++ b/libos/test/regression/meson.build @@ -155,6 +155,8 @@ tests = { 'tcp_einprogress': {}, 'tcp_ipv6_v6only': {}, 'tcp_msg_peek': {}, + 'timerfd': {}, + 'timerfd_fork': {}, 'udp': {}, 'uid_gid': {}, 'unix': {}, diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py index 57ab164f74..90b46ef90d 100644 --- a/libos/test/regression/test_libos.py +++ b/libos/test/regression/test_libos.py @@ -1084,6 +1084,18 @@ def test_161_rlimit_nofile_4k(self): self.assertIn("(after setrlimit) opened fd: 4096", stdout) self.assertIn("TEST OK", stdout) + def test_170_timerfd(self): + stdout, _ = self.run_binary(['timerfd'], timeout=120) + self.assertIn("TEST OK", stdout) + + def test_171_timerfd_fork(self): + try: + self.run_binary(['timerfd_fork']) + self.fail('timerfd_fork unexpectedly succeeded') + except subprocess.CalledProcessError as e: + stdout = e.stdout.decode() + self.assertIn('child died', stdout) + class TC_31_Syscall(RegressionTestCase): def test_000_syscall_redirect(self): stdout, _ = self.run_binary(['syscall']) diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml index 23fa2fc5c6..063e5fdaed 100644 --- a/libos/test/regression/tests.toml +++ b/libos/test/regression/tests.toml @@ -134,6 +134,8 @@ manifests = [ "tcp_einprogress", "tcp_ipv6_v6only", "tcp_msg_peek", + "timerfd", + "timerfd_fork", "toml_parsing", "udp", "uid_gid", diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml index 7a3acc3743..20334622a4 100644 --- a/libos/test/regression/tests_musl.toml +++ b/libos/test/regression/tests_musl.toml @@ -135,6 +135,8 @@ manifests = [ "tcp_einprogress", "tcp_ipv6_v6only", "tcp_msg_peek", + "timerfd", + "timerfd_fork", "toml_parsing", "udp", "uid_gid", diff --git a/libos/test/regression/timerfd.c b/libos/test/regression/timerfd.c new file mode 100644 index 0000000000..cd4ee10520 --- /dev/null +++ b/libos/test/regression/timerfd.c @@ -0,0 +1,388 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* + * Single-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and + * `timerfd_gettime()`). + * + * The tests involve cases including reading a blocking/non-blocking timerfd, poll/epoll/select on + * timerfds, setting up a relative/absolute/periodic timerfd and reading a timerfd from multiple + * threads. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define EXPECTED_EXPIRATIONS 1 +#define EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT 5 +#define NUM_FDS 2 +#define NUM_THREADS 5 +#define PERIODIC_INTERVAL 1 +#define TIMEOUT_VALUE 2 + +static void set_timerfd_relative(int fd, bool periodic) { + struct itimerspec new_value = { + .it_value.tv_sec = TIMEOUT_VALUE, + .it_interval.tv_sec = periodic ? PERIODIC_INTERVAL : 0, + }; + + CHECK(timerfd_settime(fd, 0, &new_value, NULL)); +} + +static void set_timerfds_relative(int fds[NUM_FDS], bool periodic) { + for (int i = 0; i < NUM_FDS; i++) + set_timerfd_relative(fds[i], periodic); +} + +static void set_timerfd_absolute(int fd, struct timespec* abs_time) { + struct itimerspec new_value; + + /* Set the timer to expire at the absolute time specified */ + new_value.it_value.tv_sec = abs_time->tv_sec; + new_value.it_value.tv_nsec = abs_time->tv_nsec; + new_value.it_interval.tv_sec = 0; + new_value.it_interval.tv_nsec = 0; + + /* Set the timer to absolute time */ + CHECK(timerfd_settime(fd, TFD_TIMER_ABSTIME, &new_value, NULL)); +} + +static void create_timerfds(int fds[NUM_FDS]) { + for (int i = 0; i < NUM_FDS; i++) + fds[i] = CHECK(timerfd_create(CLOCK_REALTIME, 0)); +} + +static void close_timerfds(int fds[NUM_FDS]) { + for (int i = 0; i < NUM_FDS; i++) + CHECK(close(fds[i])); +} + +static void test_select(int fds[NUM_FDS]) { + fd_set rfds; + int max_fd = 0; + int ready_fds = 0; + + for (int i = 0; i < NUM_FDS; i++) { + if (fds[i] > max_fd) + max_fd = fds[i]; + } + + while (ready_fds < NUM_FDS) { + FD_ZERO(&rfds); + for (int i = 0; i < NUM_FDS; i++) { + FD_SET(fds[i], &rfds); + } + + int nfds = select(max_fd + 1, &rfds, NULL, NULL, NULL); + if (nfds <= 0) + err(1, "select on read event failed"); + + for (int i = 0; i < NUM_FDS; i++) { + if (FD_ISSET(fds[i], &rfds)) { + uint64_t expirations; + CHECK(read(fds[i], &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "select: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + ready_fds++; + } + } + } + + if (ready_fds != NUM_FDS) + errx(1, "select: unexpected number of ready fds (expected %d, got %d)", + NUM_FDS, ready_fds); +} + +static void test_poll(int fds[NUM_FDS]) { + struct pollfd pfds[NUM_FDS]; + int ready_fds = 0; + + for (int i = 0; i < NUM_FDS; i++) { + pfds[i].fd = fds[i]; + pfds[i].events = POLLIN; + pfds[i].revents = 0; + } + + while (ready_fds < NUM_FDS) { + int nfds = poll(pfds, NUM_FDS, -1); + if (nfds <= 0) + err(1, "poll with POLLIN failed"); + + for (int i = 0; i < NUM_FDS; i++) { + if (pfds[i].revents & POLLIN) { + uint64_t expirations; + CHECK(read(pfds[i].fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "poll: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + ready_fds++; + pfds[i].revents = 0; + } + } + } + + if (ready_fds != NUM_FDS) + errx(1, "poll: unexpected number of ready fds (expected %d, got %d)", + NUM_FDS, ready_fds); +} + +static void test_epoll(int fds[NUM_FDS]) { + int epfd = CHECK(epoll_create1(0)); + + struct epoll_event ev; + ev.events = EPOLLIN; + for (int i = 0; i < NUM_FDS; i++) { + ev.data.fd = fds[i]; + CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fds[i], &ev)); + } + + struct epoll_event events[NUM_FDS]; + int ready_fds = 0; + + while (ready_fds < NUM_FDS) { + int nfds = epoll_wait(epfd, events, NUM_FDS, -1); + if (nfds <= 0) + err(1, "epoll_wait with EPOLLIN failed"); + + for (int i = 0; i < nfds; i++) { + uint64_t expirations; + CHECK(read(events[i].data.fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "epoll_wait: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + ready_fds++; + } + } + + if (ready_fds != NUM_FDS) + errx(1, "epoll_wait: unexpected number of ready fds (expected %d, got %d)", + NUM_FDS, ready_fds); + + CHECK(close(epfd)); +} + +/* this test expects the timerfd (`fd`) to be a periodic timer */ +static void test_epoll_modes(int fd) { + int epfd = CHECK(epoll_create1(0)); + + /* level-triggered mode */ + struct epoll_event ev; + ev.events = EPOLLIN; + ev.data.fd = fd; + CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev)); + + struct epoll_event events[1]; + int nfds = CHECK(epoll_wait(epfd, events, 1, -1)); + if (nfds != 1) + errx(1, "epoll: unexpected number of fds (expected 1, got %u)", nfds); + + /* waiting for another event without reading the expiration count */ + nfds = CHECK(epoll_wait(epfd, events, 1, /*timeout=*/PERIODIC_INTERVAL * 1000 * 2)); + if (nfds != 1) + errx(1, "epoll: unexpected number of fds in level-triggered mode without reading " + "(expected 1, got %u)", nfds); + + /* switch to edge-triggered mode */ + ev.events = EPOLLIN | EPOLLET; + CHECK(epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ev)); + + nfds = CHECK(epoll_wait(epfd, events, 1, -1)); + if (nfds != 1) + errx(1, "epoll: unexpected number of fds (expected 1, got %u)", nfds); + + /* waiting for another event without reading the expiration count: here, even though the timer + * expired at least once, there is no event reported because we're in edge-triggered mode (which + * does not "reset" the event since there was no read) */ + nfds = CHECK(epoll_wait(epfd, events, 1, /*timeout=*/PERIODIC_INTERVAL * 1000 * 2)); + if (nfds != 0) + errx(1, "epoll: unexpected number of fds in edge-triggered mode without reading " + "(expected 0, got %u)", nfds); + + CHECK(close(epfd)); +} + +static void test_periodic_timer(int fd) { + uint64_t expirations; + size_t total_expirations = 0; + + while (total_expirations < EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT) { + CHECK(read(fd, &expirations, sizeof(expirations))); + total_expirations += expirations; + } +} + +static void* timerfd_read_thread(void* arg) { + int fd = *(int*)arg; + uint64_t expirations; + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations == 0) + err(1, "threaded read: unexpected number of expirations"); + pthread_exit(NULL); +} + +/* a periodic timer is required so that all NUM_THREADS threads have something to read */ +static void test_periodic_timer_threaded_read(int fd) { + pthread_t threads[NUM_THREADS]; + for (int i = 0; i < NUM_THREADS; i++) { + CHECK(pthread_create(&threads[i], NULL, timerfd_read_thread, &fd)); + /* wait for the thread to finish */ + CHECK(pthread_join(threads[i], NULL)); + } +} + +static void test_timerfd_gettime(int fd) { + struct itimerspec curr_value; + CHECK(timerfd_gettime(fd, &curr_value)); + + /* the timer should be set to expire close to 2 seconds */ + if (curr_value.it_value.tv_sec > 2 || curr_value.it_value.tv_sec < 1 || + curr_value.it_value.tv_nsec < 0 || curr_value.it_value.tv_nsec >= 1000000000) { + errx(1, "timerfd_gettime: unexpected timer value (expected close to 2.0, got %ld.%09ld)", + curr_value.it_value.tv_sec, curr_value.it_value.tv_nsec); + } +} + +static void test_disarm_timer(int fd) { + struct itimerspec old_value; + + /* immediately disarm the timer and get the old value */ + struct itimerspec disarm_value = { 0 }; + CHECK(timerfd_settime(fd, 0, &disarm_value, &old_value)); + + /* check that the old value is around 2 seconds */ + if (old_value.it_value.tv_sec > 2 || old_value.it_value.tv_sec < 1 || + old_value.it_value.tv_nsec < 0 || old_value.it_value.tv_nsec >= 1000000000) { + errx(1, "disarm_timer: unexpected old timer value (expected close to 2.0, got %ld.%09ld)", + old_value.it_value.tv_sec, old_value.it_value.tv_nsec); + } + + /* test poll with a timeout to ensure the timer was disarmed */ + struct pollfd pfd = { + .fd = fd, + .events = POLLIN, + }; + int ret = poll(&pfd, 1, /*timeout=*/(TIMEOUT_VALUE + 1) * 1000); + if (ret != 0) + errx(1, "disarm_timer: poll returned %d, expected 0 (timeout)", ret); +} + +static void test_absolute_time(int fd) { + struct timespec now; + struct timespec abs_time; + uint64_t expirations; + + /* test timerfd with absolute time set in the future */ + CHECK(clock_gettime(CLOCK_REALTIME, &now)); + abs_time.tv_sec = now.tv_sec + TIMEOUT_VALUE; + abs_time.tv_nsec = now.tv_nsec; + + set_timerfd_absolute(fd, &abs_time); + + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "absolute_time future: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + + expirations = 0; + memset(&now, 0, sizeof(struct timespec)); + memset(&abs_time, 0, sizeof(struct timespec)); + + /* test timerfd with absolute time set in the past */ + CHECK(clock_gettime(CLOCK_REALTIME, &now)); + abs_time.tv_sec = now.tv_sec - TIMEOUT_VALUE; + abs_time.tv_nsec = now.tv_nsec; + + set_timerfd_absolute(fd, &abs_time); + + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "absolute_time past: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } +} + +/* This test must be executed twice: first reading from a non-periodic timerfd in blocking mode to + * capture its expiration, and then switching to non-blocking mode for a second read, which + * immediately returns with EAGAIN because the timer is disarmed and there are no new expiration + * events. */ +static void test_read(int fd, bool non_blocking) { + if (non_blocking) { + CHECK(fcntl(fd, F_SETFL, O_NONBLOCK)); + } + + uint64_t expirations; + int retval = read(fd, &expirations, sizeof(expirations)); + + if (non_blocking) { + if (retval != -1 || errno != EAGAIN) { + errx(1, "non-blocking read: read returned %d, errno %d, expected -1 and EAGAIN", + retval, errno); + } + } else { + CHECK(retval); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "read: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + } +} + +int main(void) { + int fds[NUM_FDS]; + create_timerfds(fds); + + set_timerfds_relative(fds, /*periodic=*/false); + test_select(fds); + + set_timerfds_relative(fds, /*periodic=*/false); + test_poll(fds); + + set_timerfds_relative(fds, /*periodic=*/false); + test_epoll(fds); + + set_timerfd_relative(fds[0], /*periodic=*/true); + test_epoll_modes(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/true); + test_periodic_timer(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/true); + test_periodic_timer_threaded_read(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/false); + test_timerfd_gettime(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/false); + test_disarm_timer(fds[0]); + + set_timerfd_relative(fds[0], /*periodic=*/false); + test_read(fds[0], /*non_blocking=*/false); + test_read(fds[0], /*non_blocking=*/true); + + test_absolute_time(fds[1]); + + close_timerfds(fds); + + puts("TEST OK"); + return 0; +} diff --git a/libos/test/regression/timerfd_fork.c b/libos/test/regression/timerfd_fork.c new file mode 100644 index 0000000000..0bf82fc64c --- /dev/null +++ b/libos/test/regression/timerfd_fork.c @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Kailun Qin + */ + +/* Multi-process test for `timerfd` syscalls (`timerfd_create()` and `timerfd_settime()`). + * + * Note that timerfd is currently only emulated in a secure single-process mode, so this test does + * not work. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define EXPECTED_EXPIRATIONS 1 +#define TIMEOUT_VALUE 2 + +static void set_timerfd(int fd) { + struct itimerspec new_value = { .it_value.tv_sec = TIMEOUT_VALUE }; + + CHECK(timerfd_settime(fd, 0, &new_value, NULL)); +} + +static void test_multi_process(int fd) { + pid_t pid = CHECK(fork()); + if (pid == 0) { + uint64_t expirations; + /* child: wait on a blocking read for the timer to expire */ + /* Note: Due to the limitation of `timerfd` syscalls being only emulated in a secure + * single-process mode, `read()` will return a negative error code. */ + CHECK(read(fd, &expirations, sizeof(expirations))); + if (expirations != EXPECTED_EXPIRATIONS) { + errx(1, "child process: unexpected number of expirations (expected %d, got %lu)", + EXPECTED_EXPIRATIONS, expirations); + } + exit(0); + } else { + int status = 0; + + /* parent: do nothing and let the child process read the timerfd */ + CHECK(waitpid(pid, &status, 0)); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + errx(1, "child died with status: %#x", status); + } + } +} + +int main(void) { + int fd = CHECK(timerfd_create(CLOCK_REALTIME, 0)); + + set_timerfd(fd); + test_multi_process(fd); + + CHECK(close(fd)); + + puts("TEST OK"); + return 0; +}