diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md
index 93f84e3b07..e334a624e4 100644
--- a/Documentation/devel/features.md
+++ b/Documentation/devel/features.md
@@ -1036,7 +1036,7 @@ The below list is generated from the [syscall table of Linux
- ☒ `signalfd()`
[7](#signals-and-process-state-changes)
-- ☒ `timerfd_create()`
+- ▣ `timerfd_create()`
[20](#sleeps-timers-and-alarms)
- ▣ `eventfd()`
@@ -1045,10 +1045,10 @@ The below list is generated from the [syscall table of Linux
- ▣ `fallocate()`
[9a](#file-system-operations)
-- ☒ `timerfd_settime()`
+- ▣ `timerfd_settime()`
[20](#sleeps-timers-and-alarms)
-- ☒ `timerfd_gettime()`
+- ▣ `timerfd_gettime()`
[20](#sleeps-timers-and-alarms)
- ☑ `accept4()`
@@ -2891,9 +2891,23 @@ Gramine implements getting and setting the interval timer: `getitimer()` and `se
Gramine implements alarm clocks via `alarm()`.
+Gramine implements timers that notify via file descriptors: `timerfd_create()`, `timerfd_settime()`
+and `timerfd_gettime()`. The timerfd object is created inside Gramine, and all operations are
+resolved entirely inside Gramine (note that the time source in Gramine SGX is still untrusted). Each
+timerfd object is associated with a dummy eventfd created on the host. This is purely for triggering
+read notifications (e.g., in epoll); timerfd data is verified inside Gramine and is never exposed to
+the host. Since the host is used purely for notifications, a malicious host can only induce Denial
+of Service (DoS) attacks. `TFD_TIMER_CANCEL_ON_SET` is silently ignored because there are no
+"discontinuous changes of time" in Gramine (via e.g., `settimeofday()`). `TFD_IOC_SET_TICKS` is not
+supported.
+
+The emulation is currently implemented at the level of a single process. All timerfds created in the
+parent process are marked as invalid in child processes. In multi-process applications, Gramine does
+not exit immediately after fork; it only exits if the application attempts to use timerfds in the
+child. Therefore, inter-process timing signals via timerfds are not allowed.
+
Gramine does *not* currently implement the POSIX per-process timer: `timer_create()`, etc. Gramine
-also does not currently implement timers that notify via file descriptors. Gramine could implement
-these timers in the future, if need arises.
+could implement it in the future, if need arises.
Related system calls
@@ -2909,9 +2923,9 @@ these timers in the future, if need arises.
- ☒ `timer_getoverrun()`: may be implemented in the future
- ☒ `timer_delete()`: may be implemented in the future
-- ☒ `timerfd_create()`: may be implemented in the future
-- ☒ `timerfd_settime()`: may be implemented in the future
-- ☒ `timerfd_gettime()`: may be implemented in the future
+- ▣ `timerfd_create()`: see the notes above
+- ▣ `timerfd_settime()`: see the notes above
+- ▣ `timerfd_gettime()`: see the notes above
diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h
index 0590a9c8db..b101c041df 100644
--- a/libos/include/libos_fs.h
+++ b/libos/include/libos_fs.h
@@ -190,7 +190,7 @@ struct libos_fs_ops {
int (*poll)(struct libos_handle* hdl, int in_events, int* out_events);
/* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed
- * ones. Used in e.g. secure eventfd FS to verify if the host is not lying to us. */
+ * ones. Used in e.g. secure eventfd and timerfd FS to verify if the host is not lying to us. */
void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events);
/* checkpoint/migrate the file system */
@@ -948,6 +948,7 @@ extern struct libos_fs eventfd_builtin_fs;
extern struct libos_fs synthetic_builtin_fs;
extern struct libos_fs path_builtin_fs;
extern struct libos_fs shm_builtin_fs;
+extern struct libos_fs timerfd_builtin_fs;
struct libos_fs* find_fs(const char* name);
diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h
index d0920cff06..71331d7fb7 100644
--- a/libos/include/libos_handle.h
+++ b/libos/include/libos_handle.h
@@ -46,6 +46,7 @@ enum libos_handle_type {
/* Special handles: */
TYPE_EPOLL, /* epoll handles, see `libos_epoll.c` */
TYPE_EVENTFD, /* eventfd handles, used by `eventfd` filesystem */
+ TYPE_TIMERFD, /* timerfd handles, used by `timerfd` filesystem */
};
struct libos_pipe_handle {
@@ -142,6 +143,18 @@ struct libos_eventfd_handle {
uint64_t dummy_host_val;
};
+struct libos_timerfd_handle {
+ bool broken_in_child;
+
+ spinlock_t expiration_lock; /* protecting below fields */
+ uint64_t num_expirations;
+ uint64_t dummy_host_val;
+
+ spinlock_t timer_lock; /* protecting below fields */
+ uint64_t timeout; /* always an absolute time */
+ uint64_t reset;
+};
+
struct libos_handle {
enum libos_handle_type type;
bool is_dir;
@@ -217,6 +230,8 @@ struct libos_handle {
struct libos_epoll_handle epoll; /* TYPE_EPOLL */
struct libos_eventfd_handle eventfd; /* TYPE_EVENTFD */
+
+ struct libos_timerfd_handle timerfd; /* TYPE_TIMERFD */
} info;
struct libos_dir_handle dir_info;
@@ -232,7 +247,7 @@ struct libos_handle {
* `read`, `seek` but not `pread`). This lock should be taken *before* `libos_handle.lock` and
* `libos_inode.lock`. Must be used *only* via maybe_lock_pos_handle() and
* maybe_unlock_pos_handle(); these functions make sure that the lock is acquired only on those
- * handle types that are seekable (e.g. not on eventfds or pipes). */
+ * handle types that are seekable (e.g. not on eventfds, timerfds or pipes). */
struct libos_lock pos_lock;
};
diff --git a/libos/include/libos_table.h b/libos/include/libos_table.h
index e204aaf6de..1b8840ff24 100644
--- a/libos/include/libos_table.h
+++ b/libos/include/libos_table.h
@@ -220,3 +220,7 @@ long libos_syscall_getrandom(char* buf, size_t count, unsigned int flags);
long libos_syscall_mlock2(unsigned long start, size_t len, int flags);
long libos_syscall_sysinfo(struct sysinfo* info);
long libos_syscall_close_range(unsigned int first, unsigned int last, unsigned int flags);
+long libos_syscall_timerfd_create(int clockid, int flags);
+long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value,
+ struct __kernel_itimerspec* ovalue);
+long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value);
diff --git a/libos/include/libos_utils.h b/libos/include/libos_utils.h
index a4298a50ef..bdae8966bc 100644
--- a/libos/include/libos_utils.h
+++ b/libos/include/libos_utils.h
@@ -52,8 +52,14 @@ void clean_link_map_list(void);
int create_pipe(char* name, char* uri, size_t size, PAL_HANDLE* hdl, bool use_vmid_for_name);
/* Asynchronous event support */
+enum async_event_type {
+ ASYNC_EVENT_TYPE_IO = 1,
+ ASYNC_EVENT_TYPE_ALARM_TIMER = 2,
+};
+
int init_async_worker(void);
-int64_t install_async_event(PAL_HANDLE object, unsigned long time,
+int64_t install_async_event(enum async_event_type type, PAL_HANDLE object,
+ unsigned long time_us, bool absolute_time,
void (*callback)(IDTYPE caller, void* arg), void* arg);
void terminate_async_worker(void);
diff --git a/libos/include/linux_abi/time.h b/libos/include/linux_abi/time.h
index da848822de..303d184c0b 100644
--- a/libos/include/linux_abi/time.h
+++ b/libos/include/linux_abi/time.h
@@ -9,11 +9,11 @@
/* These need to be binary-identical with the ones used by Linux. */
// TODO: remove all of these includes and make this header libc-independent.
-#include
-#include
-#include
#include
+typedef long __kernel_suseconds_t;
+typedef long __kernel_time_t;
+
typedef __kernel_time_t time_t;
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
@@ -37,3 +37,28 @@ struct __kernel_timezone {
int tz_minuteswest; /* minutes west of Greenwich */
int tz_dsttime; /* type of dst correction */
};
+
+/* The IDs of the various system clocks (for POSIX.1b interval timers). */
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_PROCESS_CPUTIME_ID 2
+#define CLOCK_THREAD_CPUTIME_ID 3
+#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6
+#define CLOCK_BOOTTIME 7
+#define CLOCK_REALTIME_ALARM 8
+#define CLOCK_BOOTTIME_ALARM 9
+
+#define MAX_CLOCKS 16
+
+#define TFD_TIMER_ABSTIME (1 << 0)
+#define TFD_TIMER_CANCEL_ON_SET (1 << 1)
+#define TFD_CLOEXEC O_CLOEXEC
+#define TFD_NONBLOCK O_NONBLOCK
+
+#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK)
+/* Flags for timerfd_create. */
+#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS
+/* Flags for timerfd_settime. */
+#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)
diff --git a/libos/src/arch/x86_64/libos_table.c b/libos/src/arch/x86_64/libos_table.c
index 480f81259a..fbe6d3c49b 100644
--- a/libos/src/arch/x86_64/libos_table.c
+++ b/libos/src/arch/x86_64/libos_table.c
@@ -297,11 +297,11 @@ libos_syscall_t libos_syscall_table[LIBOS_SYSCALL_BOUND] = {
[__NR_utimensat] = (libos_syscall_t)0, // libos_syscall_utimensat
[__NR_epoll_pwait] = (libos_syscall_t)libos_syscall_epoll_pwait,
[__NR_signalfd] = (libos_syscall_t)0, // libos_syscall_signalfd
- [__NR_timerfd_create] = (libos_syscall_t)0, // libos_syscall_timerfd_create
+ [__NR_timerfd_create] = (libos_syscall_t)libos_syscall_timerfd_create,
[__NR_eventfd] = (libos_syscall_t)libos_syscall_eventfd,
[__NR_fallocate] = (libos_syscall_t)libos_syscall_fallocate,
- [__NR_timerfd_settime] = (libos_syscall_t)0, // libos_syscall_timerfd_settime
- [__NR_timerfd_gettime] = (libos_syscall_t)0, // libos_syscall_timerfd_gettime
+ [__NR_timerfd_settime] = (libos_syscall_t)libos_syscall_timerfd_settime,
+ [__NR_timerfd_gettime] = (libos_syscall_t)libos_syscall_timerfd_gettime,
[__NR_accept4] = (libos_syscall_t)libos_syscall_accept4,
[__NR_signalfd4] = (libos_syscall_t)0, // libos_syscall_signalfd4
[__NR_eventfd2] = (libos_syscall_t)libos_syscall_eventfd2,
diff --git a/libos/src/fs/libos_fs.c b/libos/src/fs/libos_fs.c
index 5a29a36d6d..f10aefd74b 100644
--- a/libos/src/fs/libos_fs.c
+++ b/libos/src/fs/libos_fs.c
@@ -33,6 +33,7 @@ static struct libos_fs* g_builtin_fs[] = {
&synthetic_builtin_fs,
&path_builtin_fs,
&shm_builtin_fs,
+ &timerfd_builtin_fs,
};
static struct libos_lock g_mount_mgr_lock;
diff --git a/libos/src/fs/proc/thread.c b/libos/src/fs/proc/thread.c
index c3da147c48..ed1fa1a95a 100644
--- a/libos/src/fs/proc/thread.c
+++ b/libos/src/fs/proc/thread.c
@@ -287,6 +287,7 @@ static char* describe_handle(struct libos_handle* hdl) {
case TYPE_EPOLL: str = "epoll:[?]"; break;
case TYPE_EVENTFD: str = "eventfd:[?]"; break;
case TYPE_SHM: str = "shm:[?]"; break;
+ case TYPE_TIMERFD: str = "timerfd:[?]"; break;
default: str = "unknown:[?]"; break;
}
return strdup(str);
diff --git a/libos/src/fs/timerfd/fs.c b/libos/src/fs/timerfd/fs.c
new file mode 100644
index 0000000000..502ce37c2b
--- /dev/null
+++ b/libos/src/fs/timerfd/fs.c
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2024 Intel Corporation
+ * Kailun Qin
+ */
+
+/*
+ * This file contains code for implementation of "timerfd" filesystem. For more information, see
+ * `libos/src/sys/libos_timerfd.c`.
+ */
+
+#include "libos_fs.h"
+#include "libos_handle.h"
+#include "libos_internal.h"
+#include "libos_lock.h"
+#include "linux_abi/errors.h"
+#include "pal.h"
+
+/* Enforce a restriction that all timerfds created in the parent process are marked as invalid in
+ * child processes, i.e. inter-process timing signals via timerfds are not allowed. This restriction
+ * is because LibOS doesn't yet implement sync between timerfd objects. */
+static int timerfd_checkin(struct libos_handle* hdl) {
+ assert(hdl->type == TYPE_TIMERFD);
+ hdl->info.timerfd.broken_in_child = true;
+ return 0;
+}
+
+/* This implementation is the same as `eventfd_dummy_host_read()` in "fs/eventfd/fs.c". */
+static void timerfd_dummy_host_read(struct libos_handle* hdl) {
+ int ret;
+ uint64_t buf_dummy_host_val = 0;
+ size_t dummy_host_val_count = sizeof(buf_dummy_host_val);
+ do {
+ ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count,
+ &buf_dummy_host_val);
+ } while (ret == PAL_ERROR_INTERRUPTED);
+ if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) {
+ /* must not happen in benign case, consider it an attack and panic */
+ BUG();
+ }
+}
+
+/* This implementation is the same as `eventfd_dummy_host_wait()` in "fs/eventfd/fs.c". */
+static void timerfd_dummy_host_wait(struct libos_handle* hdl) {
+ pal_wait_flags_t wait_for_events = PAL_WAIT_READ;
+ pal_wait_flags_t ret_events = 0;
+ int ret = PalStreamsWaitEvents(1, &hdl->pal_handle, &wait_for_events, &ret_events, NULL);
+ if (ret < 0 && ret != PAL_ERROR_INTERRUPTED) {
+ BUG();
+ }
+ (void)ret_events; /* we don't care what events the host returned, we can't trust them anyway */
+}
+
+static ssize_t timerfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) {
+ __UNUSED(pos);
+ assert(hdl->type == TYPE_TIMERFD);
+
+ if (count < sizeof(uint64_t))
+ return -EINVAL;
+
+ if (hdl->info.timerfd.broken_in_child) {
+ log_warning("Child process tried to access timerfd created by parent process. This is "
+ "disallowed in Gramine.");
+ return -EIO;
+ }
+
+ int ret;
+ spinlock_lock(&hdl->info.timerfd.expiration_lock);
+
+ while (!hdl->info.timerfd.num_expirations) {
+ if (hdl->flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ spinlock_unlock(&hdl->info.timerfd.expiration_lock);
+ timerfd_dummy_host_wait(hdl);
+ spinlock_lock(&hdl->info.timerfd.expiration_lock);
+ }
+
+ memcpy(buf, &hdl->info.timerfd.num_expirations, sizeof(uint64_t));
+ hdl->info.timerfd.num_expirations = 0;
+
+ /* perform a read (not supposed to block) to clear the event from polling threads */
+ if (hdl->info.timerfd.dummy_host_val) {
+ timerfd_dummy_host_read(hdl);
+ hdl->info.timerfd.dummy_host_val = 0;
+ }
+
+ ret = (ssize_t)count;
+out:
+ spinlock_unlock(&hdl->info.timerfd.expiration_lock);
+ maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false);
+ return ret;
+}
+
+static void timerfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) {
+ assert(hdl->type == TYPE_TIMERFD);
+
+ if (hdl->info.timerfd.broken_in_child) {
+ log_warning("Child process tried to access timerfd created by parent process. This is "
+ "disallowed in Gramine.");
+ *pal_ret_events = PAL_WAIT_ERROR;
+ return;
+ }
+
+ if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP | PAL_WAIT_WRITE)) {
+ /* impossible: we control timerfd inside the LibOS, and we never raise such conditions */
+ BUG();
+ }
+
+ spinlock_lock(&hdl->info.timerfd.expiration_lock);
+ if (*pal_ret_events & PAL_WAIT_READ) {
+ /* there is data to read: verify if timerfd has number of expirations greater than zero */
+ if (!hdl->info.timerfd.num_expirations) {
+ /* spurious or malicious notification, can legitimately happen if another thread
+ * consumed this event between this thread's poll wakeup and the post_poll callback;
+ * we currently choose to return a spurious notification to the user */
+ *pal_ret_events &= ~PAL_WAIT_READ;
+ }
+ }
+ spinlock_unlock(&hdl->info.timerfd.expiration_lock);
+}
+
+static int timerfd_close(struct libos_handle* hdl) {
+ if (hdl->info.timerfd.broken_in_child) {
+ log_warning("Child process tried to access timerfd created by parent process. This is "
+ "disallowed in Gramine.");
+ return -EIO;
+ }
+
+ /* cancel the pending timerfd object */
+ return install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle,
+ /*time_us=*/0, /*absolute_time=*/false, /*callback=*/NULL,
+ /*arg=*/NULL);
+}
+
+struct libos_fs_ops timerfd_fs_ops = {
+ .checkin = &timerfd_checkin,
+ .read = &timerfd_read,
+ .close = &timerfd_close,
+ .post_poll = &timerfd_post_poll,
+};
+
+struct libos_fs timerfd_builtin_fs = {
+ .name = "timerfd",
+ .fs_ops = &timerfd_fs_ops,
+};
diff --git a/libos/src/libos_async.c b/libos/src/libos_async.c
index d05af32691..310518316c 100644
--- a/libos/src/libos_async.c
+++ b/libos/src/libos_async.c
@@ -15,12 +15,13 @@
DEFINE_LIST(async_event);
struct async_event {
+ enum async_event_type type;
IDTYPE caller; /* thread installing this event */
LIST_TYPE(async_event) list;
LIST_TYPE(async_event) triggered_list;
void (*callback)(IDTYPE caller, void* arg);
void* arg;
- PAL_HANDLE object; /* handle (async IO) to wait on */
+ PAL_HANDLE object; /* handle (async IO or timerfd) to wait on */
uint64_t expire_time_us; /* alarm/timer to wait on */
};
DEFINE_LISTP(async_event);
@@ -35,25 +36,26 @@ static struct libos_lock async_worker_lock;
/* TODO: use async_worker_thread->pollable_event instead */
static struct libos_pollable_event install_new_event;
-/* Threads register async events like alarm(), setitimer(), ioctl(FIOASYNC)
- * using this function. These events are enqueued in async_list and delivered
- * to async worker thread by triggering install_new_event. When event is
- * triggered in async worker thread, the corresponding event's callback with
- * arguments `arg` is called. This callback typically sends a signal to the
+/* Threads register async events like alarm(), setitimer(), timerfd_settime(), ioctl(FIOASYNC) using
+ * this function. These events are enqueued in async_list and delivered to async worker thread by
+ * triggering install_new_event. When event is triggered in async worker thread, the corresponding
+ * event's callback with arguments `arg` is called. This callback typically sends a signal to the
* thread which registered the event (saved in `event->caller`).
*
- * We distinguish between alarm/timer events and async IO events:
- * - alarm/timer events set object = NULL and time_us = microseconds
- * (time_us = 0 cancels all pending alarms/timers).
+ * The async event type is specified in `type`. Alarm/timer events and async IO events are currently
+ * supported:
+ * - alarm/timer events set time_us = microsseconds (time_us = 0 cancels all pending
+ * alarms/timers). Specfically when object != NULL, this indicates a timerfd event.
* - async IO events set object = handle and time_us = 0.
*
- * Function returns remaining usecs for alarm/timer events (same as alarm())
- * or 0 for async IO events. On error, it returns a negated error code.
+ * Function returns remaining usecs for alarm/timer events (same as alarm()) or 0 for async IO
+ * events. On error, it returns a negated error code.
*/
-int64_t install_async_event(PAL_HANDLE object, uint64_t time_us,
+int64_t install_async_event(enum async_event_type type, PAL_HANDLE object,
+ uint64_t time_us, bool absolute_time,
void (*callback)(IDTYPE caller, void* arg), void* arg) {
- /* if event happens on object, time_us must be zero */
- assert(!object || (object && !time_us));
+ assert((type == ASYNC_EVENT_TYPE_ALARM_TIMER) ||
+ (type == ASYNC_EVENT_TYPE_IO && (!object || !time_us)));
uint64_t now_us = 0;
int ret = PalSystemTimeQuery(&now_us);
@@ -68,21 +70,22 @@ int64_t install_async_event(PAL_HANDLE object, uint64_t time_us,
return -ENOMEM;
}
+ event->type = type;
event->callback = callback;
event->arg = arg;
event->caller = get_cur_tid();
event->object = object;
- event->expire_time_us = time_us ? now_us + time_us : 0;
+ event->expire_time_us = time_us ? (absolute_time ? time_us : now_us + time_us) : 0;
lock(&async_worker_lock);
- if (callback != &cleanup_thread && !object) {
- /* This is alarm() or setitimer() emulation, treat both according to
- * alarm() syscall semantics: cancel any pending alarm/timer. */
+ if (callback != &cleanup_thread && type == ASYNC_EVENT_TYPE_ALARM_TIMER) {
+ /* This is alarm(), setitimer(), timerfd_settime() emulation, treat all according to alarm()
+ * syscall semantics: cancel any pending alarm/timer. */
struct async_event* tmp;
struct async_event* n;
LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) {
- if (tmp->expire_time_us) {
+ if (tmp->object == object && tmp->expire_time_us) {
/* this is a pending alarm/timer, cancel it and save its expiration time */
if (max_prev_expire_time_us < tmp->expire_time_us)
max_prev_expire_time_us = tmp->expire_time_us;
@@ -164,7 +167,7 @@ static int libos_async_worker(void* arg) {
struct async_event* n;
LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) {
/* repopulate `pals` with IO events and find the next expiring alarm/timer */
- if (tmp->object) {
+ if (tmp->type == ASYNC_EVENT_TYPE_IO && tmp->object) {
if (pals_cnt == pals_max_cnt) {
/* grow `pals` to accommodate more objects */
PAL_HANDLE* tmp_pals = malloc(sizeof(*tmp_pals) * (1 + pals_max_cnt * 2));
@@ -200,7 +203,8 @@ static int libos_async_worker(void* arg) {
pal_events[pals_cnt + 1] = PAL_WAIT_READ;
ret_events[pals_cnt + 1] = 0;
pals_cnt++;
- } else if (tmp->expire_time_us && tmp->expire_time_us > now_us) {
+ } else if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER && tmp->expire_time_us &&
+ tmp->expire_time_us > now_us) {
if (!next_expire_time_us || next_expire_time_us > tmp->expire_time_us) {
/* use time of the next expiring alarm/timer */
next_expire_time_us = tmp->expire_time_us;
@@ -252,7 +256,7 @@ static int libos_async_worker(void* arg) {
/* check if this event is an IO event found in async_list */
LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &async_list, list) {
- if (tmp->object == pals[i]) {
+ if (tmp->type == ASYNC_EVENT_TYPE_IO && tmp->object == pals[i]) {
log_debug("Async IO event triggered at %lu", now_us);
LISTP_ADD_TAIL(tmp, &triggered, triggered_list);
break;
@@ -282,7 +286,7 @@ static int libos_async_worker(void* arg) {
LISTP_FOR_EACH_ENTRY_SAFE(tmp, n, &triggered, triggered_list) {
LISTP_DEL(tmp, &triggered, triggered_list);
tmp->callback(tmp->caller, tmp->arg);
- if (!tmp->object) {
+ if (tmp->type == ASYNC_EVENT_TYPE_ALARM_TIMER) {
/* this is a one-off exit-child or alarm/timer event */
free(tmp);
}
diff --git a/libos/src/libos_parser.c b/libos/src/libos_parser.c
index f1f470cc41..641314e476 100644
--- a/libos/src/libos_parser.c
+++ b/libos/src/libos_parser.c
@@ -21,6 +21,7 @@
#include "linux_abi/sched.h"
#include "linux_abi/signals.h"
#include "linux_abi/syscalls_nr_arch.h"
+#include "linux_abi/time.h"
#include "socket_utils.h"
static void parse_open_flags(struct print_buf*, va_list*);
@@ -38,6 +39,7 @@ static void parse_sigprocmask_how(struct print_buf*, va_list*);
static void parse_msync_flags(struct print_buf*, va_list*);
static void parse_madvise_behavior(struct print_buf*, va_list* ap);
static void parse_timespec(struct print_buf*, va_list*);
+static void parse_itimerspec(struct print_buf*, va_list*);
static void parse_sockaddr(struct print_buf*, va_list*);
static void parse_domain(struct print_buf*, va_list*);
static void parse_socktype(struct print_buf*, va_list*);
@@ -53,6 +55,9 @@ static void parse_getrandom_flags(struct print_buf*, va_list*);
static void parse_epoll_op(struct print_buf*, va_list*);
static void parse_epoll_event(struct print_buf* buf, va_list* ap);
static void parse_close_range_flags(struct print_buf* buf, va_list* ap);
+static void parse_clockid(struct print_buf* buf, va_list* ap);
+static void parse_timerfd_create_flags(struct print_buf* buf, va_list* ap);
+static void parse_timerfd_settime_flags(struct print_buf* buf, va_list* ap);
static void parse_string_arg(struct print_buf*, va_list* ap);
static void parse_pointer_arg(struct print_buf*, va_list* ap);
@@ -522,13 +527,17 @@ struct parser_table {
parse_integer_arg, parse_pointer_arg, parse_integer_arg,
parse_integer_arg, parse_pointer_arg, parse_pointer_arg}},
[__NR_signalfd] = {.slow = false, .name = "signalfd", .parser = {NULL}},
- [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {NULL}},
+ [__NR_timerfd_create] = {.slow = false, .name = "timerfd_create", .parser = {parse_long_arg,
+ parse_clockid, parse_timerfd_create_flags}},
[__NR_eventfd] = {.slow = false, .name = "eventfd", .parser = {parse_long_arg,
parse_integer_arg}},
[__NR_fallocate] = {.slow = false, .name = "fallocate", .parser = {parse_long_arg,
parse_integer_arg, parse_integer_arg, parse_long_arg, parse_long_arg}},
- [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {NULL}},
- [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {NULL}},
+ [__NR_timerfd_settime] = {.slow = false, .name = "timerfd_settime", .parser = {parse_long_arg,
+ parse_integer_arg, parse_timerfd_settime_flags, parse_itimerspec,
+ parse_itimerspec}},
+ [__NR_timerfd_gettime] = {.slow = false, .name = "timerfd_gettime", .parser = {parse_long_arg,
+ parse_integer_arg, parse_itimerspec}},
[__NR_accept4] = {.slow = true, .name = "accept4", .parser = {parse_long_arg, parse_integer_arg,
parse_pointer_arg, parse_pointer_arg, parse_integer_arg}},
[__NR_signalfd4] = {.slow = false, .name = "signalfd4", .parser = {NULL}},
@@ -1145,6 +1154,24 @@ static void parse_timespec(struct print_buf* buf, va_list* ap) {
buf_printf(buf, "[%ld,%ld]", tv->tv_sec, tv->tv_nsec);
}
+static void parse_itimerspec(struct print_buf* buf, va_list* ap) {
+ const struct itimerspec* it = va_arg(*ap, const struct itimerspec*);
+
+ if (!it) {
+ buf_puts(buf, "NULL");
+ return;
+ }
+
+ if (!is_user_memory_readable((void*)it, sizeof(*it))) {
+ buf_printf(buf, "(invalid-addr %p)", it);
+ return;
+ }
+
+ buf_printf(buf, "intvl:[%ld,%ld] val:[%ld,%ld]",
+ it->it_interval.tv_sec, it->it_interval.tv_nsec,
+ it->it_value.tv_sec, it->it_value.tv_nsec);
+}
+
static void parse_sockaddr(struct print_buf* buf, va_list* ap) {
void* addr = va_arg(*ap, void*);
@@ -1621,6 +1648,77 @@ static void parse_close_range_flags(struct print_buf* buf, va_list* ap) {
buf_printf(buf, "|0x%x", flags);
}
+static void parse_clockid(struct print_buf* buf, va_list* ap) {
+ int clockid = va_arg(*ap, int);
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ buf_puts(buf, "CLOCK_REALTIME");
+ break;
+ case CLOCK_MONOTONIC:
+ buf_puts(buf, "CLOCK_MONOTONIC");
+ break;
+ case CLOCK_PROCESS_CPUTIME_ID:
+ buf_puts(buf, "CLOCK_PROCESS_CPUTIME_ID");
+ break;
+ case CLOCK_THREAD_CPUTIME_ID:
+ buf_puts(buf, "CLOCK_THREAD_CPUTIME_ID");
+ break;
+ case CLOCK_MONOTONIC_RAW:
+ buf_puts(buf, "CLOCK_MONOTONIC_RAW");
+ break;
+ case CLOCK_REALTIME_COARSE:
+ buf_puts(buf, "CLOCK_REALTIME_COARSE");
+ break;
+ case CLOCK_MONOTONIC_COARSE:
+ buf_puts(buf, "CLOCK_MONOTONIC_COARSE");
+ break;
+ case CLOCK_BOOTTIME:
+ buf_puts(buf, "CLOCK_BOOTTIME");
+ break;
+ case CLOCK_REALTIME_ALARM:
+ buf_puts(buf, "CLOCK_REALTIME_ALARM");
+ break;
+ case CLOCK_BOOTTIME_ALARM:
+ buf_puts(buf, "CLOCK_BOOTTIME_ALARM");
+ break;
+ default:
+ buf_printf(buf, "(unknown: %d)", clockid);
+ break;
+ }
+}
+
+static void parse_timerfd_create_flags(struct print_buf* buf, va_list* ap) {
+ int flags = va_arg(*ap, int);
+
+#define FLG(n) \
+ { #n, n }
+ const struct flag_table all_flags[] = {
+ FLG(TFD_NONBLOCK),
+ FLG(TFD_CLOEXEC),
+ };
+#undef FLG
+
+ flags = parse_flags(buf, flags, all_flags, ARRAY_SIZE(all_flags));
+ if (flags)
+ buf_printf(buf, "|0x%x", flags);
+}
+
+static void parse_timerfd_settime_flags(struct print_buf* buf, va_list* ap) {
+ int flags = va_arg(*ap, int);
+
+#define FLG(n) \
+ { #n, n }
+ const struct flag_table all_flags[] = {
+ FLG(TFD_TIMER_ABSTIME),
+ FLG(TFD_TIMER_CANCEL_ON_SET),
+ };
+#undef FLG
+
+ flags = parse_flags(buf, flags, all_flags, ARRAY_SIZE(all_flags));
+ if (flags)
+ buf_printf(buf, "|0x%x", flags);
+}
+
static void parse_string_arg(struct print_buf* buf, va_list* ap) {
const char* arg = va_arg(*ap, const char*);
if (is_user_string_readable(arg)) {
diff --git a/libos/src/meson.build b/libos/src/meson.build
index 5a262a8160..2f3bf32b09 100644
--- a/libos/src/meson.build
+++ b/libos/src/meson.build
@@ -43,6 +43,7 @@ libos_sources = files(
'fs/sys/cpu_info.c',
'fs/sys/fs.c',
'fs/sys/node_info.c',
+ 'fs/timerfd/fs.c',
'fs/tmpfs/fs.c',
'gramine_hash.c',
'ipc/libos_ipc.c',
@@ -101,6 +102,7 @@ libos_sources = files(
'sys/libos_socket.c',
'sys/libos_stat.c',
'sys/libos_time.c',
+ 'sys/libos_timerfd.c',
'sys/libos_uname.c',
'sys/libos_wait.c',
'sys/libos_wrappers.c',
diff --git a/libos/src/sys/libos_alarm.c b/libos/src/sys/libos_alarm.c
index 0ccecfce25..ee683fd38e 100644
--- a/libos/src/sys/libos_alarm.c
+++ b/libos/src/sys/libos_alarm.c
@@ -35,7 +35,8 @@ static void signal_alarm(IDTYPE caller, void* arg) {
long libos_syscall_alarm(unsigned int seconds) {
uint64_t usecs = 1000000ULL * seconds;
- int64_t ret = install_async_event(NULL, usecs, &signal_alarm, NULL);
+ int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL,
+ usecs, /*absolute_time=*/false, &signal_alarm, /*arg=*/NULL);
if (ret < 0)
return ret;
@@ -66,8 +67,9 @@ static void signal_itimer(IDTYPE caller, void* arg) {
spinlock_unlock(&g_real_itimer_lock);
if (next_reset) {
- int64_t ret = install_async_event(/*object=*/NULL, next_reset, &signal_itimer,
- /*arg=*/NULL);
+ int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL,
+ next_reset, /*absolute_time=*/false,
+ &signal_itimer, /*arg=*/NULL);
if (ret < 0) {
log_error(
"failed to re-enqueue the next timer event initially set up by 'setitimer()': %s",
@@ -113,8 +115,9 @@ long libos_syscall_setitimer(int which, struct __kernel_itimerval* value,
: 0;
uint64_t current_reset = g_real_itimer.reset;
- int64_t install_ret = install_async_event(NULL, next_value, &signal_itimer, /*arg=*/NULL);
-
+ int64_t install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL,
+ next_value, /*absolute_time=*/false,
+ &signal_itimer, /*arg=*/NULL);
if (install_ret < 0) {
spinlock_unlock(&g_real_itimer_lock);
return install_ret;
diff --git a/libos/src/sys/libos_epoll.c b/libos/src/sys/libos_epoll.c
index b3e7a058fa..3694e1a3f5 100644
--- a/libos/src/sys/libos_epoll.c
+++ b/libos/src/sys/libos_epoll.c
@@ -189,6 +189,12 @@ void maybe_epoll_et_trigger(struct libos_handle* handle, int ret, bool in, bool
needs_et = true;
}
break;
+ case TYPE_TIMERFD:
+ /* timerfd in edge-triggered mode will notify only on expiration state changes (i.e.,
+ * if the expiration count is not read, the timerfd remains in an expired state, and no
+ * notification will be triggered) */
+ needs_et = in;
+ break;
default:
/* Type unsupported with EPOLLET. */
break;
@@ -461,6 +467,7 @@ long libos_syscall_epoll_ctl(int epfd, int op, int fd, struct epoll_event* event
case TYPE_PIPE:
case TYPE_SOCK:
case TYPE_EVENTFD:
+ case TYPE_TIMERFD:
break;
default:
/* epoll not supported by this type of handle */
diff --git a/libos/src/sys/libos_exit.c b/libos/src/sys/libos_exit.c
index fbaee29b7b..afbdb3ce1e 100644
--- a/libos/src/sys/libos_exit.c
+++ b/libos/src/sys/libos_exit.c
@@ -101,7 +101,9 @@ noreturn void thread_exit(int error_code, int term_signal) {
cur_thread->clear_child_tid_pal = 1; /* any non-zero value suffices */
/* We pass this ownership to `cleanup_thread`. */
get_thread(cur_thread);
- int64_t ret = install_async_event(NULL, 0, &cleanup_thread, cur_thread);
+ int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, /*object=*/NULL,
+ /*time_us=*/0, /*absolute_time=*/false, &cleanup_thread,
+ cur_thread);
/* Take the reference to the current thread from the tcb. */
lock(&cur_thread->lock);
diff --git a/libos/src/sys/libos_ioctl.c b/libos/src/sys/libos_ioctl.c
index 89d5424da9..8bbad6efa4 100644
--- a/libos/src/sys/libos_ioctl.c
+++ b/libos/src/sys/libos_ioctl.c
@@ -104,7 +104,9 @@ long libos_syscall_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) {
rwlock_write_unlock(&handle_map->lock);
break;
case FIOASYNC:
- ret = install_async_event(hdl->pal_handle, 0, &signal_io, NULL);
+ ret = install_async_event(ASYNC_EVENT_TYPE_IO, hdl->pal_handle,
+ /*time_us=*/0, /*absolute_time=*/false, &signal_io,
+ /*arg=*/NULL);
break;
case FIONREAD: {
if (!is_user_memory_writable((void*)arg, sizeof(int))) {
diff --git a/libos/src/sys/libos_sleep.c b/libos/src/sys/libos_sleep.c
index 56fc8b12b9..c01af88992 100644
--- a/libos/src/sys/libos_sleep.c
+++ b/libos/src/sys/libos_sleep.c
@@ -11,6 +11,7 @@
#include "libos_thread.h"
#include "libos_utils.h"
#include "linux_abi/errors.h"
+#include "linux_abi/time.h"
#include "pal.h"
long libos_syscall_pause(void) {
diff --git a/libos/src/sys/libos_time.c b/libos/src/sys/libos_time.c
index cf34c334ee..60316080bc 100644
--- a/libos/src/sys/libos_time.c
+++ b/libos/src/sys/libos_time.c
@@ -8,6 +8,7 @@
#include "libos_internal.h"
#include "libos_table.h"
#include "linux_abi/errors.h"
+#include "linux_abi/time.h"
#include "pal.h"
long libos_syscall_gettimeofday(struct __kernel_timeval* tv, struct __kernel_timezone* tz) {
diff --git a/libos/src/sys/libos_timerfd.c b/libos/src/sys/libos_timerfd.c
new file mode 100644
index 0000000000..63d0506194
--- /dev/null
+++ b/libos/src/sys/libos_timerfd.c
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2024 Intel Corporation
+ * Kailun Qin
+ */
+
+/* Implementation of "timerfd" system calls.
+ *
+ * The timerfd object is created inside the LibOS, and all operations are resolved entirely inside
+ * the LibOS (note that the time source in Gramine SGX is still untrusted). Each timerfd object is
+ * associated with a dummy eventfd created on the host. This is purely for triggering read
+ * notifications (e.g., in epoll); timerfd data is verified inside the LibOS and is never exposed to
+ * the host. Since the host is used purely for notifications, a malicious host can only induce
+ * Denial of Service (DoS) attacks.
+ *
+ * The emulation is currently implemented at the level of a single process. All timerfds created in
+ * the parent process are marked as invalid in child processes. In multi-process applications,
+ * Gramine does not exit immediately after fork; it only exits if the application attempts to use
+ * timerfds in the child. Therefore, inter-process timing signals via timerfds are not allowed.
+ *
+ * The host's eventfd object is "dummy" and used purely for notifications -- to unblock blocking
+ * read/select/poll/epoll system calls. The read notify logic is already hardened, by
+ * double-checking that the timerfd object indeed expired. However, there are three possible attacks
+ * on polling mechanisms (select/poll/epoll):
+ *
+ * a. Malicious host may inject the notification too early: POLLIN when no timer expired yet. This
+ * may lead to a synchronization failure of the app. To prevent this, timerfd implements a
+ * callback `post_poll()` where it verifies that a timer was indeed expired (i.e., that the
+ * notification is not spurious).
+ * b. Malicious host may inject the notification too late or not send a notification at all.
+ * This is a Denial of Service (DoS), which we don't care about.
+ * c. Malicious host may inject POLLERR, POLLHUP, POLLRDHUP, POLLNVAL, POLLOUT. This is impossible
+ * as we control timerfd objects inside the LibOS, and we never raise such conditions. So the
+ * callback `post_poll()` panics if it detects such a return event.
+ */
+
+#include "libos_checkpoint.h"
+#include "libos_fs.h"
+#include "libos_handle.h"
+#include "libos_internal.h"
+#include "libos_table.h"
+#include "libos_utils.h"
+#include "linux_abi/fs.h"
+#include "linux_abi/time.h"
+#include "linux_eventfd.h"
+#include "pal.h"
+
+/* This implementation is the same as `eventfd_dummy_host_write()` in "fs/eventfd/fs.c". */
+static void timerfd_dummy_host_write(struct libos_handle* hdl) {
+ int ret;
+ uint64_t buf_dummy_host_val = 1;
+ size_t dummy_host_val_count = sizeof(buf_dummy_host_val);
+ do {
+ ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count,
+ &buf_dummy_host_val);
+ } while (ret == PAL_ERROR_INTERRUPTED);
+ if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) {
+ /* must not happen in benign case, consider it an attack and panic */
+ BUG();
+ }
+}
+
+static int create_timerfd_pal_handle(PAL_HANDLE* out_pal_handle) {
+ PAL_HANDLE hdl = NULL;
+
+ int ret = PalStreamOpen(URI_PREFIX_EVENTFD, PAL_ACCESS_RDWR, /*share_flags=*/0,
+ PAL_CREATE_IGNORED, /*options=*/0, &hdl);
+ if (ret < 0) {
+ log_error("timerfd: dummy host eventfd creation failure");
+ return pal_to_unix_errno(ret);
+ }
+
+ *out_pal_handle = hdl;
+ return 0;
+}
+
+long libos_syscall_timerfd_create(int clockid, int flags) {
+ int ret;
+
+ if ((flags & ~TFD_CREATE_FLAGS) ||
+ (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME &&
+ clockid != CLOCK_BOOTTIME_ALARM))
+ return -EINVAL;
+
+ if (clockid != CLOCK_REALTIME) {
+ if (FIRST_TIME()) {
+ log_warning("Unsupported clockid in 'timerfd_create()'; replaced by the system-wide "
+ "real-time clock.");
+ }
+ }
+
+ struct libos_handle* hdl = get_new_handle();
+ if (!hdl)
+ return -ENOMEM;
+
+ hdl->type = TYPE_TIMERFD;
+ hdl->fs = &timerfd_builtin_fs;
+ hdl->flags = O_RDONLY | (flags & TFD_NONBLOCK ? O_NONBLOCK : 0);
+ hdl->acc_mode = MAY_READ;
+
+ hdl->info.timerfd.broken_in_child = false;
+ hdl->info.timerfd.num_expirations = 0;
+ hdl->info.timerfd.dummy_host_val = 0;
+ hdl->info.timerfd.timeout = 0;
+ hdl->info.timerfd.reset = 0;
+
+ ret = create_timerfd_pal_handle(&hdl->pal_handle);
+ if (ret < 0)
+ goto out;
+
+ ret = set_new_fd_handle(hdl, flags & TFD_CLOEXEC ? FD_CLOEXEC : 0, NULL);
+out:
+ put_handle(hdl);
+ return ret;
+}
+
+static void timerfd_update(struct libos_handle* hdl) {
+ spinlock_lock(&hdl->info.timerfd.expiration_lock);
+
+ /* When the expiration count overflows, the read will saturate at UINT64_MAX while the timer
+ * will continue to fire. */
+ if (hdl->info.timerfd.num_expirations < UINT64_MAX)
+ hdl->info.timerfd.num_expirations++;
+
+ hdl->info.timerfd.dummy_host_val++;
+
+ /* perform a write (not supposed to block) to send an event to reading/polling threads */
+ timerfd_dummy_host_write(hdl);
+
+ spinlock_unlock(&hdl->info.timerfd.expiration_lock);
+}
+
+static void callback_itimer(IDTYPE caller, void* arg) {
+ __UNUSED(caller);
+
+ struct libos_handle* hdl = (struct libos_handle*)arg;
+
+ spinlock_lock(&hdl->info.timerfd.timer_lock);
+ hdl->info.timerfd.timeout += hdl->info.timerfd.reset;
+ uint64_t next_reset = hdl->info.timerfd.reset;
+ spinlock_unlock(&hdl->info.timerfd.timer_lock);
+
+ if (next_reset) {
+ int64_t ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle,
+ next_reset, /*absolute_time=*/false,
+ &callback_itimer, (void*)hdl);
+ if (ret < 0) {
+ log_error(
+ "failed to re-enqueue the next timer event initially set up by "
+ "'timerfd_settime()': %s", unix_strerror(ret));
+ die_or_inf_loop();
+ }
+ }
+
+ timerfd_update(hdl);
+}
+
+long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value,
+ struct __kernel_itimerspec* ovalue) {
+ int ret;
+
+ struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL);
+ if (!hdl)
+ return -EBADF;
+
+ if (hdl->type != TYPE_TIMERFD) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (hdl->info.timerfd.broken_in_child) {
+ log_warning("Child process tried to access timerfd created by parent process. This is "
+ "disallowed in Gramine.");
+ return -EIO;
+ }
+
+ if (!is_user_memory_readable(value, sizeof(*value))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (ovalue && !is_user_memory_writable(ovalue, sizeof(*ovalue))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* `TFD_TIMER_CANCEL_ON_SET` is silently ignored because there are no "discontinuous changes of
+ * time" in Gramine (via e.g., `settimeofday()`). */
+
+ if (flags & ~TFD_SETTIME_FLAGS) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ uint64_t setup_time = 0;
+ ret = PalSystemTimeQuery(&setup_time);
+ if (ret < 0) {
+ ret = pal_to_unix_errno(ret);
+ goto out;
+ }
+
+ uint64_t new_timeout = timespec_to_us(&value->it_value);
+ uint64_t new_reset = timespec_to_us(&value->it_interval);
+
+ spinlock_lock(&hdl->info.timerfd.timer_lock);
+
+ uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time
+ ? hdl->info.timerfd.timeout - setup_time
+ : 0;
+ uint64_t current_reset = hdl->info.timerfd.reset;
+
+ bool absolute_time = flags & TFD_TIMER_ABSTIME;
+ if (absolute_time) {
+ hdl->info.timerfd.timeout = new_timeout;
+ } else {
+ hdl->info.timerfd.timeout = setup_time + new_timeout;
+ }
+ hdl->info.timerfd.reset = new_reset;
+
+ spinlock_unlock(&hdl->info.timerfd.timer_lock);
+
+ int64_t install_ret;
+ if (new_timeout) {
+ install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle,
+ new_timeout, absolute_time,
+ &callback_itimer, (void*)hdl);
+ } else {
+ /* cancel the pending timerfd object */
+ install_ret = install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle,
+ /*time_us=*/0, /*absolute_time=*/false,
+ /*callback=*/NULL, /*arg=*/NULL);
+ }
+ if (install_ret < 0) {
+ ret = install_ret;
+ goto out;
+ }
+
+ if (ovalue) {
+ ovalue->it_interval.tv_sec = current_reset / TIME_US_IN_S;
+ ovalue->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US;
+ ovalue->it_value.tv_sec = current_timeout / TIME_US_IN_S;
+ ovalue->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US;
+ }
+
+ ret = 0;
+out:
+ put_handle(hdl);
+ return ret;
+}
+
+long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value) {
+ int ret;
+
+ struct libos_handle* hdl = get_fd_handle(fd, /*fd_flags=*/NULL, /*map=*/NULL);
+ if (!hdl)
+ return -EBADF;
+
+ if (hdl->type != TYPE_TIMERFD) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (hdl->info.timerfd.broken_in_child) {
+ log_warning("Child process tried to access timerfd created by parent process. This is "
+ "disallowed in Gramine.");
+ return -EIO;
+ }
+
+ if (!is_user_memory_writable(value, sizeof(*value))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ uint64_t setup_time = 0;
+ ret = PalSystemTimeQuery(&setup_time);
+ if (ret < 0) {
+ ret = pal_to_unix_errno(ret);
+ goto out;
+ }
+
+ spinlock_lock(&hdl->info.timerfd.timer_lock);
+ uint64_t current_timeout = hdl->info.timerfd.timeout > setup_time
+ ? hdl->info.timerfd.timeout - setup_time
+ : 0;
+ uint64_t current_reset = hdl->info.timerfd.reset;
+ spinlock_unlock(&hdl->info.timerfd.timer_lock);
+
+ value->it_interval.tv_sec = current_reset / TIME_US_IN_S;
+ value->it_interval.tv_nsec = (current_reset % TIME_US_IN_S) * TIME_NS_IN_US;
+ value->it_value.tv_sec = current_timeout / TIME_US_IN_S;
+ value->it_value.tv_nsec = (current_timeout % TIME_US_IN_S) * TIME_NS_IN_US;
+
+ ret = 0;
+out:
+ put_handle(hdl);
+ return ret;
+}
diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg
index 9195858e20..8c358afeeb 100644
--- a/libos/test/ltp/ltp.cfg
+++ b/libos/test/ltp/ltp.cfg
@@ -2436,8 +2436,12 @@ skip = yes
[timer_settime*]
skip = yes
-# no timerfd
-[timerfd*]
+# clocks other than `CLOCK_REALTIME` are not supported
+[timerfd04]
+skip = yes
+
+# relies on "/proc/sys/kernel/tainted" (see tst_taint.c:tst_taint_check)
+[timerfd_settime02]
skip = yes
[times03]
diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build
index b6f201b24e..444c9a6996 100644
--- a/libos/test/regression/meson.build
+++ b/libos/test/regression/meson.build
@@ -155,6 +155,8 @@ tests = {
'tcp_einprogress': {},
'tcp_ipv6_v6only': {},
'tcp_msg_peek': {},
+ 'timerfd': {},
+ 'timerfd_fork': {},
'udp': {},
'uid_gid': {},
'unix': {},
diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py
index 57ab164f74..90b46ef90d 100644
--- a/libos/test/regression/test_libos.py
+++ b/libos/test/regression/test_libos.py
@@ -1084,6 +1084,18 @@ def test_161_rlimit_nofile_4k(self):
self.assertIn("(after setrlimit) opened fd: 4096", stdout)
self.assertIn("TEST OK", stdout)
+ def test_170_timerfd(self):
+ stdout, _ = self.run_binary(['timerfd'], timeout=120)
+ self.assertIn("TEST OK", stdout)
+
+ def test_171_timerfd_fork(self):
+ try:
+ self.run_binary(['timerfd_fork'])
+ self.fail('timerfd_fork unexpectedly succeeded')
+ except subprocess.CalledProcessError as e:
+ stdout = e.stdout.decode()
+ self.assertIn('child died', stdout)
+
class TC_31_Syscall(RegressionTestCase):
def test_000_syscall_redirect(self):
stdout, _ = self.run_binary(['syscall'])
diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml
index 23fa2fc5c6..063e5fdaed 100644
--- a/libos/test/regression/tests.toml
+++ b/libos/test/regression/tests.toml
@@ -134,6 +134,8 @@ manifests = [
"tcp_einprogress",
"tcp_ipv6_v6only",
"tcp_msg_peek",
+ "timerfd",
+ "timerfd_fork",
"toml_parsing",
"udp",
"uid_gid",
diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml
index 7a3acc3743..20334622a4 100644
--- a/libos/test/regression/tests_musl.toml
+++ b/libos/test/regression/tests_musl.toml
@@ -135,6 +135,8 @@ manifests = [
"tcp_einprogress",
"tcp_ipv6_v6only",
"tcp_msg_peek",
+ "timerfd",
+ "timerfd_fork",
"toml_parsing",
"udp",
"uid_gid",
diff --git a/libos/test/regression/timerfd.c b/libos/test/regression/timerfd.c
new file mode 100644
index 0000000000..cd4ee10520
--- /dev/null
+++ b/libos/test/regression/timerfd.c
@@ -0,0 +1,388 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2024 Intel Corporation
+ * Kailun Qin
+ */
+
+/*
+ * Single-process test for `timerfd` syscalls (`timerfd_create()`, `timerfd_settime()` and
+ * `timerfd_gettime()`).
+ *
+ * The tests involve cases including reading a blocking/non-blocking timerfd, poll/epoll/select on
+ * timerfds, setting up a relative/absolute/periodic timerfd and reading a timerfd from multiple
+ * threads.
+ */
+
+#define _GNU_SOURCE
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "common.h"
+
+#define EXPECTED_EXPIRATIONS 1
+#define EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT 5
+#define NUM_FDS 2
+#define NUM_THREADS 5
+#define PERIODIC_INTERVAL 1
+#define TIMEOUT_VALUE 2
+
+static void set_timerfd_relative(int fd, bool periodic) {
+ struct itimerspec new_value = {
+ .it_value.tv_sec = TIMEOUT_VALUE,
+ .it_interval.tv_sec = periodic ? PERIODIC_INTERVAL : 0,
+ };
+
+ CHECK(timerfd_settime(fd, 0, &new_value, NULL));
+}
+
+static void set_timerfds_relative(int fds[NUM_FDS], bool periodic) {
+ for (int i = 0; i < NUM_FDS; i++)
+ set_timerfd_relative(fds[i], periodic);
+}
+
+static void set_timerfd_absolute(int fd, struct timespec* abs_time) {
+ struct itimerspec new_value;
+
+ /* Set the timer to expire at the absolute time specified */
+ new_value.it_value.tv_sec = abs_time->tv_sec;
+ new_value.it_value.tv_nsec = abs_time->tv_nsec;
+ new_value.it_interval.tv_sec = 0;
+ new_value.it_interval.tv_nsec = 0;
+
+ /* Set the timer to absolute time */
+ CHECK(timerfd_settime(fd, TFD_TIMER_ABSTIME, &new_value, NULL));
+}
+
+static void create_timerfds(int fds[NUM_FDS]) {
+ for (int i = 0; i < NUM_FDS; i++)
+ fds[i] = CHECK(timerfd_create(CLOCK_REALTIME, 0));
+}
+
+static void close_timerfds(int fds[NUM_FDS]) {
+ for (int i = 0; i < NUM_FDS; i++)
+ CHECK(close(fds[i]));
+}
+
+static void test_select(int fds[NUM_FDS]) {
+ fd_set rfds;
+ int max_fd = 0;
+ int ready_fds = 0;
+
+ for (int i = 0; i < NUM_FDS; i++) {
+ if (fds[i] > max_fd)
+ max_fd = fds[i];
+ }
+
+ while (ready_fds < NUM_FDS) {
+ FD_ZERO(&rfds);
+ for (int i = 0; i < NUM_FDS; i++) {
+ FD_SET(fds[i], &rfds);
+ }
+
+ int nfds = select(max_fd + 1, &rfds, NULL, NULL, NULL);
+ if (nfds <= 0)
+ err(1, "select on read event failed");
+
+ for (int i = 0; i < NUM_FDS; i++) {
+ if (FD_ISSET(fds[i], &rfds)) {
+ uint64_t expirations;
+ CHECK(read(fds[i], &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "select: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+ ready_fds++;
+ }
+ }
+ }
+
+ if (ready_fds != NUM_FDS)
+ errx(1, "select: unexpected number of ready fds (expected %d, got %d)",
+ NUM_FDS, ready_fds);
+}
+
+static void test_poll(int fds[NUM_FDS]) {
+ struct pollfd pfds[NUM_FDS];
+ int ready_fds = 0;
+
+ for (int i = 0; i < NUM_FDS; i++) {
+ pfds[i].fd = fds[i];
+ pfds[i].events = POLLIN;
+ pfds[i].revents = 0;
+ }
+
+ while (ready_fds < NUM_FDS) {
+ int nfds = poll(pfds, NUM_FDS, -1);
+ if (nfds <= 0)
+ err(1, "poll with POLLIN failed");
+
+ for (int i = 0; i < NUM_FDS; i++) {
+ if (pfds[i].revents & POLLIN) {
+ uint64_t expirations;
+ CHECK(read(pfds[i].fd, &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "poll: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+ ready_fds++;
+ pfds[i].revents = 0;
+ }
+ }
+ }
+
+ if (ready_fds != NUM_FDS)
+ errx(1, "poll: unexpected number of ready fds (expected %d, got %d)",
+ NUM_FDS, ready_fds);
+}
+
+static void test_epoll(int fds[NUM_FDS]) {
+ int epfd = CHECK(epoll_create1(0));
+
+ struct epoll_event ev;
+ ev.events = EPOLLIN;
+ for (int i = 0; i < NUM_FDS; i++) {
+ ev.data.fd = fds[i];
+ CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fds[i], &ev));
+ }
+
+ struct epoll_event events[NUM_FDS];
+ int ready_fds = 0;
+
+ while (ready_fds < NUM_FDS) {
+ int nfds = epoll_wait(epfd, events, NUM_FDS, -1);
+ if (nfds <= 0)
+ err(1, "epoll_wait with EPOLLIN failed");
+
+ for (int i = 0; i < nfds; i++) {
+ uint64_t expirations;
+ CHECK(read(events[i].data.fd, &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "epoll_wait: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+ ready_fds++;
+ }
+ }
+
+ if (ready_fds != NUM_FDS)
+ errx(1, "epoll_wait: unexpected number of ready fds (expected %d, got %d)",
+ NUM_FDS, ready_fds);
+
+ CHECK(close(epfd));
+}
+
+/* this test expects the timerfd (`fd`) to be a periodic timer */
+static void test_epoll_modes(int fd) {
+ int epfd = CHECK(epoll_create1(0));
+
+ /* level-triggered mode */
+ struct epoll_event ev;
+ ev.events = EPOLLIN;
+ ev.data.fd = fd;
+ CHECK(epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev));
+
+ struct epoll_event events[1];
+ int nfds = CHECK(epoll_wait(epfd, events, 1, -1));
+ if (nfds != 1)
+ errx(1, "epoll: unexpected number of fds (expected 1, got %u)", nfds);
+
+ /* waiting for another event without reading the expiration count */
+ nfds = CHECK(epoll_wait(epfd, events, 1, /*timeout=*/PERIODIC_INTERVAL * 1000 * 2));
+ if (nfds != 1)
+ errx(1, "epoll: unexpected number of fds in level-triggered mode without reading "
+ "(expected 1, got %u)", nfds);
+
+ /* switch to edge-triggered mode */
+ ev.events = EPOLLIN | EPOLLET;
+ CHECK(epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ev));
+
+ nfds = CHECK(epoll_wait(epfd, events, 1, -1));
+ if (nfds != 1)
+ errx(1, "epoll: unexpected number of fds (expected 1, got %u)", nfds);
+
+ /* waiting for another event without reading the expiration count: here, even though the timer
+ * expired at least once, there is no event reported because we're in edge-triggered mode (which
+ * does not "reset" the event since there was no read) */
+ nfds = CHECK(epoll_wait(epfd, events, 1, /*timeout=*/PERIODIC_INTERVAL * 1000 * 2));
+ if (nfds != 0)
+ errx(1, "epoll: unexpected number of fds in edge-triggered mode without reading "
+ "(expected 0, got %u)", nfds);
+
+ CHECK(close(epfd));
+}
+
+static void test_periodic_timer(int fd) {
+ uint64_t expirations;
+ size_t total_expirations = 0;
+
+ while (total_expirations < EXPECTED_PERIODIC_TIMER_EXPIRATION_COUNT) {
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ total_expirations += expirations;
+ }
+}
+
+static void* timerfd_read_thread(void* arg) {
+ int fd = *(int*)arg;
+ uint64_t expirations;
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ if (expirations == 0)
+ err(1, "threaded read: unexpected number of expirations");
+ pthread_exit(NULL);
+}
+
+/* a periodic timer is required so that all NUM_THREADS threads have something to read */
+static void test_periodic_timer_threaded_read(int fd) {
+ pthread_t threads[NUM_THREADS];
+ for (int i = 0; i < NUM_THREADS; i++) {
+ CHECK(pthread_create(&threads[i], NULL, timerfd_read_thread, &fd));
+ /* wait for the thread to finish */
+ CHECK(pthread_join(threads[i], NULL));
+ }
+}
+
+static void test_timerfd_gettime(int fd) {
+ struct itimerspec curr_value;
+ CHECK(timerfd_gettime(fd, &curr_value));
+
+ /* the timer should be set to expire close to 2 seconds */
+ if (curr_value.it_value.tv_sec > 2 || curr_value.it_value.tv_sec < 1 ||
+ curr_value.it_value.tv_nsec < 0 || curr_value.it_value.tv_nsec >= 1000000000) {
+ errx(1, "timerfd_gettime: unexpected timer value (expected close to 2.0, got %ld.%09ld)",
+ curr_value.it_value.tv_sec, curr_value.it_value.tv_nsec);
+ }
+}
+
+static void test_disarm_timer(int fd) {
+ struct itimerspec old_value;
+
+ /* immediately disarm the timer and get the old value */
+ struct itimerspec disarm_value = { 0 };
+ CHECK(timerfd_settime(fd, 0, &disarm_value, &old_value));
+
+ /* check that the old value is around 2 seconds */
+ if (old_value.it_value.tv_sec > 2 || old_value.it_value.tv_sec < 1 ||
+ old_value.it_value.tv_nsec < 0 || old_value.it_value.tv_nsec >= 1000000000) {
+ errx(1, "disarm_timer: unexpected old timer value (expected close to 2.0, got %ld.%09ld)",
+ old_value.it_value.tv_sec, old_value.it_value.tv_nsec);
+ }
+
+ /* test poll with a timeout to ensure the timer was disarmed */
+ struct pollfd pfd = {
+ .fd = fd,
+ .events = POLLIN,
+ };
+ int ret = poll(&pfd, 1, /*timeout=*/(TIMEOUT_VALUE + 1) * 1000);
+ if (ret != 0)
+ errx(1, "disarm_timer: poll returned %d, expected 0 (timeout)", ret);
+}
+
+static void test_absolute_time(int fd) {
+ struct timespec now;
+ struct timespec abs_time;
+ uint64_t expirations;
+
+ /* test timerfd with absolute time set in the future */
+ CHECK(clock_gettime(CLOCK_REALTIME, &now));
+ abs_time.tv_sec = now.tv_sec + TIMEOUT_VALUE;
+ abs_time.tv_nsec = now.tv_nsec;
+
+ set_timerfd_absolute(fd, &abs_time);
+
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "absolute_time future: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+
+ expirations = 0;
+ memset(&now, 0, sizeof(struct timespec));
+ memset(&abs_time, 0, sizeof(struct timespec));
+
+ /* test timerfd with absolute time set in the past */
+ CHECK(clock_gettime(CLOCK_REALTIME, &now));
+ abs_time.tv_sec = now.tv_sec - TIMEOUT_VALUE;
+ abs_time.tv_nsec = now.tv_nsec;
+
+ set_timerfd_absolute(fd, &abs_time);
+
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "absolute_time past: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+}
+
+/* This test must be executed twice: first reading from a non-periodic timerfd in blocking mode to
+ * capture its expiration, and then switching to non-blocking mode for a second read, which
+ * immediately returns with EAGAIN because the timer is disarmed and there are no new expiration
+ * events. */
+static void test_read(int fd, bool non_blocking) {
+ if (non_blocking) {
+ CHECK(fcntl(fd, F_SETFL, O_NONBLOCK));
+ }
+
+ uint64_t expirations;
+ int retval = read(fd, &expirations, sizeof(expirations));
+
+ if (non_blocking) {
+ if (retval != -1 || errno != EAGAIN) {
+ errx(1, "non-blocking read: read returned %d, errno %d, expected -1 and EAGAIN",
+ retval, errno);
+ }
+ } else {
+ CHECK(retval);
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "read: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+ }
+}
+
+int main(void) {
+ int fds[NUM_FDS];
+ create_timerfds(fds);
+
+ set_timerfds_relative(fds, /*periodic=*/false);
+ test_select(fds);
+
+ set_timerfds_relative(fds, /*periodic=*/false);
+ test_poll(fds);
+
+ set_timerfds_relative(fds, /*periodic=*/false);
+ test_epoll(fds);
+
+ set_timerfd_relative(fds[0], /*periodic=*/true);
+ test_epoll_modes(fds[0]);
+
+ set_timerfd_relative(fds[0], /*periodic=*/true);
+ test_periodic_timer(fds[0]);
+
+ set_timerfd_relative(fds[0], /*periodic=*/true);
+ test_periodic_timer_threaded_read(fds[0]);
+
+ set_timerfd_relative(fds[0], /*periodic=*/false);
+ test_timerfd_gettime(fds[0]);
+
+ set_timerfd_relative(fds[0], /*periodic=*/false);
+ test_disarm_timer(fds[0]);
+
+ set_timerfd_relative(fds[0], /*periodic=*/false);
+ test_read(fds[0], /*non_blocking=*/false);
+ test_read(fds[0], /*non_blocking=*/true);
+
+ test_absolute_time(fds[1]);
+
+ close_timerfds(fds);
+
+ puts("TEST OK");
+ return 0;
+}
diff --git a/libos/test/regression/timerfd_fork.c b/libos/test/regression/timerfd_fork.c
new file mode 100644
index 0000000000..0bf82fc64c
--- /dev/null
+++ b/libos/test/regression/timerfd_fork.c
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: LGPL-3.0-or-later */
+/* Copyright (C) 2024 Intel Corporation
+ * Kailun Qin
+ */
+
+/* Multi-process test for `timerfd` syscalls (`timerfd_create()` and `timerfd_settime()`).
+ *
+ * Note that timerfd is currently only emulated in a secure single-process mode, so this test does
+ * not work.
+ */
+
+#define _GNU_SOURCE
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "common.h"
+
+#define EXPECTED_EXPIRATIONS 1
+#define TIMEOUT_VALUE 2
+
+static void set_timerfd(int fd) {
+ struct itimerspec new_value = { .it_value.tv_sec = TIMEOUT_VALUE };
+
+ CHECK(timerfd_settime(fd, 0, &new_value, NULL));
+}
+
+static void test_multi_process(int fd) {
+ pid_t pid = CHECK(fork());
+ if (pid == 0) {
+ uint64_t expirations;
+ /* child: wait on a blocking read for the timer to expire */
+ /* Note: Due to the limitation of `timerfd` syscalls being only emulated in a secure
+ * single-process mode, `read()` will return a negative error code. */
+ CHECK(read(fd, &expirations, sizeof(expirations)));
+ if (expirations != EXPECTED_EXPIRATIONS) {
+ errx(1, "child process: unexpected number of expirations (expected %d, got %lu)",
+ EXPECTED_EXPIRATIONS, expirations);
+ }
+ exit(0);
+ } else {
+ int status = 0;
+
+ /* parent: do nothing and let the child process read the timerfd */
+ CHECK(waitpid(pid, &status, 0));
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ errx(1, "child died with status: %#x", status);
+ }
+ }
+}
+
+int main(void) {
+ int fd = CHECK(timerfd_create(CLOCK_REALTIME, 0));
+
+ set_timerfd(fd);
+ test_multi_process(fd);
+
+ CHECK(close(fd));
+
+ puts("TEST OK");
+ return 0;
+}