Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LibOS] Add support for timerfd system calls #1734

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions Documentation/devel/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -1036,7 +1036,7 @@ The below list is generated from the [syscall table of Linux
- ☒ `signalfd()`
<sup>[7](#signals-and-process-state-changes)</sup>

- `timerfd_create()`
- `timerfd_create()`
<sup>[20](#sleeps-timers-and-alarms)</sup>

- ▣ `eventfd()`
Expand All @@ -1045,10 +1045,10 @@ The below list is generated from the [syscall table of Linux
- ▣ `fallocate()`
<sup>[9a](#file-system-operations)</sup>

- `timerfd_settime()`
- `timerfd_settime()`
<sup>[20](#sleeps-timers-and-alarms)</sup>

- `timerfd_gettime()`
- `timerfd_gettime()`
<sup>[20](#sleeps-timers-and-alarms)</sup>

- ☑ `accept4()`
Expand Down Expand Up @@ -2891,9 +2891,23 @@ Gramine implements getting and setting the interval timer: `getitimer()` and `se

Gramine implements alarm clocks via `alarm()`.

Gramine implements timers that notify via file descriptors: `timerfd_create()`, `timerfd_settime()`
and `timerfd_gettime()`. The timerfd object is created inside Gramine, and all operations are
resolved entirely inside Gramine (note that the time source in Gramine SGX is still untrusted). Each
timerfd object is associated with a dummy eventfd created on the host. This is purely for triggering
read notifications (e.g., in epoll); timerfd data is verified inside Gramine and is never exposed to
the host. Since the host is used purely for notifications, a malicious host can only induce Denial
of Service (DoS) attacks. `TFD_TIMER_CANCEL_ON_SET` is silently ignored because there are no
"discontinuous changes of time" in Gramine (via e.g., `settimeofday()`). `TFD_IOC_SET_TICKS` is not
supported.

The emulation is currently implemented at the level of a single process. All timerfds created in the
parent process are marked as invalid in child processes. In multi-process applications, Gramine does
not exit immediately after fork; it only exits if the application attempts to use timerfds in the
child. Therefore, inter-process timing signals via timerfds are not allowed.

Gramine does *not* currently implement the POSIX per-process timer: `timer_create()`, etc. Gramine
also does not currently implement timers that notify via file descriptors. Gramine could implement
these timers in the future, if need arises.
could implement it in the future, if need arises.

<details><summary>Related system calls</summary>

Expand All @@ -2909,9 +2923,9 @@ these timers in the future, if need arises.
- ☒ `timer_getoverrun()`: may be implemented in the future
- ☒ `timer_delete()`: may be implemented in the future

- `timerfd_create()`: may be implemented in the future
- `timerfd_settime()`: may be implemented in the future
- `timerfd_gettime()`: may be implemented in the future
- `timerfd_create()`: see the notes above
- `timerfd_settime()`: see the notes above
- `timerfd_gettime()`: see the notes above

</details><br />

Expand Down
3 changes: 2 additions & 1 deletion libos/include/libos_fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ struct libos_fs_ops {
int (*poll)(struct libos_handle* hdl, int in_events, int* out_events);

/* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed
* ones. Used in e.g. secure eventfd FS to verify if the host is not lying to us. */
* ones. Used in e.g. secure eventfd and timerfd FS to verify if the host is not lying to us. */
void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events);

/* checkpoint/migrate the file system */
Expand Down Expand Up @@ -948,6 +948,7 @@ extern struct libos_fs eventfd_builtin_fs;
extern struct libos_fs synthetic_builtin_fs;
extern struct libos_fs path_builtin_fs;
extern struct libos_fs shm_builtin_fs;
extern struct libos_fs timerfd_builtin_fs;

struct libos_fs* find_fs(const char* name);

Expand Down
17 changes: 16 additions & 1 deletion libos/include/libos_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ enum libos_handle_type {
/* Special handles: */
TYPE_EPOLL, /* epoll handles, see `libos_epoll.c` */
TYPE_EVENTFD, /* eventfd handles, used by `eventfd` filesystem */
TYPE_TIMERFD, /* timerfd handles, used by `timerfd` filesystem */
};

struct libos_pipe_handle {
Expand Down Expand Up @@ -142,6 +143,18 @@ struct libos_eventfd_handle {
uint64_t dummy_host_val;
};

struct libos_timerfd_handle {
bool broken_in_child;

spinlock_t expiration_lock; /* protecting below fields */
uint64_t num_expirations;
uint64_t dummy_host_val;

spinlock_t timer_lock; /* protecting below fields */
uint64_t timeout; /* always an absolute time */
uint64_t reset;
};

struct libos_handle {
enum libos_handle_type type;
bool is_dir;
Expand Down Expand Up @@ -217,6 +230,8 @@ struct libos_handle {

struct libos_epoll_handle epoll; /* TYPE_EPOLL */
struct libos_eventfd_handle eventfd; /* TYPE_EVENTFD */

struct libos_timerfd_handle timerfd; /* TYPE_TIMERFD */
} info;

struct libos_dir_handle dir_info;
Expand All @@ -232,7 +247,7 @@ struct libos_handle {
* `read`, `seek` but not `pread`). This lock should be taken *before* `libos_handle.lock` and
* `libos_inode.lock`. Must be used *only* via maybe_lock_pos_handle() and
* maybe_unlock_pos_handle(); these functions make sure that the lock is acquired only on those
* handle types that are seekable (e.g. not on eventfds or pipes). */
* handle types that are seekable (e.g. not on eventfds, timerfds or pipes). */
struct libos_lock pos_lock;
};

Expand Down
4 changes: 4 additions & 0 deletions libos/include/libos_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,7 @@ long libos_syscall_getrandom(char* buf, size_t count, unsigned int flags);
long libos_syscall_mlock2(unsigned long start, size_t len, int flags);
long libos_syscall_sysinfo(struct sysinfo* info);
long libos_syscall_close_range(unsigned int first, unsigned int last, unsigned int flags);
long libos_syscall_timerfd_create(int clockid, int flags);
long libos_syscall_timerfd_settime(int fd, int flags, const struct __kernel_itimerspec* value,
struct __kernel_itimerspec* ovalue);
long libos_syscall_timerfd_gettime(int fd, struct __kernel_itimerspec* value);
8 changes: 7 additions & 1 deletion libos/include/libos_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,14 @@ void clean_link_map_list(void);
int create_pipe(char* name, char* uri, size_t size, PAL_HANDLE* hdl, bool use_vmid_for_name);

/* Asynchronous event support */
enum async_event_type {
ASYNC_EVENT_TYPE_IO = 1,
ASYNC_EVENT_TYPE_ALARM_TIMER = 2,
};

int init_async_worker(void);
int64_t install_async_event(PAL_HANDLE object, unsigned long time,
int64_t install_async_event(enum async_event_type type, PAL_HANDLE object,
unsigned long time_us, bool absolute_time,
void (*callback)(IDTYPE caller, void* arg), void* arg);
void terminate_async_worker(void);

Expand Down
31 changes: 28 additions & 3 deletions libos/include/linux_abi/time.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
/* These need to be binary-identical with the ones used by Linux. */

// TODO: remove all of these includes and make this header libc-independent.
#include <linux/times.h>
#include <linux/timex.h>
#include <linux/utime.h>
#include <linux/version.h>

typedef long __kernel_suseconds_t;
typedef long __kernel_time_t;

typedef __kernel_time_t time_t;

#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
Expand All @@ -37,3 +37,28 @@ struct __kernel_timezone {
int tz_minuteswest; /* minutes west of Greenwich */
int tz_dsttime; /* type of dst correction */
};

/* The IDs of the various system clocks (for POSIX.1b interval timers). */
#define CLOCK_REALTIME 0
#define CLOCK_MONOTONIC 1
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
#define CLOCK_MONOTONIC_RAW 4
#define CLOCK_REALTIME_COARSE 5
#define CLOCK_MONOTONIC_COARSE 6
#define CLOCK_BOOTTIME 7
#define CLOCK_REALTIME_ALARM 8
#define CLOCK_BOOTTIME_ALARM 9

#define MAX_CLOCKS 16

#define TFD_TIMER_ABSTIME (1 << 0)
#define TFD_TIMER_CANCEL_ON_SET (1 << 1)
#define TFD_CLOEXEC O_CLOEXEC
#define TFD_NONBLOCK O_NONBLOCK

#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK)
/* Flags for timerfd_create. */
#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS
/* Flags for timerfd_settime. */
#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)
6 changes: 3 additions & 3 deletions libos/src/arch/x86_64/libos_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,11 +297,11 @@ libos_syscall_t libos_syscall_table[LIBOS_SYSCALL_BOUND] = {
[__NR_utimensat] = (libos_syscall_t)0, // libos_syscall_utimensat
[__NR_epoll_pwait] = (libos_syscall_t)libos_syscall_epoll_pwait,
[__NR_signalfd] = (libos_syscall_t)0, // libos_syscall_signalfd
[__NR_timerfd_create] = (libos_syscall_t)0, // libos_syscall_timerfd_create
[__NR_timerfd_create] = (libos_syscall_t)libos_syscall_timerfd_create,
[__NR_eventfd] = (libos_syscall_t)libos_syscall_eventfd,
[__NR_fallocate] = (libos_syscall_t)libos_syscall_fallocate,
[__NR_timerfd_settime] = (libos_syscall_t)0, // libos_syscall_timerfd_settime
[__NR_timerfd_gettime] = (libos_syscall_t)0, // libos_syscall_timerfd_gettime
[__NR_timerfd_settime] = (libos_syscall_t)libos_syscall_timerfd_settime,
[__NR_timerfd_gettime] = (libos_syscall_t)libos_syscall_timerfd_gettime,
[__NR_accept4] = (libos_syscall_t)libos_syscall_accept4,
[__NR_signalfd4] = (libos_syscall_t)0, // libos_syscall_signalfd4
[__NR_eventfd2] = (libos_syscall_t)libos_syscall_eventfd2,
Expand Down
1 change: 1 addition & 0 deletions libos/src/fs/libos_fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ static struct libos_fs* g_builtin_fs[] = {
&synthetic_builtin_fs,
&path_builtin_fs,
&shm_builtin_fs,
&timerfd_builtin_fs,
};

static struct libos_lock g_mount_mgr_lock;
Expand Down
1 change: 1 addition & 0 deletions libos/src/fs/proc/thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ static char* describe_handle(struct libos_handle* hdl) {
case TYPE_EPOLL: str = "epoll:[?]"; break;
case TYPE_EVENTFD: str = "eventfd:[?]"; break;
case TYPE_SHM: str = "shm:[?]"; break;
case TYPE_TIMERFD: str = "timerfd:[?]"; break;
default: str = "unknown:[?]"; break;
}
return strdup(str);
Expand Down
146 changes: 146 additions & 0 deletions libos/src/fs/timerfd/fs.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
/* SPDX-License-Identifier: LGPL-3.0-or-later */
/* Copyright (C) 2024 Intel Corporation
* Kailun Qin <[email protected]>
*/

/*
* This file contains code for implementation of "timerfd" filesystem. For more information, see
* `libos/src/sys/libos_timerfd.c`.
*/

#include "libos_fs.h"
#include "libos_handle.h"
#include "libos_internal.h"
#include "libos_lock.h"
#include "linux_abi/errors.h"
#include "pal.h"

/* Enforce a restriction that all timerfds created in the parent process are marked as invalid in
* child processes, i.e. inter-process timing signals via timerfds are not allowed. This restriction
* is because LibOS doesn't yet implement sync between timerfd objects. */
static int timerfd_checkin(struct libos_handle* hdl) {
assert(hdl->type == TYPE_TIMERFD);
hdl->info.timerfd.broken_in_child = true;
return 0;
}

/* This implementation is the same as `eventfd_dummy_host_read()` in "fs/eventfd/fs.c". */
static void timerfd_dummy_host_read(struct libos_handle* hdl) {
int ret;
uint64_t buf_dummy_host_val = 0;
size_t dummy_host_val_count = sizeof(buf_dummy_host_val);
do {
ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count,
&buf_dummy_host_val);
} while (ret == PAL_ERROR_INTERRUPTED);
if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) {
/* must not happen in benign case, consider it an attack and panic */
BUG();
}
}

/* This implementation is the same as `eventfd_dummy_host_wait()` in "fs/eventfd/fs.c". */
static void timerfd_dummy_host_wait(struct libos_handle* hdl) {
pal_wait_flags_t wait_for_events = PAL_WAIT_READ;
pal_wait_flags_t ret_events = 0;
int ret = PalStreamsWaitEvents(1, &hdl->pal_handle, &wait_for_events, &ret_events, NULL);
if (ret < 0 && ret != PAL_ERROR_INTERRUPTED) {
BUG();
}
(void)ret_events; /* we don't care what events the host returned, we can't trust them anyway */
}

static ssize_t timerfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) {
__UNUSED(pos);
assert(hdl->type == TYPE_TIMERFD);

if (count < sizeof(uint64_t))
return -EINVAL;

if (hdl->info.timerfd.broken_in_child) {
log_warning("Child process tried to access timerfd created by parent process. This is "
"disallowed in Gramine.");
return -EIO;
}

int ret;
spinlock_lock(&hdl->info.timerfd.expiration_lock);

while (!hdl->info.timerfd.num_expirations) {
if (hdl->flags & O_NONBLOCK) {
ret = -EAGAIN;
goto out;
}
spinlock_unlock(&hdl->info.timerfd.expiration_lock);
timerfd_dummy_host_wait(hdl);
spinlock_lock(&hdl->info.timerfd.expiration_lock);
}

memcpy(buf, &hdl->info.timerfd.num_expirations, sizeof(uint64_t));
hdl->info.timerfd.num_expirations = 0;

/* perform a read (not supposed to block) to clear the event from polling threads */
if (hdl->info.timerfd.dummy_host_val) {
timerfd_dummy_host_read(hdl);
hdl->info.timerfd.dummy_host_val = 0;
}

ret = (ssize_t)count;
out:
spinlock_unlock(&hdl->info.timerfd.expiration_lock);
maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false);
return ret;
}

static void timerfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) {
assert(hdl->type == TYPE_TIMERFD);

if (hdl->info.timerfd.broken_in_child) {
log_warning("Child process tried to access timerfd created by parent process. This is "
"disallowed in Gramine.");
*pal_ret_events = PAL_WAIT_ERROR;
return;
}

if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP | PAL_WAIT_WRITE)) {
/* impossible: we control timerfd inside the LibOS, and we never raise such conditions */
BUG();
}

spinlock_lock(&hdl->info.timerfd.expiration_lock);
if (*pal_ret_events & PAL_WAIT_READ) {
/* there is data to read: verify if timerfd has number of expirations greater than zero */
if (!hdl->info.timerfd.num_expirations) {
/* spurious or malicious notification, can legitimately happen if another thread
* consumed this event between this thread's poll wakeup and the post_poll callback;
* we currently choose to return a spurious notification to the user */
*pal_ret_events &= ~PAL_WAIT_READ;
}
}
spinlock_unlock(&hdl->info.timerfd.expiration_lock);
}

static int timerfd_close(struct libos_handle* hdl) {
if (hdl->info.timerfd.broken_in_child) {
log_warning("Child process tried to access timerfd created by parent process. This is "
"disallowed in Gramine.");
return -EIO;
}

/* cancel the pending timerfd object */
return install_async_event(ASYNC_EVENT_TYPE_ALARM_TIMER, hdl->pal_handle,
/*time_us=*/0, /*absolute_time=*/false, /*callback=*/NULL,
/*arg=*/NULL);
}

struct libos_fs_ops timerfd_fs_ops = {
.checkin = &timerfd_checkin,
.read = &timerfd_read,
.close = &timerfd_close,
.post_poll = &timerfd_post_poll,
};

struct libos_fs timerfd_builtin_fs = {
.name = "timerfd",
.fs_ops = &timerfd_fs_ops,
};
Loading