diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fa42bbc2a..e9aa95e756 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,3 +11,4 @@ add_subdirectory(libia2) add_subdirectory(rewriter) add_subdirectory(partition-alloc) add_subdirectory(pad-tls) +add_subdirectory(runtime) diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt new file mode 100644 index 0000000000..51d32f9e77 --- /dev/null +++ b/runtime/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 3.12) +project(IA2Runtime) + +add_executable(read-pkru + read_pkru_demo.c + get_inferior_pkru.c +) + +add_executable(track-memory-map + memory_map.c + track_memory_map_demo.c + get_inferior_pkru.c +) + +add_executable(seccomp-filter + seccomp_filter.c +) + +add_executable(landlock + landlock.c + strv.c + forbid_paths.c +) + +set_target_properties( + read-pkru track-memory-map seccomp-filter landlock +PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON +) diff --git a/runtime/forbid_paths.c b/runtime/forbid_paths.c new file mode 100644 index 0000000000..56a0dd431e --- /dev/null +++ b/runtime/forbid_paths.c @@ -0,0 +1,162 @@ +#include "forbid_paths.h" + +#define _GNU_SOURCE +#include +#include +#include + +#include +#include +#include + +#include "landlock.h" + +static int allow_path(const char *path, const int ruleset_fd, bool shallow) { + struct landlock_path_beneath_attr path_beneath = { + .parent_fd = -1, + }; + + struct stat statbuf; + + path_beneath.parent_fd = open(path, O_PATH | O_CLOEXEC); + if (path_beneath.parent_fd < 0) { + fprintf(stderr, "Failed to open \"%s\": %s\n", path, strerror(errno)); + return -1; + } + if (fstat(path_beneath.parent_fd, &statbuf)) { + close(path_beneath.parent_fd); + return -1; + } + path_beneath.allowed_access = + ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_WRITE; + + /* limit non-directory files to access flags relevant for regular files */ + if (!S_ISDIR(statbuf.st_mode)) { + path_beneath.allowed_access &= ACCESS_FILE; + } else { + /* if we only shallowly allow this directory, only allow READ_DIR */ + if (shallow) + path_beneath.allowed_access = LANDLOCK_ACCESS_FS_READ_DIR; + } + + /* add rule */ + if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, &path_beneath, + 0)) { + fprintf(stderr, "Failed to update the ruleset with \"%s\": %s\n", path, + strerror(errno)); + close(path_beneath.parent_fd); + return -1; + } + close(path_beneath.parent_fd); + + return 0; +} + +/* does path have the given prefix as a prefix of its path segments? */ +static bool path_has_segment_prefix(const char *path, const char *prefix) { + if (!strcmp(prefix, "/")) { + return true; + } + + while (*path == *prefix && *path && *prefix) { + path++; + prefix++; + } + + if (*prefix == '\0' && (*path == '/' || *path == '\0')) { + return true; + } + return false; +} + +/* +Forbid access to the given paths, a null-terminated string vector. + +Because landlock is an allowlist-based system, we need to allow everything other +than the specified paths. This is done by walking the directory tree downward +from / toward each path. At each level, directories that do not (transitively) +contain any forbidden files are recursively allowed, and directories that do are +shallowly allowed. + +When the forbidden path is reached, its siblings are allowed by the same process +(which thereby avoids allowing other forbidden files). +*/ + +static struct forbid_ctx { + const char **forbidden_paths; + int ruleset_fd; + int error; +} ctx; + +/* At each level, directories that do not (transitively) contain any forbidden +files are recursively allowed, and directories that do are shallowly allowed. */ +static int forbid_deep_or_shallow(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftwbuf) { + const char *forbidden_path; + + bool contains_forbidden_path = false; + bool is_forbidden_path = false; + /* is any forbidden path a descendant of this dir? */ + for (int i = 0; (forbidden_path = ctx.forbidden_paths[i]); i++) { + /* is forbidden_path a descendant of this dir? */ + bool prefix_matches = path_has_segment_prefix(forbidden_path, fpath); + if (prefix_matches) { + contains_forbidden_path = true; + } + is_forbidden_path |= !strcmp(forbidden_path, fpath); + if (is_forbidden_path) { + break; + } + } + + /* if this is a forbidden path, do not allow it or children, just move on */ + if (is_forbidden_path) { + return FTW_SKIP_SUBTREE; + } + + /* do not allow or continue beneath symbolic links */ + if (typeflag == FTW_SL) { + return FTW_SKIP_SUBTREE; + } + + /* if contains forbidden path, allow shallowly and process children */ + if (contains_forbidden_path) { + int ret = allow_path(fpath, ctx.ruleset_fd, true); + if (ret < 0) { + ctx.error = ret; + return FTW_STOP; + } + + return FTW_CONTINUE; + } + + /* allow whole dir or file */ + int ret = allow_path(fpath, ctx.ruleset_fd, false); + if (ret < 0) { + ctx.error = ret; + return FTW_STOP; + } + + /* do not inspect individual children if this is an allowed dir */ + if (typeflag == FTW_D || typeflag == FTW_DNR) { + return FTW_SKIP_SUBTREE; + } else { + return FTW_CONTINUE; + } +} + +/* one-argument wrapper for realpath */ +static char *realpath_alloc(const char *path) { return realpath(path, NULL); } + +int forbid_paths(const char **paths, const int ruleset_fd) { + /* normalize away symbolic links in the forbidden paths. if multiple routes to + * a file were allowed to exist (as permitted by links), we would allow the + * one not mentioned by name as a forbidden path, contrary to our intent. */ + char **real_paths = strvmap(paths, realpath_alloc); + ctx.forbidden_paths = (const char **)real_paths; + ctx.ruleset_fd = ruleset_fd; + ctx.error = 0; + nftw("/", forbid_deep_or_shallow, 512, FTW_PHYS | FTW_ACTIONRETVAL); + strvfree(real_paths); + return ctx.error; +} diff --git a/runtime/forbid_paths.h b/runtime/forbid_paths.h new file mode 100644 index 0000000000..4ca3af765e --- /dev/null +++ b/runtime/forbid_paths.h @@ -0,0 +1,3 @@ +#pragma once + +int forbid_paths(const char **paths, const int ruleset_fd); diff --git a/runtime/get_inferior_pkru.c b/runtime/get_inferior_pkru.c new file mode 100644 index 0000000000..57d8dcd6b9 --- /dev/null +++ b/runtime/get_inferior_pkru.c @@ -0,0 +1,42 @@ +#include +#include +#include +#include +#include +#include + +#include "get_inferior_pkru.h" + +/* this is largely copped from gdb and pared down to just what we need. it would + * be much more complex if we had to deal with the compacted xsave area. */ + +#define X86_XSTATE_PKRU_SIZE 2696 +#define X86_XSTATE_MAX_SIZE 2696 + +/* offset to the location of the PKRU register data structure used by the + * "xsave" instruction */ +static int xsave_pkeys_offset = + 2688 + 0 * 8; /* %pkru (64 bits in XSTATE, 32-bit actually used by + instructions and applications). */ + +bool get_inferior_pkru(pid_t pid, uint32_t *pkru_out) { + char xstateregs[X86_XSTATE_MAX_SIZE]; + struct iovec iov; + + /* Pre-4.14 kernels have a bug (fixed by commit 0852b374173b + "x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on + Intel Skylake CPUs") that sometimes causes the mxcsr location in + xstateregs not to be copied by PTRACE_GETREGSET. Make sure that + the location is at least initialized with a defined value. */ + memset(xstateregs, 0, sizeof(xstateregs)); + iov.iov_base = xstateregs; + iov.iov_len = sizeof(xstateregs); + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_XSTATE, (long)&iov) < + 0) { + perror("could not read xstate registers"); + return false; + } + + memcpy(pkru_out, &xstateregs[xsave_pkeys_offset], sizeof(*pkru_out)); + return true; +} diff --git a/runtime/get_inferior_pkru.h b/runtime/get_inferior_pkru.h new file mode 100644 index 0000000000..7f17c3f6b6 --- /dev/null +++ b/runtime/get_inferior_pkru.h @@ -0,0 +1,5 @@ +#include +#include +#include + +bool get_inferior_pkru(pid_t pid, uint32_t *pkru_out); diff --git a/runtime/landlock.c b/runtime/landlock.c new file mode 100644 index 0000000000..18a9ba4e84 --- /dev/null +++ b/runtime/landlock.c @@ -0,0 +1,116 @@ +#include "landlock.h" +#include "forbid_paths.h" + +int main(const int argc, char *const argv[], char *const *const envp) { + const char *cmd_path; + char *const *cmd_argv; + int ruleset_fd, abi_ver; + __u64 access_fs_rw = ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_WRITE; + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = access_fs_rw, + }; + + if (argc < 2) { + fprintf(stderr, "usage: DENY_PATH=... %s \n\n", basename(argv[0])); + fprintf(stderr, "built against landlock ABI version <= %d\n", + LANDLOCK_ABI_LAST); + return 1; + } + + abi_ver = landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION); + if (abi_ver < 0) { + const int err = errno; + + perror("Failed to check Landlock compatibility"); + switch (err) { + case ENOSYS: + fprintf(stderr, + "Hint: Landlock is not supported by the current kernel. " + "To support it, build the kernel with " + "CONFIG_SECURITY_LANDLOCK=y and prepend " + "\"landlock,\" to the content of CONFIG_LSM.\n"); + break; + case EOPNOTSUPP: + fprintf(stderr, + "Hint: Landlock is currently disabled. " + "It can be enabled in the kernel configuration by " + "prepending \"landlock,\" to the content of CONFIG_LSM, " + "or at boot time by setting the same content to the " + "\"lsm\" kernel parameter.\n"); + break; + } + return 1; + } + + switch (abi_ver) { + case 1: + /* + * Removes LANDLOCK_ACCESS_FS_REFER for ABI < 2 + * + * Note: The "refer" operations (file renaming and linking + * across different directories) are always forbidden when using + * Landlock with ABI 1. + * + * If only ABI 1 is available, this sandboxer knowingly forbids + * refer operations. + * + * If a program *needs* to do refer operations after enabling + * Landlock, it can not use Landlock at ABI level 1. To be + * compatible with different kernel versions, such programs + * should then fall back to not restrict themselves at all if + * the running kernel only supports ABI 1. + */ + ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_REFER_OR_0; + __attribute__((fallthrough)); + case 2: + /* Removes LANDLOCK_ACCESS_FS_TRUNCATE for ABI < 3 */ + ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_TRUNCATE_OR_0; + + fprintf(stderr, + "Hint: You should update the running kernel " + "to leverage Landlock features " + "provided by ABI version %d (instead of %d).\n", + LANDLOCK_ABI_LAST, abi_ver); + __attribute__((fallthrough)); + case LANDLOCK_ABI_LAST: + break; + default: + fprintf( + stderr, + "rebuild sandboxer to use features from ABI version %d instead of %d\n", + abi_ver, LANDLOCK_ABI_LAST); + } + access_fs_rw &= ruleset_attr.handled_access_fs; + + ruleset_fd = landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + if (ruleset_fd < 0) { + perror("Failed to create ruleset"); + return 1; + } + const char *path = getenv("DENY_PATH"); + const char *paths[] = {path, NULL}; + if (forbid_paths(paths, ruleset_fd) < 0) { + fprintf(stderr, "Failed to set up path allowlist\n"); + return 1; + } + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + perror("Failed to restrict privileges"); + goto err_close_ruleset; + } + if (landlock_restrict_self(ruleset_fd, 0)) { + perror("Failed to enforce ruleset"); + goto err_close_ruleset; + } + close(ruleset_fd); + + cmd_path = argv[1]; + cmd_argv = argv + 1; + execvpe(cmd_path, cmd_argv, envp); + perror("execvpe"); + return 1; + +err_close_ruleset: + close(ruleset_fd); + return 1; +} diff --git a/runtime/landlock.h b/runtime/landlock.h new file mode 100644 index 0000000000..47585d4b42 --- /dev/null +++ b/runtime/landlock.h @@ -0,0 +1,99 @@ +#pragma once + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "strv.h" + +#ifndef landlock_create_ruleset +static inline int +landlock_create_ruleset(const struct landlock_ruleset_attr *const attr, + const size_t size, const __u32 flags) { + return syscall(__NR_landlock_create_ruleset, attr, size, flags); +} +#endif + +#ifndef landlock_add_rule +static inline int landlock_add_rule(const int ruleset_fd, + const enum landlock_rule_type rule_type, + const void *const rule_attr, + const __u32 flags) { + return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type, rule_attr, + flags); +} +#endif + +#ifndef landlock_restrict_self +static inline int landlock_restrict_self(const int ruleset_fd, + const __u32 flags) { + return syscall(__NR_landlock_restrict_self, ruleset_fd, flags); +} +#endif + +/* clang-format off */ + +#define ACCESS_FS_WITHIN ( \ + LANDLOCK_ACCESS_FS_READ_DIR | \ + LANDLOCK_ACCESS_FS_REMOVE_DIR | \ + LANDLOCK_ACCESS_FS_REMOVE_FILE | \ + LANDLOCK_ACCESS_FS_MAKE_CHAR | \ + LANDLOCK_ACCESS_FS_MAKE_DIR | \ + LANDLOCK_ACCESS_FS_MAKE_REG | \ + LANDLOCK_ACCESS_FS_MAKE_SOCK | \ + LANDLOCK_ACCESS_FS_MAKE_FIFO | \ + LANDLOCK_ACCESS_FS_MAKE_BLOCK | \ + LANDLOCK_ACCESS_FS_MAKE_SYM) + +#define ACCESS_FS_ROUGHLY_READ ( \ + LANDLOCK_ACCESS_FS_EXECUTE | \ + LANDLOCK_ACCESS_FS_READ_FILE | \ + LANDLOCK_ACCESS_FS_READ_DIR) + +#ifdef LANDLOCK_ACCESS_FS_REFER +#define LANDLOCK_ACCESS_FS_REFER_OR_0 LANDLOCK_ACCESS_FS_REFER +#else +#define LANDLOCK_ACCESS_FS_REFER_OR_0 0 +#endif + +#ifdef LANDLOCK_ACCESS_FS_TRUNCATE +#define LANDLOCK_ACCESS_FS_TRUNCATE_OR_0 LANDLOCK_ACCESS_FS_TRUNCATE +#else +#define LANDLOCK_ACCESS_FS_TRUNCATE_OR_0 0 +#endif + +#define ACCESS_FS_ROUGHLY_WRITE ( \ + LANDLOCK_ACCESS_FS_WRITE_FILE | \ + LANDLOCK_ACCESS_FS_REMOVE_DIR | \ + LANDLOCK_ACCESS_FS_REMOVE_FILE | \ + LANDLOCK_ACCESS_FS_MAKE_CHAR | \ + LANDLOCK_ACCESS_FS_MAKE_DIR | \ + LANDLOCK_ACCESS_FS_MAKE_REG | \ + LANDLOCK_ACCESS_FS_MAKE_SOCK | \ + LANDLOCK_ACCESS_FS_MAKE_FIFO | \ + LANDLOCK_ACCESS_FS_MAKE_BLOCK | \ + LANDLOCK_ACCESS_FS_MAKE_SYM | \ + LANDLOCK_ACCESS_FS_REFER_OR_0 | \ + LANDLOCK_ACCESS_FS_TRUNCATE_OR_0) + +#define ACCESS_FILE ( \ + LANDLOCK_ACCESS_FS_EXECUTE | \ + LANDLOCK_ACCESS_FS_WRITE_FILE | \ + LANDLOCK_ACCESS_FS_READ_FILE | \ + LANDLOCK_ACCESS_FS_TRUNCATE_OR_0) + +/* clang-format on */ + +#define LANDLOCK_ABI_LAST 3 diff --git a/runtime/mem_region.h b/runtime/mem_region.h new file mode 100644 index 0000000000..dc01e4a737 --- /dev/null +++ b/runtime/mem_region.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +struct range { + size_t start; + size_t len; +}; + +struct mem_region { + struct range range; + unsigned char owner_pkey; +}; + +// maps regions to pkeys +struct memory_map { + struct mem_region *regions; + size_t n_regions; + size_t capacity; +}; diff --git a/runtime/memory_map.c b/runtime/memory_map.c new file mode 100644 index 0000000000..ee75f873cb --- /dev/null +++ b/runtime/memory_map.c @@ -0,0 +1,91 @@ +// #include "mem_map.h" +#include "memory_map.h" +#include +#include + +struct mem_region *add_region(struct memory_map *map, struct range range, + unsigned char owner_pkey) { + printf("adding compartment %d region %08zx+%zd\n", owner_pkey, range.start, + range.len); + + if (map->n_regions == map->capacity) { + map->capacity *= 2; + if (map->capacity == 0) { + map->capacity = 16; + } + map->regions = + realloc(map->regions, map->capacity * sizeof(struct mem_region)); + } + + map->n_regions++; + + struct mem_region *region = &map->regions[map->n_regions - 1]; + region->range = range; + region->owner_pkey = owner_pkey; + + return region; +} + +bool ranges_overlap(struct range *a, struct range *b) { + size_t a_end = a->start + a->len; + size_t b_end = b->start + b->len; + return a_end >= b->start && b_end >= a->start; +} + +struct mem_region *find_overlapping_region(struct memory_map *map, + struct range needle) { + for (int i = 0; i < map->n_regions; i++) { + struct mem_region *region = &map->regions[i]; + if (ranges_overlap(®ion->range, &needle)) { + return region; + } + } + return NULL; +} + +struct mem_region *find_region_exact(struct memory_map *map, + struct range needle) { + for (int i = 0; i < map->n_regions; i++) { + struct mem_region *region = &map->regions[i]; + if (region->range.start == needle.start && + region->range.len == needle.len) { + return region; + } + } + return NULL; +} + +bool remove_region(struct memory_map *map, struct range needle) { + struct mem_region *r = find_region_exact(map, needle); + if (r != NULL) { + /* move the last region here */ + *r = map->regions[map->n_regions - 1]; + /* pop the old last region */ + map->n_regions--; + return true; + } + return false; +} + +struct mem_region *find_region_containing_addr(struct memory_map *map, + size_t addr) { + struct range needle = {addr, addr}; + return find_overlapping_region(map, needle); +} + +bool all_overlapping_regions_have_pkey(struct memory_map *map, + struct range needle, + unsigned char pkey) { + for (int i = 0; i < map->n_regions; i++) { + struct mem_region *region = &map->regions[i]; + bool pkeys_differ = region->owner_pkey != pkey; + printf(" pkeys: %d/%d, ranges: %zx+%zd, %zx+%zd\n", region->owner_pkey, + pkey, region->range.start, region->range.len, needle.start, + needle.len); + if (pkeys_differ && ranges_overlap(®ion->range, &needle)) { + printf(" ^ counterexample!\n"); + return false; + } + } + return true; +} diff --git a/runtime/memory_map.h b/runtime/memory_map.h new file mode 100644 index 0000000000..7b4293cc57 --- /dev/null +++ b/runtime/memory_map.h @@ -0,0 +1,20 @@ +#pragma once + +#include "mem_region.h" +#include + +struct mem_region *add_region(struct memory_map *map, struct range range, + unsigned char owner_pkey); +struct mem_region *find_overlapping_region(struct memory_map *map, + struct range needle); + +struct mem_region *find_region_exact(struct memory_map *map, + struct range needle); + +bool remove_region(struct memory_map *map, struct range needle); + +struct mem_region *find_region_containing_addr(struct memory_map *map, + size_t addr); + +bool all_overlapping_regions_have_pkey(struct memory_map *map, + struct range needle, unsigned char pkey); \ No newline at end of file diff --git a/runtime/mmap_event.h b/runtime/mmap_event.h new file mode 100644 index 0000000000..e7f7e75b13 --- /dev/null +++ b/runtime/mmap_event.h @@ -0,0 +1,61 @@ +#pragma once + +#include "mem_region.h" + +struct mmap_info { + struct range range; + int prot; + int flags; + int fildes; + unsigned char pkey; +}; + +struct munmap_info { + struct range range; + unsigned char pkey; +}; + +struct mremap_info { + struct range old_range; + struct range new_range; + int flags; + unsigned char pkey; +}; + +struct mprotect_info { + struct range range; + int prot; + unsigned char pkey; +}; + +struct pkey_mprotect_info { + struct range range; + int prot; + unsigned char new_owner_pkey; + unsigned char pkey; +}; + +union event_info { + struct mmap_info mmap; + struct munmap_info munmap; + struct mremap_info mremap; + struct mprotect_info mprotect; + struct pkey_mprotect_info pkey_mprotect; +}; + +enum mmap_event { + EVENT_MMAP, + EVENT_MUNMAP, + EVENT_MREMAP, + EVENT_MPROTECT, + EVENT_PKEY_MPROTECT, + EVENT_NONE, +}; + +static const char *event_names[] = { + "MMAP", "MUNMAP", "MREMAP", "MPROTECT", "PKEY_MPROTECT", "NONE", +}; + +static const char *event_name(enum mmap_event event) { + return event_names[event]; +} diff --git a/runtime/read_pkru_demo.c b/runtime/read_pkru_demo.c new file mode 100644 index 0000000000..f6be0d9bb3 --- /dev/null +++ b/runtime/read_pkru_demo.c @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "get_inferior_pkru.h" + +void usage(const char *name) { fprintf(stderr, "usage: %s \n", name); } + +int main(int argc, char **argv) { + if (argc != 2) { + usage(basename(argv[0])); + return 1; + } + + pid_t pid = atoi(argv[1]); + if (pid == 0) { + usage(basename(argv[0])); + return 1; + } + + if (ptrace(PTRACE_ATTACH, pid, 0, 0) < 0) { + perror("could not ptrace(PTRACE_ATTACH)"); + } else { + /* wait to get hold of the tracee */ + pid_t ret = waitpid(pid, NULL, WUNTRACED); + if (ret < 0) { + perror("waitpid"); + return 1; + } + } + + uint32_t pkru = 0; + bool res = get_inferior_pkru(pid, &pkru); + if (res) { + printf("pkru=%08x\n", pkru); + } + + return res; +} diff --git a/runtime/seccomp_filter.c b/runtime/seccomp_filter.c new file mode 100644 index 0000000000..b8ab5c8e9d --- /dev/null +++ b/runtime/seccomp_filter.c @@ -0,0 +1,135 @@ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// bpf filter to forbid the seccomp syscall +struct sock_filter forbid_seccomp_filter[] = { + // load the syscall number + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), + // compare syscall number to seccomp() and jump 0 ahead if equal, 1 ahead if + // not + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_seccomp, 0, 1), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), +}; + +struct sock_fprog forbid_seccomp_prog = { + .len = (unsigned short)(sizeof(forbid_seccomp_filter) / + sizeof(forbid_seccomp_filter[0])), + .filter = forbid_seccomp_filter, +}; + +// shorthand for an equality comparison jump of 0 (eq) or 1 (neq) followed by +// a return of the given policy (used in the equal case) +#define BPF_SYSCALL_POLICY(name, policy) \ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_##name, 0, 1), \ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_##policy) + +long syscall(long no, ...); + +int configure_seccomp(void) { + struct sock_filter filter[] = { + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), + BPF_SYSCALL_POLICY(write, ALLOW), + // this would compare syscall number to write() and allow if it matches + /*BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_write, 0, 1), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),*/ + // allow seccomp() + BPF_SYSCALL_POLICY(seccomp, ALLOW), + // equivalent: + /*BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_seccomp, 0, 1), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),*/ + // compare syscall number to open() and jump to kill insn if different + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_open, 0, 3), + // load argument 1 + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, args[1]))), + // if argument 1 is equal to O_RDONLY, allow + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, O_RDONLY, 0, 1), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL)}; + + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter) / sizeof(filter[0])), + .filter = filter, + }; + + printf("forbidding new privs\n"); + // in order to use seccomp() without CAP_SYS_SECCOMP, we must opt out of being + // able to gain privs via exec() of setuid binaries as they would inherit our + // seccomp filters. + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + printf("calling seccomp()\n"); + // we must make two separate calls to seccomp() here because we want to create + // a user notification fd to pass to the supervisor, but we also want to pass + // FLAG_TSYNC, and these two cannot be combined in one call because they + // impose conflicting interpretations on the syscall return value. + int sc_unotify_fd = syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); + printf("fd=%d\n", sc_unotify_fd); + if (sc_unotify_fd < 0) + return -1; + int sync_ret = syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, &forbid_seccomp_prog); + printf("ret=%d\n", sync_ret); + if (sync_ret < 0) + return -1; + // prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + return 0; +} + +int main(int argc, char *argv[]) { + int infd, outfd; + ssize_t read_bytes; + char buffer[1024]; + + if (argc < 3) { + printf("Usage:\n\tdup_file \n"); + return -1; + } + + if (configure_seccomp() < 0) { + return -1; + } + + const char *infile = argv[1]; + const char *outfile = argv[2]; + + printf("opening %s O_RDONLY\n", infile); + if ((infd = open(infile, O_RDONLY)) < 0) { + perror("open ro"); + return 1; + } + + printf("opening %s O_WRONLY|O_CREAT\n", outfile); + if ((outfd = open(outfile, O_WRONLY | O_CREAT, 0644)) < 0) { + perror("open rw"); + return 1; + } + + while ((read_bytes = read(infd, &buffer, 1024)) > 0) { + int ret = write(outfd, &buffer, (ssize_t)read_bytes); + if (ret < 0) { + perror("write"); + return 1; + } + } + + if (read_bytes < 0) { + perror("read"); + return 1; + } + + close(infd); + close(outfd); + return 0; +} diff --git a/runtime/strv.c b/runtime/strv.c new file mode 100644 index 0000000000..7263b05652 --- /dev/null +++ b/runtime/strv.c @@ -0,0 +1,30 @@ +#include "strv.h" +#include + +/* compute length of a NULL-terminated string vector */ +size_t strvlen(const char **strv) { + size_t len = 0; + while (strv[len] != NULL) { + len++; + } + return len; +} + +/* free a NULL-terminated string vector */ +void strvfree(char **strv) { + for (size_t i = 0; strv[i] != NULL; i++) { + free(strv[i]); + } + free(strv); +} + +/* map a function over each item of a NULL-terminated string vector, returning a + * new string vector */ +char **strvmap(const char **strv, char *(*fn)(const char *)) { + size_t len = strvlen(strv); + char **out = malloc(len * sizeof(char *)); + for (int i = 0; i < len; i++) { + out[i] = fn(strv[i]); + } + return out; +} diff --git a/runtime/strv.h b/runtime/strv.h new file mode 100644 index 0000000000..a15ce6f2cc --- /dev/null +++ b/runtime/strv.h @@ -0,0 +1,7 @@ +#pragma once + +#include + +size_t strvlen(const char **strv); +void strvfree(char **strv); +char **strvmap(const char **strv, char *(*fn)(const char *)); diff --git a/runtime/track_memory_map_demo.c b/runtime/track_memory_map_demo.c new file mode 100644 index 0000000000..dc3187cf62 --- /dev/null +++ b/runtime/track_memory_map_demo.c @@ -0,0 +1,405 @@ +#define _GNU_SOURCE +#include "get_inferior_pkru.h" +#include +#include +#include +#include +#include +#include +#include +#include + +// ptrace mmap, munmap, mremap, [pkey_]mprotect + +#include "mem_region.h" +#include "memory_map.h" +#include "mmap_event.h" + +void usage(const char *name) { fprintf(stderr, "usage: %s \n", name); } + +bool is_op_permitted(struct memory_map *map, int event, + union event_info *info) { + switch (event) { + case EVENT_MMAP: + if (all_overlapping_regions_have_pkey(map, info->mmap.range, + info->mmap.pkey)) + return true; + + break; + case EVENT_MUNMAP: + if (all_overlapping_regions_have_pkey(map, info->munmap.range, + info->munmap.pkey)) + return true; + break; + case EVENT_MREMAP: + if (all_overlapping_regions_have_pkey(map, info->mremap.old_range, + info->mremap.pkey)) + return true; + break; + case EVENT_MPROTECT: + if (all_overlapping_regions_have_pkey(map, info->mprotect.range, + info->mprotect.pkey)) + return true; + break; + case EVENT_PKEY_MPROTECT: { + /* allow mprotecting memory that we own to our pkey */ + bool impacts_only_our_memory = all_overlapping_regions_have_pkey( + map, info->pkey_mprotect.range, info->pkey_mprotect.pkey); + bool sets_our_key = + (info->pkey_mprotect.new_owner_pkey == info->pkey_mprotect.pkey); + if (impacts_only_our_memory && sets_our_key) + return true; + /* otherwise, only compartment 0 can pkey_mprotect anything as another + * compartment */ + if (info->pkey_mprotect.pkey != 0 && + info->pkey_mprotect.new_owner_pkey != info->pkey_mprotect.pkey) { + return false; + } + break; + } + case EVENT_NONE: + return true; + break; + } + return false; +} + +void update_memory_map(struct memory_map *map, int event, + union event_info *info) { + switch (event) { + case EVENT_MMAP: + add_region(map, info->mmap.range, info->mmap.pkey); + break; + case EVENT_MUNMAP: + remove_region(map, info->munmap.range); + break; + case EVENT_MREMAP: + if (!(info->mremap.flags & MREMAP_DONTUNMAP)) + remove_region(map, info->mremap.old_range); + // add_region() + // remove_region(map, info->mremap.addr, info->mremap.len); + // add_region() + break; + case EVENT_MPROTECT: + break; + case EVENT_PKEY_MPROTECT: { + struct mem_region *r = find_region_exact(map, info->pkey_mprotect.range); + /* pkey_mprotect applies onto to entire region for now */ + if (r == NULL) { + fprintf(stderr, "no exact region found\n"); + break; + } + r->owner_pkey = info->pkey_mprotect.new_owner_pkey; + /* TODO: possibly split existing region */ + break; + } + case EVENT_NONE: + break; + } +} + +// #include //wrong +#include + +enum mmap_event event_from_syscall(uint64_t rax) { + switch (rax) { + case __NR_mmap: + return EVENT_MMAP; + case __NR_munmap: + return EVENT_MUNMAP; + case __NR_mremap: + return EVENT_MREMAP; + case __NR_mprotect: + return EVENT_MPROTECT; + case __NR_pkey_mprotect: + return EVENT_PKEY_MPROTECT; + default: + return EVENT_NONE; + } +} + +#define PKEY_INVALID 255 +#define PKRU(pkey) (~((3 << (2 * pkey)) | 3)) + +unsigned char pkey_for_pkru(uint32_t pkru) { +#define CHECK(x) \ + case PKRU(x): \ + return x; + switch (pkru) { + CHECK(0); + CHECK(1); + CHECK(2); + CHECK(3); + CHECK(4); + CHECK(5); + CHECK(6); + CHECK(7); + CHECK(8); + CHECK(9); + CHECK(10); + CHECK(11); + CHECK(12); + CHECK(13); + CHECK(14); + CHECK(15); + default: + return PKEY_INVALID; + } +#undef CHECK +} + +#include + +/* query pid to determine the mmap-relevant event being requested. returns true + * unless something horrible happens */ +bool interpret_syscall(struct user_regs_struct *regs, unsigned char pkey, + enum mmap_event *event, union event_info *event_info) { + /* determine event from syscall # */ + *event = event_from_syscall(regs->orig_rax); + /* dispatch on event and read args from registers. + arg order is: rdi, rsi, rdx, r10, r8, r9 */ + switch (*event) { + case EVENT_MMAP: { + printf("mmap!\n"); + + struct mmap_info *info = &event_info->mmap; + info->range.start = + regs->rdi; /* this will be replaced with the actual addr on return */ + info->range.len = regs->rsi; + info->prot = regs->rdx; + info->flags = regs->r10; + info->fildes = regs->r8; + info->pkey = pkey; + + printf("compartment %d mmap (%08zx, %zd, prot=%d, flags=%x, fd=%d)\n", + info->pkey, info->range.start, info->range.len, info->prot, + info->flags, info->fildes); + break; + } + case EVENT_MUNMAP: { + printf("munmap!\n"); + + struct munmap_info *info = &event_info->munmap; + info->range.start = regs->rdi; + info->range.len = regs->rsi; + info->pkey = pkey; + break; + } + case EVENT_MREMAP: { + printf("mremap!\n"); + + struct mremap_info *info = &event_info->mremap; + info->old_range.start = regs->rdi; + info->old_range.len = regs->rsi; + info->new_range.len = regs->rdx; + info->flags = regs->r10; + + if (info->flags & MREMAP_FIXED) + info->new_range.start = + regs->r8; // accepts a 5th arg if this flag is present + else + info->new_range.start = info->old_range.start; + + info->pkey = pkey; + break; + } + case EVENT_MPROTECT: { + printf("mprotect!\n"); + + struct mprotect_info *info = &event_info->mprotect; + info->range.start = regs->rdi; + info->range.len = regs->rsi; + info->pkey = pkey; + break; + } + case EVENT_PKEY_MPROTECT: { + printf("pkey_mprotect!\n"); + + struct pkey_mprotect_info *info = &event_info->pkey_mprotect; + info->range.start = regs->rdi; + info->range.len = regs->rsi; + info->prot = regs->rdx; + info->new_owner_pkey = regs->r10; + info->pkey = pkey; + break; + } + case EVENT_NONE: { + /* when ptracing alone, this may occur; when we are a seccomp helper, this + should not happen */ + /* printf("other; rax=%llu\n", regs->orig_rax); */ + break; + } + } + return true; +} + +void update_event_with_result(struct user_regs_struct *regs, + enum mmap_event event, + union event_info *event_info) { + /* if mremap(MREMAP_MAYMOVE) or regular mmap() sans MAP_FIXED, we need to + find out what addr came back */ + + switch (event) { + case EVENT_MMAP: { + /* read result from registers */ + struct mmap_info *info = &event_info->mmap; + info->range.start = regs->rax; + break; + } + case EVENT_MREMAP: { + /* read result from registers */ + struct mmap_info *info = &event_info->mmap; + info->range.start = regs->rax; + break; + } + default: { + break; + } + } +} + +void return_syscall_eperm(pid_t pid) { + struct user_regs_struct regs = {0}; + if (ptrace(PTRACE_GETREGS, pid, 0, ®s) < 0) { + perror("could not PTRACE_GETREGS"); + return; + } + + /* set to invalid syscall */ + regs.orig_rax = -1; + ptrace(PTRACE_SETREGS, pid, 0, ®s); + fprintf(stderr, "set syscall # to -1\n"); + + /* run syscall until exit */ + ptrace(PTRACE_SYSCALL, pid, 0, 0); + waitpid(pid, NULL, 0); + fprintf(stderr, "continued\n"); + + if (ptrace(PTRACE_GETREGS, pid, 0, ®s) < 0) { + perror("could not PTRACE_GETREGS"); + return; + } + /* return -EPERM */ +#include + regs.rax = -EPERM; + ptrace(PTRACE_SETREGS, pid, 0, ®s); + fprintf(stderr, "wrote -eperm to rax\n"); +} + +void track_memory_map(pid_t pid, struct memory_map *map) { + while (true) { + /* run until the next syscall entry */ + if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) { + perror("could not PTRACE_SYSCALL"); + } + /* wait for the process to get signalled */ + int stat = 0; + int ret = waitpid(pid, &stat, 0); + if (ret < 0) { + perror("waitpid"); + return; + } + if (WIFEXITED(stat)) { + printf("inferior exited\n"); + return; + } + if (WIFSIGNALED(stat)) { + printf("inferior killed by signal\n"); + return; + } + + /* read which syscall is being called and its args */ + struct user_regs_struct regs = {0}; + if (ptrace(PTRACE_GETREGS, pid, 0, ®s) < 0) { + perror("could not PTRACE_GETREGS"); + return; + } + + /* if syscall number is -1, finish and kill process */ + if (regs.orig_rax == -1) { + return; + } + + /* read pkru */ + uint32_t pkru = -1; + bool res = get_inferior_pkru(pid, &pkru); + if (!res) { + fprintf(stderr, "could not get pkey\n"); + return; + } + unsigned char pkey = pkey_for_pkru(pkru); + if (pkey == PKEY_INVALID) { + fprintf(stderr, "pkru value %8x does not correspond to any pkey!\n", + pkru); + return; + } + + union event_info event_info = {0}; + enum mmap_event event = EVENT_NONE; + if (!interpret_syscall(®s, pkey, &event, &event_info)) { + fprintf(stderr, "could not interpret syscall!\n"); + } + + if (!is_op_permitted(map, event, &event_info)) { + fprintf(stderr, "forbidden operation requested: %s\n", event_name(event)); + return_syscall_eperm(pid); + continue; + } else { + fprintf(stderr, "operation allowed: %s (syscall %lld)\n", + event_name(event), regs.orig_rax); + } + + /* run the actual syscall until syscall exit so we can read its result */ + if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) { + perror("could not PTRACE_SYSCALL"); + } + waitpid(pid, NULL, 0); + + /* read syscall result from registers */ + if (ptrace(PTRACE_GETREGS, pid, 0, ®s) < 0) { + perror("could not PTRACE_GETREGS"); + return; + } + + /* update event */ + update_event_with_result(®s, event, &event_info); + + /* track effect of syscall on memory map */ + update_memory_map(map, event, &event_info); + } +} + +#include + +int main(int argc, char **argv) { + if (argc != 2) { + usage(basename(argv[0])); + return 1; + } + + pid_t pid = atoi(argv[1]); + if (pid == 0) { + usage(basename(argv[0])); + return 1; + } + + if (ptrace(PTRACE_ATTACH, pid, 0, 0) < 0) { + perror("could not PTRACE_ATTACH"); + } else { + /* wait to get hold of the tracee */ + pid_t ret = waitpid(pid, NULL, WUNTRACED); + if (ret < 0) { + perror("waitpid"); + return 1; + } + } + + /* do not let the tracee continue if our process dies */ + ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_EXITKILL); + + struct memory_map map = {0}; + track_memory_map(pid, &map); + ptrace(PTRACE_KILL, pid, 0, 0); + + return 0; +}