diff --git a/Makefile b/Makefile index 337e4dad88c..6c5aa70c890 100644 --- a/Makefile +++ b/Makefile @@ -254,6 +254,8 @@ install: build doc sdk doc-json install -m 644 _build/default/ocaml/networkd/bin/xcp-networkd.1 $(DESTDIR)/usr/share/man/man1/xcp-networkd.1 # wsproxy install -m 755 _build/install/default/bin/wsproxy $(DESTDIR)$(LIBEXECDIR)/wsproxy +# forkexecd + install -m 755 _build/default/ocaml/forkexecd/helper/vfork_helper $(DESTDIR)/usr/libexec/xapi/vfork_helper # dune can install libraries and several other files into the right locations dune install --destdir=$(DESTDIR) --prefix=$(PREFIX) --libdir=$(LIBDIR) --mandir=$(MANDIR) \ xapi-client xapi-schema xapi-consts xapi-cli-protocol xapi-datamodel xapi-types \ diff --git a/ocaml/forkexecd/helper/Makefile b/ocaml/forkexecd/helper/Makefile new file mode 100644 index 00000000000..74fd61ec477 --- /dev/null +++ b/ocaml/forkexecd/helper/Makefile @@ -0,0 +1,35 @@ +## Set some macro but not override environment ones +CFLAGS ?= -O2 -g -Wall -Werror +LDFLAGS ?= + +all:: vfork_helper + +clean:: + rm -f vfork_helper *.o + +%.o: %.c + gcc $(CFLAGS) -c -o $@ $< + +vfork_helper: vfork_helper.o close_from.o syslog.o + gcc $(CFLAGS) $(LDFLAGS) -o $@ $^ -pthread + +close_from.o: close_from.h Makefile +syslog.o: syslog.h Makefile +vfork_helper.o: redirect_algo.h Makefile + +## Fuzzer uses AFL (American Fuzzy Lop). +## +## Use "make fuzz" to build and launch the fuzzer +## +## Use "make show" to look at the first failures (if found). + +fuzz:: + afl-gcc $(CFLAGS) -Wall -Werror -o algo_fuzzer algo_fuzzer.c + rm -rf testcase_dir + mkdir testcase_dir + echo maomaoamaoaoao > testcase_dir/test1 + rm -rf findings_dir/ + afl-fuzz -i testcase_dir -o findings_dir -D -- ./algo_fuzzer + +show:: + cat "$$(ls -1 findings_dir/default/crashes/id* | head -1)" | ./algo_fuzzer diff --git a/ocaml/forkexecd/helper/algo_fuzzer.c b/ocaml/forkexecd/helper/algo_fuzzer.c new file mode 100644 index 00000000000..946332e6839 --- /dev/null +++ b/ocaml/forkexecd/helper/algo_fuzzer.c @@ -0,0 +1,246 @@ + +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#undef NDEBUG +#define DEBUG 1 + +#if DEBUG +#define log(fmt, ...) printf(fmt "\n", ##__VA_ARGS__) +#else +#define log(fmt, ...) do {} while(0) +#endif + +// include as first file to make sure header is self container +#include "redirect_algo.h" + +#include +#include +#include +#include +#include +#include +#include + +static int fake_close(int fd); + +typedef struct { + bool open; + bool cloexec; + char *name; +} fd; + +#define NUM_FDS 4096 +static fd fds[NUM_FDS]; + +static bool +fake_close_fds_from(int fd_from) +{ + for (int fd = fd_from; fd < NUM_FDS; ++fd) + fake_close(fd); + + return true; +} + +#define O_WRONLY 1 +static int +fake_open(const char *fn, int dummy) +{ + for (int i = 0; i < NUM_FDS; ++i) + if (!fds[i].open) { + assert(fds[i].name == NULL); + fds[i].name = strdup(fn); + fds[i].open = true; + fds[i].cloexec = false; + return i; + } + assert(0); + return -1; +} + +static int +fake_close(int fd) +{ + assert(fd >= 0); + assert(fd < NUM_FDS); + if (!fds[fd].open) { + errno = EBADF; + return -1; + } + fds[fd].open = false; + free(fds[fd].name); + fds[fd].name = NULL; + return 0; +} + +static int +fake_dup2(int from, int to) +{ + assert(from >= 0 && from < NUM_FDS); + assert(to >= 0 && to < NUM_FDS); + assert(fds[from].open); + assert(from != to); + free(fds[to].name); + fds[to].open = true; + fds[to].name = strdup(fds[from].name); + fds[to].cloexec = false; + return 0; +} + +static int +fake_fcntl(int fd) +{ + assert(fd >= 0 && fd < NUM_FDS); + assert(fds[fd].open); + fds[fd].cloexec = false; + return 0; +} + +int main(int argc, char **argv) +{ + // Input where a given FD goes?? + // No, not enough, can be duplicated. + // Numbers >4096 in 2 bytes not file descriptor, + // (-1 for standard, skip for normal). + // We should add some random fds. + enum { MAX_FILE_BUF = 2048 }; + uint16_t file_buf[MAX_FILE_BUF]; + size_t read = fread(file_buf, 2, MAX_FILE_BUF, stdin); + if (read < 3) + return 0; + + static const char standard_names[][8] = { + "stdin", "stdout", "stderr" + }; + int num_mappings = 0; + uint16_t *num = file_buf; + mapping mappings[MAX_FILE_BUF]; + int i = 0; + for (i = 0; i < 3; ++i) { + mapping *m = &mappings[num_mappings++]; + m->uuid = standard_names[i]; + uint16_t n = *num++; + m->current_fd = n < NUM_FDS ? n : -1; + m->wanted_fd = i; + } + for (; i < read; ++i) { + uint16_t n = *num++; + if (n >= NUM_FDS) + continue; + + mapping *m = &mappings[num_mappings++]; + m->current_fd = n; + m->wanted_fd = -1; + char buf[64]; + sprintf(buf, "file%d", i); + m->uuid = strdup(buf); + } + if (num_mappings > MAX_TOTAL_MAPPINGS) + return 0; + + for (unsigned n = 0; n < num_mappings; ++n) { + mapping *m = &mappings[n]; + int fd = m->current_fd; + if (fd < 0) + continue; + fake_close(fd); + fds[fd].open = true; + fds[fd].name = strdup(m->uuid); + fds[fd].cloexec = true; + } + + // Check in the final file mapping all valid mappings + // have an open file descriptor. + // There should be no duplicate numbers in current_fd. + // current_fd must be in a range. + // Only if wanted_fd >= 0 current_fd can be -1. + // There should be a correspondance between input and output names. + // If current_fd was -1 it will still be -1. + // If wanted_fd >= 0 current_fd should be the same. + + fd_operation operations[MAX_OPERATIONS]; + int num_operations = + redirect_mappings(mappings, num_mappings, operations); + assert(num_operations > 0); + assert(num_operations <= MAX_OPERATIONS); + + for (int i = 0; i < num_operations; ++i) { + const fd_operation* op = &operations[i]; + log("op %d %d %d", op->fd_from, op->fd_to, op->operation); + switch (op->operation) { + case FD_OP_DUP: + if (op->fd_from == op->fd_to) + fake_fcntl(op->fd_from); + else + fake_dup2(op->fd_from, op->fd_to); + break; + case FD_OP_MOVE: + assert(op->fd_from != op->fd_to); + fake_dup2(op->fd_from, op->fd_to); + fake_close(op->fd_from); + break; + case FD_OP_DEVNULL: + // first close old, then create new one + fake_close(op->fd_to); + // TODO ideally we want read only for input for Ocaml did the same... + assert(fake_open("/dev/null", O_WRONLY) == op->fd_to); + break; + case FD_OP_CLOSE_FROM: + fake_close_fds_from(op->fd_from); + break; + default: + assert(0); + } + } + + // check files opened + for (int fd = 0; fd < NUM_FDS; ++fd) + assert(fds[fd].open == (fd < num_mappings)); + + for (int fd = 0; fd < num_mappings; ++fd) { + assert(fds[fd].cloexec == false); + log("file %d %s", fd, fds[fd].name); + } + + // Check in the final file mapping all valid mappings + // has an open file descriptor. + bool already_found[NUM_FDS] = { false, }; + for (unsigned n = 0; n < num_mappings; ++n) { + const int fd = mappings[n].current_fd; + const int wanted = mappings[n].wanted_fd; + if (fd >= 0) { + assert(fd < NUM_FDS); + assert(fds[fd].open); + + // There should be no duplicate numbers in current_fd. + assert(!already_found[fd]); + already_found[fd] = true; + } else { + // Only if wanted_fd >= 0 current_fd can be -1. + assert(mappings[n].wanted_fd >= 0); + assert(fd == -1); + } + + // If wanted_fd >= 0 current_fd should be the same. + if (wanted >= 0) + assert(wanted == fd || fd == -1); + + // current_fd must be in a range. + assert(fd >= -1); + assert(fd < num_mappings); + } + + // There should be a correspondance between input and output names. + // If current_fd was -1 it will still be -1. +} diff --git a/ocaml/forkexecd/helper/close_from.c b/ocaml/forkexecd/helper/close_from.c new file mode 100644 index 00000000000..207325d7407 --- /dev/null +++ b/ocaml/forkexecd/helper/close_from.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include "close_from.h" + +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +#include +#endif + +// try to use close_range on Linux even if not defined by headers +#if defined(__linux__) && !defined(SYS_close_range) +# if defined(__alpha__) +# define SYS_close_range 546 +# elif defined(__amd64__) || defined(__x86_64__) || defined(__arm__) || \ + defined(__aarch64__) || defined(__hppa__) || defined(__i386__) || \ + defined(__ia64__) || defined(__m68k__) || defined(__mips__) || \ + defined(__powerpc__) || defined(__powerpc64__) || defined(__sparc__) || \ + defined(__s390x__) +# define SYS_close_range 436 +# endif +#endif + +bool +close_fds_from(int fd_from) +{ + // first method, use close_range +#if (defined(__linux__) && defined(SYS_close_range)) \ + || (defined(__FreeBSD__) && defined(CLOSE_RANGE_CLOEXEC)) + static bool close_range_supported = true; + if (close_range_supported) { +#if defined(__linux__) + if (syscall(SYS_close_range, fd_from, ~0U, 0) == 0) +#else + if (close_range(fd_from, ~0U, 0) == 0) +#endif + return true; + + if (errno == ENOSYS) + close_range_supported = false; + } +#endif + + // second method, read fds list from /proc + DIR *dir = opendir("/proc/self/fd"); + if (dir) { + const int dir_fd = dirfd(dir); + struct dirent *ent; + while ((ent = readdir(dir)) != NULL) { + char *end = NULL; + unsigned long fd = strtoul(ent->d_name, &end, 10); + if (end == NULL || *end) + continue; + if (fd >= fd_from && fd != dir_fd) + close(fd); + } + closedir(dir); + return true; + } + + // third method, use just a loop + struct rlimit limit; + if (getrlimit(RLIMIT_NOFILE, &limit) < 0) + return false; + for (int fd = fd_from; fd < limit.rlim_cur; ++ fd) + close(fd); + + return true; +} diff --git a/ocaml/forkexecd/helper/close_from.h b/ocaml/forkexecd/helper/close_from.h new file mode 100644 index 00000000000..7a98b9a0ecc --- /dev/null +++ b/ocaml/forkexecd/helper/close_from.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#pragma once + +#include + +bool close_fds_from(int fd); diff --git a/ocaml/forkexecd/helper/dune b/ocaml/forkexecd/helper/dune new file mode 100644 index 00000000000..a22fdf45ce9 --- /dev/null +++ b/ocaml/forkexecd/helper/dune @@ -0,0 +1,21 @@ +(rule + (targets vfork_helper) + (package forkexec) + (deps + close_from.h + close_from.c + logs.h + logs.c + syslog.h + syslog.c + redirect_algo.h + vfork_helper.h + vfork_helper.c + Makefile) + (action + (run make))) + +(rule + (alias install) + (deps vfork_helper) + (action (run true))) diff --git a/ocaml/forkexecd/helper/logs.c b/ocaml/forkexecd/helper/logs.c new file mode 100644 index 00000000000..0b44bbb0ea4 --- /dev/null +++ b/ocaml/forkexecd/helper/logs.c @@ -0,0 +1,149 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include "logs.h" + +#if FORKEXECD_DEBUG_LOGS + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#define FILE_SIZE (32 * 1024) + +struct priv_mapped_logs { + uint32_t size; + + // Flags, we use characters instead of binary so + // easily see them easily with different tools. + char flags[4]; + char filename[64]; + pid_t pid; + int num; +}; + +// flags order +enum { SUCCESS, FAILURE }; + +mapped_logs mapped_logs_open(void) +{ + static int last_num = 0; + + // create a mapped file with a given size, will write header as structure + // and update using memory + mkdir("/tmp/fe_repl", 0755); + + char tmpl[] = "/tmp/fe_repl/logXXXXXX"; + int fd = mkstemp(tmpl); + if (!fd) + caml_raise_out_of_memory(); + + if (ftruncate(fd, FILE_SIZE) < 0) { + close(fd); + caml_raise_out_of_memory(); + } + + priv_mapped_logs *l = mmap(NULL, FILE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (l == MAP_FAILED) { + close(fd); + caml_raise_out_of_memory(); + } + close(fd); + + l->size = sizeof(*l); + memcpy(l->flags, "____", 4); + strncpy(l->filename, tmpl, sizeof(l->filename)); + l->pid = getpid(); + l->num = ++last_num; + + return (mapped_logs){l}; +} + +#define DEFINE_RANGE(start, end) \ + char *start = (char*) logs.priv + sizeof(priv_mapped_logs); \ + char *const end = (char*) logs.priv + FILE_SIZE + +void mapped_logs_close(mapped_logs logs) +{ + if (!logs.priv) + return; + DEFINE_RANGE(start, end); + bool written = false; + bool success = logs.priv->flags[FAILURE] == '_' && logs.priv->flags[SUCCESS] != '_'; + if (!success) { + FILE *f = fopen("/tmp/fe_repl/all_logs", "a"); + if (f) { + end[-1] = 0; + size_t len = strlen(start); + written = (fwrite(start, 1, len, f) == len); + fclose(f); + } + } + if (written || success) + unlink(logs.priv->filename); + munmap(logs.priv, FILE_SIZE); +} + +void mapped_logs_failure(mapped_logs logs) +{ + if (!logs.priv) + return; + logs.priv->flags[FAILURE] = 'F'; +} + +void mapped_logs_success(mapped_logs logs) +{ + if (!logs.priv) + return; + logs.priv->flags[SUCCESS] = 'S'; +} + +void mapped_logs_add(mapped_logs logs, const char *fmt, ...) +{ + if (!logs.priv) + return; + int save_errno = errno; + DEFINE_RANGE(start, end); + start += strlen(start); + if (start >= end -1) { + errno = save_errno; + return; // no more space + } + size_t len = end - start; + int l = snprintf(start, len, "%d:%d ", (int) logs.priv->pid, logs.priv->num); + if (l >= len) { + errno = save_errno; + return; + } + start += l; + len -= l; + va_list ap; + va_start(ap, fmt); + vsnprintf(start, len, fmt, ap); + va_end(ap); + + errno = save_errno; +} +#endif diff --git a/ocaml/forkexecd/helper/logs.h b/ocaml/forkexecd/helper/logs.h new file mode 100644 index 00000000000..13d54331f21 --- /dev/null +++ b/ocaml/forkexecd/helper/logs.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +// Definitions to write logs into memory mapped objects. +// We use a memory mapped object here because we close file descriptors +// so writing to file using them would cause logs to be lost. + +#pragma once + +#if !defined(FORKEXECD_DEBUG_LOGS) +#define FORKEXECD_DEBUG_LOGS 0 +#endif + +#if (FORKEXECD_DEBUG_LOGS) != 0 && (FORKEXECD_DEBUG_LOGS) != 1 +#error Expected FORKEXECD_DEBUG_LOGS to be defined either 0 or 1 +#endif + +typedef struct priv_mapped_logs priv_mapped_logs; +typedef struct mapped_logs mapped_logs; + +#if FORKEXECD_DEBUG_LOGS +struct mapped_logs { + priv_mapped_logs *priv; +}; +#define NULL_MAPPED_LOGS ((mapped_logs){0}) +mapped_logs mapped_logs_open(void); +void mapped_logs_close(mapped_logs logs); + +// Add a log entry, similar to printf. +void mapped_logs_add(mapped_logs logs, const char *fmt, ...); + +// Mark as failed, any failure will keep the log. +void mapped_logs_failure(mapped_logs logs); + +// Mark as successful, if successful and no failure during +// execution the log will be removed. +void mapped_logs_success(mapped_logs logs); +#else +// Use an empty structure, compiler will strip it passing +// it as a parameter without the needs to change the source +// code. +struct mapped_logs {}; +#define NULL_MAPPED_LOGS ((mapped_logs){}) +static inline mapped_logs mapped_logs_open(void) { + return (mapped_logs){}; +} + +static inline void mapped_logs_close(mapped_logs logs) { +} + +static inline void mapped_logs_failure(mapped_logs logs) { +} + +static inline void mapped_logs_success(mapped_logs logs) { +} + +#define mapped_logs_add(...) \ + do {} while(0) +#endif diff --git a/ocaml/forkexecd/helper/redirect_algo.h b/ocaml/forkexecd/helper/redirect_algo.h new file mode 100644 index 00000000000..367e5e1ec5a --- /dev/null +++ b/ocaml/forkexecd/helper/redirect_algo.h @@ -0,0 +1,210 @@ +/* Algorithm used to remap file handles before executing a process. + * The algorithm is separated in a different file in order to reuse for + * fuzzing it. + */ + +#pragma once + +#if !defined(DEBUG) +#define DEBUG 0 +#endif + +#if (DEBUG) != 0 && (DEBUG) != 1 +#error Expected DEBUG to be defined either 0 or 1 +#endif + +#ifndef log +#error Expected log macro to be defined +#endif + +#include +#include +#include + +typedef struct { + const char *uuid; + int current_fd; + int wanted_fd; +} mapping; + +typedef struct { + // source file + int fd_from; + // destination file + short fd_to; + // see FD_OP_ constants + uint8_t operation; +} fd_operation; + +typedef enum { + // Duplicate from fd_from to fd_to. + // If fd_from is the same as fd_to remove FD_CLOEXEC flag. + FD_OP_DUP, + // Duplicate from fd_from to fd_to and close fd_from. + FD_OP_MOVE, + // Open /dev/null on fd_to. + FD_OP_DEVNULL, + // Close from fd_from to the sky! + FD_OP_CLOSE_FROM, +} FD_OP; + +#define MAX_OPERATIONS 1024 +#define MAX_TOTAL_MAPPINGS (MAX_OPERATIONS - 4) + +static uint16_t remap_fds(mapping *const mappings, unsigned num_mappings, int from, int to); + +// Given the passed mappings update them (current_fd) and returns the +// requested operations to do the job. +// First 3 mappings should refer to standard file descriptors (stdin, +// stdout, stderr). +// Returns the number of operations to perform or negative if error. +static int +redirect_mappings(mapping *const mappings, const unsigned num_mappings, fd_operation *operations) +{ + mapping *const end_mappins = mappings + num_mappings; + uint16_t used_fds[MAX_OPERATIONS] = {0,}; + fd_operation *ops = operations; + +#define DUMP_MAPPINGS do { \ + if (DEBUG) { \ + for (unsigned i = 0; i < num_mappings; ++i) { \ + const mapping *m __attribute__((unused)) = &mappings[i]; \ + log("mapping %s %d %d", m->uuid, m->current_fd, m->wanted_fd); \ + } \ + char lbuf[MAX_OPERATIONS* 16]; \ + lbuf[0] = 0; \ + for (int i = 0; i < MAX_OPERATIONS; ++i) { \ + if (used_fds[i]) \ + sprintf(strchr(lbuf, 0), "%d=%d,", i, used_fds[i]); \ + } \ + log("used %s", lbuf); \ + } \ +} while(0); + + log("handle"); + + // parse all mappings + standard fds, mark ones using range 0-MAX_OPERATIONS + for (mapping *m = mappings; m < end_mappins; ++m) { + if (m->current_fd < 0 || m->current_fd >= MAX_OPERATIONS) + continue; + used_fds[m->current_fd]++; + } + DUMP_MAPPINGS; + + // Move standard file descriptors out of the way. + // Maximum 3 operations. + log("move standard fds away"); + for (mapping *m = mappings; m < end_mappins; ++m) { + const int current_fd = m->current_fd; + if (current_fd < 0 || current_fd > 2) + continue; + // find first available fd to use + int fd = 3; + while (used_fds[fd]) + ++fd; + *ops++ = (fd_operation){ current_fd, fd, FD_OP_DUP }; + uint16_t changed = remap_fds(mappings, num_mappings, current_fd, fd); + log("changed %d from %d to %d", changed, current_fd, fd); + used_fds[current_fd] = 0; + used_fds[fd] = changed; + } + DUMP_MAPPINGS; + + // Move standard fds into proper positions + // Maximum 3 operations (standard fds to be moved). + log("move standard fds correctly"); + for (mapping *m = mappings; m < end_mappins; ++m) { + const int current_fd = m->current_fd; + if (current_fd < 0 || m->wanted_fd < 0) + continue; + int fd = m->wanted_fd; + FD_OP op = FD_OP_DUP; + if (current_fd >= num_mappings) { + // move + op = FD_OP_MOVE; + uint16_t changed = remap_fds(mappings, num_mappings, current_fd, fd); + log("changed %d from %d to %d", changed, current_fd, fd); + used_fds[fd] = changed; + } else { + // duplicate + m->current_fd = fd; + if (--used_fds[current_fd] == 0) + op = FD_OP_MOVE; + used_fds[fd] = 1; + } + *ops++ = (fd_operation){ current_fd, fd, op }; + } + DUMP_MAPPINGS; + + // Remove cloexec on range [3, 3 + num mappings). + // Maximum no standard mappings operations. + log("remove cloexec flags"); + for (int fd = 3; fd < num_mappings; ++fd) { + if (!used_fds[fd]) + continue; + log("remove cloexec from %d", fd); + *ops++ = (fd_operation){ fd, fd, FD_OP_DUP }; + } + DUMP_MAPPINGS; + + // Move all fds left in range [3, 3 + num mappings). + // Maximum no standard mapping operations; then sum with + // the above is the no standard mapping operations. + log("move all fds left in range"); + int last_free = 3; + for (mapping *m = mappings; m < end_mappins; ++m) { + const int current_fd = m->current_fd; + if (m->wanted_fd >= 0) + continue; + if (current_fd < num_mappings && used_fds[current_fd] == 1) + continue; + while (used_fds[last_free]) + ++last_free; + int fd = last_free; + // TODO copied from above + FD_OP op = FD_OP_DUP; + if (current_fd >= num_mappings) { + // move + op = FD_OP_MOVE; + uint16_t changed = remap_fds(mappings, num_mappings, current_fd, fd); + log("changed %d from %d to %d", changed, current_fd, fd); + used_fds[fd] = changed; + } else { + // duplicate + m->current_fd = fd; + if (--used_fds[current_fd] == 0) + op = FD_OP_MOVE; + used_fds[fd] = 1; + } + *ops++ = (fd_operation){ current_fd, fd, op }; + } + DUMP_MAPPINGS; + + // Close extra fds. + *ops++ = (fd_operation){ num_mappings, 0, FD_OP_CLOSE_FROM }; + + // Create missing standard fds. + // Maximum standard mapping operations, but not the above, + // so the sum with move the standard is 3. + for (int fd = 0; fd < 3; ++fd) { + if (used_fds[fd]) + continue; + *ops++ = (fd_operation){ fd, fd, FD_OP_DEVNULL }; + } + + return ops - operations; +} + +static uint16_t +remap_fds(mapping *const mappings, unsigned num_mappings, int from, int to) +{ + uint16_t res = 0; + for (unsigned i = 0; i < num_mappings; ++i) { + mapping *m = &mappings[i]; + if (m->current_fd == from) { + m->current_fd = to; + res++; + } + } + return res; +} diff --git a/ocaml/forkexecd/helper/syslog.c b/ocaml/forkexecd/helper/syslog.c new file mode 100644 index 00000000000..ddfa50b66d6 --- /dev/null +++ b/ocaml/forkexecd/helper/syslog.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "syslog.h" + +#include +#include +#include + +static inline size_t quoted_length(const char c) +{ + return c == '\\' ? 2 : + (c >= ' ' && c < 0x7f) ? 1 : + 4; +} + +static const char hex[] = "0123456789ABCDEF"; + +static inline void write_quoted(char *const p, const char c) +{ + if (c == '\\') { + p[0] = p[1] = c; + } else if (c >= ' ' && c < 0x7f) { + p[0] = c; + } else { + p[0] = '\\'; + p[1] = 'x'; + p[2] = hex[(c>>4)&0xf]; + p[3] = hex[c&0xf]; + } +} + +static void syslog_line(const char *line, const char *key, int child_pid) +{ + syslog(LOG_DAEMON|LOG_INFO, "%s[%d]: %s", key, child_pid, line); +} + +// Quote and forward every line from "fd" to the syslog. +// "fd" will be closed. +bool forward_to_syslog(int fd, const char *key, int child_pid) +{ +#define syslog_line(line) syslog_line(line, key, child_pid) + FILE *f = fdopen(fd, "r"); + char quoted_buf[64000]; + char *dest = quoted_buf; + char *const dest_end = quoted_buf + sizeof(quoted_buf) - sizeof(" ...") - 1; + bool overflowed = false; + while (true) { + int ch = getc_unlocked(f); + + if (!overflowed && dest != quoted_buf && (ch == '\n' || ch == EOF)) { + *dest = 0; + syslog_line(quoted_buf); + } + + if (ch == EOF) { + bool res = !!feof(f); + fclose(f); + return res; + } + + if (ch == '\n') { + overflowed = false; + dest = quoted_buf; + continue; + } + + if (overflowed) + continue; + + const size_t quoted_len = quoted_length(ch); + if (dest + quoted_len >= dest_end) { + strcpy(dest, " ..."); + syslog_line(quoted_buf); + overflowed = true; + continue; + } + write_quoted(dest, ch); + dest += quoted_len; + } +} diff --git a/ocaml/forkexecd/helper/syslog.h b/ocaml/forkexecd/helper/syslog.h new file mode 100644 index 00000000000..8ca172b6324 --- /dev/null +++ b/ocaml/forkexecd/helper/syslog.h @@ -0,0 +1,21 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#pragma once + +#include + +// Quote and forward every line from "fd" to the syslog. +// "fd" will be closed. +bool forward_to_syslog(int fd, const char *key, int child_pid); diff --git a/ocaml/forkexecd/helper/vfork_helper.c b/ocaml/forkexecd/helper/vfork_helper.c new file mode 100644 index 00000000000..39ea01399ad --- /dev/null +++ b/ocaml/forkexecd/helper/vfork_helper.c @@ -0,0 +1,433 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "close_from.h" +#include "syslog.h" +#include "logs.h" +#include "vfork_helper.h" + +#define log(...) do {} while(0) +#include "redirect_algo.h" + +typedef struct { + char **args; + mapping *mappings; + fd_operation operations[MAX_OPERATIONS]; + int err; + const char *err_func; +} exec_info; + +static void adjust_args(char **args, mapping *const mappings, unsigned num_mappings); +static void reset_signal_handlers(void); +static void clear_cgroup(void); +static const char *get_arg(int *argc, char ***argv); +static int get_fd(int *argc, char ***argv); +static void error(int err, const char *msg, ...); +static void init_syslog(const char *key, bool redirect_stderr_to_stdout); + +static int error_fd = -1; + +int main(int argc, char **argv) +{ + unsigned num_mappings = 3; + bool redirect_stderr_to_stdout = false; + const char *key = NULL; + struct rlimit nofile_limit; + mapping mappings_buf[MAX_TOTAL_MAPPINGS]; + exec_info info[1] = { NULL, }; + const char *directory = "/"; + + mapped_logs logs = mapped_logs_open(); +#undef log +#define log(fmt, ...) mapped_logs_add(logs, fmt "\n", ## __VA_ARGS__) +#define log_fail(fmt, ...) do {\ + mapped_logs_failure(logs); \ + mapped_logs_add(logs, fmt "\n", ## __VA_ARGS__); \ +} while(0) + + log("starting"); + + info->mappings = mappings_buf; + for (int i = 0; i < 3; ++i) { + mapping *const m = &info->mappings[i]; + m->uuid = NULL; + m->current_fd = -1; + m->wanted_fd = i; + } + + // Scan all arguments, check them and collect some information. + ++argv; + --argc; + for (;;) { + // we must have an argument left + const char *arg = get_arg(&argc, &argv); + + // next must be a single letter option + if (arg[0] != '-' || arg[1] == 0 || arg[2] != 0) + error(EINVAL, "Invalid option %s", arg); + + // final "--" + if (arg[1] == '-') + break; + + switch (arg[1]) { + case 'I': // stdin + info->mappings[0].current_fd = get_fd(&argc, &argv); + break; + case 'O': // stdout + info->mappings[1].current_fd = get_fd(&argc, &argv); + break; + case 'E': // stderr + info->mappings[2].current_fd = get_fd(&argc, &argv); + break; + case 'm': { // mapping + if (num_mappings >= MAX_TOTAL_MAPPINGS) { + log_fail("too many mappings"); + mapped_logs_close(logs); + error(EINVAL, "Too many mappings"); + } + const char *uuid = get_arg(&argc, &argv); + if (strlen(uuid) != 36) { + log_fail("invalid mapping"); + mapped_logs_close(logs); + error(EINVAL, "Invalid mapping UUID"); + } + const int fd = get_fd(&argc, &argv); + mapping* const m = &info->mappings[num_mappings++]; + m->uuid = uuid; + m->current_fd = fd; + m->wanted_fd = -1; + } + break; + case 's': // syslog (with key) + key = get_arg(&argc, &argv); + break; + case 'S': // syslog stderr to stdout + redirect_stderr_to_stdout = true; + break; + case 'd': + directory = get_arg(&argc, &argv); + break; + case 'e': { // error file descriptor + error_fd = get_fd(&argc, &argv); + if (num_mappings >= MAX_TOTAL_MAPPINGS) { + log_fail("too many mappings"); + mapped_logs_close(logs); + error(EINVAL, "Too many mappings"); + } + mapping* const m = &info->mappings[num_mappings++]; + m->uuid = NULL; + m->current_fd = error_fd; + m->wanted_fd = -1; + } + break; + default: + error(EINVAL, "Invalid option %s", arg); + } + } + + if (argc < 1) { + log_fail("no args"); + mapped_logs_close(logs); + error(EINVAL, "No command arguments"); + } + + info->args = argv; + + if (getrlimit(RLIMIT_NOFILE, &nofile_limit) < 0) { + int err = errno; + log_fail("getrlimit error"); + mapped_logs_close(logs); + error(err, "getrlimit"); + } + + sigset_t sigset; + + // Compute the file operations we need to do for the file mappings + int num_operations = + redirect_mappings(info->mappings, num_mappings, info->operations); + + if (FORKEXECD_DEBUG_LOGS) { + for (size_t n = 0; info->args[n]; ++n) + log("arg %zd %s", n, info->args[n]); + } + + // Rename all command line. + adjust_args(info->args, info->mappings, num_mappings); + + if (FORKEXECD_DEBUG_LOGS) { + for (size_t n = 0; info->args[n]; ++n) + log("arg %zd %s", n, info->args[n]); + } + + reset_signal_handlers(); + + if (strcmp(directory, ".") != 0 && chdir(directory) < 0) { + int err = errno; + log_fail("chdir %d", err); + error(err, "chdir"); + } + + // Clear cgroup otherwise systemd will shutdown processes if + // toolstack is restarted. + clear_cgroup(); + + if (setsid() < 0) { + int err = errno; + log_fail("setsid %d", errno); + error(err, "setsid"); + } + + // Redirect file descriptors. + int err = 0; + const char *err_func = NULL; + for (int i = 0; i < num_operations && err == 0; ++i) { + const fd_operation* const op = &info->operations[i]; + log("op %d %d %d", op->fd_from, op->fd_to, op->operation); + switch (op->operation) { + case FD_OP_DUP: + if (op->fd_from == op->fd_to) { + // These file descriptors came from another process, + // so surely they have the CLOEXEC flag set, nothing + // to do. + break; + } else { + err_func = "dup2"; + if (dup2(op->fd_from, op->fd_to) < 0) + err = errno; + if (op->fd_from == error_fd) + error_fd = op->fd_to; + } + break; + case FD_OP_MOVE: + err_func = "dup2"; + if (dup2(op->fd_from, op->fd_to) < 0) + err = errno; + if (op->fd_from == error_fd) + error_fd = op->fd_to; + close(op->fd_from); + break; + case FD_OP_DEVNULL: + // first close old, then create new one + close(op->fd_to); + // TODO ideally we want read only for input for Ocaml did the same... + err_func = "open"; + if (open("/dev/null", O_WRONLY) != op->fd_to) + err = errno ? errno : EBADF; + break; + case FD_OP_CLOSE_FROM: + close_fds_from(op->fd_from); + break; + default: + err_func = "safe_exec"; + err = EINVAL; + } + } + if (err != 0) { + info->err = err; + info->err_func = err_func; + log_fail("redirect error %d in %s", err, err_func); + error(err, "%s", err_func); + } + + if (key) + init_syslog(key, redirect_stderr_to_stdout); + + // Limit number of files limits to standard limit to avoid + // creating bugs with old programs. + if (nofile_limit.rlim_cur > 1024) { + nofile_limit.rlim_cur = 1024; + setrlimit(RLIMIT_NOFILE, &nofile_limit); + } + + // Reset signal mask, inherited by the process we are going to execute + sigemptyset(&sigset); + pthread_sigmask(SIG_SETMASK, &sigset, NULL); + + log("execv..."); + mapped_logs_success(logs); + if (error_fd >= 0) + close(error_fd); + execv(info->args[0], info->args); + log_fail("execve failed %d", errno); + // Here we could set err and err_func but we kept compatibility + // with forkexecd daemon. + exit(errno == ENOENT ? 127 : 126); +} + +static void +adjust_args(char **args, mapping *const mappings, unsigned num_mappings) +{ + for (; *args; ++args) { + char *arg = *args; + size_t len = strlen(arg); + if (len < 36) + continue; + + // replace uuid with file descriptor + char *uuid = arg + len - 36; + for (unsigned i = 0; i < num_mappings; ++i) { + const mapping *m = &mappings[i]; + if (m->uuid == NULL || strcmp(m->uuid, uuid) != 0) + continue; + sprintf(uuid, "%d", m->current_fd); + } + } +} + +static void +reset_signal_handlers(void) +{ + for (int sig = 1; sig < NSIG; ++sig) { + // these signals can't be overridden + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + // Set signal dispositions. + // This avoids inherit unwanted overrides. + // Also prevent handling unwanted signal handler, especially using vfork(). + // Use ignore SIGPIPE for compatibility with forkexecd. + signal(sig, sig == SIGPIPE ? SIG_IGN : SIG_DFL); + } +} + +static void +clear_cgroup(void) +{ + int fd = open("/sys/fs/cgroup/systemd/cgroup.procs", O_WRONLY|O_CLOEXEC); + if (fd >= 0) { + char string_pid[32]; + int ignored __attribute__((unused)); + sprintf(string_pid, "%d\n", (int) getpid()); + ignored = write(fd, string_pid, strlen(string_pid)); + close(fd); + } +} + +static const char * +get_arg(int *argc, char ***argv) +{ + if (*argc < 0) + error(EINVAL, "Expected one more argument"); + + const char *arg = **argv; + --(*argc); + ++(*argv); + return arg; +} + +static int +get_fd(int *argc, char ***argv) +{ + const char *arg = get_arg(argc, argv); + unsigned long fd = strtoul(arg, NULL, 0); + if (fd < 0 || fd > INT_MAX) + error(EINVAL, "Expected valid file descriptor number"); + return (int) fd; +} + +static void +error(int err, const char *format, ...) +{ + if (error_fd >= 0) { + msg_t msg = { err }; + va_list ap; + va_start(ap, format); + int ignored __attribute__((unused)); + vsnprintf(msg.msg_buf, sizeof(msg.msg_buf), format, ap); + msg.msg_buf[sizeof(msg.msg_buf) - 1] = 0; + va_end(ap); + ignored = write(error_fd, &msg, offsetof(msg_t, msg_buf) + strlen(msg.msg_buf) + 1); + } + exit(125); +} + +static void +init_syslog(const char *key, bool redirect_stderr_to_stdout) +{ + int fds[2]; + if (pipe(fds) < 0) + error(errno, "pipe"); + dup2(fds[1], 1); + if (redirect_stderr_to_stdout) + dup2(fds[1], 2); + close(fds[1]); + + const int child_pid = (int) getpid(); + + pid_t pid = fork(); + if (pid < 0) + error(errno, "fork"); + + if (pid == 0) { + // child + close(0); + close(1); + if (open("/dev/null", O_RDONLY) != 0 + || open("/dev/null", O_WRONLY) != 1) + error(errno, "open"); + dup2(1, 2); + if (fds[0] != 3) { + dup2(fds[0], 3); + fds[0] = 3; + } + close_fds_from(4); + + pid = fork(); + if (pid < 0) + error(errno, "fork"); + if (pid > 0) + // parent + exit(0); + + openlog("forkexecd", 0, LOG_DAEMON); + forward_to_syslog(fds[0], key, child_pid); + exit(0); + } + + close(fds[0]); + + // parent + int status; + wait(&status); + if (!WIFEXITED(status)) + error(EPIPE, "syslogger"); + + switch (WEXITSTATUS(status)) { + case 0: + // success + return; + case 125: + // forward error, a proper message will be forwarded + exit(125); + } + error(EPIPE, "syslogger"); +} diff --git a/ocaml/forkexecd/helper/vfork_helper.h b/ocaml/forkexecd/helper/vfork_helper.h new file mode 100644 index 00000000000..08fd1ff21af --- /dev/null +++ b/ocaml/forkexecd/helper/vfork_helper.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#pragma once + +// Common structure to pass errors from helper to library +typedef struct { + // numeric C error + int err; + // message + char msg_buf[1000]; +} msg_t; diff --git a/ocaml/forkexecd/lib/dune b/ocaml/forkexecd/lib/dune index 749f173b977..b4e02786e67 100644 --- a/ocaml/forkexecd/lib/dune +++ b/ocaml/forkexecd/lib/dune @@ -15,4 +15,10 @@ xapi-stdext-unix xapi-tracing ) - (preprocess (per_module ((pps ppx_deriving_rpc) Fe)))) + (preprocess (per_module ((pps ppx_deriving_rpc) Fe))) + (foreign_stubs + (language c) + (names fe_stubs) + (include_dirs ../helper) + (flags :standard -Wall -Werror) + )) diff --git a/ocaml/forkexecd/lib/fe_stubs.c b/ocaml/forkexecd/lib/fe_stubs.c new file mode 100644 index 00000000000..813d06106d1 --- /dev/null +++ b/ocaml/forkexecd/lib/fe_stubs.c @@ -0,0 +1,405 @@ +/* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../helper/vfork_helper.h" + +#define FOREACH_LIST(name, list) \ + for(value name = (list); name != Val_emptylist; name = Field(name, 1)) + +// Create thread reducing stack usage to a minimum to reduce memory usage. +// Returns error number (like pthread_create). +static int create_thread_minstack(pthread_t *th, void *(*proc)(void *), void *arg); + +static inline void +reap_pid(pid_t pid) +{ + int status; + while (waitpid(pid, &status, 0) < 0 && errno == EINTR) + continue; +} + +static void * +thread_proc_reap(void *arg) +{ + pid_t pid = (pid_t) (intptr_t) arg; + + reap_pid(pid); + + return NULL; +} + +// Appends a string to *p_dest buffer. +// It updates *p_dest to point after copied string. +// Returns copied string. +static char * +append_string(char **p_dest, const char *s) +{ + char *const dest = *p_dest; + size_t const size = strlen(s) + 1; + memcpy(dest, s, size); + *p_dest = dest + size; + return dest; +} + +static char** +copy_string_list(value list) +{ + size_t strings_size = 0; + size_t list_size = 0; + char **res, **ptrs; + char *strings; + + FOREACH_LIST(item, list) { + strings_size += strlen(String_val(Field(item, 0))) + 1; + ++list_size; + } + + res = (char **) malloc(sizeof(char*) * (list_size + 1) + strings_size); + if (!res) + return NULL; + + ptrs = res; + strings = (char *) (res + (list_size + 1)); + FOREACH_LIST(item, list) + *ptrs++ = append_string(&strings, String_val(Field(item, 0))); + *ptrs = NULL; + + return res; +} + +static void +close_fd(int *const p_fd) +{ + const int fd = *p_fd; + if (fd >= 0) { + *p_fd = -1; + close(fd); + } +} + +typedef struct { + const char *err_msg; + pid_t pid; + msg_t msg; +} safe_exec_result; + +static int +safe_exec_with_helper(safe_exec_result *res, char **args, char **envs) +{ + int err = EINVAL; + char fd_string[48]; + int pipe_fds[2] = { -1, -1 }; + + res->err_msg = "safe_exec"; + + if (!args[0] || !args[1] || !args[2]) + return EINVAL; + + if (strcmp(args[1], "-e") == 0) { + if (pipe(pipe_fds) < 0) { + res->err_msg = "pipe"; + return errno; + } + sprintf(fd_string, "%d", pipe_fds[1]); + args[2] = fd_string; + } + + sigset_t sigset, old_sigset; + int cancellation_state; + + // Disable cancellation to avoid some signals. + // Glibc use some signals to handle thread cancellation. + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancellation_state); + + // Block all possible signals to avoid receiving some in the child. + // Signal mask is inherited to new process/thread will start with + // all signals disabled and we can safely change them. + sigfillset(&sigset); + pthread_sigmask(SIG_BLOCK, &sigset, &old_sigset); + + // fork + err = 0; + res->pid = vfork(); + if (res->pid < 0) { + err = errno; + } else if (res->pid == 0) { + // child + if (pipe_fds[0] >= 0) + close(pipe_fds[0]); + execve(args[0], args, envs); + // keep compatibility with forkexecd daemon. + _exit(errno == ENOENT ? 127 : 126); + } + + // Restore thread state + pthread_sigmask(SIG_SETMASK, &old_sigset, NULL); + pthread_setcancelstate(cancellation_state, NULL); + + // We don't need writing pipe anymore and we need to detect + // if closed so we can't keep it open + close_fd(&pipe_fds[1]); + + if (err != 0) { + close_fd(&pipe_fds[0]); + res->err_msg = "vfork"; + return err; + } + + // Handle errors from helper + if (pipe_fds[0] >= 0) { + int readed; + // Note that buffer is small and written atomically by + // the helper, no reason for the kernel to split it. + while ((readed = read(pipe_fds[0], &res->msg, sizeof(res->msg))) < 0 + && errno == EINTR) + continue; + close_fd(&pipe_fds[0]); + if (readed != 0 && readed < offsetof(msg_t, msg_buf) + 1) { + // This should never happen !!! + // At this point the process is created and we have a pid so + // we cannot just return an error. + // We could try to wait the process but it should fail, let + // returns success and let caller read process status result. + return 0; + } + res->msg.msg_buf[sizeof(res->msg.msg_buf) - 1] = 0; + if (readed > 0) { + // Wait the process otherwise we'll have a zombie + reap_pid(res->pid); + + res->err_msg = res->msg.msg_buf; + return res->msg.err; + } + } + return 0; +} + +CAMLprim value +caml_safe_exec_with_helper(value args, value environment) +{ + CAMLparam2(args, environment); + + // Copy parameters to C + char **c_args = copy_string_list(args); + char **c_envs = copy_string_list(environment); + if (!c_envs || !c_args) { + free(c_envs); + free(c_args); + caml_raise_out_of_memory(); + } + + // potentially slow section, release Ocaml engine + caml_enter_blocking_section(); + + safe_exec_result res; + int err = safe_exec_with_helper(&res, c_args, c_envs); + + free(c_envs); + free(c_args); + + caml_leave_blocking_section(); + + // error, notify with an exception + if (err != 0) + unix_error(err, res.err_msg, Nothing); + + CAMLreturn(Val_int(res.pid)); +} + +CAMLprim value +caml_pidwaiter_dontwait(value pid_val) +{ + CAMLparam1(pid_val); + pid_t pid = Int_val(pid_val); + + // reap the pid to avoid zombies + pthread_t th; + if (create_thread_minstack(&th, thread_proc_reap, (void *) (intptr_t) pid) == 0) + pthread_detach(th); + + CAMLreturn(Val_unit); +} + +typedef struct { + pid_t pid; + bool timed_out; + bool stop; + struct timespec deadline; + pthread_mutex_t mtx; + pthread_cond_t cond; +} timeout_kill; + +static void * +thread_proc_timeout_kill(void *arg) +{ + timeout_kill *tm = (timeout_kill *) arg; + + pthread_mutex_lock(&tm->mtx); + int res = tm->stop ? 0: + pthread_cond_timedwait(&tm->cond, &tm->mtx, &tm->deadline); + pthread_mutex_unlock(&tm->mtx); + + if (res == ETIMEDOUT) { + kill(tm->pid, SIGKILL); + tm->timed_out = true; + } + return NULL; +} + +static int +create_thread_minstack(pthread_t *th, void *(*proc)(void *), void *arg) +{ + int res; + + // disable any possible signal handler so we can safely use a small stack + // for the thread + sigset_t sigset, old_sigset; + sigfillset(&sigset); + pthread_sigmask(SIG_BLOCK, &sigset, &old_sigset); + + pthread_attr_t th_attr; + res = pthread_attr_init(&th_attr); + if (!res) { + pthread_attr_setstacksize(&th_attr, PTHREAD_STACK_MIN); + + res = pthread_create(th, &th_attr, proc, arg); + + pthread_attr_destroy(&th_attr); + } + pthread_sigmask(SIG_SETMASK, &old_sigset, NULL); + + return res; +} + +/* + * Wait a process with a given timeout. + * At the end of timeout (if trigger) kill the process. + * To avoid race we need to wait a specific process, but this is blocking + * and we use a timeout to implement the wait. Timer functions are per + * process, not per thread. + * Returns <0 if error, 0 if not timed out, >0 if timedout. + */ +static int +wait_process_timeout(pid_t pid, double timeout) +{ + int err; + + // compute deadline + timeout_kill tm = { pid, false, false }; + if (clock_gettime(CLOCK_MONOTONIC, &tm.deadline) < 0) + return -errno; + + double f = floor(timeout); + tm.deadline.tv_sec += f; + tm.deadline.tv_nsec += (timeout - f) * 1000000000.; + if (tm.deadline.tv_nsec >= 1000000000) { + tm.deadline.tv_nsec -= 1000000000; + tm.deadline.tv_sec += 1; + } + + pthread_condattr_t attr; + err = pthread_condattr_init(&attr); + if (err) + return -err; + err = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); + if (!err) + err = pthread_cond_init(&tm.cond, &attr); + pthread_condattr_destroy(&attr); + if (err) + return -err; + + err = pthread_mutex_init(&tm.mtx, NULL); + if (err) { + pthread_cond_destroy(&tm.cond); + return -err; + } + + // Create timeout thread + pthread_t th; + err = create_thread_minstack(&th, thread_proc_timeout_kill, &tm); + if (err) { + pthread_cond_destroy(&tm.cond); + pthread_mutex_destroy(&tm.mtx); + return -err; + } + + // Wait the process, we avoid to reap the other process to avoid + // race conditions. Consider: + // - process exit; + // - we reap the thread; + // - OS reuse the pid; + // - timeout thread terminate the pid, now reused. + // Avoiding reaping the process will create a zombie process so + // the KILL would be directed to that. + siginfo_t info; + waitid(P_PID, pid, &info, WEXITED|WNOWAIT); + + // Close the timeout thread + pthread_mutex_lock(&tm.mtx); + // We use also a variable to avoid races like + // - we create the thread; + // - we start waiting the process which was already exited; + // - we came here trying to close the thread; + // - thread waits for signal. + tm.stop = true; + pthread_cond_broadcast(&tm.cond); + pthread_mutex_unlock(&tm.mtx); + pthread_join(th, NULL); + + // Cleanup + pthread_cond_destroy(&tm.cond); + pthread_mutex_destroy(&tm.mtx); + + return tm.timed_out ? 1 : 0; +} + +CAMLprim value +caml_pidwaiter_waitpid(value timeout_value, value pid_value) +{ + CAMLparam0(); + double timeout = timeout_value == Val_none ? 0 : Double_val(Some_val(timeout_value)); + pid_t pid = Int_val(pid_value); + + caml_enter_blocking_section(); + + bool timed_out = false; + int err = 0; + if (timeout > 0) { + int res = wait_process_timeout(pid, timeout); + if (res < 0) + err = -res; + else if (res != 0) + timed_out = true; + } + + caml_leave_blocking_section(); + + if (err) + unix_error(err, "waitpid", Nothing); + + CAMLreturn(timed_out ? Val_true: Val_false); +} diff --git a/ocaml/forkexecd/lib/forkhelpers.ml b/ocaml/forkexecd/lib/forkhelpers.ml index 7b7fc0b2247..9dc065e7ff9 100644 --- a/ocaml/forkexecd/lib/forkhelpers.ml +++ b/ocaml/forkexecd/lib/forkhelpers.ml @@ -40,12 +40,32 @@ let with_tracing ~tracing ~name f = Tracing.with_tracing ~parent:tracing ~name f let finally = Xapi_stdext_pervasives.Pervasiveext.finally -type pidty = Unix.file_descr * int +let use_daemon = ref false + +module FEStubs = struct + external safe_exec_with_helper : string list -> string list -> int + = "caml_safe_exec_with_helper" + + (* timeout <= 0 wait infinite *) + external pidwaiter_waitpid : ?timeout:float -> int -> bool + = "caml_pidwaiter_waitpid" + + (* do not wait for a process, release it, it won't generate a zombie process *) + external pidwaiter_dontwait : int -> unit = "caml_pidwaiter_dontwait" +end + +type waiter = Pidwaiter | Sock of Unix.file_descr + +type pidty = waiter * int (* The forking executioner has been used, therefore we need to tell *it* to waitpid *) -let string_of_pidty (fd, pid) = - Printf.sprintf "(FEFork (%d,%d))" (Fd_send_recv.int_of_fd fd) pid +let string_of_pidty (waiter, pid) = + match waiter with + | Pidwaiter -> + Printf.sprintf "(FEFork (%d))" pid + | Sock fd -> + Printf.sprintf "(FEFork (%d,%d))" (Fd_send_recv.int_of_fd fd) pid exception Subprocess_failed of int @@ -53,7 +73,7 @@ exception Subprocess_killed of int exception Subprocess_timeout -let waitpid (sock, pid) = +let waitpid_daemon sock pid = let status = Fecomms.read_raw_rpc sock in Unix.close sock ; match status with @@ -79,7 +99,7 @@ let waitpid (sock, pid) = (* [waitpid_nohang] reports the status of a socket to a process. The intention is to make this non-blocking. If the process is finished, the socket is closed and not otherwise. *) -let waitpid_nohang (sock, pid) = +let waitpid_nohang_daemon sock pid = let verbose = false in if verbose then D.debug "%s pid=%d" __FUNCTION__ pid ; let fail fmt = Printf.ksprintf failwith fmt in @@ -118,7 +138,7 @@ let waitpid_nohang (sock, pid) = fail "%s: error happened when trying to read the status. %s" __FUNCTION__ (Printexc.to_string exn) -let dontwaitpid (sock, _pid) = +let dontwaitpid_daemon sock _pid = ( try (* Try to tell the child fe that we're not going to wait for it. If the other end of the pipe has been closed then this doesn't matter, as this @@ -128,6 +148,27 @@ let dontwaitpid (sock, _pid) = ) ; Unix.close sock +let waitpid (waiter, pid) = + match waiter with + | Pidwaiter -> + Unix.waitpid [] pid + | Sock sock -> + waitpid_daemon sock pid + +let waitpid_nohang (waiter, pid) = + match waiter with + | Pidwaiter -> + Unix.waitpid [Unix.WNOHANG] pid + | Sock sock -> + waitpid_nohang_daemon sock pid + +let dontwaitpid (waiter, pid) = + match waiter with + | Pidwaiter -> + FEStubs.pidwaiter_dontwait pid + | Sock sock -> + dontwaitpid_daemon sock pid + let waitpid_fail_if_bad_exit ty = let _, status = waitpid ty in match status with @@ -140,7 +181,7 @@ let waitpid_fail_if_bad_exit ty = | Unix.WSTOPPED n -> raise (Subprocess_killed n) -let getpid (_sock, pid) = pid +let getpid (_waiter, pid) = pid type 'a result = Success of string * 'a | Failure of string * exn @@ -179,12 +220,9 @@ type syslog_stdout = | Syslog_DefaultKey | Syslog_WithKey of string -(** Safe function which forks a command, closing all fds except a whitelist and - having performed some fd operations in the child *) -let safe_close_and_exec ?tracing ?env stdin stdout stderr +let safe_close_and_exec_daemon ?tracing env stdin stdout stderr (fds : (string * Unix.file_descr) list) ?(syslog_stdout = NoSyslogging) - ?(redirect_stderr_to_stdout = false) (cmd : string) (args : string list) = - with_tracing ~tracing ~name:__FUNCTION__ @@ fun tracing -> + ?(redirect_stderr_to_stdout = false) args = let sock = Fecomms.open_unix_domain_sock_client ?tracing (Filename.concat runtime_path "/xapi/forker/main") @@ -228,7 +266,6 @@ let safe_close_and_exec ?tracing ?env stdin stdout stderr List.fold_left maybe_add_id_to_fd_map dest_named_fds predefined_fds in - let env = Option.value ~default:default_path_env_pair env in let syslog_stdout = match syslog_stdout with | NoSyslogging -> @@ -241,7 +278,7 @@ let safe_close_and_exec ?tracing ?env stdin stdout stderr Fecomms.write_raw_rpc ?tracing sock (Fe.Setup { - Fe.cmdargs= cmd :: args + Fe.cmdargs= args ; env= Array.to_list env ; id_to_fd_map ; syslog_stdout @@ -295,7 +332,7 @@ let safe_close_and_exec ?tracing ?env stdin stdout stderr match Fecomms.read_raw_rpc ?tracing sock with | Ok (Fe.Execed pid) -> remove_fd_from_close_list sock ; - (sock, pid) + (Sock sock, pid) | Ok status -> let msg = Printf.sprintf @@ -314,6 +351,64 @@ let safe_close_and_exec ?tracing ?env stdin stdout stderr ) close_fds +let safe_close_and_exec_vfork ?tracing env stdin stdout stderr + (fds : (string * Unix.file_descr) list) ?(syslog_stdout = NoSyslogging) + ?(redirect_stderr_to_stdout = false) cmd args = + let string_of_fd (fd : Unix.file_descr) = string_of_int (Obj.magic fd) in + let args = "--" :: args in + let args = if redirect_stderr_to_stdout then "-S" :: args else args in + let args = + match syslog_stdout with + | NoSyslogging -> + args + | Syslog_DefaultKey -> + "-s" :: Filename.basename cmd :: args + | Syslog_WithKey key -> + "-s" :: key :: args + in + let args = + List.fold_right + (fun (uuid, fd) args -> + Unix.clear_close_on_exec fd ; + "-m" :: uuid :: string_of_fd fd :: args + ) + fds args + in + let add_std args arg fd = + match fd with + | Some fd -> + Unix.clear_close_on_exec fd ; + arg :: string_of_fd fd :: args + | None -> + args + in + let args = add_std args "-E" stderr in + let args = add_std args "-O" stdout in + let args = add_std args "-I" stdin in + let args = "/usr/libexec/xapi/vfork_helper" :: "-e" :: "DUMMY" :: args in + (* Convert environment and add tracing variables. *) + let env = + List.append (Tracing.EnvHelpers.of_span tracing) (Array.to_list env) + in + let pid = FEStubs.safe_exec_with_helper args env in + (Pidwaiter, pid) + +(** Safe function which forks a command, closing all fds except a whitelist and + having performed some fd operations in the child *) +let safe_close_and_exec ?tracing ?env stdin stdout stderr + (fds : (string * Unix.file_descr) list) ?(syslog_stdout = NoSyslogging) + ?(redirect_stderr_to_stdout = false) (cmd : string) (args : string list) = + with_tracing ~tracing ~name:__FUNCTION__ @@ fun tracing -> + let args = cmd :: args in + let env = Option.value ~default:default_path_env_pair env in + + if not !use_daemon then (* Build a list of arguments as helper wants. *) + safe_close_and_exec_vfork ?tracing env stdin stdout stderr fds + ~syslog_stdout ~redirect_stderr_to_stdout cmd args + else + safe_close_and_exec_daemon ?tracing env stdin stdout stderr fds + ~syslog_stdout ~redirect_stderr_to_stdout args + let execute_command_get_output_inner ?tracing ?env ?stdin ?(syslog_stdout = NoSyslogging) ?(redirect_stderr_to_stdout = false) ?(timeout = -1.0) cmd args = @@ -342,7 +437,7 @@ let execute_command_get_output_inner ?tracing ?env ?stdin with_tracing ~tracing ~name:"Forkhelpers.with_logfile_err_fd" @@ fun tracing -> with_logfile_fd "execute_command_get_err" (fun err_fd -> - let sock, pid = + let waiter, pid = safe_close_and_exec ?tracing ?env (Option.map (fun (_, fd, _) -> fd) stdinandpipes) (Some out_fd) (Some err_fd) [] ~syslog_stdout @@ -354,14 +449,26 @@ let execute_command_get_output_inner ?tracing ?env ?stdin close wr ) stdinandpipes ; - if timeout > 0. then - Unix.setsockopt_float sock Unix.SO_RCVTIMEO timeout ; - with_tracing ~tracing ~name:"Forkhelpers.waitpid" @@ fun _ -> - try waitpid (sock, pid) - with Unix.(Unix_error ((EAGAIN | EWOULDBLOCK), _, _)) -> - Unix.kill pid Sys.sigkill ; - ignore (waitpid (sock, pid)) ; - raise Subprocess_timeout + match waiter with + | Pidwaiter -> + with_tracing ~tracing ~name:"Forkhelpers.waitpid" + @@ fun _ -> + let timedout = FEStubs.pidwaiter_waitpid ~timeout pid in + let res = Unix.waitpid [] pid in + + if timedout then raise Subprocess_timeout ; + res + | Sock sock -> ( + if timeout > 0. then + Unix.setsockopt_float sock Unix.SO_RCVTIMEO timeout ; + with_tracing ~tracing ~name:"Forkhelpers.waitpid" + @@ fun _ -> + try waitpid_daemon sock pid + with Unix.(Unix_error ((EAGAIN | EWOULDBLOCK), _, _)) -> + Unix.kill pid Sys.sigkill ; + ignore (waitpid_daemon sock pid) ; + raise Subprocess_timeout + ) ) ) with diff --git a/ocaml/forkexecd/test/dune b/ocaml/forkexecd/test/dune index b190bed81aa..4e57ff2804e 100644 --- a/ocaml/forkexecd/test/dune +++ b/ocaml/forkexecd/test/dune @@ -13,6 +13,6 @@ (rule (alias runtest) (package xapi-forkexecd) - (deps fe_test.sh fe_test.exe ../src/fe_main.exe syslog.so) + (deps fe_test.sh fe_test.exe ../src/fe_main.exe syslog.so ../helper/vfork_helper) (action (run ./fe_test.sh))) diff --git a/ocaml/forkexecd/test/fe_test.sh b/ocaml/forkexecd/test/fe_test.sh index fe454e89802..05d855f85e7 100755 --- a/ocaml/forkexecd/test/fe_test.sh +++ b/ocaml/forkexecd/test/fe_test.sh @@ -8,7 +8,9 @@ export FE_TEST=1 SOCKET=${XDG_RUNTIME_DIR}/xapi/forker/main rm -f "$SOCKET" -LD_PRELOAD="$PWD/syslog.so" ../src/fe_main.exe & +LD_PRELOAD="$PWD/syslog.so" \ +TEST_VFORK_HELPER="$PWD/../helper/vfork_helper" \ +../src/fe_main.exe & MAIN=$! cleanup () { kill $MAIN @@ -17,4 +19,6 @@ trap cleanup EXIT INT for _ in $(seq 1 10); do test -S ${SOCKET} || sleep 1 done -echo "" | LD_PRELOAD="$PWD/syslog.so" ./fe_test.exe 16 +echo "" | LD_PRELOAD="$PWD/syslog.so" \ +TEST_VFORK_HELPER="$PWD/../helper/vfork_helper" \ +./fe_test.exe 16 diff --git a/ocaml/forkexecd/test/syslog.c b/ocaml/forkexecd/test/syslog.c index 2316e84a25e..10e3dc3c79f 100644 --- a/ocaml/forkexecd/test/syslog.c +++ b/ocaml/forkexecd/test/syslog.c @@ -18,6 +18,23 @@ if (!old_func) \ old_func = (typeof(name) *) dlsym(RTLD_NEXT, #name); +#define strlcpy _strlcpy +static size_t +strlcpy(char *dest, const char *src, size_t len) +{ + size_t l = strlen(src); + + if (len) { + --len; + if (l <= len) + len = l; + + memcpy(dest, src, len); + dest[len] = 0; + } + return l; +} + int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen) { static const char dev_log[] = "/dev/log"; @@ -119,3 +136,50 @@ void __vsyslog_chk(int priority, int flags, const char *format, va_list ap) { vsyslog_internal(priority, format, ap); } + +static char vfork_helper[256] = "/usr/libexec/xapi/vfork_helper"; +static char ld_preload[512]; + +static const char ld_prefix[] = "LD_PRELOAD="; +enum { len_prefix = sizeof(ld_prefix) - 1 }; + +__attribute__((constructor)) +static void initialize(void) +{ + const char *env; + env = getenv("TEST_VFORK_HELPER"); + if (env) + strlcpy(vfork_helper, env, sizeof(vfork_helper)); + env = getenv("LD_PRELOAD"); + if (env) { + strcpy(ld_preload, ld_prefix); + strlcpy(ld_preload + len_prefix, env, sizeof(ld_preload) - len_prefix); + } +} + +int execve(const char *pathname, char *const argv[], char *const envp[]) +{ + START(execve); + + if (strcmp(pathname, "/usr/libexec/xapi/vfork_helper") == 0) + pathname = vfork_helper; + + if (envp && ld_preload[0]) { + bool ok = false; + size_t num_env = 0; + for (char * const *e = envp; *e; ++e) { + ++num_env; + if (strncmp(*e, ld_prefix, len_prefix) == 0) + ok = true; + } + if (!ok) { + // allocate on stack, we could be inside a vfork() created process + char **new_envs = alloca(sizeof(char*) * (num_env + 2)); + *new_envs = ld_preload; + memcpy(new_envs + 1, envp, sizeof(char*) * (num_env + 1)); + envp = new_envs; + } + } + + return old_func(pathname, argv, envp); +} diff --git a/ocaml/libs/stunnel/stunnel.ml b/ocaml/libs/stunnel/stunnel.ml index 8d319b4b80d..c7fb013ff54 100644 --- a/ocaml/libs/stunnel/stunnel.ml +++ b/ocaml/libs/stunnel/stunnel.ml @@ -110,11 +110,6 @@ type pid = | FEFork of Forkhelpers.pidty (** the forkhelpers module did it for us. *) | Nopid -(* let string_of_pid = function - | StdFork x -> Printf.sprintf "(StdFork %d)" x - | FEFork x -> Forkhelpers.string_of_pidty x - | Nopid -> "None" *) - let getpid ty = match ty with | StdFork pid ->