From adef0dedef1dca84f0d03fa564b1b79812031faf Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Wed, 30 May 2018 11:49:43 +0900 Subject: [PATCH] oci-worker: experimental support for rootless mode Signed-off-by: Akihiro Suda --- README.md | 4 + cmd/buildkitd/main.go | 27 +- cmd/buildkitd/main_containerd_worker.go | 6 + cmd/buildkitd/main_linux.go | 9 + cmd/buildkitd/main_nolinux.go | 7 + cmd/buildkitd/main_oci_worker.go | 48 +- docs/rootless.md | 51 ++ executor/runcexecutor/executor.go | 49 +- util/appdefaults/appdefaults_unix.go | 47 + util/appdefaults/appdefaults_windows.go | 12 + util/libcontainer_specconv/README.md | 1 + util/libcontainer_specconv/example.go | 190 ++++ util/libcontainer_specconv/spec_linux_test.go | 190 ++++ vendor.conf | 3 +- .../runc/libcontainer/configs/blkio_device.go | 61 ++ .../runc/libcontainer/configs/cgroup_linux.go | 122 +++ .../libcontainer/configs/cgroup_windows.go | 6 + .../runc/libcontainer/configs/config.go | 349 ++++++++ .../runc/libcontainer/configs/config_linux.go | 61 ++ .../runc/libcontainer/configs/device.go | 57 ++ .../libcontainer/configs/device_defaults.go | 111 +++ .../libcontainer/configs/hugepage_limit.go | 9 + .../runc/libcontainer/configs/intelrdt.go | 7 + .../configs/interface_priority_map.go | 14 + .../runc/libcontainer/configs/mount.go | 39 + .../runc/libcontainer/configs/namespaces.go | 5 + .../libcontainer/configs/namespaces_linux.go | 122 +++ .../configs/namespaces_syscall.go | 31 + .../configs/namespaces_syscall_unsupported.go | 13 + .../configs/namespaces_unsupported.go | 8 + .../runc/libcontainer/configs/network.go | 72 ++ .../libcontainer/configs/validate/rootless.go | 117 +++ .../configs/validate/validator.go | 212 +++++ .../runc/libcontainer/intelrdt/intelrdt.go | 553 ++++++++++++ .../runc/libcontainer/intelrdt/stats.go | 24 + .../runc/libcontainer/nsenter/nsexec.c | 63 +- .../runc/libcontainer/seccomp/config.go | 76 ++ .../libcontainer/seccomp/seccomp_linux.go | 258 ++++++ .../seccomp/seccomp_unsupported.go | 24 + .../runc/libcontainer/specconv/example.go | 221 +++++ .../runc/libcontainer/specconv/spec_linux.go | 836 ++++++++++++++++++ .../runc/libcontainer/system/linux.go | 38 +- .../runc/libcontainer/system/unsupported.go | 18 + .../runc/libcontainer/user/lookup_unix.go | 26 + .../runc/libcontainer/user/user.go | 129 ++- .../runc/libcontainer/utils/cmsg.go | 93 ++ .../runc/libcontainer/utils/utils.go | 127 +++ .../runc/libcontainer/utils/utils_unix.go | 44 + .../github.com/opencontainers/selinux/LICENSE | 201 +++++ .../opencontainers/selinux/README.md | 7 + .../selinux/go-selinux/selinux.go | 688 ++++++++++++++ .../selinux/go-selinux/xattrs.go | 78 ++ worker/runc/runc.go | 9 +- worker/runc/runc_test.go | 3 +- 54 files changed, 5512 insertions(+), 64 deletions(-) create mode 100644 cmd/buildkitd/main_linux.go create mode 100644 cmd/buildkitd/main_nolinux.go create mode 100644 docs/rootless.md create mode 100644 util/libcontainer_specconv/README.md create mode 100644 util/libcontainer_specconv/example.go create mode 100644 util/libcontainer_specconv/spec_linux_test.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/config.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/device.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/network.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go create mode 100644 vendor/github.com/opencontainers/selinux/LICENSE create mode 100644 vendor/github.com/opencontainers/selinux/README.md create mode 100644 vendor/github.com/opencontainers/selinux/go-selinux/selinux.go create mode 100644 vendor/github.com/opencontainers/selinux/go-selinux/xattrs.go diff --git a/README.md b/README.md index a3612aecc956..4ca3b7b7ff7f 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ Key features: - Distributable workers - Multiple output formats - Pluggable architecture +- Execution without root privileges Read the proposal from https://github.com/moby/moby/issues/32925 @@ -225,6 +226,9 @@ export JAEGER_TRACE=0.0.0.0:6831 During development, BuildKit is tested with the version of runc that is being used by the containerd repository. Please refer to [runc.md](https://github.com/containerd/containerd/blob/v1.1.0/RUNC.md) for more information. +### Running BuildKit without root privileges + +Please refer to `[docs/rootless.md]`(docs/rootless.md). ### Contributing diff --git a/cmd/buildkitd/main.go b/cmd/buildkitd/main.go index 2e13223fce97..ee2b1e8ecebc 100644 --- a/cmd/buildkitd/main.go +++ b/cmd/buildkitd/main.go @@ -69,8 +69,24 @@ func main() { app := cli.NewApp() app.Name = "buildkitd" app.Usage = "build daemon" - - app.Flags = []cli.Flag{ + defaultRoot := appdefaults.Root + defaultAddress := appdefaults.Address + rootlessUsage := "set all the default options to be compatible with rootless containers" + if runningAsUnprivilegedUser() { + app.Flags = append(app.Flags, cli.BoolTFlag{ + Name: "rootless", + Usage: rootlessUsage + " (default: true)", + }) + defaultRoot = appdefaults.UserRoot() + defaultAddress = appdefaults.UserAddress() + appdefaults.EnsureUserAddressDir() + } else { + app.Flags = append(app.Flags, cli.BoolFlag{ + Name: "rootless", + Usage: rootlessUsage, + }) + } + app.Flags = append(app.Flags, cli.BoolFlag{ Name: "debug", Usage: "enable debug output in logs", @@ -78,12 +94,12 @@ func main() { cli.StringFlag{ Name: "root", Usage: "path to state directory", - Value: appdefaults.Root, + Value: defaultRoot, }, cli.StringSliceFlag{ Name: "addr", Usage: "listening address (socket or tcp)", - Value: &cli.StringSlice{appdefaults.Address}, + Value: &cli.StringSlice{defaultAddress}, }, cli.StringFlag{ Name: "group", @@ -107,8 +123,7 @@ func main() { Name: "tlscacert", Usage: "ca certificate to verify clients", }, - } - + ) app.Flags = append(app.Flags, appFlags...) app.Action = func(c *cli.Context) error { diff --git a/cmd/buildkitd/main_containerd_worker.go b/cmd/buildkitd/main_containerd_worker.go index 6730939cb44e..b028e25caf4d 100644 --- a/cmd/buildkitd/main_containerd_worker.go +++ b/cmd/buildkitd/main_containerd_worker.go @@ -52,6 +52,12 @@ func containerdWorkerInitializer(c *cli.Context, common workerInitializerOpt) ([ if err != nil { return nil, err } + // GlobalBool works for BoolT as well + rootless := c.GlobalBool("rootless") + if rootless { + logrus.Warn("rootless mode is not supported for containerd workers. disabling containerd worker.") + return nil, nil + } opt, err := containerd.NewWorkerOpt(common.root, socket, ctd.DefaultSnapshotter, labels) if err != nil { return nil, err diff --git a/cmd/buildkitd/main_linux.go b/cmd/buildkitd/main_linux.go new file mode 100644 index 000000000000..27823fd6e93f --- /dev/null +++ b/cmd/buildkitd/main_linux.go @@ -0,0 +1,9 @@ +// +build linux + +package main + +import "github.com/opencontainers/runc/libcontainer/system" + +func runningAsUnprivilegedUser() bool { + return system.GetParentNSeuid() != 0 || system.RunningInUserNS() +} diff --git a/cmd/buildkitd/main_nolinux.go b/cmd/buildkitd/main_nolinux.go new file mode 100644 index 000000000000..883baa1660ad --- /dev/null +++ b/cmd/buildkitd/main_nolinux.go @@ -0,0 +1,7 @@ +// +build !linux + +package main + +func runningAsUnprivilegedUser() bool { + return false +} diff --git a/cmd/buildkitd/main_oci_worker.go b/cmd/buildkitd/main_oci_worker.go index 1d40bd5c276c..82d65bf4895b 100644 --- a/cmd/buildkitd/main_oci_worker.go +++ b/cmd/buildkitd/main_oci_worker.go @@ -17,11 +17,7 @@ import ( ) func init() { - registerWorkerInitializer( - workerInitializer{ - fn: ociWorkerInitializer, - priority: 0, - }, + flags := []cli.Flag{ cli.StringFlag{ Name: "oci-worker", Usage: "enable oci workers (true/false/auto)", @@ -34,11 +30,30 @@ func init() { cli.StringFlag{ Name: "oci-worker-snapshotter", Usage: "name of snapshotter (overlayfs or native)", - // TODO(AkihiroSuda): autodetect overlayfs availability when the value is set to "auto"? - Value: "overlayfs", + Value: "auto", + }, + } + n := "oci-worker-rootless" + u := "enable rootless mode" + if runningAsUnprivilegedUser() { + flags = append(flags, cli.BoolTFlag{ + Name: n, + Usage: u, + }) + } else { + flags = append(flags, cli.BoolFlag{ + Name: n, + Usage: u, + }) + } + registerWorkerInitializer( + workerInitializer{ + fn: ociWorkerInitializer, + priority: 0, }, + flags..., ) - // TODO: allow multiple oci runtimes and snapshotters + // TODO: allow multiple oci runtimes } func ociWorkerInitializer(c *cli.Context, common workerInitializerOpt) ([]worker.Worker, error) { @@ -57,7 +72,12 @@ func ociWorkerInitializer(c *cli.Context, common workerInitializerOpt) ([]worker if err != nil { return nil, err } - opt, err := runc.NewWorkerOpt(common.root, snFactory, labels) + // GlobalBool works for BoolT as well + rootless := c.GlobalBool("oci-worker-rootless") || c.GlobalBool("rootless") + if rootless { + logrus.Debugf("running in rootless mode") + } + opt, err := runc.NewWorkerOpt(common.root, snFactory, rootless, labels) if err != nil { return nil, err } @@ -75,6 +95,16 @@ func snapshotterFactory(name string) (runc.SnapshotterFactory, error) { } var err error switch name { + case "auto": + snFactory.New = func(root string) (ctdsnapshot.Snapshotter, error) { + err := overlay.Supported(root) + if err == nil { + logrus.Debug("auto snapshotter: using overlayfs") + return overlay.NewSnapshotter(root) + } + logrus.Debugf("auto snapshotter: using native for %s: %v", root, err) + return native.NewSnapshotter(root) + } case "native": snFactory.New = native.NewSnapshotter case "overlayfs": // not "overlay", for consistency with containerd snapshotter plugin ID. diff --git a/docs/rootless.md b/docs/rootless.md new file mode 100644 index 000000000000..38200c973526 --- /dev/null +++ b/docs/rootless.md @@ -0,0 +1,51 @@ +# Rootless mode (Experimental) + +Requirements: +- runc (May 30, 2018) or later +- Some distros such as Debian and Arch Linux require `echo 1 > /proc/sys/kernel/unprivileged_userns_clone` +- `newuidmap` and `newgidmap` need to be installed on the host. These commands are provided by the `uidmap` package. +- `/etc/subuid` and `/etc/subgid` should contain >= 65536 sub-IDs. e.g. `penguin:231072:65536`. +- To run in a Docker container with non-root `USER`, `docker run --privileged` is still required. See also Jessie's blog: https://blog.jessfraz.com/post/building-container-images-securely-on-kubernetes/ + +Setting up rootless mode also requires some bothersome steps as follows, but we will soon have automation tool. + +## Terminal 1: + +``` +$ unshare -U -m +unshared$ echo $$ > /tmp/pid +``` + +Unsharing mountns (and userns) is required for mounting filesystems without real root privileges. + +## Terminal 2: + +``` +$ id -u +1001 +$ grep $(whoami) /etc/subuid +penguin:231072:65536 +$ grep $(whoami) /etc/subgid +penguin:231072:65536 +$ newuidmap $(cat /tmp/pid) 0 1001 1 1 231072 65536 +$ newgidmap $(cat /tmp/pid) 0 1001 1 1 231072 65536 +``` + +## Terminal 1: + +``` +unshared# buildkitd +``` + +* The data dir will be set to `/home/penguin/.local/share/buildkit` +* The address will be set to `unix:///run/user/1001/buildkit/buildkitd.sock` +* `overlayfs` snapshotter is not supported except Ubuntu-flavored kernel: http://kernel.ubuntu.com/git/ubuntu/ubuntu-artful.git/commit/fs/overlayfs?h=Ubuntu-4.13.0-25.29&id=0a414bdc3d01f3b61ed86cfe3ce8b63a9240eba7 +* containerd worker is not supported ( pending PR: https://github.com/containerd/containerd/pull/2006 ) +* Network namespace is not used at the moment. + +## Terminal 2: + +``` +$ go get ./examples/build-using-dockerfile +$ build-using-dockerfile --buildkit-addr unix:///run/user/1001/buildkit/buildkitd.sock -t foo /path/to/somewhere +``` diff --git a/executor/runcexecutor/executor.go b/executor/runcexecutor/executor.go index e81484eb25a8..593ecf0a1f47 100644 --- a/executor/runcexecutor/executor.go +++ b/executor/runcexecutor/executor.go @@ -4,9 +4,12 @@ import ( "context" "encoding/json" "io" + "io/ioutil" "os" "os/exec" "path/filepath" + "strconv" + "strings" "syscall" "github.com/containerd/containerd/contrib/seccomp" @@ -18,22 +21,28 @@ import ( "github.com/moby/buildkit/executor" "github.com/moby/buildkit/executor/oci" "github.com/moby/buildkit/identity" + "github.com/moby/buildkit/util/libcontainer_specconv" "github.com/moby/buildkit/util/system" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) type Opt struct { + // root directory Root string CommandCandidates []string + // without root privileges (has nothing to do with Opt.Root directory) + Rootless bool } var defaultCommandCandidates = []string{"buildkit-runc", "runc"} type runcExecutor struct { - runc *runc.Runc - root string - cmd string + runc *runc.Runc + root string + cmd string + rootless bool } func New(opt Opt) (executor.Executor, error) { @@ -75,8 +84,9 @@ func New(opt Opt) (executor.Executor, error) { } w := &runcExecutor{ - runc: runtime, - root: root, + runc: runtime, + root: root, + rootless: opt.Rootless, } return w, nil } @@ -156,6 +166,19 @@ func (w *runcExecutor) Exec(ctx context.Context, meta executor.Meta, root cache. return errors.Wrapf(err, "failed to create working directory %s", newp) } + if w.rootless { + specconv.ToRootless(spec, &specconv.RootlessOpts{ + MapSubUIDGID: true, + }) + // TODO(AkihiroSuda): keep Cgroups enabled if /sys/fs/cgroup/cpuset/buildkit exists and writable + spec.Linux.CgroupsPath = "" + // TODO(AkihiroSuda): ToRootless removes netns, but we should readd netns here + // if either SUID or userspace NAT is configured on the host. + if err := setOOMScoreAdj(spec); err != nil { + return err + } + } + if err := json.NewEncoder(f).Encode(spec); err != nil { return err } @@ -205,3 +228,19 @@ func (s *forwardIO) Stdout() io.ReadCloser { func (s *forwardIO) Stderr() io.ReadCloser { return nil } + +// setOOMScoreAdj comes from https://github.com/genuinetools/img/blob/2fabe60b7dc4623aa392b515e013bbc69ad510ab/executor/runc/executor.go#L182-L192 +func setOOMScoreAdj(spec *specs.Spec) error { + // Set the oom_score_adj of our children containers to that of the current process. + b, err := ioutil.ReadFile("/proc/self/oom_score_adj") + if err != nil { + return errors.Wrap(err, "failed to read /proc/self/oom_score_adj") + } + s := strings.TrimSpace(string(b)) + oom, err := strconv.Atoi(s) + if err != nil { + return errors.Wrapf(err, "failed to parse %s as int", s) + } + spec.Process.OOMScoreAdj = &oom + return nil +} diff --git a/util/appdefaults/appdefaults_unix.go b/util/appdefaults/appdefaults_unix.go index 4cc1b4547b0f..7b907ad32be0 100644 --- a/util/appdefaults/appdefaults_unix.go +++ b/util/appdefaults/appdefaults_unix.go @@ -2,7 +2,54 @@ package appdefaults +import ( + "os" + "path/filepath" + "strings" +) + const ( Address = "unix:///run/buildkit/buildkitd.sock" Root = "/var/lib/buildkit" ) + +// UserAddress typically returns /run/user/$UID/buildkit/buildkitd.sock +func UserAddress() string { + // pam_systemd sets XDG_RUNTIME_DIR but not other dirs. + xdgRuntimeDir := os.Getenv("XDG_RUNTIME_DIR") + if xdgRuntimeDir != "" { + dirs := strings.Split(xdgRuntimeDir, ":") + return "unix://" + filepath.Join(dirs[0], "buildkit", "buildkitd.sock") + } + return Address +} + +// EnsureUserAddressDir sets sticky bit on XDG_RUNTIME_DIR if XDG_RUNTIME_DIR is set. +// See https://github.com/opencontainers/runc/issues/1694 +func EnsureUserAddressDir() error { + xdgRuntimeDir := os.Getenv("XDG_RUNTIME_DIR") + if xdgRuntimeDir != "" { + dirs := strings.Split(xdgRuntimeDir, ":") + dir := filepath.Join(dirs[0], "buildkit") + if err := os.MkdirAll(dir, 0700); err != nil { + return err + } + return os.Chmod(dir, 0700|os.ModeSticky) + } + return nil +} + +// UserRoot typically returns /home/$USER/.local/share/buildkit +func UserRoot() string { + // pam_systemd sets XDG_RUNTIME_DIR but not other dirs. + xdgDataHome := os.Getenv("XDG_DATA_HOME") + if xdgDataHome != "" { + dirs := strings.Split(xdgDataHome, ":") + return filepath.Join(dirs[0], "buildkit") + } + home := os.Getenv("HOME") + if home != "" { + return filepath.Join(home, ".local", "share", "buildkit") + } + return Root +} diff --git a/util/appdefaults/appdefaults_windows.go b/util/appdefaults/appdefaults_windows.go index 37203ccd4d09..dbc96c809556 100644 --- a/util/appdefaults/appdefaults_windows.go +++ b/util/appdefaults/appdefaults_windows.go @@ -4,3 +4,15 @@ const ( Address = "npipe:////./pipe/buildkitd" Root = ".buildstate" ) + +func UserAddress() string { + return Address +} + +func EnsureUserAddressDir() error { + return nil +} + +func UserRoot() string { + return Root +} diff --git a/util/libcontainer_specconv/README.md b/util/libcontainer_specconv/README.md new file mode 100644 index 000000000000..7b985bafbf2f --- /dev/null +++ b/util/libcontainer_specconv/README.md @@ -0,0 +1 @@ +Temporary forked from https://github.com/opencontainers/runc/pull/1692 diff --git a/util/libcontainer_specconv/example.go b/util/libcontainer_specconv/example.go new file mode 100644 index 000000000000..6ca1e06d762b --- /dev/null +++ b/util/libcontainer_specconv/example.go @@ -0,0 +1,190 @@ +package specconv + +import ( + "os" + "sort" + "strings" + + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/user" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// RootlessOpts is an optional spec for ToRootless +type RootlessOpts struct { + // Add sub{u,g}id to spec.Linux.{U,G}IDMappings. + // Requires newuidmap(1) and newgidmap(1) with suid bit. + // Ignored when running in userns. + MapSubUIDGID bool +} + +// Run-time context for ToRootless. +type RootlessContext struct { + EUID uint32 + EGID uint32 + SubUIDs []user.SubID + SubGIDs []user.SubID + UIDMap []user.IDMap + GIDMap []user.IDMap + InUserNS bool +} + +// ToRootless converts the given spec file into one that should work with +// rootless containers, by removing incompatible options and adding others that +// are needed. +func ToRootless(spec *specs.Spec, opts *RootlessOpts) error { + var err error + ctx := RootlessContext{} + ctx.EUID = uint32(os.Geteuid()) + ctx.EGID = uint32(os.Getegid()) + ctx.SubUIDs, err = user.CurrentUserSubUIDs() + if err != nil && !os.IsNotExist(err) { + return err + } + ctx.SubGIDs, err = user.CurrentGroupSubGIDs() + if err != nil && !os.IsNotExist(err) { + return err + } + ctx.UIDMap, err = user.CurrentProcessUIDMap() + if err != nil && !os.IsNotExist(err) { + return err + } + uidMapExists := !os.IsNotExist(err) + ctx.GIDMap, err = user.CurrentProcessUIDMap() + if err != nil && !os.IsNotExist(err) { + return err + } + ctx.InUserNS = uidMapExists && system.UIDMapInUserNS(ctx.UIDMap) + return ToRootlessWithContext(ctx, spec, opts) +} + +// ToRootlessWithContext converts the spec with the run-time context. +// ctx can be internally modified for sorting. +func ToRootlessWithContext(ctx RootlessContext, spec *specs.Spec, opts *RootlessOpts) error { + if opts == nil { + opts = &RootlessOpts{} + } + var namespaces []specs.LinuxNamespace + + // Remove networkns from the spec. + for _, ns := range spec.Linux.Namespaces { + switch ns.Type { + case specs.NetworkNamespace, specs.UserNamespace: + // Do nothing. + default: + namespaces = append(namespaces, ns) + } + } + // Add userns to the spec. + namespaces = append(namespaces, specs.LinuxNamespace{ + Type: specs.UserNamespace, + }) + spec.Linux.Namespaces = namespaces + + // Add mappings for the current user. + if ctx.InUserNS { + uNextContainerID := 0 + sort.Sort(idmapSorter(ctx.UIDMap)) + for _, uidmap := range ctx.UIDMap { + spec.Linux.UIDMappings = append(spec.Linux.UIDMappings, + specs.LinuxIDMapping{ + HostID: uint32(uidmap.ID), + ContainerID: uint32(uNextContainerID), + Size: uint32(uidmap.Count), + }) + uNextContainerID += uidmap.Count + } + gNextContainerID := 0 + sort.Sort(idmapSorter(ctx.GIDMap)) + for _, gidmap := range ctx.GIDMap { + spec.Linux.GIDMappings = append(spec.Linux.GIDMappings, + specs.LinuxIDMapping{ + HostID: uint32(gidmap.ID), + ContainerID: uint32(gNextContainerID), + Size: uint32(gidmap.Count), + }) + gNextContainerID += gidmap.Count + } + // opts.MapSubUIDGID is ignored in userns + } else { + spec.Linux.UIDMappings = []specs.LinuxIDMapping{{ + HostID: ctx.EUID, + ContainerID: 0, + Size: 1, + }} + spec.Linux.GIDMappings = []specs.LinuxIDMapping{{ + HostID: ctx.EGID, + ContainerID: 0, + Size: 1, + }} + if opts.MapSubUIDGID { + uNextContainerID := 1 + sort.Sort(subIDSorter(ctx.SubUIDs)) + for _, subuid := range ctx.SubUIDs { + spec.Linux.UIDMappings = append(spec.Linux.UIDMappings, + specs.LinuxIDMapping{ + HostID: uint32(subuid.SubID), + ContainerID: uint32(uNextContainerID), + Size: uint32(subuid.Count), + }) + uNextContainerID += subuid.Count + } + gNextContainerID := 1 + sort.Sort(subIDSorter(ctx.SubGIDs)) + for _, subgid := range ctx.SubGIDs { + spec.Linux.GIDMappings = append(spec.Linux.GIDMappings, + specs.LinuxIDMapping{ + HostID: uint32(subgid.SubID), + ContainerID: uint32(gNextContainerID), + Size: uint32(subgid.Count), + }) + gNextContainerID += subgid.Count + } + } + } + + // Fix up mounts. + var mounts []specs.Mount + for _, mount := range spec.Mounts { + // Ignore all mounts that are under /sys. + if strings.HasPrefix(mount.Destination, "/sys") { + continue + } + + // Remove all gid= and uid= mappings. + var options []string + for _, option := range mount.Options { + if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") { + options = append(options, option) + } + } + + mount.Options = options + mounts = append(mounts, mount) + } + // Add the sysfs mount as an rbind. + mounts = append(mounts, specs.Mount{ + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }) + spec.Mounts = mounts + + // Remove cgroup settings. + spec.Linux.Resources = nil + return nil +} + +// subIDSorter is required for Go <= 1.7 +type subIDSorter []user.SubID + +func (x subIDSorter) Len() int { return len(x) } +func (x subIDSorter) Swap(i, j int) { x[i], x[j] = x[j], x[i] } +func (x subIDSorter) Less(i, j int) bool { return x[i].SubID < x[j].SubID } + +type idmapSorter []user.IDMap + +func (x idmapSorter) Len() int { return len(x) } +func (x idmapSorter) Swap(i, j int) { x[i], x[j] = x[j], x[i] } +func (x idmapSorter) Less(i, j int) bool { return x[i].ID < x[j].ID } diff --git a/util/libcontainer_specconv/spec_linux_test.go b/util/libcontainer_specconv/spec_linux_test.go new file mode 100644 index 000000000000..116112b0df65 --- /dev/null +++ b/util/libcontainer_specconv/spec_linux_test.go @@ -0,0 +1,190 @@ +// +build linux + +package specconv + +import ( + "reflect" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runc/libcontainer/user" + "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestRootlessSpecconvValidate(t *testing.T) { + specGen := func() *specs.Spec { + spec := specconv.Example() + spec.Root.Path = "/" + return spec + } + cases := []struct { + ctx RootlessContext + opts *RootlessOpts + additionalValidator func(t *testing.T, s *specs.Spec) + }{ + { + ctx: RootlessContext{ + EUID: 0, + EGID: 0, + }, + }, + { + ctx: RootlessContext{ + EUID: 4242, + EGID: 4242, + }, + }, + { + ctx: RootlessContext{ + EUID: 4242, + EGID: 4242, + // empty subuid / subgid + }, + opts: &RootlessOpts{ + MapSubUIDGID: true, + }, + }, + { + ctx: RootlessContext{ + EUID: 4242, + EGID: 4242, + SubUIDs: []user.SubID{ + { + Name: "dummy", + SubID: 14242, + Count: 65536, + }, + { + Name: "dummy", + SubID: 114242, + Count: 65536, + }, + }, + SubGIDs: []user.SubID{ + { + Name: "dummy", + SubID: 14242, + Count: 65536, + }, + { + Name: "dummy", + SubID: 114242, + Count: 65536, + }, + }, + }, + opts: &RootlessOpts{ + MapSubUIDGID: true, + }, + additionalValidator: func(t *testing.T, s *specs.Spec) { + expectedUIDMappings := []specs.LinuxIDMapping{ + { + HostID: 4242, + ContainerID: 0, + Size: 1, + }, + { + HostID: 14242, + ContainerID: 1, + Size: 65536, + }, + { + HostID: 114242, + ContainerID: 65537, + Size: 65536, + }, + } + if !reflect.DeepEqual(expectedUIDMappings, s.Linux.UIDMappings) { + t.Errorf("expected %#v, got %#v", expectedUIDMappings, s.Linux.UIDMappings) + } + expectedGIDMappings := expectedUIDMappings + if !reflect.DeepEqual(expectedGIDMappings, s.Linux.GIDMappings) { + t.Errorf("expected %#v, got %#v", expectedGIDMappings, s.Linux.GIDMappings) + } + }, + }, + { + ctx: RootlessContext{ + EUID: 0, + EGID: 0, + UIDMap: []user.IDMap{ + { + ID: 0, + ParentID: 4242, + Count: 1, + }, + { + ID: 1, + ParentID: 231072, + Count: 65536, + }, + }, + GIDMap: []user.IDMap{ + { + ID: 0, + ParentID: 4242, + Count: 1, + }, + { + ID: 1, + ParentID: 231072, + Count: 65536, + }, + }, + InUserNS: true, + }, + additionalValidator: func(t *testing.T, s *specs.Spec) { + expectedUIDMappings := []specs.LinuxIDMapping{ + { + HostID: 0, + ContainerID: 0, + Size: 1, + }, + { + HostID: 1, + ContainerID: 1, + Size: 65536, + }, + } + if !reflect.DeepEqual(expectedUIDMappings, s.Linux.UIDMappings) { + t.Errorf("expected %#v, got %#v", expectedUIDMappings, s.Linux.UIDMappings) + } + expectedGIDMappings := expectedUIDMappings + if !reflect.DeepEqual(expectedGIDMappings, s.Linux.GIDMappings) { + t.Errorf("expected %#v, got %#v", expectedGIDMappings, s.Linux.GIDMappings) + } + }, + }, + } + + for _, c := range cases { + spec := specGen() + err := ToRootlessWithContext(c.ctx, spec, c.opts) + if err != nil { + t.Errorf("Couldn't convert a rootful spec to rootless: %v", err) + } + + // t.Logf("%#v", spec) + if c.additionalValidator != nil { + c.additionalValidator(t, spec) + } + + opts := &specconv.CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + Rootless: true, + } + + config, err := specconv.CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid rootless container config: %v", err) + } + } +} diff --git a/vendor.conf b/vendor.conf index 06e5a91c3c43..8891d059ad68 100644 --- a/vendor.conf +++ b/vendor.conf @@ -18,7 +18,7 @@ github.com/gogo/googleapis 08a7655d27152912db7aaf4f983275eaf8d128ef github.com/golang/protobuf 1643683e1b54a9e88ad26d98f81400c8c9d9f4f9 github.com/containerd/continuity d3c23511c1bf5851696cba83143d9cbcd666869b github.com/opencontainers/image-spec v1.0.1 -github.com/opencontainers/runc 69663f0bd4b60df09991c08812a60108003fa340 +github.com/opencontainers/runc 0e561642f81e84ebd0b3afd6ec510c75a2ccb71b github.com/Microsoft/go-winio v0.4.5 github.com/containerd/fifo 3d5202aec260678c48179c56f40e6f38a095738c github.com/opencontainers/runtime-spec v1.0.1 @@ -61,3 +61,4 @@ github.com/uber/jaeger-lib c48167d9cae5887393dd5e61efd06a4a48b7fbb3 github.com/codahale/hdrhistogram f8ad88b59a584afeee9d334eff879b104439117b github.com/opentracing-contrib/go-stdlib b1a47cfbdd7543e70e9ef3e73d0802ad306cc1cc +github.com/opencontainers/selinux 74a747aeaf2d66097b6908f572794f49f07dda2c diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go new file mode 100644 index 000000000000..e0f3ca16533e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -0,0 +1,61 @@ +package configs + +import "fmt" + +// blockIODevice holds major:minor format supported in blkio cgroup +type blockIODevice struct { + // Major is the device's major number + Major int64 `json:"major"` + // Minor is the device's minor number + Minor int64 `json:"minor"` +} + +// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair +type WeightDevice struct { + blockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight uint16 `json:"weight"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + LeafWeight uint16 `json:"leafWeight"` +} + +// NewWeightDevice returns a configured WeightDevice pointer +func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { + wd := &WeightDevice{} + wd.Major = major + wd.Minor = minor + wd.Weight = weight + wd.LeafWeight = leafWeight + return wd +} + +// WeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) WeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) +} + +// LeafWeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) LeafWeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) +} + +// ThrottleDevice struct holds a `major:minor rate_per_second` pair +type ThrottleDevice struct { + blockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` +} + +// NewThrottleDevice returns a configured ThrottleDevice pointer +func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { + td := &ThrottleDevice{} + td.Major = major + td.Minor = minor + td.Rate = rate + return td +} + +// String formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) String() string { + return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go new file mode 100644 index 000000000000..e15a662f5228 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -0,0 +1,122 @@ +package configs + +type FreezerState string + +const ( + Undefined FreezerState = "" + Frozen FreezerState = "FROZEN" + Thawed FreezerState = "THAWED" +) + +type Cgroup struct { + // Deprecated, use Path instead + Name string `json:"name,omitempty"` + + // name of parent of cgroup or slice + // Deprecated, use Path instead + Parent string `json:"parent,omitempty"` + + // Path specifies the path to cgroups that are created and/or joined by the container. + // The path is assumed to be relative to the host system cgroup mountpoint. + Path string `json:"path"` + + // ScopePrefix describes prefix for the scope name + ScopePrefix string `json:"scope_prefix"` + + // Paths represent the absolute cgroups paths to join. + // This takes precedence over Path. + Paths map[string]string + + // Resources contains various cgroups settings to apply + *Resources +} + +type Resources struct { + // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. + // Deprecated + AllowAllDevices *bool `json:"allow_all_devices,omitempty"` + // Deprecated + AllowedDevices []*Device `json:"allowed_devices,omitempty"` + // Deprecated + DeniedDevices []*Device `json:"denied_devices,omitempty"` + + Devices []*Device `json:"devices"` + + // Memory limit (in bytes) + Memory int64 `json:"memory"` + + // Memory reservation or soft_limit (in bytes) + MemoryReservation int64 `json:"memory_reservation"` + + // Total memory usage (memory + swap); set `-1` to enable unlimited swap + MemorySwap int64 `json:"memory_swap"` + + // Kernel memory limit (in bytes) + KernelMemory int64 `json:"kernel_memory"` + + // Kernel memory limit for TCP use (in bytes) + KernelMemoryTCP int64 `json:"kernel_memory_tcp"` + + // CPU shares (relative weight vs. other containers) + CpuShares uint64 `json:"cpu_shares"` + + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuQuota int64 `json:"cpu_quota"` + + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpuPeriod uint64 `json:"cpu_period"` + + // How many time CPU will use in realtime scheduling (in usecs). + CpuRtRuntime int64 `json:"cpu_rt_quota"` + + // CPU period to be used for realtime scheduling (in usecs). + CpuRtPeriod uint64 `json:"cpu_rt_period"` + + // CPU to use + CpusetCpus string `json:"cpuset_cpus"` + + // MEM to use + CpusetMems string `json:"cpuset_mems"` + + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + + // Specifies per cgroup weight, range is from 10 to 1000. + BlkioWeight uint16 `json:"blkio_weight"` + + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + BlkioLeafWeight uint16 `json:"blkio_leaf_weight"` + + // Weight per cgroup per device, can override BlkioWeight. + BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"` + + // IO read rate limit per cgroup per device, bytes per second. + BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` + + // IO write rate limit per cgroup per device, bytes per second. + BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` + + // IO read rate limit per cgroup per device, IO per second. + BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"` + + // IO write rate limit per cgroup per device, IO per second. + BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"` + + // set the freeze value for the process + Freezer FreezerState `json:"freezer"` + + // Hugetlb limit (in bytes) + HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` + + // Whether to disable OOM Killer + OomKillDisable bool `json:"oom_kill_disable"` + + // Tuning swappiness behaviour per cgroup + MemorySwappiness *uint64 `json:"memory_swappiness"` + + // Set priority of network traffic for container + NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` + + // Set class identifier for container's network packets + NetClsClassid uint32 `json:"net_cls_classid_u"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go new file mode 100644 index 000000000000..d74847b0db83 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go @@ -0,0 +1,6 @@ +package configs + +// TODO Windows: This can ultimately be entirely factored out on Windows as +// cgroups are a Unix-specific construct. +type Cgroup struct { +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go new file mode 100644 index 000000000000..b1c4762fe20c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -0,0 +1,349 @@ +package configs + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "time" + + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" +) + +type Rlimit struct { + Type int `json:"type"` + Hard uint64 `json:"hard"` + Soft uint64 `json:"soft"` +} + +// IDMap represents UID/GID Mappings for User Namespaces. +type IDMap struct { + ContainerID int `json:"container_id"` + HostID int `json:"host_id"` + Size int `json:"size"` +} + +// Seccomp represents syscall restrictions +// By default, only the native architecture of the kernel is allowed to be used +// for syscalls. Additional architectures can be added by specifying them in +// Architectures. +type Seccomp struct { + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` +} + +// Action is taken upon rule match in Seccomp +type Action int + +const ( + Kill Action = iota + 1 + Errno + Trap + Allow + Trace +) + +// Operator is a comparison operator to be used when matching syscall arguments in Seccomp +type Operator int + +const ( + EqualTo Operator = iota + 1 + NotEqualTo + GreaterThan + GreaterThanOrEqualTo + LessThan + LessThanOrEqualTo + MaskEqualTo +) + +// Arg is a rule to match a specific syscall argument in Seccomp +type Arg struct { + Index uint `json:"index"` + Value uint64 `json:"value"` + ValueTwo uint64 `json:"value_two"` + Op Operator `json:"op"` +} + +// Syscall is a rule to match a syscall in Seccomp +type Syscall struct { + Name string `json:"name"` + Action Action `json:"action"` + Args []*Arg `json:"args"` +} + +// TODO Windows. Many of these fields should be factored out into those parts +// which are common across platforms, and those which are platform specific. + +// Config defines configuration options for executing a process inside a contained environment. +type Config struct { + // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs + // This is a common option when the container is running in ramdisk + NoPivotRoot bool `json:"no_pivot_root"` + + // ParentDeathSignal specifies the signal that is sent to the container's process in the case + // that the parent process dies. + ParentDeathSignal int `json:"parent_death_signal"` + + // Path to a directory containing the container's root filesystem. + Rootfs string `json:"rootfs"` + + // Readonlyfs will remount the container's rootfs as readonly where only externally mounted + // bind mounts are writtable. + Readonlyfs bool `json:"readonlyfs"` + + // Specifies the mount propagation flags to be applied to /. + RootPropagation int `json:"rootPropagation"` + + // Mounts specify additional source and destination paths that will be mounted inside the container's + // rootfs and mount namespace if specified + Mounts []*Mount `json:"mounts"` + + // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! + Devices []*Device `json:"devices"` + + MountLabel string `json:"mount_label"` + + // Hostname optionally sets the container's hostname if provided + Hostname string `json:"hostname"` + + // Namespaces specifies the container's namespaces that it should setup when cloning the init process + // If a namespace is not provided that namespace is shared from the container's parent process + Namespaces Namespaces `json:"namespaces"` + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *Capabilities `json:"capabilities"` + + // Networks specifies the container's network setup to be created + Networks []*Network `json:"networks"` + + // Routes can be specified to create entries in the route table as the container is started + Routes []*Route `json:"routes"` + + // Cgroups specifies specific cgroup settings for the various subsystems that the container is + // placed into to limit the resources the container has available + Cgroups *Cgroup `json:"cgroups"` + + // AppArmorProfile specifies the profile to apply to the process running in the container and is + // change at the time the process is execed + AppArmorProfile string `json:"apparmor_profile,omitempty"` + + // ProcessLabel specifies the label to apply to the process running in the container. It is + // commonly used by selinux + ProcessLabel string `json:"process_label,omitempty"` + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []Rlimit `json:"rlimits,omitempty"` + + // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores + // for a process. Valid values are between the range [-1000, '1000'], where processes with + // higher scores are preferred for being killed. If it is unset then we don't touch the current + // value. + // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ + OomScoreAdj *int `json:"oom_score_adj,omitempty"` + + // UidMappings is an array of User ID mappings for User Namespaces + UidMappings []IDMap `json:"uid_mappings"` + + // GidMappings is an array of Group ID mappings for User Namespaces + GidMappings []IDMap `json:"gid_mappings"` + + // MaskPaths specifies paths within the container's rootfs to mask over with a bind + // mount pointing to /dev/null as to prevent reads of the file. + MaskPaths []string `json:"mask_paths"` + + // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only + // so that these files prevent any writes. + ReadonlyPaths []string `json:"readonly_paths"` + + // Sysctl is a map of properties and their values. It is the equivalent of using + // sysctl -w my.property.name value in Linux. + Sysctl map[string]string `json:"sysctl"` + + // Seccomp allows actions to be taken whenever a syscall is made within the container. + // A number of rules are given, each having an action to be taken if a syscall matches it. + // A default action to be taken if no rules match is also given. + Seccomp *Seccomp `json:"seccomp"` + + // NoNewPrivileges controls whether processes in the container can gain additional privileges. + NoNewPrivileges bool `json:"no_new_privileges,omitempty"` + + // Hooks are a collection of actions to perform at various container lifecycle events. + // CommandHooks are serialized to JSON, but other hooks are not. + Hooks *Hooks + + // Version is the version of opencontainer specification that is supported. + Version string `json:"version"` + + // Labels are user defined metadata that is stored in the config and populated on the state + Labels []string `json:"labels"` + + // NoNewKeyring will not allocated a new session keyring for the container. It will use the + // callers keyring in this case. + NoNewKeyring bool `json:"no_new_keyring"` + + // Rootless specifies whether the container is a rootless container. + Rootless bool `json:"rootless"` + + // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into + // to limit the resources (e.g., L3 cache) the container has available + IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` +} + +type Hooks struct { + // Prestart commands are executed after the container namespaces are created, + // but before the user supplied command is executed from init. + Prestart []Hook + + // Poststart commands are executed after the container init process starts. + Poststart []Hook + + // Poststop commands are executed after the container init process exits. + Poststop []Hook +} + +type Capabilities struct { + // Bounding is the set of capabilities checked by the kernel. + Bounding []string + // Effective is the set of capabilities checked by the kernel. + Effective []string + // Inheritable is the capabilities preserved across execve. + Inheritable []string + // Permitted is the limiting superset for effective capabilities. + Permitted []string + // Ambient is the ambient set of capabilities that are kept. + Ambient []string +} + +func (hooks *Hooks) UnmarshalJSON(b []byte) error { + var state struct { + Prestart []CommandHook + Poststart []CommandHook + Poststop []CommandHook + } + + if err := json.Unmarshal(b, &state); err != nil { + return err + } + + deserialize := func(shooks []CommandHook) (hooks []Hook) { + for _, shook := range shooks { + hooks = append(hooks, shook) + } + + return hooks + } + + hooks.Prestart = deserialize(state.Prestart) + hooks.Poststart = deserialize(state.Poststart) + hooks.Poststop = deserialize(state.Poststop) + return nil +} + +func (hooks Hooks) MarshalJSON() ([]byte, error) { + serialize := func(hooks []Hook) (serializableHooks []CommandHook) { + for _, hook := range hooks { + switch chook := hook.(type) { + case CommandHook: + serializableHooks = append(serializableHooks, chook) + default: + logrus.Warnf("cannot serialize hook of type %T, skipping", hook) + } + } + + return serializableHooks + } + + return json.Marshal(map[string]interface{}{ + "prestart": serialize(hooks.Prestart), + "poststart": serialize(hooks.Poststart), + "poststop": serialize(hooks.Poststop), + }) +} + +// HookState is the payload provided to a hook on execution. +type HookState specs.State + +type Hook interface { + // Run executes the hook with the provided state. + Run(HookState) error +} + +// NewFunctionHook will call the provided function when the hook is run. +func NewFunctionHook(f func(HookState) error) FuncHook { + return FuncHook{ + run: f, + } +} + +type FuncHook struct { + run func(HookState) error +} + +func (f FuncHook) Run(s HookState) error { + return f.run(s) +} + +type Command struct { + Path string `json:"path"` + Args []string `json:"args"` + Env []string `json:"env"` + Dir string `json:"dir"` + Timeout *time.Duration `json:"timeout"` +} + +// NewCommandHook will execute the provided command when the hook is run. +func NewCommandHook(cmd Command) CommandHook { + return CommandHook{ + Command: cmd, + } +} + +type CommandHook struct { + Command +} + +func (c Command) Run(s HookState) error { + b, err := json.Marshal(s) + if err != nil { + return err + } + var stdout, stderr bytes.Buffer + cmd := exec.Cmd{ + Path: c.Path, + Args: c.Args, + Env: c.Env, + Stdin: bytes.NewReader(b), + Stdout: &stdout, + Stderr: &stderr, + } + if err := cmd.Start(); err != nil { + return err + } + errC := make(chan error, 1) + go func() { + err := cmd.Wait() + if err != nil { + err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) + } + errC <- err + }() + var timerCh <-chan time.Time + if c.Timeout != nil { + timer := time.NewTimer(*c.Timeout) + defer timer.Stop() + timerCh = timer.C + } + select { + case err := <-errC: + return err + case <-timerCh: + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go new file mode 100644 index 000000000000..07da10804540 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go @@ -0,0 +1,61 @@ +package configs + +import "fmt" + +// HostUID gets the translated uid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostUID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.UidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.") + } + id, found := c.hostIDFromMapping(containerId, c.UidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.") + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootUID gets the root uid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootUID() (int, error) { + return c.HostUID(0) +} + +// HostGID gets the translated gid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostGID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.GidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") + } + id, found := c.hostIDFromMapping(containerId, c.GidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.") + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootGID gets the root gid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootGID() (int, error) { + return c.HostGID(0) +} + +// Utility function that gets a host ID for a container ID from user namespace map +// if that ID is present in the map. +func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { + for _, m := range uMap { + if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { + hostID := m.HostID + (containerID - m.ContainerID) + return hostID, true + } + } + return -1, false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go new file mode 100644 index 000000000000..8701bb212d7b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go @@ -0,0 +1,57 @@ +package configs + +import ( + "fmt" + "os" +) + +const ( + Wildcard = -1 +) + +// TODO Windows: This can be factored out in the future + +type Device struct { + // Device type, block, char, etc. + Type rune `json:"type"` + + // Path to the device. + Path string `json:"path"` + + // Major is the device's major number. + Major int64 `json:"major"` + + // Minor is the device's minor number. + Minor int64 `json:"minor"` + + // Cgroup permissions format, rwm. + Permissions string `json:"permissions"` + + // FileMode permission bits for the device. + FileMode os.FileMode `json:"file_mode"` + + // Uid of the device. + Uid uint32 `json:"uid"` + + // Gid of the device. + Gid uint32 `json:"gid"` + + // Write the file to the allowed list + Allow bool `json:"allow"` +} + +func (d *Device) CgroupString() string { + return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions) +} + +func (d *Device) Mkdev() int { + return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12)) +} + +// deviceNumberString converts the device number to a string return result. +func deviceNumberString(number int64) string { + if number == Wildcard { + return "*" + } + return fmt.Sprint(number) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go new file mode 100644 index 000000000000..e4f423c523ff --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go @@ -0,0 +1,111 @@ +// +build linux + +package configs + +var ( + // DefaultSimpleDevices are devices that are to be both allowed and created. + DefaultSimpleDevices = []*Device{ + // /dev/null and zero + { + Path: "/dev/null", + Type: 'c', + Major: 1, + Minor: 3, + Permissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/zero", + Type: 'c', + Major: 1, + Minor: 5, + Permissions: "rwm", + FileMode: 0666, + }, + + { + Path: "/dev/full", + Type: 'c', + Major: 1, + Minor: 7, + Permissions: "rwm", + FileMode: 0666, + }, + + // consoles and ttys + { + Path: "/dev/tty", + Type: 'c', + Major: 5, + Minor: 0, + Permissions: "rwm", + FileMode: 0666, + }, + + // /dev/urandom,/dev/random + { + Path: "/dev/urandom", + Type: 'c', + Major: 1, + Minor: 9, + Permissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/random", + Type: 'c', + Major: 1, + Minor: 8, + Permissions: "rwm", + FileMode: 0666, + }, + } + DefaultAllowedDevices = append([]*Device{ + // allow mknod for any device + { + Type: 'c', + Major: Wildcard, + Minor: Wildcard, + Permissions: "m", + }, + { + Type: 'b', + Major: Wildcard, + Minor: Wildcard, + Permissions: "m", + }, + + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: Wildcard, + Permissions: "rwm", + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + }, + + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + }, + }, DefaultSimpleDevices...) + DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...) +) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go new file mode 100644 index 000000000000..d30216380b50 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go @@ -0,0 +1,9 @@ +package configs + +type HugepageLimit struct { + // which type of hugepage to limit. + Pagesize string `json:"page_size"` + + // usage limit for hugepage. + Limit uint64 `json:"limit"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go new file mode 100644 index 000000000000..36bd5f96a112 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go @@ -0,0 +1,7 @@ +package configs + +type IntelRdt struct { + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go new file mode 100644 index 000000000000..9a0395eaf5ff --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go @@ -0,0 +1,14 @@ +package configs + +import ( + "fmt" +) + +type IfPrioMap struct { + Interface string `json:"interface"` + Priority int64 `json:"priority"` +} + +func (i *IfPrioMap) CgroupString() string { + return fmt.Sprintf("%s %d", i.Interface, i.Priority) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go new file mode 100644 index 000000000000..670757ddb5f4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -0,0 +1,39 @@ +package configs + +const ( + // EXT_COPYUP is a directive to copy up the contents of a directory when + // a tmpfs is mounted over it. + EXT_COPYUP = 1 << iota +) + +type Mount struct { + // Source path for the mount. + Source string `json:"source"` + + // Destination path for the mount inside the container. + Destination string `json:"destination"` + + // Device the mount is for. + Device string `json:"device"` + + // Mount flags. + Flags int `json:"flags"` + + // Propagation Flags + PropagationFlags []int `json:"propagation_flags"` + + // Mount data applied to the mount. + Data string `json:"data"` + + // Relabel source if set, "z" indicates shared, "Z" indicates unshared. + Relabel string `json:"relabel"` + + // Extensions are additional flags that are specific to runc. + Extensions int `json:"extensions"` + + // Optional Command to be run before Source is mounted. + PremountCmds []Command `json:"premount_cmds"` + + // Optional Command to be run after Source is mounted. + PostmountCmds []Command `json:"postmount_cmds"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go new file mode 100644 index 000000000000..a3329a31a9cb --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go @@ -0,0 +1,5 @@ +package configs + +type NamespaceType string + +type Namespaces []Namespace diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go new file mode 100644 index 000000000000..5fc171a57b36 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -0,0 +1,122 @@ +package configs + +import ( + "fmt" + "os" + "sync" +) + +const ( + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" +) + +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// NsName converts the namespace type to its filename +func NsName(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := NsName(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + +func NamespaceTypes() []NamespaceType { + return []NamespaceType{ + NEWUSER, // Keep user NS always first, don't move it. + NEWIPC, + NEWUTS, + NEWNET, + NEWPID, + NEWNS, + } +} + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { + Type NamespaceType `json:"type"` + Path string `json:"path"` +} + +func (n *Namespace) GetPath(pid int) string { + return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) +} + +func (n *Namespaces) Remove(t NamespaceType) bool { + i := n.index(t) + if i == -1 { + return false + } + *n = append((*n)[:i], (*n)[i+1:]...) + return true +} + +func (n *Namespaces) Add(t NamespaceType, path string) { + i := n.index(t) + if i == -1 { + *n = append(*n, Namespace{Type: t, Path: path}) + return + } + (*n)[i].Path = path +} + +func (n *Namespaces) index(t NamespaceType) int { + for i, ns := range *n { + if ns.Type == t { + return i + } + } + return -1 +} + +func (n *Namespaces) Contains(t NamespaceType) bool { + return n.index(t) != -1 +} + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go new file mode 100644 index 000000000000..4ce6813d2330 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -0,0 +1,31 @@ +// +build linux + +package configs + +import "golang.org/x/sys/unix" + +func (n *Namespace) Syscall() int { + return namespaceInfo[n.Type] +} + +var namespaceInfo = map[NamespaceType]int{ + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + var flag int + for _, v := range *n { + if v.Path != "" { + continue + } + flag |= namespaceInfo[v.Type] + } + return uintptr(flag) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go new file mode 100644 index 000000000000..5d9a5c81f3fd --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux,!windows + +package configs + +func (n *Namespace) Syscall() int { + panic("No namespace syscall support") +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + panic("No namespace syscall support") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go new file mode 100644 index 000000000000..19bf713de3a3 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go @@ -0,0 +1,8 @@ +// +build !linux + +package configs + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go new file mode 100644 index 000000000000..ccdb228e14c8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go @@ -0,0 +1,72 @@ +package configs + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omitted from a container causing the +// container to be setup with the host's networking stack +type Network struct { + // Type sets the networks type, commonly veth and loopback + Type string `json:"type"` + + // Name of the network interface + Name string `json:"name"` + + // The bridge to use. + Bridge string `json:"bridge"` + + // MacAddress contains the MAC address to set on the network interface + MacAddress string `json:"mac_address"` + + // Address contains the IPv4 and mask to set on the network interface + Address string `json:"address"` + + // Gateway sets the gateway address that is used as the default for the interface + Gateway string `json:"gateway"` + + // IPv6Address contains the IPv6 and mask to set on the network interface + IPv6Address string `json:"ipv6_address"` + + // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface + IPv6Gateway string `json:"ipv6_gateway"` + + // Mtu sets the mtu value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + Mtu int `json:"mtu"` + + // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + TxQueueLen int `json:"txqueuelen"` + + // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the + // container. + HostInterfaceName string `json:"host_interface_name"` + + // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface + // bridge port in the case of type veth + // Note: This is unsupported on some systems. + // Note: This does not apply to loopback interfaces. + HairpinMode bool `json:"hairpin_mode"` +} + +// Routes can be specified to create entries in the route table as the container is started +// +// All of destination, source, and gateway should be either IPv4 or IPv6. +// One of the three options must be present, and omitted entries will use their +// IP family default for the route table. For IPv4 for example, setting the +// gateway to 1.2.3.4 and the interface to eth0 will set up a standard +// destination of 0.0.0.0(or *) when viewed in the route table. +type Route struct { + // Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6 + Destination string `json:"destination"` + + // Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6 + Source string `json:"source"` + + // Sets the gateway. Accepts IPv4 and IPv6 + Gateway string `json:"gateway"` + + // The device to set this route up for, for example: eth0 + InterfaceName string `json:"interface_name"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go new file mode 100644 index 000000000000..e532ac8fe287 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -0,0 +1,117 @@ +package validate + +import ( + "fmt" + "os" + "reflect" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + geteuid = os.Geteuid + getegid = os.Getegid +) + +func (v *ConfigValidator) rootless(config *configs.Config) error { + if err := rootlessMappings(config); err != nil { + return err + } + if err := rootlessMount(config); err != nil { + return err + } + + // XXX: We currently can't verify the user config at all, because + // configs.Config doesn't store the user-related configs. So this + // has to be verified by setupUser() in init_linux.go. + + return nil +} + +func hasIDMapping(id int, mappings []configs.IDMap) bool { + for _, m := range mappings { + if id >= m.ContainerID && id < m.ContainerID+m.Size { + return true + } + } + return false +} + +func rootlessMappings(config *configs.Config) error { + if euid := geteuid(); euid != 0 { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless containers require user namespaces") + } + } + + if len(config.UidMappings) == 0 { + return fmt.Errorf("rootless containers requires at least one UID mapping") + } + if len(config.GidMappings) == 0 { + return fmt.Errorf("rootless containers requires at least one GID mapping") + } + + return nil +} + +// cgroup verifies that the user isn't trying to set any cgroup limits or paths. +func rootlessCgroup(config *configs.Config) error { + // Nothing set at all. + if config.Cgroups == nil || config.Cgroups.Resources == nil { + return nil + } + + // Used for comparing to the zero value. + left := reflect.ValueOf(*config.Cgroups.Resources) + right := reflect.Zero(left.Type()) + + // This is all we need to do, since specconv won't add cgroup options in + // rootless mode. + if !reflect.DeepEqual(left.Interface(), right.Interface()) { + return fmt.Errorf("cannot specify resource limits in rootless container") + } + + return nil +} + +// mount verifies that the user isn't trying to set up any mounts they don't have +// the rights to do. In addition, it makes sure that no mount has a `uid=` or +// `gid=` option that doesn't resolve to root. +func rootlessMount(config *configs.Config) error { + // XXX: We could whitelist allowed devices at this point, but I'm not + // convinced that's a good idea. The kernel is the best arbiter of + // access control. + + for _, mount := range config.Mounts { + // Check that the options list doesn't contain any uid= or gid= entries + // that don't resolve to root. + for _, opt := range strings.Split(mount.Data, ",") { + if strings.HasPrefix(opt, "uid=") { + var uid int + n, err := fmt.Sscanf(opt, "uid=%d", &uid) + if n != 1 || err != nil { + // Ignore unknown mount options. + continue + } + if !hasIDMapping(uid, config.UidMappings) { + return fmt.Errorf("cannot specify uid= mount options for unmapped uid in rootless containers") + } + } + + if strings.HasPrefix(opt, "gid=") { + var gid int + n, err := fmt.Sscanf(opt, "gid=%d", &gid) + if n != 1 || err != nil { + // Ignore unknown mount options. + continue + } + if !hasIDMapping(gid, config.GidMappings) { + return fmt.Errorf("cannot specify gid= mount options for unmapped gid in rootless containers") + } + } + } + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go new file mode 100644 index 000000000000..cbbba9a03a20 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -0,0 +1,212 @@ +package validate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + selinux "github.com/opencontainers/selinux/go-selinux" +) + +type Validator interface { + Validate(*configs.Config) error +} + +func New() Validator { + return &ConfigValidator{} +} + +type ConfigValidator struct { +} + +func (v *ConfigValidator) Validate(config *configs.Config) error { + if err := v.rootfs(config); err != nil { + return err + } + if err := v.network(config); err != nil { + return err + } + if err := v.hostname(config); err != nil { + return err + } + if err := v.security(config); err != nil { + return err + } + if err := v.usernamespace(config); err != nil { + return err + } + if err := v.sysctl(config); err != nil { + return err + } + if err := v.intelrdt(config); err != nil { + return err + } + if config.Rootless { + if err := v.rootless(config); err != nil { + return err + } + } + return nil +} + +// rootfs validates if the rootfs is an absolute path and is not a symlink +// to the container's root filesystem. +func (v *ConfigValidator) rootfs(config *configs.Config) error { + if _, err := os.Stat(config.Rootfs); err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs) + } + return err + } + cleaned, err := filepath.Abs(config.Rootfs) + if err != nil { + return err + } + if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { + return err + } + if filepath.Clean(config.Rootfs) != cleaned { + return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) + } + return nil +} + +func (v *ConfigValidator) network(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWNET) { + if len(config.Networks) > 0 || len(config.Routes) > 0 { + return fmt.Errorf("unable to apply network settings without a private NET namespace") + } + } + return nil +} + +func (v *ConfigValidator) hostname(config *configs.Config) error { + if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) { + return fmt.Errorf("unable to set hostname without a private UTS namespace") + } + return nil +} + +func (v *ConfigValidator) security(config *configs.Config) error { + // restrict sys without mount namespace + if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) && + !config.Namespaces.Contains(configs.NEWNS) { + return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") + } + if config.ProcessLabel != "" && !selinux.GetEnabled() { + return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + } + + return nil +} + +func (v *ConfigValidator) usernamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWUSER) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + return fmt.Errorf("USER namespaces aren't enabled in the kernel") + } + } else { + if config.UidMappings != nil || config.GidMappings != nil { + return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config") + } + } + return nil +} + +// sysctl validates that the specified sysctl keys are valid or not. +// /proc/sys isn't completely namespaced and depending on which namespaces +// are specified, a subset of sysctls are permitted. +func (v *ConfigValidator) sysctl(config *configs.Config) error { + validSysctlMap := map[string]bool{ + "kernel.msgmax": true, + "kernel.msgmnb": true, + "kernel.msgmni": true, + "kernel.sem": true, + "kernel.shmall": true, + "kernel.shmmax": true, + "kernel.shmmni": true, + "kernel.shm_rmid_forced": true, + } + + for s := range config.Sysctl { + if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") { + if config.Namespaces.Contains(configs.NEWIPC) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s) + } + } + if strings.HasPrefix(s, "net.") { + if config.Namespaces.Contains(configs.NEWNET) { + if path := config.Namespaces.PathOf(configs.NEWNET); path != "" { + if err := checkHostNs(s, path); err != nil { + return err + } + } + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) + } + } + return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) + } + + return nil +} + +func (v *ConfigValidator) intelrdt(config *configs.Config) error { + if config.IntelRdt != nil { + if !intelrdt.IsEnabled() { + return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled") + } + if config.IntelRdt.L3CacheSchema == "" { + return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + } + } + + return nil +} + +func isSymbolicLink(path string) (bool, error) { + fi, err := os.Lstat(path) + if err != nil { + return false, err + } + + return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil +} + +// checkHostNs checks whether network sysctl is used in host namespace. +func checkHostNs(sysctlConfig string, path string) error { + var currentProcessNetns = "/proc/self/ns/net" + // readlink on the current processes network namespace + destOfCurrentProcess, err := os.Readlink(currentProcessNetns) + if err != nil { + return fmt.Errorf("read soft link %q error", currentProcessNetns) + } + + // First check if the provided path is a symbolic link + symLink, err := isSymbolicLink(path) + if err != nil { + return fmt.Errorf("could not check that %q is a symlink: %v", path, err) + } + + if symLink == false { + // The provided namespace is not a symbolic link, + // it is not the host namespace. + return nil + } + + // readlink on the path provided in the struct + destOfContainer, err := os.Readlink(path) + if err != nil { + return fmt.Errorf("read soft link %q error", path) + } + if destOfContainer == destOfCurrentProcess { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig) + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go new file mode 100644 index 000000000000..487c630af61c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go @@ -0,0 +1,553 @@ +// +build linux + +package intelrdt + +import ( + "bufio" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* + * About Intel RDT/CAT feature: + * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). + * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 + * Cache is the only resource that is supported in RDT. + * + * This feature provides a way for the software to restrict cache allocation to a + * defined 'subset' of L3 cache which may be overlapping with other 'subsets'. + * The different subsets are identified by class of service (CLOS) and each CLOS + * has a capacity bitmask (CBM). + * + * For more information about Intel RDT/CAT can be found in the section 17.17 + * of Intel Software Developer Manual. + * + * About Intel RDT/CAT kernel interface: + * In Linux 4.10 kernel or newer, the interface is defined and exposed via + * "resource control" filesystem, which is a "cgroup-like" interface. + * + * Comparing with cgroups, it has similar process management lifecycle and + * interfaces in a container. But unlike cgroups' hierarchy, it has single level + * filesystem layout. + * + * Intel RDT "resource control" filesystem hierarchy: + * mount -t resctrl resctrl /sys/fs/resctrl + * tree /sys/fs/resctrl + * /sys/fs/resctrl/ + * |-- info + * | |-- L3 + * | |-- cbm_mask + * | |-- min_cbm_bits + * | |-- num_closids + * |-- cpus + * |-- schemata + * |-- tasks + * |-- + * |-- cpus + * |-- schemata + * |-- tasks + * + * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache + * resource constraints. + * + * The file `tasks` has a list of tasks that belongs to this group (e.g., + * " group). Tasks can be added to a group by writing the task ID + * to the "tasks" file (which will automatically remove them from the previous + * group to which they belonged). New tasks created by fork(2) and clone(2) are + * added to the same group as their parent. If a pid is not in any sub group, it is + * in root group. + * + * The file `schemata` has allocation bitmasks/values for L3 cache on each socket, + * which contains L3 cache id and capacity bitmask (CBM). + * Format: "L3:=;=;..." + * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` + * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + * + * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can + * be set is less than the max bit. The max bits in the CBM is varied among + * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem + * layout, the CBM in a group should be a subset of the CBM in root. Kernel will + * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits + * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM + * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * + * For more information about Intel RDT/CAT kernel interface: + * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + * + * An example for runc: + * Consider a two-socket machine with two L3 caches where the default CBM is + * 0xfffff and the max CBM length is 20 bits. With this configuration, tasks + * inside the container only have access to the "upper" 80% of L3 cache id 0 and + * the "lower" 50% L3 cache id 1: + * + * "linux": { + * "intelRdt": { + * "l3CacheSchema": "L3:0=ffff0;1=3ff" + * } + * } + */ + +type Manager interface { + // Applies Intel RDT configuration to the process with the specified pid + Apply(pid int) error + + // Returns statistics for Intel RDT + GetStats() (*Stats, error) + + // Destroys the Intel RDT 'container_id' group + Destroy() error + + // Returns Intel RDT path to save in a state file and to be able to + // restore the object later + GetPath() string + + // Set Intel RDT "resource control" filesystem as configured. + Set(container *configs.Config) error +} + +// This implements interface Manager +type IntelRdtManager struct { + mu sync.Mutex + Config *configs.Config + Id string + Path string +} + +const ( + IntelRdtTasks = "tasks" +) + +var ( + // The absolute root path of the Intel RDT "resource control" filesystem + intelRdtRoot string + intelRdtRootLock sync.Mutex + + // The flag to indicate if Intel RDT is supported + isEnabled bool +) + +type intelRdtData struct { + root string + config *configs.Config + pid int +} + +// Check if Intel RDT is enabled in init() +func init() { + // 1. Check if hardware and kernel support Intel RDT/CAT feature + // "cat_l3" flag is set if supported + isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if !isFlagSet || err != nil { + isEnabled = false + return + } + + // 2. Check if Intel RDT "resource control" filesystem is mounted + // The user guarantees to mount the filesystem + isEnabled = isIntelRdtMounted() +} + +// Return the mount point path of Intel RDT "resource control" filesysem +func findIntelRdtMountpointDir() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "Intel RDT" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "resctrl" { + // Check that the mount is properly formated. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + return fields[4], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + + return "", NewNotFoundError("Intel RDT") +} + +// Gets the root path of Intel RDT "resource control" filesystem +func getIntelRdtRoot() (string, error) { + intelRdtRootLock.Lock() + defer intelRdtRootLock.Unlock() + + if intelRdtRoot != "" { + return intelRdtRoot, nil + } + + root, err := findIntelRdtMountpointDir() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + intelRdtRoot = root + return intelRdtRoot, nil +} + +func isIntelRdtMounted() bool { + _, err := getIntelRdtRoot() + if err != nil { + return false + } + + return true +} + +func parseCpuInfoFile(path string) (bool, error) { + f, err := os.Open(path) + if err != nil { + return false, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return false, err + } + + text := s.Text() + flags := strings.Split(text, " ") + + // "cat_l3" flag is set if Intel RDT/CAT is supported + for _, flag := range flags { + if flag == "cat_l3" { + return true, nil + } + } + } + return false, nil +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Gets a single uint64 value from the specified file. +func getIntelRdtParamUint(path, file string) (uint64, error) { + fileName := filepath.Join(path, file) + contents, err := ioutil.ReadFile(fileName) + if err != nil { + return 0, err + } + + res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName) + } + return res, nil +} + +// Gets a string value from the specified file +func getIntelRdtParamString(path, file string) (string, error) { + contents, err := ioutil.ReadFile(filepath.Join(path, file)) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(contents)), nil +} + +func readTasksFile(dir string) ([]int, error) { + f, err := os.Open(filepath.Join(dir, IntelRdtTasks)) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + +func writeFile(dir, file, data string) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", file) + } + if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + } + return nil +} + +func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + return &intelRdtData{ + root: rootPath, + config: c, + pid: pid, + }, nil +} + +// Get the read-only L3 cache information +func getL3CacheInfo() (*L3CacheInfo, error) { + l3CacheInfo := &L3CacheInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return l3CacheInfo, err + } + + path := filepath.Join(rootPath, "info", "L3") + cbmMask, err := getIntelRdtParamString(path, "cbm_mask") + if err != nil { + return l3CacheInfo, err + } + minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits") + if err != nil { + return l3CacheInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return l3CacheInfo, err + } + + l3CacheInfo.CbmMask = cbmMask + l3CacheInfo.MinCbmBits = minCbmBits + l3CacheInfo.NumClosids = numClosids + + return l3CacheInfo, nil +} + +// WriteIntelRdtTasks writes the specified pid into the "tasks" file +func WriteIntelRdtTasks(dir string, pid int) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", IntelRdtTasks) + } + + // Dont attach any pid if -1 is specified as a pid + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) + } + } + return nil +} + +// Check if Intel RDT is enabled +func IsEnabled() bool { + return isEnabled +} + +// Get the 'container_id' path in Intel RDT "resource control" filesystem +func GetIntelRdtPath(id string) (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, id) + return path, nil +} + +// Applies Intel RDT configuration to the process with the specified pid +func (m *IntelRdtManager) Apply(pid int) (err error) { + // If intelRdt is not specified in config, we do nothing + if m.Config.IntelRdt == nil { + return nil + } + d, err := getIntelRdtData(m.Config, pid) + if err != nil && !IsNotFound(err) { + return err + } + + m.mu.Lock() + defer m.mu.Unlock() + path, err := d.join(m.Id) + if err != nil { + return err + } + + m.Path = path + return nil +} + +// Destroys the Intel RDT 'container_id' group +func (m *IntelRdtManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + if err := os.RemoveAll(m.Path); err != nil { + return err + } + m.Path = "" + return nil +} + +// Returns Intel RDT path to save in a state file and to be able to +// restore the object later +func (m *IntelRdtManager) GetPath() string { + if m.Path == "" { + m.Path, _ = GetIntelRdtPath(m.Id) + } + return m.Path +} + +// Returns statistics for Intel RDT +func (m *IntelRdtManager) GetStats() (*Stats, error) { + // If intelRdt is not specified in config + if m.Config.IntelRdt == nil { + return nil, nil + } + + m.mu.Lock() + defer m.mu.Unlock() + stats := NewStats() + + // The read-only L3 cache information + l3CacheInfo, err := getL3CacheInfo() + if err != nil { + return nil, err + } + stats.L3CacheInfo = l3CacheInfo + + // The read-only L3 cache schema in root + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata") + if err != nil { + return nil, err + } + // L3 cache schema is in the first line + schemaRootStrings := strings.Split(tmpRootStrings, "\n") + stats.L3CacheSchemaRoot = schemaRootStrings[0] + + // The L3 cache schema in 'container_id' group + tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") + if err != nil { + return nil, err + } + // L3 cache schema is in the first line + schemaStrings := strings.Split(tmpStrings, "\n") + stats.L3CacheSchema = schemaStrings[0] + + return stats, nil +} + +// Set Intel RDT "resource control" filesystem as configured. +func (m *IntelRdtManager) Set(container *configs.Config) error { + path := m.GetPath() + + // About L3 cache schema file: + // The schema has allocation masks/values for L3 cache on each socket, + // which contains L3 cache id and capacity bitmask (CBM). + // Format: "L3:=;=;..." + // For example, on a two-socket machine, L3's schema line could be: + // L3:0=ff;1=c0 + // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + // + // About L3 cache CBM validity: + // The valid L3 cache CBM is a *contiguous bits set* and number of + // bits that can be set is less than the max bit. The max bits in the + // CBM is varied among supported Intel Xeon platforms. In Intel RDT + // "resource control" filesystem layout, the CBM in a group should + // be a subset of the CBM in root. Kernel will check if it is valid + // when writing. + // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, + // which mapping to entire L3 cache capacity. Some valid CBM values + // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + if container.IntelRdt != nil { + l3CacheSchema := container.IntelRdt.L3CacheSchema + if l3CacheSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema); err != nil { + return err + } + } + } + + return nil +} + +func (raw *intelRdtData) join(id string) (string, error) { + path := filepath.Join(raw.root, id) + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + + if err := WriteIntelRdtTasks(path, raw.pid); err != nil { + return "", err + } + return path, nil +} + +type NotFoundError struct { + ResourceControl string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl) +} + +func NewNotFoundError(res string) error { + return &NotFoundError{ + ResourceControl: res, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go new file mode 100644 index 000000000000..095c0a380cd0 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go @@ -0,0 +1,24 @@ +// +build linux + +package intelrdt + +type L3CacheInfo struct { + CbmMask string `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type Stats struct { + // The read-only L3 cache information + L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` + + // The read-only L3 cache schema in root + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + + // The L3 cache schema in 'container_id' group + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + +func NewStats() *Stats { + return &Stats{} +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index 2c69cee5d64e..a4cd1399d9eb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -505,7 +505,8 @@ void join_namespaces(char *nslist) ns->fd = fd; ns->ns = nsflag(namespace); - strncpy(ns->path, path, PATH_MAX); + strncpy(ns->path, path, PATH_MAX - 1); + ns->path[PATH_MAX - 1] = '\0'; } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); /* @@ -678,17 +679,15 @@ void nsexec(void) /* * Enable setgroups(2) if we've been asked to. But we also * have to explicitly disable setgroups(2) if we're - * creating a rootless container (this is required since - * Linux 3.19). + * creating a rootless container for single-entry mapping. + * i.e. config.is_setgroup == false. + * (this is required since Linux 3.19). + * + * For rootless multi-entry mapping, config.is_setgroup shall be true and + * newuidmap/newgidmap shall be used. */ - if (config.is_rootless && config.is_setgroup) { - kill(child, SIGKILL); - bail("cannot allow setgroup in an unprivileged user namespace setup"); - } - if (config.is_setgroup) - update_setgroups(child, SETGROUPS_ALLOW); - if (config.is_rootless) + if (config.is_rootless && !config.is_setgroup) update_setgroups(child, SETGROUPS_DENY); /* Set up mappings. */ @@ -809,25 +808,30 @@ void nsexec(void) if (config.namespaces) join_namespaces(config.namespaces); - /* - * Unshare all of the namespaces. Now, it should be noted that this - * ordering might break in the future (especially with rootless - * containers). But for now, it's not possible to split this into - * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. - * - * Note that we don't merge this with clone() because there were - * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) - * was broken, so we'll just do it the long way anyway. - */ - if (unshare(config.cloneflags) < 0) - bail("failed to unshare namespaces"); - /* * Deal with user namespaces first. They are quite special, as they * affect our ability to unshare other namespaces and are used as * context for privilege checks. + * + * We don't unshare all namespaces in one go. The reason for this + * is that, while the kernel documentation may claim otherwise, + * there are certain cases where unsharing all namespaces at once + * will result in namespace objects being owned incorrectly. + * Ideally we should just fix these kernel bugs, but it's better to + * be safe than sorry, and fix them separately. + * + * A specific case of this is that the SELinux label of the + * internal kern-mount that mqueue uses will be incorrect if the + * UTS namespace is cloned before the USER namespace is mapped. + * I've also heard of similar problems with the network namespace + * in some scenarios. This also mirrors how LXC deals with this + * problem. */ if (config.cloneflags & CLONE_NEWUSER) { + if (unshare(CLONE_NEWUSER) < 0) + bail("failed to unshare user namespace"); + config.cloneflags &= ~CLONE_NEWUSER; + /* * We don't have the privileges to do any mapping here (see the * clone_parent rant). So signal our parent to hook us up. @@ -853,8 +857,21 @@ void nsexec(void) if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to set process as dumpable"); } + + /* Become root in the namespace proper. */ + if (setresuid(0, 0, 0) < 0) + bail("failed to become root in user namespace"); } + /* + * Unshare all of the namespaces. Note that we don't merge this + * with clone() because there were some old kernel versions where + * clone(CLONE_PARENT | CLONE_NEWPID) was broken, so we'll just do + * it the long way. + */ + if (unshare(config.cloneflags) < 0) + bail("failed to unshare namespaces"); + /* * TODO: What about non-namespace clone flags that we're dropping here? * diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go new file mode 100644 index 000000000000..ded5a6bbc8bb --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -0,0 +1,76 @@ +package seccomp + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var operators = map[string]configs.Operator{ + "SCMP_CMP_NE": configs.NotEqualTo, + "SCMP_CMP_LT": configs.LessThan, + "SCMP_CMP_LE": configs.LessThanOrEqualTo, + "SCMP_CMP_EQ": configs.EqualTo, + "SCMP_CMP_GE": configs.GreaterThanOrEqualTo, + "SCMP_CMP_GT": configs.GreaterThan, + "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo, +} + +var actions = map[string]configs.Action{ + "SCMP_ACT_KILL": configs.Kill, + "SCMP_ACT_ERRNO": configs.Errno, + "SCMP_ACT_TRAP": configs.Trap, + "SCMP_ACT_ALLOW": configs.Allow, + "SCMP_ACT_TRACE": configs.Trace, +} + +var archs = map[string]string{ + "SCMP_ARCH_X86": "x86", + "SCMP_ARCH_X86_64": "amd64", + "SCMP_ARCH_X32": "x32", + "SCMP_ARCH_ARM": "arm", + "SCMP_ARCH_AARCH64": "arm64", + "SCMP_ARCH_MIPS": "mips", + "SCMP_ARCH_MIPS64": "mips64", + "SCMP_ARCH_MIPS64N32": "mips64n32", + "SCMP_ARCH_MIPSEL": "mipsel", + "SCMP_ARCH_MIPSEL64": "mipsel64", + "SCMP_ARCH_MIPSEL64N32": "mipsel64n32", + "SCMP_ARCH_PPC": "ppc", + "SCMP_ARCH_PPC64": "ppc64", + "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_S390": "s390", + "SCMP_ARCH_S390X": "s390x", +} + +// ConvertStringToOperator converts a string into a Seccomp comparison operator. +// Comparison operators use the names they are assigned by Libseccomp's header. +// Attempting to convert a string that is not a valid operator results in an +// error. +func ConvertStringToOperator(in string) (configs.Operator, error) { + if op, ok := operators[in]; ok == true { + return op, nil + } + return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in) +} + +// ConvertStringToAction converts a string into a Seccomp rule match action. +// Actions use the names they are assigned in Libseccomp's header, though some +// (notable, SCMP_ACT_TRACE) are not available in this implementation and will +// return errors. +// Attempting to convert a string that is not a valid action results in an +// error. +func ConvertStringToAction(in string) (configs.Action, error) { + if act, ok := actions[in]; ok == true { + return act, nil + } + return 0, fmt.Errorf("string %s is not a valid action for seccomp", in) +} + +// ConvertStringToArch converts a string into a Seccomp comparison arch. +func ConvertStringToArch(in string) (string, error) { + if arch, ok := archs[in]; ok == true { + return arch, nil + } + return "", fmt.Errorf("string %s is not a valid arch for seccomp", in) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go new file mode 100644 index 000000000000..d99f3fe640c6 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -0,0 +1,258 @@ +// +build linux,cgo,seccomp + +package seccomp + +import ( + "bufio" + "fmt" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + libseccomp "github.com/seccomp/libseccomp-golang" + + "golang.org/x/sys/unix" +) + +var ( + actAllow = libseccomp.ActAllow + actTrap = libseccomp.ActTrap + actKill = libseccomp.ActKill + actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) + actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) +) + +const ( + // Linux system calls can have at most 6 arguments + syscallMaxArguments int = 6 +) + +// Filters given syscalls in a container, preventing them from being used +// Started in the container init process, and carried over to all child processes +// Setns calls, however, require a separate invocation, as they are not children +// of the init until they join the namespace +func InitSeccomp(config *configs.Seccomp) error { + if config == nil { + return fmt.Errorf("cannot initialize Seccomp - nil config passed") + } + + defaultAction, err := getAction(config.DefaultAction) + if err != nil { + return fmt.Errorf("error initializing seccomp - invalid default action") + } + + filter, err := libseccomp.NewFilter(defaultAction) + if err != nil { + return fmt.Errorf("error creating filter: %s", err) + } + + // Add extra architectures + for _, arch := range config.Architectures { + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + return fmt.Errorf("error validating Seccomp architecture: %s", err) + } + + if err := filter.AddArch(scmpArch); err != nil { + return fmt.Errorf("error adding architecture to seccomp filter: %s", err) + } + } + + // Unset no new privs bit + if err := filter.SetNoNewPrivsBit(false); err != nil { + return fmt.Errorf("error setting no new privileges: %s", err) + } + + // Add a rule for each syscall + for _, call := range config.Syscalls { + if call == nil { + return fmt.Errorf("encountered nil syscall while initializing Seccomp") + } + + if err = matchCall(filter, call); err != nil { + return err + } + } + + if err = filter.Load(); err != nil { + return fmt.Errorf("error loading seccomp filter into kernel: %s", err) + } + + return nil +} + +// IsEnabled returns if the kernel has been configured to support seccomp. +func IsEnabled() bool { + // Try to read from /proc/self/status for kernels > 3.8 + s, err := parseStatusFile("/proc/self/status") + if err != nil { + // Check if Seccomp is supported, via CONFIG_SECCOMP. + if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL { + // Make sure the kernel has CONFIG_SECCOMP_FILTER. + if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL { + return true + } + } + return false + } + _, ok := s["Seccomp"] + return ok +} + +// Convert Libcontainer Action to Libseccomp ScmpAction +func getAction(act configs.Action) (libseccomp.ScmpAction, error) { + switch act { + case configs.Kill: + return actKill, nil + case configs.Errno: + return actErrno, nil + case configs.Trap: + return actTrap, nil + case configs.Allow: + return actAllow, nil + case configs.Trace: + return actTrace, nil + default: + return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule") + } +} + +// Convert Libcontainer Operator to Libseccomp ScmpCompareOp +func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) { + switch op { + case configs.EqualTo: + return libseccomp.CompareEqual, nil + case configs.NotEqualTo: + return libseccomp.CompareNotEqual, nil + case configs.GreaterThan: + return libseccomp.CompareGreater, nil + case configs.GreaterThanOrEqualTo: + return libseccomp.CompareGreaterEqual, nil + case configs.LessThan: + return libseccomp.CompareLess, nil + case configs.LessThanOrEqualTo: + return libseccomp.CompareLessOrEqual, nil + case configs.MaskEqualTo: + return libseccomp.CompareMaskedEqual, nil + default: + return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule") + } +} + +// Convert Libcontainer Arg to Libseccomp ScmpCondition +func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { + cond := libseccomp.ScmpCondition{} + + if arg == nil { + return cond, fmt.Errorf("cannot convert nil to syscall condition") + } + + op, err := getOperator(arg.Op) + if err != nil { + return cond, err + } + + return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo) +} + +// Add a rule to match a single syscall +func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { + if call == nil || filter == nil { + return fmt.Errorf("cannot use nil as syscall to block") + } + + if len(call.Name) == 0 { + return fmt.Errorf("empty string is not a valid syscall") + } + + // If we can't resolve the syscall, assume it's not supported on this kernel + // Ignore it, don't error out + callNum, err := libseccomp.GetSyscallFromName(call.Name) + if err != nil { + return nil + } + + // Convert the call's action to the libseccomp equivalent + callAct, err := getAction(call.Action) + if err != nil { + return fmt.Errorf("action in seccomp profile is invalid: %s", err) + } + + // Unconditional match - just add the rule + if len(call.Args) == 0 { + if err = filter.AddRule(callNum, callAct); err != nil { + return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err) + } + } else { + // If two or more arguments have the same condition, + // Revert to old behavior, adding each condition as a separate rule + argCounts := make([]uint, syscallMaxArguments) + conditions := []libseccomp.ScmpCondition{} + + for _, cond := range call.Args { + newCond, err := getCondition(cond) + if err != nil { + return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err) + } + + argCounts[cond.Index] += 1 + + conditions = append(conditions, newCond) + } + + hasMultipleArgs := false + for _, count := range argCounts { + if count > 1 { + hasMultipleArgs = true + break + } + } + + if hasMultipleArgs { + // Revert to old behavior + // Add each condition attached to a separate rule + for _, cond := range conditions { + condArr := []libseccomp.ScmpCondition{cond} + + if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + } + } + } else { + // No conditions share same argument + // Use new, proper behavior + if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + } + } + } + + return nil +} + +func parseStatusFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + s := bufio.NewScanner(f) + status := make(map[string]string) + + for s.Scan() { + text := s.Text() + parts := strings.Split(text, ":") + + if len(parts) <= 1 { + continue + } + + status[parts[0]] = parts[1] + } + if err := s.Err(); err != nil { + return nil, err + } + + return status, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go new file mode 100644 index 000000000000..44df1ad4c269 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go @@ -0,0 +1,24 @@ +// +build !linux !cgo !seccomp + +package seccomp + +import ( + "errors" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") + +// InitSeccomp does nothing because seccomp is not supported. +func InitSeccomp(config *configs.Seccomp) error { + if config != nil { + return ErrSeccompNotEnabled + } + return nil +} + +// IsEnabled returns false, because it is not supported. +func IsEnabled() bool { + return false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go new file mode 100644 index 000000000000..c113b337f34a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go @@ -0,0 +1,221 @@ +package specconv + +import ( + "os" + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Example returns an example spec file, with many options set so a user can +// see what a standard spec file looks like. +func Example() *specs.Spec { + return &specs.Spec{ + Version: specs.Version, + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, + }, + Process: &specs.Process{ + Terminal: true, + User: specs.User{}, + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: "/", + NoNewPrivileges: true, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Ambient: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: uint64(1024), + Soft: uint64(1024), + }, + }, + }, + Hostname: "runc", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "proc", + Source: "proc", + Options: nil, + }, + { + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/pts", + Type: "devpts", + Source: "devpts", + Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, + }, + { + Destination: "/dev/shm", + Type: "tmpfs", + Source: "shm", + Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, + }, + { + Destination: "/dev/mqueue", + Type: "mqueue", + Source: "mqueue", + Options: []string{"nosuid", "noexec", "nodev"}, + }, + { + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, + }, + }, + Linux: &specs.Linux{ + MaskedPaths: []string{ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi", + }, + ReadonlyPaths: []string{ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + }, + Resources: &specs.LinuxResources{ + Devices: []specs.LinuxDeviceCgroup{ + { + Allow: false, + Access: "rwm", + }, + }, + }, + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, + }, + }, + } +} + +// ToRootless converts the given spec file into one that should work with +// rootless containers, by removing incompatible options and adding others that +// are needed. +func ToRootless(spec *specs.Spec) { + var namespaces []specs.LinuxNamespace + + // Remove networkns from the spec. + for _, ns := range spec.Linux.Namespaces { + switch ns.Type { + case specs.NetworkNamespace, specs.UserNamespace: + // Do nothing. + default: + namespaces = append(namespaces, ns) + } + } + // Add userns to the spec. + namespaces = append(namespaces, specs.LinuxNamespace{ + Type: specs.UserNamespace, + }) + spec.Linux.Namespaces = namespaces + + // Add mappings for the current user. + spec.Linux.UIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Geteuid()), + ContainerID: 0, + Size: 1, + }} + spec.Linux.GIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Getegid()), + ContainerID: 0, + Size: 1, + }} + + // Fix up mounts. + var mounts []specs.Mount + for _, mount := range spec.Mounts { + // Ignore all mounts that are under /sys. + if strings.HasPrefix(mount.Destination, "/sys") { + continue + } + + // Remove all gid= and uid= mappings. + var options []string + for _, option := range mount.Options { + if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") { + options = append(options, option) + } + } + + mount.Options = options + mounts = append(mounts, mount) + } + // Add the sysfs mount as an rbind. + mounts = append(mounts, specs.Mount{ + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }) + spec.Mounts = mounts + + // Remove cgroup settings. + spec.Linux.Resources = nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go new file mode 100644 index 000000000000..1fb4f224da89 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go @@ -0,0 +1,836 @@ +// +build linux + +// Package specconv implements conversion of specifications to libcontainer +// configurations +package specconv + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/seccomp" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + + "golang.org/x/sys/unix" +) + +const wildcard = -1 + +var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ + specs.PIDNamespace: configs.NEWPID, + specs.NetworkNamespace: configs.NEWNET, + specs.MountNamespace: configs.NEWNS, + specs.UserNamespace: configs.NEWUSER, + specs.IPCNamespace: configs.NEWIPC, + specs.UTSNamespace: configs.NEWUTS, +} + +var mountPropagationMapping = map[string]int{ + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "private": unix.MS_PRIVATE, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "slave": unix.MS_SLAVE, + "rshared": unix.MS_SHARED | unix.MS_REC, + "shared": unix.MS_SHARED, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + "unbindable": unix.MS_UNBINDABLE, + "": 0, +} + +var allowedDevices = []*configs.Device{ + // allow mknod for any device + { + Type: 'c', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: 'b', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/null", + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/random", + Major: 1, + Minor: 8, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/full", + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/tty", + Major: 5, + Minor: 0, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/zero", + Major: 1, + Minor: 5, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/urandom", + Major: 1, + Minor: 9, + Permissions: "rwm", + Allow: true, + }, + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + Allow: true, + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: wildcard, + Permissions: "rwm", + Allow: true, + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + Allow: true, + }, + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + Allow: true, + }, +} + +type CreateOpts struct { + CgroupName string + UseSystemdCgroup bool + NoPivotRoot bool + NoNewKeyring bool + Spec *specs.Spec + Rootless bool +} + +// CreateLibcontainerConfig creates a new libcontainer configuration from a +// given specification and a cgroup name +func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { + // runc's cwd will always be the bundle path + rcwd, err := os.Getwd() + if err != nil { + return nil, err + } + cwd, err := filepath.Abs(rcwd) + if err != nil { + return nil, err + } + spec := opts.Spec + if spec.Root == nil { + return nil, fmt.Errorf("Root must be specified") + } + rootfsPath := spec.Root.Path + if !filepath.IsAbs(rootfsPath) { + rootfsPath = filepath.Join(cwd, rootfsPath) + } + labels := []string{} + for k, v := range spec.Annotations { + labels = append(labels, fmt.Sprintf("%s=%s", k, v)) + } + config := &configs.Config{ + Rootfs: rootfsPath, + NoPivotRoot: opts.NoPivotRoot, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), + NoNewKeyring: opts.NoNewKeyring, + Rootless: opts.Rootless, + } + + exists := false + for _, m := range spec.Mounts { + config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m)) + } + if err := createDevices(spec, config); err != nil { + return nil, err + } + c, err := createCgroupConfig(opts) + if err != nil { + return nil, err + } + config.Cgroups = c + // set linux-specific config + if spec.Linux != nil { + if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { + return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) + } + if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) { + return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root") + } + + for _, ns := range spec.Linux.Namespaces { + t, exists := namespaceMapping[ns.Type] + if !exists { + return nil, fmt.Errorf("namespace %q does not exist", ns) + } + if config.Namespaces.Contains(t) { + return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns) + } + config.Namespaces.Add(t, ns.Path) + } + if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" { + config.Networks = []*configs.Network{ + { + Type: "loopback", + }, + } + } + if config.Namespaces.Contains(configs.NEWUSER) { + if err := setupUserNamespace(spec, config); err != nil { + return nil, err + } + } + config.MaskPaths = spec.Linux.MaskedPaths + config.ReadonlyPaths = spec.Linux.ReadonlyPaths + config.MountLabel = spec.Linux.MountLabel + config.Sysctl = spec.Linux.Sysctl + if spec.Linux.Seccomp != nil { + seccomp, err := SetupSeccomp(spec.Linux.Seccomp) + if err != nil { + return nil, err + } + config.Seccomp = seccomp + } + } + if spec.Process.SelinuxLabel != "" { + config.ProcessLabel = spec.Process.SelinuxLabel + } + if spec.Process != nil { + config.OomScoreAdj = spec.Process.OOMScoreAdj + } + if spec.Process.Capabilities != nil { + config.Capabilities = &configs.Capabilities{ + Bounding: spec.Process.Capabilities.Bounding, + Effective: spec.Process.Capabilities.Effective, + Permitted: spec.Process.Capabilities.Permitted, + Inheritable: spec.Process.Capabilities.Inheritable, + Ambient: spec.Process.Capabilities.Ambient, + } + } + createHooks(spec, config) + config.Version = specs.Version + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + } + return config, nil +} + +func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { + flags, pgflags, data, ext := parseMountOptions(m.Options) + source := m.Source + device := m.Type + if flags|unix.MS_BIND != 0 { + if device == "" { + device = "bind" + } + if !filepath.IsAbs(source) { + source = filepath.Join(cwd, m.Source) + } + } + return &configs.Mount{ + Device: device, + Source: source, + Destination: m.Destination, + Data: data, + Flags: flags, + PropagationFlags: pgflags, + Extensions: ext, + } +} + +func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { + var ( + myCgroupPath string + + spec = opts.Spec + useSystemdCgroup = opts.UseSystemdCgroup + name = opts.CgroupName + ) + + c := &configs.Cgroup{ + Resources: &configs.Resources{}, + } + + if spec.Linux != nil && spec.Linux.CgroupsPath != "" { + myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath) + if useSystemdCgroup { + myCgroupPath = spec.Linux.CgroupsPath + } + } + + if useSystemdCgroup { + if myCgroupPath == "" { + c.Parent = "system.slice" + c.ScopePrefix = "runc" + c.Name = name + } else { + // Parse the path from expected "slice:prefix:name" + // for e.g. "system.slice:docker:1234" + parts := strings.Split(myCgroupPath, ":") + if len(parts) != 3 { + return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups") + } + c.Parent = parts[0] + c.ScopePrefix = parts[1] + c.Name = parts[2] + } + } else { + if myCgroupPath == "" { + c.Name = name + } + c.Path = myCgroupPath + } + + // In rootless containers, any attempt to make cgroup changes will fail. + // libcontainer will validate this and we shouldn't add any cgroup options + // the user didn't specify. + if !opts.Rootless { + c.Resources.AllowedDevices = allowedDevices + } + if spec.Linux != nil { + r := spec.Linux.Resources + if r == nil { + return c, nil + } + for i, d := range spec.Linux.Resources.Devices { + var ( + t = "a" + major = int64(-1) + minor = int64(-1) + ) + if d.Type != "" { + t = d.Type + } + if d.Major != nil { + major = *d.Major + } + if d.Minor != nil { + minor = *d.Minor + } + if d.Access == "" { + return nil, fmt.Errorf("device access at %d field cannot be empty", i) + } + dt, err := stringToCgroupDeviceRune(t) + if err != nil { + return nil, err + } + dd := &configs.Device{ + Type: dt, + Major: major, + Minor: minor, + Permissions: d.Access, + Allow: d.Allow, + } + c.Resources.Devices = append(c.Resources.Devices, dd) + } + if r.Memory != nil { + if r.Memory.Limit != nil { + c.Resources.Memory = *r.Memory.Limit + } + if r.Memory.Reservation != nil { + c.Resources.MemoryReservation = *r.Memory.Reservation + } + if r.Memory.Swap != nil { + c.Resources.MemorySwap = *r.Memory.Swap + } + if r.Memory.Kernel != nil { + c.Resources.KernelMemory = *r.Memory.Kernel + } + if r.Memory.KernelTCP != nil { + c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP + } + if r.Memory.Swappiness != nil { + c.Resources.MemorySwappiness = r.Memory.Swappiness + } + if r.Memory.DisableOOMKiller != nil { + c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller + } + } + if r.CPU != nil { + if r.CPU.Shares != nil { + c.Resources.CpuShares = *r.CPU.Shares + } + if r.CPU.Quota != nil { + c.Resources.CpuQuota = *r.CPU.Quota + } + if r.CPU.Period != nil { + c.Resources.CpuPeriod = *r.CPU.Period + } + if r.CPU.RealtimeRuntime != nil { + c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime + } + if r.CPU.RealtimePeriod != nil { + c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod + } + if r.CPU.Cpus != "" { + c.Resources.CpusetCpus = r.CPU.Cpus + } + if r.CPU.Mems != "" { + c.Resources.CpusetMems = r.CPU.Mems + } + } + if r.Pids != nil { + c.Resources.PidsLimit = r.Pids.Limit + } + if r.BlockIO != nil { + if r.BlockIO.Weight != nil { + c.Resources.BlkioWeight = *r.BlockIO.Weight + } + if r.BlockIO.LeafWeight != nil { + c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight + } + if r.BlockIO.WeightDevice != nil { + for _, wd := range r.BlockIO.WeightDevice { + var weight, leafWeight uint16 + if wd.Weight != nil { + weight = *wd.Weight + } + if wd.LeafWeight != nil { + leafWeight = *wd.LeafWeight + } + weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) + c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) + } + } + if r.BlockIO.ThrottleReadBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleReadBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleReadIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleReadIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) + } + } + } + for _, l := range r.HugepageLimits { + c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: l.Pagesize, + Limit: l.Limit, + }) + } + if r.Network != nil { + if r.Network.ClassID != nil { + c.Resources.NetClsClassid = *r.Network.ClassID + } + for _, m := range r.Network.Priorities { + c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Name, + Priority: int64(m.Priority), + }) + } + } + } + if !opts.Rootless { + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + } + return c, nil +} + +func stringToCgroupDeviceRune(s string) (rune, error) { + switch s { + case "a": + return 'a', nil + case "b": + return 'b', nil + case "c": + return 'c', nil + default: + return 0, fmt.Errorf("invalid cgroup device type %q", s) + } +} + +func stringToDeviceRune(s string) (rune, error) { + switch s { + case "p": + return 'p', nil + case "u": + return 'u', nil + case "b": + return 'b', nil + case "c": + return 'c', nil + default: + return 0, fmt.Errorf("invalid device type %q", s) + } +} + +func createDevices(spec *specs.Spec, config *configs.Config) error { + // add whitelisted devices + config.Devices = []*configs.Device{ + { + Type: 'c', + Path: "/dev/null", + Major: 1, + Minor: 3, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/random", + Major: 1, + Minor: 8, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/full", + Major: 1, + Minor: 7, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/tty", + Major: 5, + Minor: 0, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/zero", + Major: 1, + Minor: 5, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/urandom", + Major: 1, + Minor: 9, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + } + // merge in additional devices from the spec + if spec.Linux != nil { + for _, d := range spec.Linux.Devices { + var uid, gid uint32 + var filemode os.FileMode = 0666 + + if d.UID != nil { + uid = *d.UID + } + if d.GID != nil { + gid = *d.GID + } + dt, err := stringToDeviceRune(d.Type) + if err != nil { + return err + } + if d.FileMode != nil { + filemode = *d.FileMode + } + device := &configs.Device{ + Type: dt, + Path: d.Path, + Major: d.Major, + Minor: d.Minor, + FileMode: filemode, + Uid: uid, + Gid: gid, + } + config.Devices = append(config.Devices, device) + } + } + return nil +} + +func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { + create := func(m specs.LinuxIDMapping) configs.IDMap { + return configs.IDMap{ + HostID: int(m.HostID), + ContainerID: int(m.ContainerID), + Size: int(m.Size), + } + } + if spec.Linux != nil { + for _, m := range spec.Linux.UIDMappings { + config.UidMappings = append(config.UidMappings, create(m)) + } + for _, m := range spec.Linux.GIDMappings { + config.GidMappings = append(config.GidMappings, create(m)) + } + } + rootUID, err := config.HostRootUID() + if err != nil { + return err + } + rootGID, err := config.HostRootGID() + if err != nil { + return err + } + for _, node := range config.Devices { + node.Uid = uint32(rootUID) + node.Gid = uint32(rootGID) + } + return nil +} + +// parseMountOptions parses the string and returns the flags, propagation +// flags and any mount data that it contains. +func parseMountOptions(options []string) (int, []int, string, int) { + var ( + flag int + pgflag []int + data []string + extFlags int + ) + flags := map[string]struct { + clear bool + flag int + }{ + "acl": {false, unix.MS_POSIXACL}, + "async": {true, unix.MS_SYNCHRONOUS}, + "atime": {true, unix.MS_NOATIME}, + "bind": {false, unix.MS_BIND}, + "defaults": {false, 0}, + "dev": {true, unix.MS_NODEV}, + "diratime": {true, unix.MS_NODIRATIME}, + "dirsync": {false, unix.MS_DIRSYNC}, + "exec": {true, unix.MS_NOEXEC}, + "iversion": {false, unix.MS_I_VERSION}, + "lazytime": {false, unix.MS_LAZYTIME}, + "loud": {true, unix.MS_SILENT}, + "mand": {false, unix.MS_MANDLOCK}, + "noacl": {true, unix.MS_POSIXACL}, + "noatime": {false, unix.MS_NOATIME}, + "nodev": {false, unix.MS_NODEV}, + "nodiratime": {false, unix.MS_NODIRATIME}, + "noexec": {false, unix.MS_NOEXEC}, + "noiversion": {true, unix.MS_I_VERSION}, + "nolazytime": {true, unix.MS_LAZYTIME}, + "nomand": {true, unix.MS_MANDLOCK}, + "norelatime": {true, unix.MS_RELATIME}, + "nostrictatime": {true, unix.MS_STRICTATIME}, + "nosuid": {false, unix.MS_NOSUID}, + "rbind": {false, unix.MS_BIND | unix.MS_REC}, + "relatime": {false, unix.MS_RELATIME}, + "remount": {false, unix.MS_REMOUNT}, + "ro": {false, unix.MS_RDONLY}, + "rw": {true, unix.MS_RDONLY}, + "silent": {false, unix.MS_SILENT}, + "strictatime": {false, unix.MS_STRICTATIME}, + "suid": {true, unix.MS_NOSUID}, + "sync": {false, unix.MS_SYNCHRONOUS}, + } + propagationFlags := map[string]int{ + "private": unix.MS_PRIVATE, + "shared": unix.MS_SHARED, + "slave": unix.MS_SLAVE, + "unbindable": unix.MS_UNBINDABLE, + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "rshared": unix.MS_SHARED | unix.MS_REC, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + } + extensionFlags := map[string]struct { + clear bool + flag int + }{ + "tmpcopyup": {false, configs.EXT_COPYUP}, + } + for _, o := range options { + // If the option does not exist in the flags table or the flag + // is not supported on the platform, + // then it is a data value for a specific fs type + if f, exists := flags[o]; exists && f.flag != 0 { + if f.clear { + flag &= ^f.flag + } else { + flag |= f.flag + } + } else if f, exists := propagationFlags[o]; exists && f != 0 { + pgflag = append(pgflag, f) + } else if f, exists := extensionFlags[o]; exists && f.flag != 0 { + if f.clear { + extFlags &= ^f.flag + } else { + extFlags |= f.flag + } + } else { + data = append(data, o) + } + } + return flag, pgflag, strings.Join(data, ","), extFlags +} + +func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { + if config == nil { + return nil, nil + } + + // No default action specified, no syscalls listed, assume seccomp disabled + if config.DefaultAction == "" && len(config.Syscalls) == 0 { + return nil, nil + } + + newConfig := new(configs.Seccomp) + newConfig.Syscalls = []*configs.Syscall{} + + if len(config.Architectures) > 0 { + newConfig.Architectures = []string{} + for _, arch := range config.Architectures { + newArch, err := seccomp.ConvertStringToArch(string(arch)) + if err != nil { + return nil, err + } + newConfig.Architectures = append(newConfig.Architectures, newArch) + } + } + + // Convert default action from string representation + newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction)) + if err != nil { + return nil, err + } + newConfig.DefaultAction = newDefaultAction + + // Loop through all syscall blocks and convert them to libcontainer format + for _, call := range config.Syscalls { + newAction, err := seccomp.ConvertStringToAction(string(call.Action)) + if err != nil { + return nil, err + } + + for _, name := range call.Names { + newCall := configs.Syscall{ + Name: name, + Action: newAction, + Args: []*configs.Arg{}, + } + // Loop through all the arguments of the syscall and convert them + for _, arg := range call.Args { + newOp, err := seccomp.ConvertStringToOperator(string(arg.Op)) + if err != nil { + return nil, err + } + + newArg := configs.Arg{ + Index: arg.Index, + Value: arg.Value, + ValueTwo: arg.ValueTwo, + Op: newOp, + } + + newCall.Args = append(newCall.Args, &newArg) + } + newConfig.Syscalls = append(newConfig.Syscalls, &newCall) + } + } + + return newConfig, nil +} + +func createHooks(rspec *specs.Spec, config *configs.Config) { + config.Hooks = &configs.Hooks{} + if rspec.Hooks != nil { + + for _, h := range rspec.Hooks.Prestart { + cmd := createCommandHook(h) + config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststart { + cmd := createCommandHook(h) + config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststop { + cmd := createCommandHook(h) + config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd)) + } + } +} + +func createCommandHook(h specs.Hook) configs.Command { + cmd := configs.Command{ + Path: h.Path, + Args: h.Args, + Env: h.Env, + } + if h.Timeout != nil { + d := time.Duration(*h.Timeout) * time.Second + cmd.Timeout = &d + } + return cmd +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index 5f124cd8bbc7..8d353d984be4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -3,13 +3,12 @@ package system import ( - "bufio" - "fmt" "os" "os/exec" "syscall" // only for exec "unsafe" + "github.com/opencontainers/runc/libcontainer/user" "golang.org/x/sys/unix" ) @@ -102,34 +101,43 @@ func Setctty() error { } // RunningInUserNS detects whether we are currently running in a user namespace. -// Copied from github.com/lxc/lxd/shared/util.go +// Originally copied from github.com/lxc/lxd/shared/util.go func RunningInUserNS() bool { - file, err := os.Open("/proc/self/uid_map") + uidmap, err := user.CurrentProcessUIDMap() if err != nil { // This kernel-provided file only exists if user namespaces are supported return false } - defer file.Close() - - buf := bufio.NewReader(file) - l, _, err := buf.ReadLine() - if err != nil { - return false - } + return UIDMapInUserNS(uidmap) +} - line := string(l) - var a, b, c int64 - fmt.Sscanf(line, "%d %d %d", &a, &b, &c) +func UIDMapInUserNS(uidmap []user.IDMap) bool { /* * We assume we are in the initial user namespace if we have a full * range - 4294967295 uids starting at uid 0. */ - if a == 0 && b == 0 && c == 4294967295 { + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { return false } return true } +// GetParentNSeuid returns the euid within the parent user namespace +func GetParentNSeuid() int { + euid := os.Geteuid() + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return euid + } + for _, um := range uidmap { + if um.ID <= euid && euid <= um.ID+um.Count-1 { + return um.ParentID + euid - um.ID + } + } + return euid +} + // SetSubreaper sets the value i as the subreaper setting for the calling process func SetSubreaper(i int) error { return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go index e7cfd62b293c..b94be74a6648 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go @@ -2,8 +2,26 @@ package system +import ( + "os" + + "github.com/opencontainers/runc/libcontainer/user" +) + // RunningInUserNS is a stub for non-Linux systems // Always returns false func RunningInUserNS() bool { return false } + +// UIDMapInUserNS is a stub for non-Linux systems +// Always returns false +func UIDMapInUserNS(uidmap []user.IDMap) bool { + return false +} + +// GetParentNSeuid returns the euid within the parent user namespace +// Always returns os.Geteuid on non-linux +func GetParentNSeuid() int { + return os.Geteuid() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go index c45e30041185..c1e634c949db 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go @@ -114,3 +114,29 @@ func CurrentUser() (User, error) { func CurrentGroup() (Group, error) { return LookupGid(unix.Getgid()) } + +func CurrentUserSubUIDs() ([]SubID, error) { + u, err := CurrentUser() + if err != nil { + return nil, err + } + return ParseSubIDFileFilter("/etc/subuid", + func(entry SubID) bool { return entry.Name == u.Name }) +} + +func CurrentGroupSubGIDs() ([]SubID, error) { + g, err := CurrentGroup() + if err != nil { + return nil, err + } + return ParseSubIDFileFilter("/etc/subgid", + func(entry SubID) bool { return entry.Name == g.Name }) +} + +func CurrentProcessUIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/uid_map") +} + +func CurrentProcessGIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/gid_map") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index 93414516caa8..37993da83313 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -75,12 +75,29 @@ func groupFromOS(g *user.Group) (Group, error) { return newGroup, nil } +// SubID represents an entry in /etc/sub{u,g}id +type SubID struct { + Name string + SubID int + Count int +} + +// IDMap represents an entry in /proc/PID/{u,g}id_map +type IDMap struct { + ID int + ParentID int + Count int +} + func parseLine(line string, v ...interface{}) { - if line == "" { + parseParts(strings.Split(line, ":"), v...) +} + +func parseParts(parts []string, v ...interface{}) { + if len(parts) == 0 { return } - parts := strings.Split(line, ":") for i, p := range parts { // Ignore cases where we don't have enough fields to populate the arguments. // Some configuration files like to misbehave. @@ -479,3 +496,111 @@ func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int } return GetAdditionalGroups(additionalGroups, group) } + +func ParseSubIDFile(path string) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubID(subid) +} + +func ParseSubID(subid io.Reader) ([]SubID, error) { + return ParseSubIDFilter(subid, nil) +} + +func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubIDFilter(subid, filter) +} + +func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { + if r == nil { + return nil, fmt.Errorf("nil source for subid-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []SubID{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 subuid + p := SubID{} + parseLine(line, &p.Name, &p.SubID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseIDMapFile(path string) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMap(r) +} + +func ParseIDMap(r io.Reader) ([]IDMap, error) { + return ParseIDMapFilter(r, nil) +} + +func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMapFilter(r, filter) +} + +func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { + if r == nil { + return nil, fmt.Errorf("nil source for idmap-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []IDMap{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 7 user_namespaces + p := IDMap{} + parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go new file mode 100644 index 000000000000..c8a9364d54d7 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go @@ -0,0 +1,93 @@ +// +build linux + +package utils + +/* + * Copyright 2016, 2017 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// MaxSendfdLen is the maximum length of the name of a file descriptor being +// sent using SendFd. The name of the file handle returned by RecvFd will never +// be larger than this value. +const MaxNameLen = 4096 + +// oobSpace is the size of the oob slice required to store a single FD. Note +// that unix.UnixRights appears to make the assumption that fd is always int32, +// so sizeof(fd) = 4. +var oobSpace = unix.CmsgSpace(4) + +// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// socket. The file name of the remote file descriptor will be recreated +// locally (it is sent as non-auxiliary data in the same payload). +func RecvFd(socket *os.File) (*os.File, error) { + // For some reason, unix.Recvmsg uses the length rather than the capacity + // when passing the msg_controllen and other attributes to recvmsg. So we + // have to actually set the length. + name := make([]byte, MaxNameLen) + oob := make([]byte, oobSpace) + + sockfd := socket.Fd() + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + if err != nil { + return nil, err + } + + if n >= MaxNameLen || oobn != oobSpace { + return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + } + + // Truncate. + name = name[:n] + oob = oob[:oobn] + + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return nil, err + } + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + } + scm := scms[0] + + fds, err := unix.ParseUnixRights(&scm) + if err != nil { + return nil, err + } + if len(fds) != 1 { + return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) + } + fd := uintptr(fds[0]) + + return os.NewFile(fd, string(name)), nil +} + +// SendFd sends a file descriptor over the given AF_UNIX socket. In +// addition, the file.Name() of the given file will also be sent as +// non-auxiliary data in the same payload (allowing to send contextual +// information for a file descriptor). +func SendFd(socket *os.File, name string, fd uintptr) error { + if len(name) >= MaxNameLen { + return fmt.Errorf("sendfd: filename too long: %s", name) + } + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go new file mode 100644 index 000000000000..baa54c9ba2ce --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -0,0 +1,127 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + "unsafe" + + "golang.org/x/sys/unix" +) + +const ( + exitSignalOffset = 128 +) + +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + if size > 64 { + size = 64 + } + return prefix + hex.EncodeToString(id)[:size], nil +} + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +// ExitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func ExitStatus(status unix.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} + +// WriteJSON writes the provided struct v to w using standard json marshaling +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// SearchLabels searches a list of key-value pairs for the provided key and +// returns the corresponding value. The pairs must be separated with '='. +func SearchLabels(labels []string, query string) string { + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == query { + return parts[1] + } + } + return "" +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontainer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +} + +func GetIntSize() int { + return int(unsafe.Sizeof(1)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go new file mode 100644 index 000000000000..c96088988a6d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -0,0 +1,44 @@ +// +build !windows + +package utils + +import ( + "io/ioutil" + "os" + "strconv" + + "golang.org/x/sys/unix" +) + +func CloseExecFrom(minFd int) error { + fdList, err := ioutil.ReadDir("/proc/self/fd") + if err != nil { + return err + } + for _, fi := range fdList { + fd, err := strconv.Atoi(fi.Name()) + if err != nil { + // ignore non-numeric file names + continue + } + + if fd < minFd { + // ignore descriptors lower than our specified minimum + continue + } + + // intentionally ignore errors from unix.CloseOnExec + unix.CloseOnExec(fd) + // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) + } + return nil +} + +// NewSockPair returns a new unix socket pair +func NewSockPair(name string) (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil +} diff --git a/vendor/github.com/opencontainers/selinux/LICENSE b/vendor/github.com/opencontainers/selinux/LICENSE new file mode 100644 index 000000000000..8dada3edaf50 --- /dev/null +++ b/vendor/github.com/opencontainers/selinux/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/opencontainers/selinux/README.md b/vendor/github.com/opencontainers/selinux/README.md new file mode 100644 index 000000000000..043a9293718f --- /dev/null +++ b/vendor/github.com/opencontainers/selinux/README.md @@ -0,0 +1,7 @@ +# selinux + +[![GoDoc](https://godoc.org/github.com/opencontainers/selinux?status.svg)](https://godoc.org/github.com/opencontainers/selinux) [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/selinux)](https://goreportcard.com/report/github.com/opencontainers/selinux) [![Build Status](https://travis-ci.org/opencontainers/selinux.svg?branch=master)](https://travis-ci.org/opencontainers/selinux) + +Common SELinux package used across the container ecosystem. + +Please see the [godoc](https://godoc.org/github.com/opencontainers/selinux) for more information. diff --git a/vendor/github.com/opencontainers/selinux/go-selinux/selinux.go b/vendor/github.com/opencontainers/selinux/go-selinux/selinux.go new file mode 100644 index 000000000000..17ba2c556144 --- /dev/null +++ b/vendor/github.com/opencontainers/selinux/go-selinux/selinux.go @@ -0,0 +1,688 @@ +// +build linux + +package selinux + +import ( + "bufio" + "bytes" + "crypto/rand" + "encoding/binary" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "sync" + "syscall" +) + +const ( + // Enforcing constant indicate SELinux is in enforcing mode + Enforcing = 1 + // Permissive constant to indicate SELinux is in permissive mode + Permissive = 0 + // Disabled constant to indicate SELinux is disabled + Disabled = -1 + selinuxDir = "/etc/selinux/" + selinuxConfig = selinuxDir + "config" + selinuxfsMount = "/sys/fs/selinux" + selinuxTypeTag = "SELINUXTYPE" + selinuxTag = "SELINUX" + xattrNameSelinux = "security.selinux" + stRdOnly = 0x01 + selinuxfsMagic = 0xf97cff8c +) + +type selinuxState struct { + enabledSet bool + enabled bool + selinuxfsSet bool + selinuxfs string + mcsList map[string]bool + sync.Mutex +} + +var ( + assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) + state = selinuxState{ + mcsList: make(map[string]bool), + } +) + +// Context is a representation of the SELinux label broken into 4 parts +type Context map[string]string + +func (s *selinuxState) setEnable(enabled bool) bool { + s.Lock() + defer s.Unlock() + s.enabledSet = true + s.enabled = enabled + return s.enabled +} + +func (s *selinuxState) getEnabled() bool { + s.Lock() + enabled := s.enabled + enabledSet := s.enabledSet + s.Unlock() + if enabledSet { + return enabled + } + + enabled = false + if fs := getSelinuxMountPoint(); fs != "" { + if con, _ := CurrentLabel(); con != "kernel" { + enabled = true + } + } + return s.setEnable(enabled) +} + +// SetDisabled disables selinux support for the package +func SetDisabled() { + state.setEnable(false) +} + +func (s *selinuxState) setSELinuxfs(selinuxfs string) string { + s.Lock() + defer s.Unlock() + s.selinuxfsSet = true + s.selinuxfs = selinuxfs + return s.selinuxfs +} + +func verifySELinuxfsMount(mnt string) bool { + var buf syscall.Statfs_t + for { + err := syscall.Statfs(mnt, &buf) + if err == nil { + break + } + if err == syscall.EAGAIN { + continue + } + return false + } + if uint32(buf.Type) != uint32(selinuxfsMagic) { + return false + } + if (buf.Flags & stRdOnly) != 0 { + return false + } + + return true +} + +func findSELinuxfs() string { + // fast path: check the default mount first + if verifySELinuxfsMount(selinuxfsMount) { + return selinuxfsMount + } + + // check if selinuxfs is available before going the slow path + fs, err := ioutil.ReadFile("/proc/filesystems") + if err != nil { + return "" + } + if !bytes.Contains(fs, []byte("\tselinuxfs\n")) { + return "" + } + + // slow path: try to find among the mounts + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "" + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for { + mnt := findSELinuxfsMount(scanner) + if mnt == "" { // error or not found + return "" + } + if verifySELinuxfsMount(mnt) { + return mnt + } + } +} + +// findSELinuxfsMount returns a next selinuxfs mount point found, +// if there is one, or an empty string in case of EOF or error. +func findSELinuxfsMount(s *bufio.Scanner) string { + for s.Scan() { + txt := s.Text() + // The first field after - is fs type. + // Safe as spaces in mountpoints are encoded as \040 + if !strings.Contains(txt, " - selinuxfs ") { + continue + } + const mPos = 5 // mount point is 5th field + fields := strings.SplitN(txt, " ", mPos+1) + if len(fields) < mPos+1 { + continue + } + return fields[mPos-1] + } + + return "" +} + +func (s *selinuxState) getSELinuxfs() string { + s.Lock() + selinuxfs := s.selinuxfs + selinuxfsSet := s.selinuxfsSet + s.Unlock() + if selinuxfsSet { + return selinuxfs + } + + return s.setSELinuxfs(findSELinuxfs()) +} + +// getSelinuxMountPoint returns the path to the mountpoint of an selinuxfs +// filesystem or an empty string if no mountpoint is found. Selinuxfs is +// a proc-like pseudo-filesystem that exposes the selinux policy API to +// processes. The existence of an selinuxfs mount is used to determine +// whether selinux is currently enabled or not. +func getSelinuxMountPoint() string { + return state.getSELinuxfs() +} + +// GetEnabled returns whether selinux is currently enabled. +func GetEnabled() bool { + return state.getEnabled() +} + +func readConfig(target string) (value string) { + var ( + val, key string + bufin *bufio.Reader + ) + + in, err := os.Open(selinuxConfig) + if err != nil { + return "" + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err != io.EOF { + return "" + } + done = true + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == target { + return strings.Trim(val, "\"") + } + } + } + return "" +} + +func getSELinuxPolicyRoot() string { + return selinuxDir + readConfig(selinuxTypeTag) +} + +func readCon(name string) (string, error) { + var val string + + in, err := os.Open(name) + if err != nil { + return "", err + } + defer in.Close() + + _, err = fmt.Fscanf(in, "%s", &val) + return strings.Trim(val, "\x00"), err +} + +// SetFileLabel sets the SELinux label for this path or returns an error. +func SetFileLabel(path string, label string) error { + return lsetxattr(path, xattrNameSelinux, []byte(label), 0) +} + +// FileLabel returns the SELinux label for this path or returns an error. +func FileLabel(path string) (string, error) { + label, err := lgetxattr(path, xattrNameSelinux) + if err != nil { + return "", err + } + // Trim the NUL byte at the end of the byte buffer, if present. + if len(label) > 0 && label[len(label)-1] == '\x00' { + label = label[:len(label)-1] + } + return string(label), nil +} + +/* +SetFSCreateLabel tells kernel the label to create all file system objects +created by this task. Setting label="" to return to default. +*/ +func SetFSCreateLabel(label string) error { + return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/fscreate", syscall.Gettid()), label) +} + +/* +FSCreateLabel returns the default label the kernel which the kernel is using +for file system objects created by this task. "" indicates default. +*/ +func FSCreateLabel() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/fscreate", syscall.Gettid())) +} + +// CurrentLabel returns the SELinux label of the current process thread, or an error. +func CurrentLabel() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/current", syscall.Gettid())) +} + +// PidLabel returns the SELinux label of the given pid, or an error. +func PidLabel(pid int) (string, error) { + return readCon(fmt.Sprintf("/proc/%d/attr/current", pid)) +} + +/* +ExecLabel returns the SELinux label that the kernel will use for any programs +that are executed by the current process thread, or an error. +*/ +func ExecLabel() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid())) +} + +func writeCon(name string, val string) error { + out, err := os.OpenFile(name, os.O_WRONLY, 0) + if err != nil { + return err + } + defer out.Close() + + if val != "" { + _, err = out.Write([]byte(val)) + } else { + _, err = out.Write(nil) + } + return err +} + +/* +CanonicalizeContext takes a context string and writes it to the kernel +the function then returns the context that the kernel will use. This function +can be used to see if two contexts are equivalent +*/ +func CanonicalizeContext(val string) (string, error) { + return readWriteCon(filepath.Join(getSelinuxMountPoint(), "context"), val) +} + +func readWriteCon(name string, val string) (string, error) { + var retval string + f, err := os.OpenFile(name, os.O_RDWR, 0) + if err != nil { + return "", err + } + defer f.Close() + + _, err = f.Write([]byte(val)) + if err != nil { + return "", err + } + + _, err = fmt.Fscanf(f, "%s", &retval) + return strings.Trim(retval, "\x00"), err +} + +/* +SetExecLabel sets the SELinux label that the kernel will use for any programs +that are executed by the current process thread, or an error. +*/ +func SetExecLabel(label string) error { + return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid()), label) +} + +// Get returns the Context as a string +func (c Context) Get() string { + if c["level"] != "" { + return fmt.Sprintf("%s:%s:%s:%s", c["user"], c["role"], c["type"], c["level"]) + } + return fmt.Sprintf("%s:%s:%s", c["user"], c["role"], c["type"]) +} + +// NewContext creates a new Context struct from the specified label +func NewContext(label string) Context { + c := make(Context) + + if len(label) != 0 { + con := strings.SplitN(label, ":", 4) + c["user"] = con[0] + c["role"] = con[1] + c["type"] = con[2] + if len(con) > 3 { + c["level"] = con[3] + } + } + return c +} + +// ReserveLabel reserves the MLS/MCS level component of the specified label +func ReserveLabel(label string) { + if len(label) != 0 { + con := strings.SplitN(label, ":", 4) + if len(con) > 3 { + mcsAdd(con[3]) + } + } +} + +func selinuxEnforcePath() string { + return fmt.Sprintf("%s/enforce", getSelinuxMountPoint()) +} + +// EnforceMode returns the current SELinux mode Enforcing, Permissive, Disabled +func EnforceMode() int { + var enforce int + + enforceS, err := readCon(selinuxEnforcePath()) + if err != nil { + return -1 + } + + enforce, err = strconv.Atoi(string(enforceS)) + if err != nil { + return -1 + } + return enforce +} + +/* +SetEnforceMode sets the current SELinux mode Enforcing, Permissive. +Disabled is not valid, since this needs to be set at boot time. +*/ +func SetEnforceMode(mode int) error { + return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode)) +} + +/* +DefaultEnforceMode returns the systems default SELinux mode Enforcing, +Permissive or Disabled. Note this is is just the default at boot time. +EnforceMode tells you the systems current mode. +*/ +func DefaultEnforceMode() int { + switch readConfig(selinuxTag) { + case "enforcing": + return Enforcing + case "permissive": + return Permissive + } + return Disabled +} + +func mcsAdd(mcs string) error { + if mcs == "" { + return nil + } + state.Lock() + defer state.Unlock() + if state.mcsList[mcs] { + return fmt.Errorf("MCS Label already exists") + } + state.mcsList[mcs] = true + return nil +} + +func mcsDelete(mcs string) { + if mcs == "" { + return + } + state.Lock() + defer state.Unlock() + state.mcsList[mcs] = false +} + +func intToMcs(id int, catRange uint32) string { + var ( + SETSIZE = int(catRange) + TIER = SETSIZE + ORD = id + ) + + if id < 1 || id > 523776 { + return "" + } + + for ORD > TIER { + ORD = ORD - TIER + TIER-- + } + TIER = SETSIZE - TIER + ORD = ORD + TIER + return fmt.Sprintf("s0:c%d,c%d", TIER, ORD) +} + +func uniqMcs(catRange uint32) string { + var ( + n uint32 + c1, c2 uint32 + mcs string + ) + + for { + binary.Read(rand.Reader, binary.LittleEndian, &n) + c1 = n % catRange + binary.Read(rand.Reader, binary.LittleEndian, &n) + c2 = n % catRange + if c1 == c2 { + continue + } else { + if c1 > c2 { + c1, c2 = c2, c1 + } + } + mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2) + if err := mcsAdd(mcs); err != nil { + continue + } + break + } + return mcs +} + +/* +ReleaseLabel will unreserve the MLS/MCS Level field of the specified label. +Allowing it to be used by another process. +*/ +func ReleaseLabel(label string) { + if len(label) != 0 { + con := strings.SplitN(label, ":", 4) + if len(con) > 3 { + mcsDelete(con[3]) + } + } +} + +var roFileLabel string + +// ROFileLabel returns the specified SELinux readonly file label +func ROFileLabel() (fileLabel string) { + return roFileLabel +} + +/* +ContainerLabels returns an allocated processLabel and fileLabel to be used for +container labeling by the calling process. +*/ +func ContainerLabels() (processLabel string, fileLabel string) { + var ( + val, key string + bufin *bufio.Reader + ) + + if !GetEnabled() { + return "", "" + } + lxcPath := fmt.Sprintf("%s/contexts/lxc_contexts", getSELinuxPolicyRoot()) + in, err := os.Open(lxcPath) + if err != nil { + return "", "" + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err == io.EOF { + done = true + } else { + goto exit + } + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == "process" { + processLabel = strings.Trim(val, "\"") + } + if key == "file" { + fileLabel = strings.Trim(val, "\"") + } + if key == "ro_file" { + roFileLabel = strings.Trim(val, "\"") + } + } + } + + if processLabel == "" || fileLabel == "" { + return "", "" + } + + if roFileLabel == "" { + roFileLabel = fileLabel + } +exit: + scon := NewContext(processLabel) + if scon["level"] != "" { + mcs := uniqMcs(1024) + scon["level"] = mcs + processLabel = scon.Get() + scon = NewContext(fileLabel) + scon["level"] = mcs + fileLabel = scon.Get() + } + return processLabel, fileLabel +} + +// SecurityCheckContext validates that the SELinux label is understood by the kernel +func SecurityCheckContext(val string) error { + return writeCon(fmt.Sprintf("%s/context", getSelinuxMountPoint()), val) +} + +/* +CopyLevel returns a label with the MLS/MCS level from src label replaces on +the dest label. +*/ +func CopyLevel(src, dest string) (string, error) { + if src == "" { + return "", nil + } + if err := SecurityCheckContext(src); err != nil { + return "", err + } + if err := SecurityCheckContext(dest); err != nil { + return "", err + } + scon := NewContext(src) + tcon := NewContext(dest) + mcsDelete(tcon["level"]) + mcsAdd(scon["level"]) + tcon["level"] = scon["level"] + return tcon.Get(), nil +} + +// Prevent users from relabing system files +func badPrefix(fpath string) error { + var badprefixes = []string{"/usr"} + + for _, prefix := range badprefixes { + if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) { + return fmt.Errorf("relabeling content in %s is not allowed", prefix) + } + } + return nil +} + +// Chcon changes the fpath file object to the SELinux label label. +// If the fpath is a directory and recurse is true Chcon will walk the +// directory tree setting the label +func Chcon(fpath string, label string, recurse bool) error { + if label == "" { + return nil + } + if err := badPrefix(fpath); err != nil { + return err + } + callback := func(p string, info os.FileInfo, err error) error { + return SetFileLabel(p, label) + } + + if recurse { + return filepath.Walk(fpath, callback) + } + + return SetFileLabel(fpath, label) +} + +// DupSecOpt takes an SELinux process label and returns security options that +// can will set the SELinux Type and Level for future container processes +func DupSecOpt(src string) []string { + if src == "" { + return nil + } + con := NewContext(src) + if con["user"] == "" || + con["role"] == "" || + con["type"] == "" { + return nil + } + dup := []string{"user:" + con["user"], + "role:" + con["role"], + "type:" + con["type"], + } + + if con["level"] != "" { + dup = append(dup, "level:"+con["level"]) + } + + return dup +} + +// DisableSecOpt returns a security opt that can be used to disabling SELinux +// labeling support for future container processes +func DisableSecOpt() []string { + return []string{"disable"} +} diff --git a/vendor/github.com/opencontainers/selinux/go-selinux/xattrs.go b/vendor/github.com/opencontainers/selinux/go-selinux/xattrs.go new file mode 100644 index 000000000000..7f2ef8504906 --- /dev/null +++ b/vendor/github.com/opencontainers/selinux/go-selinux/xattrs.go @@ -0,0 +1,78 @@ +// +build linux + +package selinux + +import ( + "syscall" + "unsafe" +) + +var _zero uintptr + +// Returns a []byte slice if the xattr is set and nil otherwise +// Requires path and its attribute as arguments +func lgetxattr(path string, attr string) ([]byte, error) { + var sz int + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return nil, err + } + attrBytes, err := syscall.BytePtrFromString(attr) + if err != nil { + return nil, err + } + + // Start with a 128 length byte array + sz = 128 + dest := make([]byte, sz) + destBytes := unsafe.Pointer(&dest[0]) + _sz, _, errno := syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(destBytes), uintptr(len(dest)), 0, 0) + + switch { + case errno == syscall.ENODATA: + return nil, errno + case errno == syscall.ENOTSUP: + return nil, errno + case errno == syscall.ERANGE: + // 128 byte array might just not be good enough, + // A dummy buffer is used ``uintptr(0)`` to get real size + // of the xattrs on disk + _sz, _, errno = syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(unsafe.Pointer(nil)), uintptr(0), 0, 0) + sz = int(_sz) + if sz < 0 { + return nil, errno + } + dest = make([]byte, sz) + destBytes := unsafe.Pointer(&dest[0]) + _sz, _, errno = syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(destBytes), uintptr(len(dest)), 0, 0) + if errno != 0 { + return nil, errno + } + case errno != 0: + return nil, errno + } + sz = int(_sz) + return dest[:sz], nil +} + +func lsetxattr(path string, attr string, data []byte, flags int) error { + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return err + } + attrBytes, err := syscall.BytePtrFromString(attr) + if err != nil { + return err + } + var dataBytes unsafe.Pointer + if len(data) > 0 { + dataBytes = unsafe.Pointer(&data[0]) + } else { + dataBytes = unsafe.Pointer(&_zero) + } + _, _, errno := syscall.Syscall6(syscall.SYS_LSETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(dataBytes), uintptr(len(data)), uintptr(flags), 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/worker/runc/runc.go b/worker/runc/runc.go index c18668845f52..adb293dbe63b 100644 --- a/worker/runc/runc.go +++ b/worker/runc/runc.go @@ -29,7 +29,7 @@ type SnapshotterFactory struct { // NewWorkerOpt creates a WorkerOpt. // But it does not set the following fields: // - SessionManager -func NewWorkerOpt(root string, snFactory SnapshotterFactory, labels map[string]string) (base.WorkerOpt, error) { +func NewWorkerOpt(root string, snFactory SnapshotterFactory, rootless bool, labels map[string]string) (base.WorkerOpt, error) { var opt base.WorkerOpt name := "runc-" + snFactory.Name root = filepath.Join(root, name) @@ -40,7 +40,12 @@ func NewWorkerOpt(root string, snFactory SnapshotterFactory, labels map[string]s if err != nil { return opt, err } - exe, err := runcexecutor.New(runcexecutor.Opt{Root: filepath.Join(root, "executor")}) + exe, err := runcexecutor.New(runcexecutor.Opt{ + // Root directory + Root: filepath.Join(root, "executor"), + // without root privileges + Rootless: rootless, + }) if err != nil { return opt, err } diff --git a/worker/runc/runc_test.go b/worker/runc/runc_test.go index d3c8a76d3cac..c5c13224720e 100644 --- a/worker/runc/runc_test.go +++ b/worker/runc/runc_test.go @@ -46,7 +46,8 @@ func TestRuncWorker(t *testing.T) { return overlay.NewSnapshotter(root) }, } - workerOpt, err := NewWorkerOpt(tmpdir, snFactory, nil) + rootless := false + workerOpt, err := NewWorkerOpt(tmpdir, snFactory, rootless, nil) require.NoError(t, err) workerOpt.SessionManager, err = session.NewManager()