diff --git a/README.md b/README.md index d5f2810f..08c7ddcd 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ chmod: /: Bad message ``` ## Demo on Kubernetes -Before you install the demo on k8s, please ensure all [the requirements](./docs/install.md) are satisfied +Before you install the demo on k8s, please ensure all [the requirements](./docs/install.md) are satisfied. This demo shows that the Seccomp Agent can have different behaviour depending on the Kubernetes pod (in this case, the pod's namespace and name). @@ -82,11 +82,17 @@ apiVersion: v1 kind: Pod metadata: name: mynotifypod - # /var/lib/kubelet/seccomp/notify.json + # For older versions of Kubernetes (this annotation was deprecated in + # Kubernetes v1.19 and completely removed in v1.27): annotations: seccomp.security.alpha.kubernetes.io/pod: localhost/notify.json spec: restartPolicy: Never + securityContext: + # /var/lib/kubelet/seccomp/notify.json + seccompProfile: + type: Localhost + localhostProfile: notify.json containers: - name: container1 image: busybox @@ -108,3 +114,100 @@ proc on /root type proc (rw,relatime) / # time -f %E /bin/true 0m 2.00s ``` + +## Combining with user namespaces + +By combining this with Kubernetes's user namespace support it is possible to +allow a user within a user namespace to perform some operations which would +otherwise be limited to host root. + +One example is mounting other filesystem types. This is most useful combined +with user namespaces to allow mounting network file systems while a pod is +running. This is far safer than giving the container `privileged` access but +does expose more of the kernel to the pod, so you should consider your security +carefully. + +There is a possibility a process could change its user namespace after making +the mount system call, which could result in a confusing state. To fix this the +seccomp notify policy should use the SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV +flag, however this is [not yet available in +runc](https://github.com/opencontainers/runc/issues/3860) and requires Linux >= +5.19. + +Configure a policy, similar to above, but with the following metadata: +```json +{ + "architectures" : [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32" + ], + "defaultAction" : "SCMP_ACT_ALLOW", + "listenerPath": "/run/seccomp-agent.socket", + "listenerMetadata": "MOUNT_OTHER_FS_LIST=cifs\nMOUNT_NEED_CAP_ADMIN=true", + "syscalls" : [ + { + "action" : "SCMP_ACT_NOTIFY", + "names" : [ + "mount" + ] + }, + { + "action" : "SCMP_ACT_ALLOW", + "names" : [ + "umount" + ] + } + ] +} +``` + +(Policy cut down for sake of example, recommended to use a full policy that +additionally configures notify for mount and allows umount.) + +This has currently been successfully tested with cifs. Other filesystem types +should work; NFS will need NFS client utilities installing within the container +*and* on the host (e.g. to make upcalls work). + +* Deploy a pod with the seccomp policy and user namespaces: +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: mynotifypod-userns +spec: + restartPolicy: Never + # Needs "UserNamespacesSupport" feature gate currently + hostUsers: false + securityContext: + # /var/lib/kubelet/seccomp/notify.json + seccompProfile: + type: Localhost + localhostProfile: notify.json + containers: + - name: container1 + image: alpine + command: ["sh"] + args: ["-c", "sleep infinity"] + securityContext: + capabilities: + # This is safe combined with hostUsers: false + add: [SYS_ADMIN] +``` + +* Run commands in the pod: +```shell +$ kubectl exec -it mynotifypod-userns -- /bin/sh +/ # mkdir /mnt +/ # mount -t cifs -o username=user,password=pass '//10.0.0.1/C' /mnt +/ # df -h /mnt +/mnt # df -h /mnt +Filesystem Size Used Available Use% Mounted on +//10.0.0.1/C 95.4G 85.3G 10.1G 89% /mnt +/ # ls /mnt +$Recycle.Bin Documents and Settings Program files +[...] +/ # sed -i 's!^\(nobody.*/\)false!\1sh!' /etc/passwd +/ # su nobody +/ $ mount -t cifs -o username=user,password=pass '//10.0.0.1/C' /mnt +mount: permission denied (are you root?) +``` diff --git a/cmd/seccompagent/seccompagent.go b/cmd/seccompagent/seccompagent.go index e530df78..a275bcaa 100644 --- a/cmd/seccompagent/seccompagent.go +++ b/cmd/seccompagent/seccompagent.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux && cgo // +build linux,cgo package main @@ -121,7 +122,7 @@ func main() { // / # ls /root/self/cmdline // /root/self/cmdline allowedFilesystems := map[string]struct{}{"proc": struct{}{}} - r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems) + r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems, false /* do not check capabilities */) // Example: // # chmod 777 / @@ -214,8 +215,19 @@ func main() { if v, ok := metadata["MOUNT_SYSFS"]; ok && v == "true" { allowedFilesystems["sysfs"] = struct{}{} } + if v, ok := metadata["MOUNT_OTHER_FS_LIST"]; ok { + for _, fs := range strings.Split(v, ",") { + allowedFilesystems[fs] = struct{}{} + } + } + + requireCapsForMount := false + if v, ok := metadata["MOUNT_NEED_CAP_ADMIN"]; ok && v == "true" { + requireCapsForMount = true + } + if len(allowedFilesystems) > 0 { - r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems) + r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems, requireCapsForMount) } return r } diff --git a/go.mod b/go.mod index bd430698..042541d9 100644 --- a/go.mod +++ b/go.mod @@ -61,6 +61,8 @@ require ( k8s.io/klog/v2 v2.70.1 // indirect k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect + kernel.org/pub/linux/libs/security/libcap/cap v1.2.69 // indirect + kernel.org/pub/linux/libs/security/libcap/psx v1.2.69 // indirect sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect sigs.k8s.io/yaml v1.3.0 // indirect diff --git a/go.sum b/go.sum index be495193..d2129e66 100644 --- a/go.sum +++ b/go.sum @@ -329,6 +329,10 @@ k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkI k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU= k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4= k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= +kernel.org/pub/linux/libs/security/libcap/cap v1.2.69 h1:N0m3tKYbkRMmDobh/47ngz+AWeV7PcfXMDi8xu3Vrag= +kernel.org/pub/linux/libs/security/libcap/cap v1.2.69/go.mod h1:Tk5Ip2TuxaWGpccL7//rAsLRH6RQ/jfqTGxuN/+i/FQ= +kernel.org/pub/linux/libs/security/libcap/psx v1.2.69 h1:IdrOs1ZgwGw5CI+BH6GgVVlOt+LAXoPyh7enr8lfaXs= +kernel.org/pub/linux/libs/security/libcap/psx v1.2.69/go.mod h1:+l6Ee2F59XiJ2I6WR5ObpC1utCQJZ/VLsEbQCD8RG24= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= diff --git a/pkg/handlers/mount.go b/pkg/handlers/mount.go index 5ad1da4c..a0da64b9 100644 --- a/pkg/handlers/mount.go +++ b/pkg/handlers/mount.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build linux && cgo // +build linux,cgo package handlers @@ -28,6 +29,7 @@ import ( "github.com/kinvolk/seccompagent/pkg/nsenter" "github.com/kinvolk/seccompagent/pkg/readarg" "github.com/kinvolk/seccompagent/pkg/registry" + "github.com/kinvolk/seccompagent/pkg/userns" ) var _ = nsenter.RegisterModule("mount", runMountInNamespaces) @@ -37,6 +39,8 @@ type mountModuleParams struct { Source string `json:"source,omitempty"` Dest string `json:"dest,omitempty"` Filesystem string `json:"filesystem,omitempty"` + Flags int64 `json:"flags,omitempty"` + Options string `json:"options,omitempty"` } func runMountInNamespaces(param []byte) string { @@ -46,14 +50,14 @@ func runMountInNamespaces(param []byte) string { return fmt.Sprintf("%d", int(unix.ENOSYS)) } - err = unix.Mount(params.Source, params.Dest, params.Filesystem, 0, "") + err = unix.Mount(params.Source, params.Dest, params.Filesystem, 0, params.Options) if err != nil { return fmt.Sprintf("%d", int(err.(unix.Errno))) } return "0" } -func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc { +func Mount(allowedFilesystems map[string]struct{}, requireUserNamespaceAdmin bool) registry.HandlerFunc { return func(fd libseccomp.ScmpFd, req *libseccomp.ScmpNotifReq) (result registry.HandlerResult) { memFile, err := readarg.OpenMem(req.Pid) if err != nil { @@ -96,12 +100,17 @@ func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc { return registry.HandlerResultErrno(unix.EFAULT) } + // We don't handle flags, we may want to consider allowing a few. + // This is here so the debug logging makes it possible to see flags used. + flags := int64(req.Data.Args[3]) + log.WithFields(log.Fields{ "fd": fd, "pid": req.Pid, "source": source, "dest": dest, "filesystem": filesystem, + "flags": flags, }).Debug("Mount") if _, ok := allowedFilesystems[filesystem]; !ok { @@ -110,11 +119,70 @@ func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc { return registry.HandlerResultContinue() } + var options string + if req.Data.Args[4] != 0/* NULL */ && filesystem != "sysfs" { + // Get options, we assume because this is specified in + // allowedFilesystems that the data argument to mount(2) + // is a string so this is safe now. We ignore options for sysfs, as it + // doesn't define options. + options, err = readarg.ReadString(memFile, int64(req.Data.Args[4])) + if err != nil { + log.WithFields(log.Fields{ + "fd": fd, + "pid": req.Pid, + "arg": 4, + "err": err, + }).Error("Cannot read argument") + return registry.HandlerResultErrno(unix.EFAULT) + } + + // Log this at trace level only as it could have user credentials. + log.WithFields(log.Fields{ + "fd": fd, + "pid": req.Pid, + "source": source, + "dest": dest, + "filesystem": filesystem, + "flags": flags, + "options": options, + }).Trace("Handle mount") + } + + if requireUserNamespaceAdmin { + ok, err := userns.IsPIDAdminCapable(req.Pid) + if err != nil { + log.WithFields(log.Fields{ + "fd": fd, + "pid": req.Pid, + "err": err, + }).Error("Cannot check user namespace capabilities") + return registry.HandlerResultErrno(unix.EFAULT) + } + if !ok { + log.WithFields(log.Fields{ + "fd": fd, + "pid": req.Pid, + }).Info("Mount attempted without CAP_SYS_ADMIN") + return registry.HandlerResultErrno(unix.EPERM) + } + + // Ensure the notification is still valid after checking user namespace capabilities. + if err := libseccomp.NotifIDValid(fd, req.ID); err != nil { + log.WithFields(log.Fields{ + "fd": fd, + "req": req, + "err": err, + }).Debug("Notification no longer valid") + return registry.HandlerResultIntr() + } + } + params := mountModuleParams{ Module: "mount", Source: source, Dest: dest, Filesystem: filesystem, + Options: options, } mntns, err := nsenter.OpenNamespace(req.Pid, "mnt") diff --git a/pkg/userns/check.go b/pkg/userns/check.go new file mode 100644 index 00000000..d8c6ff5d --- /dev/null +++ b/pkg/userns/check.go @@ -0,0 +1,44 @@ +package userns + +import ( + "fmt" + + "golang.org/x/sys/unix" + "kernel.org/pub/linux/libs/security/libcap/cap" +) + +// IsPIDAdminCapable returns true if the PID is considered an admin of a user +// namespace, that is, it's in either in the init user namespace or one created +// by the host root and has CAP_SYS_ADMIN. The protects against a less +// privileged user either mounting a directory over a tree that gives them more +// access (e.g. /etc/sudoers.d) or hiding files. +func IsPIDAdminCapable(pid uint32) (bool, error) { + // We unfortunately need to reimplement some of the kernel's user namespace logic. + // Our goal is to allow a user with CAP_SYS_ADMIN inside the first user + // namespace to call mount(). If the user nests a user namespace below that, + // we don't want to allow that process to call mount. + + // This is security sensitive code, however TOCTOU isn't a concern in this case + // as this is designed to be used while blocked on a syscall and the kernel + // does not let multi-threaded processes change their user namespace (see + // setns() and unshare() docs). + fd, err := unix.Open(fmt.Sprintf("/proc/%d/ns/user", pid), unix.O_RDONLY, 0) + if err != nil { + return false, err + } + defer unix.Close(fd) + + uid, err := unix.IoctlGetInt(fd, unix.NS_GET_OWNER_UID) + if err != nil { + return false, err + } + if uid != 0 { + return false, err + } + set, err := cap.GetPID(int(pid)) + if err != nil { + return false, err + } + + return set.GetFlag(cap.Effective, cap.SYS_ADMIN) +}