diff --git a/pkg/bypass4netns/bypass4netns.go b/pkg/bypass4netns/bypass4netns.go index 44dea5a..c1170c2 100644 --- a/pkg/bypass4netns/bypass4netns.go +++ b/pkg/bypass4netns/bypass4netns.go @@ -417,7 +417,19 @@ func (h *notifHandler) registerSocket(pid int, sockfd int, syscallName string) ( sock.state = NotBypassable logger.Debugf("failed to get socket args err=%q", err) } else { - if sockDomain != syscall.AF_INET && sockDomain != syscall.AF_INET6 { + // compare process's netns is same or not with container init process's netns + isSameNetNS, err := util.SameNetNS(h.containerInitPid, pid) + if err != nil { + logger.Errorf("failed to check NetNS: err=%q", err) + sock.state = NotBypassable + } + + // check the process is executed with container's netns. + // processes with nested netns are not handled by bypass4netns + if !isSameNetNS { + logger.Infof("process seems to be executed in other netns. socket is NotBypassable and ignored") + sock.state = NotBypassable + } else if sockDomain != syscall.AF_INET && sockDomain != syscall.AF_INET6 { // non IP sockets are not handled. sock.state = NotBypassable logger.Debugf("socket domain=0x%x", sockDomain) @@ -629,6 +641,8 @@ type Handler struct { // key is child port forwardingPorts map[int]ForwardPortMapping + + ContainerInitPid int } // NewHandler creates new seccomp notif handler @@ -640,6 +654,7 @@ func NewHandler(socketPath, comSocketPath, tracerAgentLogPath string) *Handler { ignoredSubnets: []net.IPNet{}, forwardingPorts: map[int]ForwardPortMapping{}, readyFd: -1, + ContainerInitPid: -1, } return &handler @@ -711,6 +726,10 @@ type notifHandler struct { // cache pidfd to reduce latency. key is pid. pidInfos map[int]pidInfo + + // container init process's pid + // used to check whether netns is container or not. + containerInitPid int } type containerInterface struct { @@ -732,14 +751,15 @@ type pidInfo struct { tgid int } -func (h *Handler) newNotifHandler(fd uintptr, state *specs.ContainerProcessState) *notifHandler { +func (h *Handler) newNotifHandler(fd uintptr, state *specs.ContainerProcessState, containerInitPid int) *notifHandler { notifHandler := notifHandler{ - fd: libseccomp.ScmpFd(fd), - state: state, - forwardingPorts: map[int]ForwardPortMapping{}, - processes: map[int]*processStatus{}, - memfds: map[int]int{}, - pidInfos: map[int]pidInfo{}, + fd: libseccomp.ScmpFd(fd), + state: state, + forwardingPorts: map[int]ForwardPortMapping{}, + processes: map[int]*processStatus{}, + memfds: map[int]int{}, + pidInfos: map[int]pidInfo{}, + containerInitPid: containerInitPid, } notifHandler.nonBypassable = nonbypassable.New(h.ignoredSubnets) notifHandler.nonBypassableAutoUpdate = h.ignoredSubnetsAutoUpdate @@ -793,8 +813,17 @@ func (h *Handler) StartHandle(c2cConfig *C2CConnectionHandleConfig, multinodeCon continue } + // state.Pid can be the process in nested netns when executed with 'ip netns exec'. + // so, we cannot distinguish container netns and nested netns with simply comparing state.Pid and hooked process's pid + // Instead of state.Pid, init process's pid should be used. + // bypass4netns recognizes the first process as a init process. + if h.ContainerInitPid < 0 { + h.ContainerInitPid = state.Pid + logrus.Infof("ContainerInitPid is %d", h.ContainerInitPid) + } + logrus.Infof("Received new seccomp fd: %v", newFd) - notifHandler := h.newNotifHandler(newFd, state) + notifHandler := h.newNotifHandler(newFd, state, h.ContainerInitPid) notifHandler.c2cConnections = c2cConfig notifHandler.multinode = multinodeConfig if notifHandler.multinode.Enable { diff --git a/pkg/util/util.go b/pkg/util/util.go index 2aaf4af..9cc6888 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -20,8 +20,16 @@ func ShrinkID(id string) string { } func SameUserNS(pidX, pidY int) (bool, error) { - nsX := fmt.Sprintf("/proc/%d/ns/user", pidX) - nsY := fmt.Sprintf("/proc/%d/ns/user", pidY) + return sameNS(pidX, pidY, "user") +} + +func SameNetNS(pidX, pidY int) (bool, error) { + return sameNS(pidX, pidY, "net") +} + +func sameNS(pidX, pidY int, nsName string) (bool, error) { + nsX := fmt.Sprintf("/proc/%d/ns/%s", pidX, nsName) + nsY := fmt.Sprintf("/proc/%d/ns/%s", pidY, nsName) nsXResolved, err := os.Readlink(nsX) if err != nil { return false, err diff --git a/test/DockerfileNestedNetNS b/test/DockerfileNestedNetNS new file mode 100644 index 0000000..8d08c6f --- /dev/null +++ b/test/DockerfileNestedNetNS @@ -0,0 +1,4 @@ +FROM ubuntu:22.04 + +RUN apt update && apt upgrade -y +RUN apt install -y netcat iproute2 tcpdump iperf3 diff --git a/test/run_test.sh b/test/run_test.sh index b0f0ba8..e6c0104 100755 --- a/test/run_test.sh +++ b/test/run_test.sh @@ -211,3 +211,47 @@ echo "===== multinode test (single node) ====" systemctl --user stop etcd.service systemctl --user reset-failed ) + +echo "===== nested netns test ====" +( + CONTAINER_NAME="test-nested" + set +e + nerdctl rm -f $CONTAINER_NAME + systemctl --user stop run-iperf3 + systemctl --user stop run-bypass4netnsd + systemctl --user reset-failed + set -ex + + + IMAGE_NAME="b4ns:nested" + nerdctl build -f ./DockerfileNestedNetNS -t $IMAGE_NAME . + + systemd-run --user --unit run-bypass4netnsd bypass4netnsd + sleep 1 + nerdctl run --privileged --annotation nerdctl/bypass4netns=true -d -p 5202:5201 --name $CONTAINER_NAME $IMAGE_NAME sleep infinity + + # with container's netns + systemd-run --user --unit run-iperf3 nerdctl exec $CONTAINER_NAME iperf3 -s + sleep 1 + iperf3 -c localhost -t 1 -p 5202 --connect-timeout 1000 # it must success to connect. + systemctl --user stop run-iperf3 + systemctl --user reset-failed + + # with nested netns + nerdctl exec $CONTAINER_NAME mkdir /sys2 + nerdctl exec $CONTAINER_NAME mount -t sysfs --make-private /sys2 + nerdctl exec $CONTAINER_NAME ip netns add nested + systemd-run --user --unit run-iperf3 nerdctl exec $CONTAINER_NAME ip netns exec nested iperf3 -s + sleep 1 + set +e + iperf3 -c localhost -t 1 -p 5202 --connect-timeout 1000 # it must fail + if [ $? -eq 0 ]; then + echo "iperf3 must not success to connect." + exit 1 + fi + set -e + systemctl --user stop run-iperf3 + + nerdctl rm -f test-nested + systemctl --user reset-failed +)