From 54905ce52a70012713c3a88dea984e60c097e76b Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 25 Jun 2024 13:19:05 +1000
Subject: [PATCH 01/11] initial safe MkdirAll and MkdirAllHandle
 implementations

A fairly common usage of SecureJoin is of the form:

  path, _ := securejoin.SecureJoin(root, nonExistentPath)
  _ = os.MkdirAll(path, 0o755)

Unfortunately, such usage is not safe if the root can be modified
between SecureJoin and MkdirAll. This is a known issue with SecureJoin
(using path strings for the API means we cannot be sure the path is
safe).

However, by carefully using file handles it is possible to adapt
SecureJoin to be safe to use in a race-free way with MkdirAll by merging
them into a single function. The implementation is designed so that we
can easily add openat2 support, which is done in a follow-up commit.

MkdirAllHandle is a variant of MkdirAll which takes the root as an
*os.File and returns an *os.File for the final part of the created
directory tree. This is a preferable interface to use if you need a
handle to the final directory.

We have to update the minimum Go version to 1.20 so that we can have
errors that contain more than one wrapped error.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .github/workflows/ci.yml |   1 -
 go.mod                   |   4 +-
 go.sum                   |   2 +
 lookup_linux.go          | 189 +++++++++++++++++++++++++++++++++++++++
 mkdir_linux.go           | 163 +++++++++++++++++++++++++++++++++
 openat_linux.go          |  51 +++++++++++
 procfs_linux.go          | 147 ++++++++++++++++++++++++++++++
 7 files changed, 555 insertions(+), 2 deletions(-)
 create mode 100644 go.sum
 create mode 100644 lookup_linux.go
 create mode 100644 mkdir_linux.go
 create mode 100644 openat_linux.go
 create mode 100644 procfs_linux.go

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2920116..67a117b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,7 +20,6 @@ jobs:
           - macos-latest
           - windows-latest
         go-version:
-          - "1.14"
           - "1.21"
           - "1.22"
           - "^1"
diff --git a/go.mod b/go.mod
index 0607c1f..45a25ec 100644
--- a/go.mod
+++ b/go.mod
@@ -1,3 +1,5 @@
 module github.com/cyphar/filepath-securejoin
 
-go 1.13
+go 1.20
+
+require golang.org/x/sys v0.21.0 // indirect
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..ac7fb31
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,2 @@
+golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
+golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
diff --git a/lookup_linux.go b/lookup_linux.go
new file mode 100644
index 0000000..10544bb
--- /dev/null
+++ b/lookup_linux.go
@@ -0,0 +1,189 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+
+	"golang.org/x/sys/unix"
+)
+
+// partialLookupInRoot tries to lookup as much of the request path as possible
+// within the provided root (a-la RESOLVE_IN_ROOT) and opens the final existing
+// component of the requested path, returning a file handle to the final
+// existing component and a string containing the remaining path components.
+func partialLookupInRoot(root *os.File, unsafePath string) (_ *os.File, _ string, Err error) {
+	unsafePath = filepath.ToSlash(unsafePath) // noop
+
+	// This is very similar to SecureJoin, except that we operate on the
+	// components using file descriptors. We then return the last component we
+	// managed open, along with the remaining path components not opened.
+
+	// Get the "actual" root path from /proc/self/fd. This is necessary if the
+	// root is some magic-link like /proc/$pid/root, in which case we want to
+	// make sure when we do checkProcSelfFdPath that we are using the correct
+	// root path.
+	logicalRootPath, err := procSelfFdReadlink(root)
+	if err != nil {
+		return nil, "", fmt.Errorf("get real root path: %w", err)
+	}
+
+	currentDir, err := dupFile(root)
+	if err != nil {
+		return nil, "", fmt.Errorf("clone root fd: %w", err)
+	}
+	defer func() {
+		if Err != nil {
+			_ = currentDir.Close()
+		}
+	}()
+
+	var (
+		linksWalked   int
+		currentPath   string
+		remainingPath = unsafePath
+	)
+	for remainingPath != "" {
+		// Save the current remaining path so if the part is not real we can
+		// return the path including the component.
+		oldRemainingPath := remainingPath
+
+		// Get the next path component.
+		var part string
+		if i := strings.IndexByte(remainingPath, '/'); i == -1 {
+			part, remainingPath = remainingPath, ""
+		} else {
+			part, remainingPath = remainingPath[:i], remainingPath[i+1:]
+		}
+		// Skip any "//" components.
+		if part == "" {
+			continue
+		}
+
+		// Apply the component lexically to the path we are building.
+		// currentPath does not contain any symlinks, and we are lexically
+		// dealing with a single component, so it's okay to do a filepath.Clean
+		// here.
+		nextPath := path.Join("/", currentPath, part)
+		// If we logically hit the root, just clone the root rather than
+		// opening the part and doing all of the other checks.
+		if nextPath == "/" {
+			// Jump to root.
+			rootClone, err := dupFile(root)
+			if err != nil {
+				return nil, "", fmt.Errorf("clone root fd: %w", err)
+			}
+			_ = currentDir.Close()
+			currentDir = rootClone
+			currentPath = nextPath
+			continue
+		}
+
+		// Try to open the next component.
+		nextDir, err := openatFile(currentDir, part, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+		switch {
+		case err == nil:
+			st, err := nextDir.Stat()
+			if err != nil {
+				_ = nextDir.Close()
+				return nil, "", fmt.Errorf("stat component %q: %w", part, err)
+			}
+
+			switch st.Mode() & os.ModeType {
+			case os.ModeDir:
+				// If we are dealing with a directory, simply walk into it.
+				_ = currentDir.Close()
+				currentDir = nextDir
+				currentPath = nextPath
+
+				// If we are operating on a .., make sure we haven't escaped.
+				// We only have to check for ".." here because walking down
+				// into a regular component component cannot cause you to
+				// escape. This mirrors the logic in RESOLVE_IN_ROOT, except we
+				// have to check every ".." rather than only checking after a
+				// rename or mount on the system.
+				if part == ".." {
+					// Make sure the root hasn't moved.
+					if err := checkProcSelfFdPath(logicalRootPath, root); err != nil {
+						return nil, "", fmt.Errorf("root path moved during lookup: %w", err)
+					}
+					// Make sure the path is what we expect.
+					fullPath := logicalRootPath + nextPath
+					if err := checkProcSelfFdPath(fullPath, currentDir); err != nil {
+						return nil, "", fmt.Errorf("walking into %q had unexpected result: %w", part, err)
+					}
+				}
+
+			case os.ModeSymlink:
+				// We don't need the handle anymore.
+				_ = nextDir.Close()
+
+				// Unfortunately, we cannot readlink through our handle and so
+				// we need to do a separate readlinkat (which could race to
+				// give us an error if the attacker swapped the symlink with a
+				// non-symlink).
+				linkDest, err := readlinkatFile(currentDir, part)
+				if err != nil {
+					if errors.Is(err, unix.EINVAL) {
+						// The part was not a symlink, so assume that it's a
+						// regular file. It is possible for it to be a
+						// directory (if an attacker is swapping a directory
+						// and non-directory at this subpath) but erroring out
+						// here is better anyway.
+						err = fmt.Errorf("%w: path component %q is invalid: %w", errPossibleAttack, part, unix.ENOTDIR)
+					}
+					return nil, "", err
+				}
+
+				linksWalked++
+				if linksWalked > maxSymlinkLimit {
+					return nil, "", &os.PathError{Op: "partialLookupInRoot", Path: logicalRootPath + "/" + unsafePath, Err: unix.ELOOP}
+				}
+
+				// Update our logical remaining path.
+				remainingPath = linkDest + "/" + remainingPath
+				// Absolute symlinks reset any work we've already done.
+				if path.IsAbs(linkDest) {
+					// Jump to root.
+					rootClone, err := dupFile(root)
+					if err != nil {
+						return nil, "", fmt.Errorf("clone root fd: %w", err)
+					}
+					_ = currentDir.Close()
+					currentDir = rootClone
+					currentPath = "/"
+				}
+			default:
+				// For any other file type, we can't walk further and so we've
+				// hit the end of the lookup. The handling is very similar to
+				// ENOENT from openat(2), except that we return a handle to the
+				// component we just walked into (and we drop the component
+				// from the symlink stack).
+				_ = currentDir.Close()
+
+				// The current component exists, so return it.
+				return nextDir, remainingPath, nil
+			}
+
+		case errors.Is(err, os.ErrNotExist):
+			// We have hit a final component that doesn't exist, so we have our
+			// partial open result. Note that we have to use the OLD remaining
+			// path, since the lookup failed.
+			return currentDir, oldRemainingPath, nil
+
+		default:
+			return nil, "", err
+		}
+	}
+	// All of the components existed!
+	return currentDir, "", nil
+}
diff --git a/mkdir_linux.go b/mkdir_linux.go
new file mode 100644
index 0000000..b5ac1e2
--- /dev/null
+++ b/mkdir_linux.go
@@ -0,0 +1,163 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"golang.org/x/sys/unix"
+)
+
+var (
+	errInvalidMode    = errors.New("invalid permission mode")
+	errPossibleAttack = errors.New("possible attack detected")
+)
+
+// MkdirAllHandle is equivalent to MkdirAll, except that it is safer to use in
+// two respects:
+//
+//   - The caller provides the root directory as an *os.File (preferably O_PATH)
+//     handle. This means that the caller can be sure which root directory is
+//     being used. Note that this can be emulated by using /proc/self/fd/... as
+//     the root path with MkdirAll.
+//
+//   - Once all of the directories have been created, an *os.File (O_PATH) handle
+//     to the directory at unsafePath is returned to the caller. This is done in
+//     an effectively-race-free way (an attacker would only be able to swap the
+//     final directory component), which is not possible to emulate with
+//     MkdirAll.
+//
+// In addition, the returned handle is obtained far more efficiently than doing
+// a brand new lookup of unsafePath (such as with SecureJoin or openat2) after
+// doing MkdirAll. If you intend to open the directory after creating it, you
+// should use MkdirAllHandle.
+func MkdirAllHandle(root *os.File, unsafePath string, mode int) (_ *os.File, Err error) {
+	// Make sure there are no os.FileMode bits set.
+	if mode&^0o7777 != 0 {
+		return nil, fmt.Errorf("%w for mkdir 0o%.3o", errInvalidMode, mode)
+	}
+
+	// Try to open as much of the path as possible.
+	currentDir, remainingPath, err := partialLookupInRoot(root, unsafePath)
+	if err != nil {
+		return nil, fmt.Errorf("find existing subpath of %q: %w", unsafePath, err)
+	}
+	defer func() {
+		if Err != nil {
+			_ = currentDir.Close()
+		}
+	}()
+
+	// If there is an attacker deleting directories as we walk into them,
+	// detect this proactively. Note this is guaranteed to detect if the
+	// attacker deleted any part of the tree up to currentDir.
+	//
+	// Once we walk into a dead directory, partialLookupInRoot would not be
+	// able to walk further down the tree (directories must be empty before
+	// they are deleted), and if the attacker has removed the entire tree we
+	// can be sure that anything that was originally inside a dead directory
+	// must also be deleted and thus is a dead directory in its own right.
+	//
+	// This is mostly a quality-of-life check, because mkdir will simply fail
+	// later if the attacker deletes the tree after this check.
+	if err := isDeadInode(currentDir); err != nil {
+		return nil, fmt.Errorf("finding existing subpath of %q: %w", unsafePath, err)
+	}
+
+	// Check that we actually got a directory.
+	if st, err := currentDir.Stat(); err != nil {
+		return nil, fmt.Errorf("stat existing subpath handle %q: %w", currentDir.Name(), err)
+	} else if !st.IsDir() {
+		return nil, fmt.Errorf("cannot create subdirectories in %q: %w", currentDir.Name(), unix.ENOTDIR)
+	}
+
+	remainingParts := strings.Split(remainingPath, string(filepath.Separator))
+	if slices.Contains(remainingParts, "..") {
+		// The path contained ".." components after the end of the "real"
+		// components. We could try to safely resolve ".." here but that would
+		// add a bunch of extra logic for something that it's not clear even
+		// needs to be supported. So just return an error.
+		//
+		// If we do filepath.Clean(remainingPath) then we end up with the
+		// problem that ".." can erase a trailing dangling symlink and produce
+		// a path that doesn't quite match what the user asked for.
+		return nil, fmt.Errorf("%w: yet-to-be-created path %q contains '..' components", unix.ENOENT, remainingPath)
+	}
+
+	// Create the remaining components.
+	for _, part := range remainingParts {
+		switch part {
+		case "", ".":
+			// Skip over no-op paths.
+			continue
+		}
+
+		// NOTE: mkdir(2) will not follow trailing symlinks, so we can safely
+		// create the finaly component without worrying about symlink-exchange
+		// attacks.
+		if err := unix.Mkdirat(int(currentDir.Fd()), part, uint32(mode)); err != nil {
+			err = &os.PathError{Op: "mkdirat", Path: currentDir.Name() + "/" + part, Err: err}
+			// Make the error a bit nicer if the directory is dead.
+			if err2 := isDeadInode(currentDir); err2 != nil {
+				err = fmt.Errorf("%w (%w)", err, err2)
+			}
+			return nil, err
+		}
+
+		// Get a handle to the next component.
+		nextDir, err := openatFile(currentDir, part, unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+		if err != nil {
+			return nil, err
+		}
+		_ = currentDir.Close()
+		currentDir = nextDir
+	}
+	return currentDir, nil
+}
+
+// MkdirAll is a race-safe alternative to the Go stdlib's os.MkdirAll function,
+// where the new directory is guaranteed to be within the root directory (if an
+// attacker can move directories from inside the root to outside the root, the
+// created directory tree might be outside of the root but the key constraint
+// is that at no point will we walk outside of the directory tree we are
+// creating).
+//
+// Effectively, MkdirAll(root, unsafePath, mode) is equivalent to
+//
+//	path, _ := securejoin.SecureJoin(root, unsafePath)
+//	err := os.MkdirAll(path, mode)
+//
+// But is much safer. The above implementation is unsafe because if an attacker
+// can modify the filesystem tree between SecureJoin and MkdirAll, it is
+// possible for MkdirAll to resolve unsafe symlink components and create
+// directories outside of the root.
+//
+// If you plan to open the directory after you have created it or want to use
+// an open directory handle as the root, you should use MkdirAllHandle instead.
+// This function is a wrapper around MkdirAllHandle.
+//
+// NOTE: The mode argument must be set the unix mode bits (unix.S_I...), not
+// the Go generic mode bits (os.Mode...).
+func MkdirAll(root, unsafePath string, mode int) error {
+	rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return err
+	}
+	defer rootDir.Close()
+
+	f, err := MkdirAllHandle(rootDir, unsafePath, mode)
+	if err != nil {
+		return err
+	}
+	_ = f.Close()
+	return nil
+}
diff --git a/openat_linux.go b/openat_linux.go
new file mode 100644
index 0000000..3291005
--- /dev/null
+++ b/openat_linux.go
@@ -0,0 +1,51 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"os"
+	"path/filepath"
+
+	"golang.org/x/sys/unix"
+)
+
+func dupFile(f *os.File) (*os.File, error) {
+	fd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
+	if err != nil {
+		return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
+	}
+	return os.NewFile(uintptr(fd), f.Name()), nil
+}
+
+func openatFile(dir *os.File, path string, flags int, mode int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.O_CLOEXEC
+	fd, err := unix.Openat(int(dir.Fd()), path, flags, uint32(mode))
+	if err != nil {
+		return nil, &os.PathError{Op: "openat", Path: dir.Name() + "/" + path, Err: err}
+	}
+	// All of the paths we use with openatFile(2) are guaranteed to be
+	// lexically safe, so we can use path.Join here.
+	fullPath := filepath.Join(dir.Name(), path)
+	return os.NewFile(uintptr(fd), fullPath), nil
+}
+
+func readlinkatFile(dir *os.File, path string) (string, error) {
+	size := 4096
+	for {
+		linkBuf := make([]byte, size)
+		n, err := unix.Readlinkat(int(dir.Fd()), path, linkBuf)
+		if err != nil {
+			return "", &os.PathError{Op: "readlinkat", Path: dir.Name() + "/" + path, Err: err}
+		}
+		if n != size {
+			return string(linkBuf[:n]), nil
+		}
+		// Possible truncation, resize the buffer.
+		size *= 2
+	}
+}
diff --git a/procfs_linux.go b/procfs_linux.go
new file mode 100644
index 0000000..64bf334
--- /dev/null
+++ b/procfs_linux.go
@@ -0,0 +1,147 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"strconv"
+	"sync"
+
+	"golang.org/x/sys/unix"
+)
+
+func fstat(f *os.File) (unix.Stat_t, error) {
+	var stat unix.Stat_t
+	if err := unix.Fstat(int(f.Fd()), &stat); err != nil {
+		return stat, &os.PathError{Op: "fstat", Path: f.Name(), Err: err}
+	}
+	return stat, nil
+}
+
+func fstatfs(f *os.File) (unix.Statfs_t, error) {
+	var statfs unix.Statfs_t
+	if err := unix.Fstatfs(int(f.Fd()), &statfs); err != nil {
+		return statfs, &os.PathError{Op: "fstatfs", Path: f.Name(), Err: err}
+	}
+	return statfs, nil
+}
+
+// The kernel guarantees that the root inode of a procfs mount has an
+// f_type of PROC_SUPER_MAGIC and st_ino of PROC_ROOT_INO.
+const (
+	PROC_SUPER_MAGIC = 0x9fa0
+	PROC_ROOT_INO    = 1
+)
+
+func verifyProcRoot(procRoot *os.File) error {
+	if statfs, err := fstatfs(procRoot); err != nil {
+		return err
+	} else if statfs.Type != PROC_SUPER_MAGIC {
+		return fmt.Errorf("%w: incorrect procfs root filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
+	}
+	if stat, err := fstat(procRoot); err != nil {
+		return err
+	} else if stat.Ino != PROC_ROOT_INO {
+		return fmt.Errorf("%w: incorrect procfs root inode number %d", errUnsafeProcfs, stat.Ino)
+	}
+	return nil
+}
+
+var (
+	procSelfFdHandle *os.File
+	procSelfFdError  error
+	procSelfFdOnce   sync.Once
+
+	errUnsafeProcfs = errors.New("unsafe procfs detected")
+)
+
+func doGetProcSelfFd() (*os.File, error) {
+	// TODO: Use fsopen or open_tree to get a safe handle that cannot be
+	// over-mounted and we can absolutely verify.
+
+	procRoot, err := os.OpenFile("/proc", unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, err
+	}
+	defer procRoot.Close()
+	if err := verifyProcRoot(procRoot); err != nil {
+		return nil, err
+	}
+
+	handle, err := openatFile(procRoot, "self/fd", unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
+	}
+	// We can't detect bind-mounts of different parts of procfs on top of
+	// /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we
+	// aren't on the wrong filesystem here.
+	if statfs, err := fstatfs(handle); err != nil {
+		return nil, err
+	} else if statfs.Type != PROC_SUPER_MAGIC {
+		return nil, fmt.Errorf("%w: incorrect /proc/self/fd filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
+	}
+	return handle, nil
+}
+
+func getProcSelfFd() (*os.File, error) {
+	procSelfFdOnce.Do(func() {
+		procSelfFdHandle, procSelfFdError = doGetProcSelfFd()
+	})
+	return procSelfFdHandle, procSelfFdError
+}
+
+func procSelfFdReadlink(f *os.File) (string, error) {
+	// NOTE: It is possible for an attacker to bind-mount on top of the
+	// /proc/self/fd/... symlink, and there is currently no way for us to
+	// detect this. So we just have to assume that hasn't happened...
+	procSelfFd, err := getProcSelfFd()
+	if err != nil {
+		return "", fmt.Errorf("get safe procfs handle: %w", err)
+	}
+	// readlinkat(</proc/self/fd>, "42")
+	return readlinkatFile(procSelfFd, strconv.Itoa(int(f.Fd())))
+}
+
+var (
+	errPossibleBreakout = errors.New("possible breakout detected")
+	errInvalidDirectory = errors.New("wandered into deleted directory")
+	errDeletedInode     = errors.New("cannot verify path of deleted inode")
+)
+
+func isDeadInode(file *os.File) error {
+	// If the nlink of a file drops to 0, there is an attacker deleting
+	// directories during our walk, which could result in weird /proc values.
+	// It's better to error out in this case.
+	stat, err := fstat(file)
+	if err != nil {
+		return fmt.Errorf("check for dead inode: %w", err)
+	}
+	if stat.Nlink == 0 {
+		err := errDeletedInode
+		if stat.Mode&unix.S_IFMT == unix.S_IFDIR {
+			err = errInvalidDirectory
+		}
+		return fmt.Errorf("%w %q", err, file.Name())
+	}
+	return nil
+}
+
+func checkProcSelfFdPath(path string, file *os.File) error {
+	if err := isDeadInode(file); err != nil {
+		return err
+	}
+	actualPath, err := procSelfFdReadlink(file)
+	if err != nil {
+		return fmt.Errorf("get path of handle: %w", err)
+	}
+	if actualPath != path {
+		return fmt.Errorf("%w: handle path %q doesn't match expected path %q", errPossibleBreakout, actualPath, path)
+	}
+	return nil
+}

From b80d695cc19ef6a6b08b297b93fa8cb4ca24f28a Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 29 Jun 2024 16:15:55 +1000
Subject: [PATCH 02/11] mkdirall: verify that the newly created directory is
 sane

An attacker could swap out the directory we created between us creating
it and opening it. However, the worst thing they could do is force us to
make a directory inside a directory with different permissions or a
different mode. If they swap our directory with a directory that has the
same owner and permissions as the one we created, they didn't gain
anything. So, verify that the directory looks like the one we would've
created.

Unfortunately, there is no way to atomically create a directory and open
it. I discussed this with Al a while ago and I believe he said he didn't
like it (though I'm not sure if it was a blanket veto or not).

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 mkdir_linux.go  | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 procfs_linux.go | 17 +++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/mkdir_linux.go b/mkdir_linux.go
index b5ac1e2..50896c8 100644
--- a/mkdir_linux.go
+++ b/mkdir_linux.go
@@ -9,6 +9,7 @@ package securejoin
 import (
 	"errors"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"slices"
@@ -93,6 +94,27 @@ func MkdirAllHandle(root *os.File, unsafePath string, mode int) (_ *os.File, Err
 		return nil, fmt.Errorf("%w: yet-to-be-created path %q contains '..' components", unix.ENOENT, remainingPath)
 	}
 
+	// Make sure the mode doesn't have any type bits.
+	mode &^= unix.S_IFMT
+	// What properties do we expect any newly created directories to have?
+	var (
+		// While umask(2) is a per-thread property, and thus this value could
+		// vary between threads, a functioning Go program would LockOSThread
+		// threads with different umasks and so we don't need to LockOSThread
+		// for this entire mkdirat loop (if we are in the locked thread with a
+		// different umask, we are already locked and there's nothing for us to
+		// do -- and if not then it doesn't matter which thread we run on and
+		// there's nothing for us to do).
+		expectedMode = uint32(unix.S_IFDIR | (mode &^ getUmask()))
+
+		// We would want to get the fs[ug]id here, but we can't access those
+		// from userspace. In practice, nobody uses setfs[ug]id() anymore, so
+		// just use the effective [ug]id (which is equivalent to the fs[ug]id
+		// for programs that don't use setfs[ug]id).
+		expectedUid = uint32(unix.Geteuid())
+		expectedGid = uint32(unix.Getegid())
+	)
+
 	// Create the remaining components.
 	for _, part := range remainingParts {
 		switch part {
@@ -120,6 +142,42 @@ func MkdirAllHandle(root *os.File, unsafePath string, mode int) (_ *os.File, Err
 		}
 		_ = currentDir.Close()
 		currentDir = nextDir
+
+		// Make sure that the directory matches what we expect. An attacker
+		// could have swapped the directory between us making it and opening
+		// it. There's no way for us to be sure that the directory is
+		// _precisely_ the same as the directory we created, but if we are in
+		// an empty directory with the same owner and mode as the one we
+		// created then there is nothing the attacker could do with this new
+		// directory that they couldn't do with the old one.
+		if stat, err := fstat(currentDir); err != nil {
+			return nil, fmt.Errorf("check newly created directory: %w", err)
+		} else {
+			if stat.Mode != expectedMode {
+				return nil, fmt.Errorf("%w: newly created directory %q has incorrect mode 0o%.3o (expected 0o%.3o)", errPossibleAttack, currentDir.Name(), stat.Mode, expectedMode)
+			}
+			if stat.Uid != expectedUid || stat.Gid != expectedGid {
+				return nil, fmt.Errorf("%w: newly created directory %q has incorrect owner %d:%d (expected %d:%d)", errPossibleAttack, currentDir.Name(), stat.Uid, stat.Gid, expectedUid, expectedGid)
+			}
+			// Re-open our O_PATH handle for the directory with O_RDONLY so we
+			// can check if it the directory is empty. This is safe to do with
+			// open(2) because there is no way for "." to be replaced or
+			// mounted over.
+			if dir, err := openatFile(currentDir, ".", unix.O_RDONLY|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0); err != nil {
+				return nil, fmt.Errorf("%w: re-open newly created directory %q: %w", errPossibleAttack, currentDir.Name(), err)
+			} else {
+				// We only need to check for a single entry, and we should get EOF
+				// if the directory is empty.
+				_, err := dir.Readdirnames(1)
+				_ = dir.Close()
+				if !errors.Is(err, io.EOF) {
+					if err == nil {
+						err = fmt.Errorf("%w: newly created directory %q is non-empty", errPossibleAttack, currentDir.Name())
+					}
+					return nil, fmt.Errorf("check if newly created directory %q is empty: %w", currentDir.Name(), err)
+				}
+			}
+		}
 	}
 	return currentDir, nil
 }
diff --git a/procfs_linux.go b/procfs_linux.go
index 64bf334..8a6e902 100644
--- a/procfs_linux.go
+++ b/procfs_linux.go
@@ -10,6 +10,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"runtime"
 	"strconv"
 	"sync"
 
@@ -132,6 +133,22 @@ func isDeadInode(file *os.File) error {
 	return nil
 }
 
+func getUmask() int {
+	// umask is a per-thread property, but it is inherited by children, so we
+	// need to lock our OS thread to make sure that no other goroutine runs in
+	// this thread and no goroutines are spawned from this thread until we
+	// revert to the old umask.
+	//
+	// We could parse /proc/self/status to avoid this get-set problem, but
+	// /proc/thread-self requires LockOSThread anyway, so there's no real
+	// benefit over just using umask(2).
+	runtime.LockOSThread()
+	umask := unix.Umask(0)
+	unix.Umask(umask)
+	runtime.UnlockOSThread()
+	return umask
+}
+
 func checkProcSelfFdPath(path string, file *os.File) error {
 	if err := isDeadInode(file); err != nil {
 		return err

From ab8607e83aeaab22bcccce2c3c87e3885c43e054 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 28 Jun 2024 01:36:35 +1000
Subject: [PATCH 03/11] procfs: use /proc/thread-self to check

Rather than saving a handle to /proc/self/fd, save a /proc handle and
use /proc/thread-self/fd. This is necessary to handle Go programs with
locked threads that have unshare(CLONE_FS), such as runc.

This is modelled on similar logic that exists in runc (ProcThreadSelf),
and is necessary if we want to use MkdirAll in runc.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 openat_linux.go |   8 ++++
 procfs_linux.go | 107 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 93 insertions(+), 22 deletions(-)

diff --git a/openat_linux.go b/openat_linux.go
index 3291005..949fb5f 100644
--- a/openat_linux.go
+++ b/openat_linux.go
@@ -34,6 +34,14 @@ func openatFile(dir *os.File, path string, flags int, mode int) (*os.File, error
 	return os.NewFile(uintptr(fd), fullPath), nil
 }
 
+func fstatatFile(dir *os.File, path string, flags int) (unix.Stat_t, error) {
+	var stat unix.Stat_t
+	if err := unix.Fstatat(int(dir.Fd()), path, &stat, flags); err != nil {
+		return stat, &os.PathError{Op: "fstatat", Path: dir.Name() + "/" + path, Err: err}
+	}
+	return stat, nil
+}
+
 func readlinkatFile(dir *os.File, path string) (string, error) {
 	size := 4096
 	for {
diff --git a/procfs_linux.go b/procfs_linux.go
index 8a6e902..d43b018 100644
--- a/procfs_linux.go
+++ b/procfs_linux.go
@@ -55,14 +55,14 @@ func verifyProcRoot(procRoot *os.File) error {
 }
 
 var (
-	procSelfFdHandle *os.File
-	procSelfFdError  error
-	procSelfFdOnce   sync.Once
+	procRootHandle *os.File
+	procRootError  error
+	procRootOnce   sync.Once
 
 	errUnsafeProcfs = errors.New("unsafe procfs detected")
 )
 
-func doGetProcSelfFd() (*os.File, error) {
+func doGetProcRoot() (_ *os.File, Err error) {
 	// TODO: Use fsopen or open_tree to get a safe handle that cannot be
 	// over-mounted and we can absolutely verify.
 
@@ -70,42 +70,105 @@ func doGetProcSelfFd() (*os.File, error) {
 	if err != nil {
 		return nil, err
 	}
-	defer procRoot.Close()
+	defer func() {
+		if Err != nil {
+			_ = procRoot.Close()
+		}
+	}()
 	if err := verifyProcRoot(procRoot); err != nil {
 		return nil, err
 	}
+	return procRoot, nil
+}
+
+func getProcRoot() (*os.File, error) {
+	procRootOnce.Do(func() {
+		procRootHandle, procRootError = doGetProcRoot()
+	})
+	return procRootHandle, procRootError
+}
+
+var (
+	haveProcThreadSelf     bool
+	haveProcThreadSelfOnce sync.Once
+)
+
+type procThreadSelfCloser func()
+
+// procThreadSelf returns a handle to /proc/thread-self/<subpath> (or an
+// equivalent handle on older kernels where /proc/thread-self doesn't exist).
+// Once finished with the handle, you must call the returned closer function
+// (runtime.UnlockOSThread). You must not pass the returned *os.File to other
+// Go threads or use the handle after calling the closer.
+//
+// This is similar to ProcThreadSelf from runc, but with extra hardening
+// applied and using *os.File.
+func procThreadSelf(subpath string) (_ *os.File, _ procThreadSelfCloser, Err error) {
+	procRoot, err := getProcRoot()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	haveProcThreadSelfOnce.Do(func() {
+		// If the kernel doesn't support thread-self, it doesn't matter which
+		// /proc handle we use.
+		_, err := fstatatFile(procRoot, "thread-self", unix.AT_SYMLINK_NOFOLLOW)
+		haveProcThreadSelf = (err == nil)
+	})
+
+	// We need to lock our thread until the caller is done with the handle
+	// because between getting the handle and using it we could get interrupted
+	// by the Go runtime and hit the case where the underlying thread is
+	// swapped out and the original thread is killed, resulting in
+	// pull-your-hair-out-hard-to-debug issues in the caller.
+	runtime.LockOSThread()
+	defer func() {
+		if Err != nil {
+			runtime.UnlockOSThread()
+		}
+	}()
+
+	// Figure out what prefix we want to use.
+	threadSelf := "thread-self/"
+	if !haveProcThreadSelf {
+		/// Pre-3.17 kernels don't have /proc/thread-self, so do it manually.
+		threadSelf = "self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+		if _, err := fstatatFile(procRoot, threadSelf, unix.AT_SYMLINK_NOFOLLOW); err != nil {
+			// In this case, we running in a pid namespace that doesn't match
+			// the /proc mount we have. This can happen inside runc.
+			//
+			// Unfortunately, there is no nice way to get the correct TID to
+			// use here because of the age of the kernel, so we have to just
+			// use /proc/self and hope that it works.
+			threadSelf = "self/"
+		}
+	}
 
-	handle, err := openatFile(procRoot, "self/fd", unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+	// Grab the handle.
+	handle, err := openatFile(procRoot, threadSelf+subpath, unix.O_PATH|unix.O_CLOEXEC, 0)
 	if err != nil {
-		return nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
+		return nil, nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
 	}
 	// We can't detect bind-mounts of different parts of procfs on top of
 	// /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we
 	// aren't on the wrong filesystem here.
 	if statfs, err := fstatfs(handle); err != nil {
-		return nil, err
+		return nil, nil, err
 	} else if statfs.Type != PROC_SUPER_MAGIC {
-		return nil, fmt.Errorf("%w: incorrect /proc/self/fd filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
+		return nil, nil, fmt.Errorf("%w: incorrect /proc/%s%s filesystem type 0x%x", errUnsafeProcfs, threadSelf, subpath, statfs.Type)
 	}
-	return handle, nil
-}
-
-func getProcSelfFd() (*os.File, error) {
-	procSelfFdOnce.Do(func() {
-		procSelfFdHandle, procSelfFdError = doGetProcSelfFd()
-	})
-	return procSelfFdHandle, procSelfFdError
+	return handle, runtime.UnlockOSThread, nil
 }
 
 func procSelfFdReadlink(f *os.File) (string, error) {
+	procSelfFd, closer, err := procThreadSelf("fd/")
+	if err != nil {
+		return "", fmt.Errorf("get safe /proc/thread-self/fd handle: %w", err)
+	}
+	defer closer()
 	// NOTE: It is possible for an attacker to bind-mount on top of the
 	// /proc/self/fd/... symlink, and there is currently no way for us to
 	// detect this. So we just have to assume that hasn't happened...
-	procSelfFd, err := getProcSelfFd()
-	if err != nil {
-		return "", fmt.Errorf("get safe procfs handle: %w", err)
-	}
-	// readlinkat(</proc/self/fd>, "42")
 	return readlinkatFile(procSelfFd, strconv.Itoa(int(f.Fd())))
 }
 

From d2ccd9b73af97635a8287376dff4a9cce9437125 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 26 Jun 2024 18:46:25 +1000
Subject: [PATCH 04/11] mkdirall: add openat2 support

While adding openat2 support to SecureJoin doesn't make much sense (the
API is wrong for handling race-safe paths), MkdirAll can use openat2
internally to make sure we safely operate on /proc and more efficiently
do the partial-open necessary for MkdirAll.

The safe /proc handling is based on similar logic in libpathrs.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 lookup_linux.go  |   5 ++
 mkdir_linux.go   |  10 +++-
 openat2_linux.go | 122 +++++++++++++++++++++++++++++++++++++++++++++++
 procfs_linux.go  |  54 ++++++++++++++++-----
 4 files changed, 177 insertions(+), 14 deletions(-)
 create mode 100644 openat2_linux.go

diff --git a/lookup_linux.go b/lookup_linux.go
index 10544bb..ebec3fa 100644
--- a/lookup_linux.go
+++ b/lookup_linux.go
@@ -28,6 +28,11 @@ func partialLookupInRoot(root *os.File, unsafePath string) (_ *os.File, _ string
 	// components using file descriptors. We then return the last component we
 	// managed open, along with the remaining path components not opened.
 
+	// Try to use openat2 if possible.
+	if hasOpenat2() {
+		return partialLookupOpenat2(root, unsafePath)
+	}
+
 	// Get the "actual" root path from /proc/self/fd. This is necessary if the
 	// root is some magic-link like /proc/$pid/root, in which case we want to
 	// make sure when we do checkProcSelfFdPath that we are using the correct
diff --git a/mkdir_linux.go b/mkdir_linux.go
index 50896c8..ee47410 100644
--- a/mkdir_linux.go
+++ b/mkdir_linux.go
@@ -136,7 +136,15 @@ func MkdirAllHandle(root *os.File, unsafePath string, mode int) (_ *os.File, Err
 		}
 
 		// Get a handle to the next component.
-		nextDir, err := openatFile(currentDir, part, unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+		var nextDir *os.File
+		if hasOpenat2() {
+			nextDir, err = openat2File(currentDir, part, &unix.OpenHow{
+				Flags:   unix.O_PATH | unix.O_NOFOLLOW | unix.O_DIRECTORY | unix.O_CLOEXEC,
+				Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_SYMLINKS | unix.RESOLVE_NO_XDEV,
+			})
+		} else {
+			nextDir, err = openatFile(currentDir, part, unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+		}
 		if err != nil {
 			return nil, err
 		}
diff --git a/openat2_linux.go b/openat2_linux.go
new file mode 100644
index 0000000..173a784
--- /dev/null
+++ b/openat2_linux.go
@@ -0,0 +1,122 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+
+	"golang.org/x/sys/unix"
+)
+
+var (
+	hasOpenat2Bool bool
+	hasOpenat2Once sync.Once
+)
+
+func hasOpenat2() bool {
+	hasOpenat2Once.Do(func() {
+		fd, err := unix.Openat2(unix.AT_FDCWD, ".", &unix.OpenHow{
+			Flags:   unix.O_PATH | unix.O_CLOEXEC,
+			Resolve: unix.RESOLVE_NO_SYMLINKS | unix.RESOLVE_IN_ROOT,
+		})
+		if err == nil {
+			hasOpenat2Bool = true
+			_ = unix.Close(fd)
+		}
+	})
+	return hasOpenat2Bool
+}
+
+func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
+	// RESOLVE_IN_ROOT (and RESOLVE_BENEATH) can return -EAGAIN if we resolve
+	// ".." while a mount or rename occurs anywhere on the system. This could
+	// happen spuriously, or as the result of an attacker trying to mess with
+	// us during lookup.
+	//
+	// In addition, scoped lookups have a "safety check" at the end of
+	// complete_walk which will return -EXDEV if the final path is not in the
+	// root.
+	return how.Resolve&(unix.RESOLVE_IN_ROOT|unix.RESOLVE_BENEATH) != 0 &&
+		(errors.Is(err, unix.EAGAIN) || errors.Is(err, unix.EXDEV))
+}
+
+const scopedLookupMaxRetries = 10
+
+func openat2File(dir *os.File, path string, how *unix.OpenHow) (*os.File, error) {
+	fullPath := dir.Name() + "/" + path
+	// Make sure we always set O_CLOEXEC.
+	how.Flags |= unix.O_CLOEXEC
+	var tries int
+	for tries < scopedLookupMaxRetries {
+		fd, err := unix.Openat2(int(dir.Fd()), path, how)
+		if err != nil {
+			if scopedLookupShouldRetry(how, err) {
+				// We retry a couple of times to avoid the spurious errors, and
+				// if we are being attacked then returning -EAGAIN is the best
+				// we can do.
+				tries++
+				continue
+			}
+			return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: err}
+		}
+		// If we are using RESOLVE_IN_ROOT, the name we generated may be wrong.
+		// NOTE: The procRoot code MUST NOT use RESOLVE_IN_ROOT, otherwise
+		//       you'll get infinite recursion here.
+		if how.Resolve&unix.RESOLVE_IN_ROOT == unix.RESOLVE_IN_ROOT {
+			if actualPath, err := rawProcSelfFdReadlink(fd); err == nil {
+				fullPath = actualPath
+			}
+		}
+		return os.NewFile(uintptr(fd), fullPath), nil
+	}
+	return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: errPossibleAttack}
+}
+
+// partialLookupOpenat2 is an alternative implementation of
+// partialLookupInRoot, using openat2(RESOLVE_IN_ROOT) to more safely get a
+// handle to the deepest existing child of the requested path within the root.
+func partialLookupOpenat2(root *os.File, unsafePath string) (*os.File, string, error) {
+	// TODO: Implement this as a git-bisect-like binary search.
+
+	unsafePath = filepath.ToSlash(unsafePath) // noop
+	endIdx := len(unsafePath)
+	for endIdx > 0 {
+		subpath := unsafePath[:endIdx]
+
+		handle, err := openat2File(root, subpath, &unix.OpenHow{
+			Flags:   unix.O_PATH | unix.O_CLOEXEC,
+			Resolve: unix.RESOLVE_IN_ROOT | unix.RESOLVE_NO_MAGICLINKS,
+		})
+		if err == nil {
+			// Jump over the slash if we have a non-"" remainingPath.
+			if endIdx < len(unsafePath) {
+				endIdx += 1
+			}
+			// We found a subpath!
+			return handle, unsafePath[endIdx:], nil
+		}
+		if errors.Is(err, unix.ENOENT) || errors.Is(err, unix.ENOTDIR) {
+			// That path doesn't exist, let's try the next directory up.
+			endIdx = strings.LastIndexByte(subpath, '/')
+			continue
+		}
+		return nil, "", fmt.Errorf("open subpath: %w", err)
+	}
+	// If we couldn't open anything, the whole subpath is missing. Return a
+	// copy of the root fd so that the caller doesn't close this one by
+	// accident.
+	rootClone, err := dupFile(root)
+	if err != nil {
+		return nil, "", err
+	}
+	return rootClone, unsafePath, nil
+}
diff --git a/procfs_linux.go b/procfs_linux.go
index d43b018..8588a96 100644
--- a/procfs_linux.go
+++ b/procfs_linux.go
@@ -145,22 +145,46 @@ func procThreadSelf(subpath string) (_ *os.File, _ procThreadSelfCloser, Err err
 	}
 
 	// Grab the handle.
-	handle, err := openatFile(procRoot, threadSelf+subpath, unix.O_PATH|unix.O_CLOEXEC, 0)
-	if err != nil {
-		return nil, nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
-	}
-	// We can't detect bind-mounts of different parts of procfs on top of
-	// /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we
-	// aren't on the wrong filesystem here.
-	if statfs, err := fstatfs(handle); err != nil {
-		return nil, nil, err
-	} else if statfs.Type != PROC_SUPER_MAGIC {
-		return nil, nil, fmt.Errorf("%w: incorrect /proc/%s%s filesystem type 0x%x", errUnsafeProcfs, threadSelf, subpath, statfs.Type)
+	var handle *os.File
+	if hasOpenat2() {
+		// We prefer being able to use RESOLVE_NO_XDEV if we can, to be
+		// absolutely sure we are operating on a clean /proc handle that
+		// doesn't have any cheeky overmounts that could trick us (including
+		// symlink mounts on top of /proc/thread-self). RESOLVE_BENEATH isn't
+		// stricly needed, but just use it since we have it.
+		//
+		// NOTE: /proc/self is technically a magic-link (the contents of the
+		//       symlink are generated dynamically), but it doesn't use
+		//       nd_jump_link() so RESOLVE_NO_MAGICLINKS allows it.
+		//
+		// NOTE: We MUST NOT use RESOLVE_IN_ROOT here, as openat2File uses
+		//       procSelfFdReadlink to clean up the returned f.Name() if we use
+		//       RESOLVE_IN_ROOT (which would lead to an infinite recursion).
+		handle, err = openat2File(procRoot, threadSelf+subpath, &unix.OpenHow{
+			Flags:   unix.O_PATH | unix.O_CLOEXEC,
+			Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_MAGICLINKS,
+		})
+		if err != nil {
+			return nil, nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
+		}
+	} else {
+		handle, err = openatFile(procRoot, threadSelf+subpath, unix.O_PATH|unix.O_CLOEXEC, 0)
+		if err != nil {
+			return nil, nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
+		}
+		// We can't detect bind-mounts of different parts of procfs on top of
+		// /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we
+		// aren't on the wrong filesystem here.
+		if statfs, err := fstatfs(handle); err != nil {
+			return nil, nil, err
+		} else if statfs.Type != PROC_SUPER_MAGIC {
+			return nil, nil, fmt.Errorf("%w: incorrect /proc/self/fd filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
+		}
 	}
 	return handle, runtime.UnlockOSThread, nil
 }
 
-func procSelfFdReadlink(f *os.File) (string, error) {
+func rawProcSelfFdReadlink(fd int) (string, error) {
 	procSelfFd, closer, err := procThreadSelf("fd/")
 	if err != nil {
 		return "", fmt.Errorf("get safe /proc/thread-self/fd handle: %w", err)
@@ -169,7 +193,11 @@ func procSelfFdReadlink(f *os.File) (string, error) {
 	// NOTE: It is possible for an attacker to bind-mount on top of the
 	// /proc/self/fd/... symlink, and there is currently no way for us to
 	// detect this. So we just have to assume that hasn't happened...
-	return readlinkatFile(procSelfFd, strconv.Itoa(int(f.Fd())))
+	return readlinkatFile(procSelfFd, strconv.Itoa(fd))
+}
+
+func procSelfFdReadlink(f *os.File) (string, error) {
+	return rawProcSelfFdReadlink(int(f.Fd()))
 }
 
 var (

From 3d3771cde4d4cba0d91bc8385abcbaac86290628 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 28 Jun 2024 05:29:37 +1000
Subject: [PATCH 05/11] mkdirall: make partialOpen semantically identical to
 openat2 implementation

openat2(2) will fail when we try to follow a broken symlink, which means
that partialOpenat2() will treat the final symlink as a non-existent
path. In contrast, partialOpen() will happily return a handle and
remainingPath in the middle of the symlink resolution.

While either of the above semantics are acceptable, we need to unify the
behaviour so that users don't get different behaviour based on their
kernel version. We can't change the behaviour of openat2(2), and
emulating the current partialOpen() behaviour using openat2(2) is just
asking for trouble. So, make partialOpen() match the behaviour of
partialOpenat2().

Unfortunately, implementing this properly (including handling recursive
symlinks) requires emulating a symlink stack and making sure that we
return the top symlink we are in the mille of resolving if we hit a
non-existent or otherwise invalid path. If there is no last symlink, we
return the partial open as normal. Yeah, this is a little ugly, but I
don't see a nicer way of implementing these semantics...

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 lookup_linux.go | 186 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)

diff --git a/lookup_linux.go b/lookup_linux.go
index ebec3fa..140ac18 100644
--- a/lookup_linux.go
+++ b/lookup_linux.go
@@ -12,11 +12,147 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"slices"
 	"strings"
 
 	"golang.org/x/sys/unix"
 )
 
+type symlinkStackEntry struct {
+	// (dir, remainingPath) is what we would've returned if the link didn't
+	// exist. This matches what openat2(RESOLVE_IN_ROOT) would return in
+	// this case.
+	dir           *os.File
+	remainingPath string
+	// linkUnwalked is the remaining path components from the original
+	// Readlink which we have yet to walk. When this slice is empty, we
+	// drop the link from the stack.
+	linkUnwalked []string
+}
+
+func (se symlinkStackEntry) String() string {
+	return fmt.Sprintf("<%s>/%s [->%s]", se.dir.Name(), se.remainingPath, strings.Join(se.linkUnwalked, "/"))
+}
+
+func (se symlinkStackEntry) Close() {
+	_ = se.dir.Close()
+}
+
+type symlinkStack []*symlinkStackEntry
+
+func (s symlinkStack) IsEmpty() bool {
+	return len(s) == 0
+}
+
+func (s *symlinkStack) Close() {
+	for _, link := range *s {
+		link.Close()
+	}
+	// TODO: Switch to clear once we switch to Go 1.21.
+	*s = nil
+}
+
+var (
+	errEmptyStack         = errors.New("[internal] stack is empty")
+	errBrokenSymlinkStack = errors.New("[internal error] broken symlink stack")
+)
+
+func (s *symlinkStack) popPart(part string) error {
+	if s.IsEmpty() {
+		// If there is nothing in the symlink stack, then the part was from the
+		// real path provided by the user, and this is a no-op.
+		return errEmptyStack
+	}
+	tailEntry := (*s)[len(*s)-1]
+
+	// Double-check that we are popping the component we expect.
+	if len(tailEntry.linkUnwalked) == 0 {
+		return fmt.Errorf("%w: trying to pop component %q of empty stack entry %s", errBrokenSymlinkStack, part, tailEntry)
+	}
+	headPart := tailEntry.linkUnwalked[0]
+	if headPart != part {
+		return fmt.Errorf("%w: trying to pop component %q but the last stack entry is %s (%q)", errBrokenSymlinkStack, part, tailEntry, headPart)
+	}
+
+	// Drop the component, but keep the entry around in case we are dealing
+	// with a "tail-chained" symlink.
+	tailEntry.linkUnwalked = tailEntry.linkUnwalked[1:]
+	return nil
+}
+
+func (s *symlinkStack) PopPart(part string) error {
+	if err := s.popPart(part); err != nil {
+		if errors.Is(err, errEmptyStack) {
+			// Skip empty stacks.
+			err = nil
+		}
+		return err
+	}
+
+	// Clean up any of the trailing stack entries that are empty.
+	for lastGood := len(*s) - 1; lastGood >= 0; lastGood-- {
+		entry := (*s)[lastGood]
+		if len(entry.linkUnwalked) > 0 {
+			break
+		}
+		entry.Close()
+		(*s) = (*s)[:lastGood]
+	}
+	return nil
+}
+
+func (s *symlinkStack) push(dir *os.File, remainingPath, linkTarget string) error {
+	// Split the link target and clean up any "" parts.
+	linkTargetParts := slices.DeleteFunc(
+		strings.Split(linkTarget, "/"),
+		func(part string) bool { return part == "" })
+
+	// Don't add a no-op link to the stack. You can't create a no-op link
+	// symlink, but if the symlink is /, partialLookupInRoot has already jumped to the
+	// root and so there's nothing more to do.
+	if len(linkTargetParts) == 0 {
+		return nil
+	}
+
+	// Copy the directory so the caller doesn't close our copy.
+	dirCopy, err := dupFile(dir)
+	if err != nil {
+		return err
+	}
+
+	// Add to the stack.
+	*s = append(*s, &symlinkStackEntry{
+		dir:           dirCopy,
+		remainingPath: remainingPath,
+		linkUnwalked:  linkTargetParts,
+	})
+	return nil
+}
+
+func (s *symlinkStack) SwapLink(linkPart string, dir *os.File, remainingPath, linkTarget string) error {
+	// If we are currently inside a symlink resolution, remove the symlink
+	// component from the last symlink entry, but don't remove the entry even
+	// if it's empty. If we are a "tail-chained" symlink (a trailing symlink we
+	// hit during a symlink resolution) we need to keep the old symlink until
+	// we finish the resolution.
+	if err := s.popPart(linkPart); err != nil {
+		if !errors.Is(err, errEmptyStack) {
+			return err
+		}
+		// Push the component regardless of whether the stack was empty.
+	}
+	return s.push(dir, remainingPath, linkTarget)
+}
+
+func (s *symlinkStack) PopTopSymlink() (*os.File, string, bool) {
+	if s.IsEmpty() {
+		return nil, "", false
+	}
+	tailEntry := (*s)[0]
+	*s = (*s)[1:]
+	return tailEntry.dir, tailEntry.remainingPath, true
+}
+
 // partialLookupInRoot tries to lookup as much of the request path as possible
 // within the provided root (a-la RESOLVE_IN_ROOT) and opens the final existing
 // component of the requested path, returning a file handle to the final
@@ -52,6 +188,21 @@ func partialLookupInRoot(root *os.File, unsafePath string) (_ *os.File, _ string
 		}
 	}()
 
+	// symlinkStack is used to emulate how openat2(RESOLVE_IN_ROOT) treats
+	// dangling symlinks. If we hit a non-existent path while resolving a
+	// symlink, we need to return the (dir, remainingPath) that we had when we
+	// hit the symlink (treating the symlink as though it were a regular file).
+	// The set of (dir, remainingPath) sets is stored within the symlinkStack
+	// and we add and remove parts when we hit symlink and non-symlink
+	// components respectively. We need a stack because of recursive symlinks
+	// (symlinks that contain symlink components in their target).
+	//
+	// Note that the stack is ONLY used for book-keeping. All of the actual
+	// path walking logic is still based on currentPath/remainingPath and
+	// currentDir (as in SecureJoin).
+	var symlinkStack symlinkStack
+	defer symlinkStack.Close()
+
 	var (
 		linksWalked   int
 		currentPath   string
@@ -82,6 +233,9 @@ func partialLookupInRoot(root *os.File, unsafePath string) (_ *os.File, _ string
 		// If we logically hit the root, just clone the root rather than
 		// opening the part and doing all of the other checks.
 		if nextPath == "/" {
+			if err := symlinkStack.PopPart(part); err != nil {
+				return nil, "", fmt.Errorf("walking into root with part %q failed: %w", part, err)
+			}
 			// Jump to root.
 			rootClone, err := dupFile(root)
 			if err != nil {
@@ -110,6 +264,11 @@ func partialLookupInRoot(root *os.File, unsafePath string) (_ *os.File, _ string
 				currentDir = nextDir
 				currentPath = nextPath
 
+				// The part was real, so drop it from the symlink stack.
+				if err := symlinkStack.PopPart(part); err != nil {
+					return nil, "", fmt.Errorf("walking into directory %q failed: %w", part, err)
+				}
+
 				// If we are operating on a .., make sure we haven't escaped.
 				// We only have to check for ".." here because walking down
 				// into a regular component component cannot cause you to
@@ -154,6 +313,11 @@ func partialLookupInRoot(root *os.File, unsafePath string) (_ *os.File, _ string
 					return nil, "", &os.PathError{Op: "partialLookupInRoot", Path: logicalRootPath + "/" + unsafePath, Err: unix.ELOOP}
 				}
 
+				// Swap out the symlink's component for the link entry itself.
+				if err := symlinkStack.SwapLink(part, currentDir, oldRemainingPath, linkDest); err != nil {
+					return nil, "", fmt.Errorf("walking into symlink %q failed: push symlink: %w", part, err)
+				}
+
 				// Update our logical remaining path.
 				remainingPath = linkDest + "/" + remainingPath
 				// Absolute symlinks reset any work we've already done.
@@ -175,11 +339,33 @@ func partialLookupInRoot(root *os.File, unsafePath string) (_ *os.File, _ string
 				// from the symlink stack).
 				_ = currentDir.Close()
 
+				// The part existed, so drop it from the symlink stack.
+				if err := symlinkStack.PopPart(part); err != nil {
+					return nil, "", fmt.Errorf("walking into non-directory %q failed: %w", part, err)
+				}
+
+				// If there are any remaining components in the symlink stack,
+				// we are still within a symlink resolution and thus we hit a
+				// dangling symlink. So pretend that the first symlink in the
+				// stack we hit was an ENOENT (to match openat2).
+				if oldDir, remainingPath, ok := symlinkStack.PopTopSymlink(); ok {
+					_ = nextDir.Close()
+					return oldDir, remainingPath, nil
+				}
+
 				// The current component exists, so return it.
 				return nextDir, remainingPath, nil
 			}
 
 		case errors.Is(err, os.ErrNotExist):
+			// If there are any remaining components in the symlink stack, we
+			// are still within a symlink resolution and thus we hit a dangling
+			// symlink. So pretend that the first symlink in the stack we hit
+			// was an ENOENT (to match openat2).
+			if oldDir, remainingPath, ok := symlinkStack.PopTopSymlink(); ok {
+				_ = currentDir.Close()
+				return oldDir, remainingPath, nil
+			}
 			// We have hit a final component that doesn't exist, so we have our
 			// partial open result. Note that we have to use the OLD remaining
 			// path, since the lookup failed.

From 0355f24972b7d2c3abe9519b512c66ea3ea200ef Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 28 Jun 2024 05:28:19 +1000
Subject: [PATCH 06/11] mkdirall: add tests for MkdirAll and MkdirAllHandle

These are primarily correctness tests (81.3% code coverage), to make
sure we handle all of the ugly corner-cases with nested and dangling
symlinks correctly.

Almost all of the untested code is related to errors (most of which are
annoying to mock). The one set of error paths it might be worth adding
errors for is the isDeadInode() checks.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 go.mod              |  11 +-
 go.sum              |  10 ++
 mkdir_linux_test.go | 242 ++++++++++++++++++++++++++++++++++++++++++++
 openat2_linux.go    |   6 ++
 util_linux_test.go  |  80 +++++++++++++++
 5 files changed, 348 insertions(+), 1 deletion(-)
 create mode 100644 mkdir_linux_test.go
 create mode 100644 util_linux_test.go

diff --git a/go.mod b/go.mod
index 45a25ec..623b38e 100644
--- a/go.mod
+++ b/go.mod
@@ -2,4 +2,13 @@ module github.com/cyphar/filepath-securejoin
 
 go 1.20
 
-require golang.org/x/sys v0.21.0 // indirect
+require (
+	github.com/stretchr/testify v1.9.0
+	golang.org/x/sys v0.21.0
+)
+
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
diff --git a/go.sum b/go.sum
index ac7fb31..9058965 100644
--- a/go.sum
+++ b/go.sum
@@ -1,2 +1,12 @@
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
+github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
 golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/mkdir_linux_test.go b/mkdir_linux_test.go
new file mode 100644
index 0000000..c2afd09
--- /dev/null
+++ b/mkdir_linux_test.go
@@ -0,0 +1,242 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/unix"
+)
+
+func testMkdirAll_Basic(t *testing.T, mkdirAll func(t *testing.T, root, unsafePath string, mode int) error) {
+	// We create a new tree for each test, but the template is the same.
+	tree := []string{
+		"dir a",
+		"dir b/c/d/e/f",
+		"file b/c/file",
+		"symlink e /b/c/d/e",
+		"symlink b-file b/c/file",
+		// Dangling symlinks.
+		"symlink a-fake1 a/fake",
+		"symlink a-fake2 a/fake/foo/bar/..",
+		"symlink a-fake3 a/fake/../../b",
+		// Test non-lexical symlinks.
+		"dir target",
+		"dir link1",
+		"symlink link1/target_abs /target",
+		"symlink link1/target_rel ../target",
+		"dir link2",
+		"symlink link2/link1_abs /link1",
+		"symlink link2/link1_rel ../link1",
+		"dir link3",
+		"symlink link3/target_abs /link2/link1_rel/target_rel",
+		"symlink link3/target_rel ../link2/link1_rel/target_rel",
+		"symlink link3/deep_dangling1 ../link2/link1_rel/target_rel/nonexist",
+		"symlink link3/deep_dangling2 ../link2/link1_rel/target_rel/nonexist",
+		// Symlink loop.
+		"dir loop",
+		"symlink loop/link ../loop/link",
+	}
+
+	withWithoutOpenat2(t, func(t *testing.T) {
+		for name, test := range map[string]struct {
+			unsafePath  string
+			expectedErr error
+		}{
+			"existing":              {unsafePath: "a"},
+			"basic":                 {unsafePath: "a/b/c/d/e/f/g/h/i/j"},
+			"dotdot-in-nonexisting": {unsafePath: "a/b/c/d/e/f/g/h/i/j/k/../lmnop", expectedErr: unix.ENOENT},
+			"dotdot-in-existing":    {unsafePath: "b/c/../c/./d/e/f/g/h"},
+			"dotdot-after-symlink":  {unsafePath: "e/../dd/ee/ff"},
+			// Check that trying to create under a file fails.
+			"nondir-trailing":         {unsafePath: "b/c/file", expectedErr: unix.ENOTDIR},
+			"nondir-dotdot":           {unsafePath: "b/c/file/../d", expectedErr: unix.ENOTDIR},
+			"nondir-subdir":           {unsafePath: "b/c/file/subdir", expectedErr: unix.ENOTDIR},
+			"nondir-symlink-trailing": {unsafePath: "b-file", expectedErr: unix.ENOTDIR},
+			"nondir-symlink-dotdot":   {unsafePath: "b-file/../d", expectedErr: unix.ENOTDIR},
+			"nondir-symlink-subdir":   {unsafePath: "b-file/subdir", expectedErr: unix.ENOTDIR},
+			// Dangling symlinks are not followed.
+			"dangling1-trailing": {unsafePath: "a-fake1", expectedErr: unix.EEXIST},
+			"dangling1-basic":    {unsafePath: "a-fake1/foo", expectedErr: unix.EEXIST},
+			"dangling1-dotdot":   {unsafePath: "a-fake1/../bar/baz", expectedErr: unix.ENOENT},
+			"dangling2-trailing": {unsafePath: "a-fake2", expectedErr: unix.EEXIST},
+			"dangling2-basic":    {unsafePath: "a-fake2/foo", expectedErr: unix.EEXIST},
+			"dangling2-dotdot":   {unsafePath: "a-fake2/../bar/baz", expectedErr: unix.ENOENT},
+			"dangling3-trailing": {unsafePath: "a-fake3", expectedErr: unix.EEXIST},
+			"dangling3-basic":    {unsafePath: "a-fake3/foo", expectedErr: unix.EEXIST},
+			"dangling3-dotdot":   {unsafePath: "a-fake3/../bar/baz", expectedErr: unix.ENOENT},
+			// Non-lexical symlinks should work.
+			"nonlexical-basic":           {unsafePath: "target/foo"},
+			"nonlexical-level1-abs":      {unsafePath: "link1/target_abs/foo"},
+			"nonlexical-level1-rel":      {unsafePath: "link1/target_rel/foo"},
+			"nonlexical-level2-abs-abs":  {unsafePath: "link2/link1_abs/target_abs/foo"},
+			"nonlexical-level2-abs-rel":  {unsafePath: "link2/link1_abs/target_rel/foo"},
+			"nonlexical-level2-abs-open": {unsafePath: "link2/link1_abs/../target/foo"},
+			"nonlexical-level2-rel-abs":  {unsafePath: "link2/link1_rel/target_abs/foo"},
+			"nonlexical-level2-rel-rel":  {unsafePath: "link2/link1_rel/target_rel/foo"},
+			"nonlexical-level2-rel-open": {unsafePath: "link2/link1_rel/../target/foo"},
+			"nonlexical-level3-abs":      {unsafePath: "link3/target_abs/foo"},
+			"nonlexical-level3-rel":      {unsafePath: "link3/target_rel/foo"},
+			// But really tricky dangling symlinks should fail.
+			"dangling-tricky1-trailing": {unsafePath: "link3/deep_dangling1", expectedErr: unix.EEXIST},
+			"dangling-tricky1-basic":    {unsafePath: "link3/deep_dangling1/foo", expectedErr: unix.EEXIST},
+			"dangling-tricky1-dotdot":   {unsafePath: "link3/deep_dangling1/../bar", expectedErr: unix.ENOENT},
+			"dangling-tricky2-trailing": {unsafePath: "link3/deep_dangling2", expectedErr: unix.EEXIST},
+			"dangling-tricky2-basic":    {unsafePath: "link3/deep_dangling2/foo", expectedErr: unix.EEXIST},
+			"dangling-tricky2-dotdot":   {unsafePath: "link3/deep_dangling2/../bar", expectedErr: unix.ENOENT},
+			// And trying to mkdir inside a loop should fail.
+			"loop-trailing": {unsafePath: "loop/link", expectedErr: unix.ELOOP},
+			"loop-basic":    {unsafePath: "loop/link/foo", expectedErr: unix.ELOOP},
+			"loop-dotdot":   {unsafePath: "loop/link/../foo", expectedErr: unix.ELOOP},
+		} {
+			test := test // copy iterator
+			t.Run(name, func(t *testing.T) {
+				root := createTree(t, tree...)
+
+				rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+				require.NoError(t, err)
+				defer rootDir.Close()
+
+				// Before trying to make the tree, figure out what
+				// components don't exist yet so we can check them later.
+				handle, remainingPath, err := partialLookupInRoot(rootDir, test.unsafePath)
+				handleName := "<nil>"
+				if handle != nil {
+					handleName = handle.Name()
+					defer handle.Close()
+				}
+				defer func() {
+					if t.Failed() {
+						t.Logf("partialLookupInRoot(%s, %s) -> (<%s>, %s, %v)", root, test.unsafePath, handleName, remainingPath, err)
+					}
+				}()
+
+				// This mode is different to the one set up by createTree.
+				const expectedMode = 0o711
+
+				// Actually make the tree.
+				err = mkdirAll(t, root, test.unsafePath, 0o711)
+				assert.ErrorIsf(t, err, test.expectedErr, "MkdirAll(%q, %q)", root, test.unsafePath)
+
+				remainingPath = filepath.Join("/", remainingPath)
+				for remainingPath != filepath.Dir(remainingPath) {
+					stat, err := fstatatFile(handle, "./"+remainingPath, unix.AT_SYMLINK_NOFOLLOW)
+					if test.expectedErr == nil {
+						// Check that the new components have the right
+						// mode.
+						if assert.NoErrorf(t, err, "unexpected error when checking new directory %q", remainingPath) {
+							assert.Equalf(t, uint32(unix.S_IFDIR|expectedMode), stat.Mode, "new directory %q has the wrong mode", remainingPath)
+						}
+					} else {
+						// Check that none of the components are
+						// directories (i.e. make sure that the MkdirAll
+						// was a no-op).
+						if err == nil {
+							assert.NotEqualf(t, uint32(unix.S_IFDIR), stat.Mode&unix.S_IFMT, "failed MkdirAll created a new directory at %q", remainingPath)
+						}
+					}
+					// Jump up a level.
+					remainingPath = filepath.Dir(remainingPath)
+				}
+			})
+		}
+	})
+}
+
+func TestMkdirAll_Basic(t *testing.T) {
+	testMkdirAll_Basic(t, func(t *testing.T, root, unsafePath string, mode int) error {
+		// We can't check expectedPath here.
+		return MkdirAll(root, unsafePath, mode)
+	})
+}
+
+func TestMkdirAllHandle_Basic(t *testing.T) {
+	testMkdirAll_Basic(t, func(t *testing.T, root, unsafePath string, mode int) error {
+		// Same logic as MkdirAll.
+		rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+		if err != nil {
+			return err
+		}
+		defer rootDir.Close()
+		handle, err := MkdirAllHandle(rootDir, unsafePath, mode)
+		if err != nil {
+			return err
+		}
+		defer handle.Close()
+
+		// We can use SecureJoin here becuase we aren't being attacked in this
+		// particular test. Obviously this check is bogus for actual programs.
+		expectedPath, err := SecureJoin(root, unsafePath)
+		require.NoError(t, err)
+
+		// Now double-check that the handle is correct.
+		gotPath, err := procSelfFdReadlink(handle)
+		require.NoError(t, err, "get real path of returned handle")
+		assert.Equal(t, expectedPath, gotPath, "wrong final path from MkdirAllHandle")
+		// Also check that the f.Name() is correct while we're at it (this is
+		// not always guaranteed but it's better to try at least).
+		assert.Equal(t, expectedPath, handle.Name(), "handle from MkdirAllHandle has the wrong .Name()")
+		return nil
+	})
+}
+
+func testMkdirAll_InvalidMode(t *testing.T, mkdirAll func(t *testing.T, root, unsafePath string, mode int) error) {
+	for _, test := range []struct {
+		mode        int
+		expectedErr error
+	}{
+		// os.FileMode bits are invalid.
+		{int(os.ModeDir | 0o777), errInvalidMode},
+		{int(os.ModeSticky | 0o777), errInvalidMode},
+		{int(os.ModeIrregular | 0o777), errInvalidMode},
+		// unix.S_IFMT bits are also invalid.
+		{unix.S_IFDIR | 0o777, errInvalidMode},
+		{unix.S_IFREG | 0o777, errInvalidMode},
+		{unix.S_IFIFO | 0o777, errInvalidMode},
+		// suid/sgid bits are valid but you get an error because they don't get
+		// applied by mkdirat.
+		// TODO: Figure out if we want to allow this.
+		{unix.S_ISUID | 0o777, errPossibleAttack},
+		{unix.S_ISGID | 0o777, errPossibleAttack},
+		{unix.S_ISUID | unix.S_ISGID | unix.S_ISVTX | 0o777, errPossibleAttack},
+		// Proper sticky bit should work.
+		{unix.S_ISVTX | 0o777, nil},
+		// Regular mode bits.
+		{0o777, nil},
+		{0o711, nil},
+	} {
+		root := t.TempDir()
+		err := mkdirAll(t, root, "a/b/c", test.mode)
+		assert.ErrorIsf(t, err, test.expectedErr, "mkdirall 0o%.3o", test.mode)
+	}
+}
+
+func TestMkdirAll_InvalidMode(t *testing.T) {
+	testMkdirAll_InvalidMode(t, func(t *testing.T, root, unsafePath string, mode int) error {
+		return MkdirAll(root, unsafePath, mode)
+	})
+}
+
+func TestMkdirAllHandle_InvalidMode(t *testing.T) {
+	testMkdirAll_InvalidMode(t, func(t *testing.T, root, unsafePath string, mode int) error {
+		rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+		if err != nil {
+			return err
+		}
+		defer rootDir.Close()
+		handle, err := MkdirAllHandle(rootDir, unsafePath, mode)
+		if err != nil {
+			return err
+		}
+		_ = handle.Close()
+		return nil
+	})
+}
diff --git a/openat2_linux.go b/openat2_linux.go
index 173a784..fc93db8 100644
--- a/openat2_linux.go
+++ b/openat2_linux.go
@@ -13,6 +13,7 @@ import (
 	"path/filepath"
 	"strings"
 	"sync"
+	"testing"
 
 	"golang.org/x/sys/unix"
 )
@@ -20,9 +21,14 @@ import (
 var (
 	hasOpenat2Bool bool
 	hasOpenat2Once sync.Once
+
+	testingForceHasOpenat2 *bool
 )
 
 func hasOpenat2() bool {
+	if testing.Testing() && testingForceHasOpenat2 != nil {
+		return *testingForceHasOpenat2
+	}
 	hasOpenat2Once.Do(func() {
 		fd, err := unix.Openat2(unix.AT_FDCWD, ".", &unix.OpenHow{
 			Flags:   unix.O_PATH | unix.O_CLOEXEC,
diff --git a/util_linux_test.go b/util_linux_test.go
new file mode 100644
index 0000000..b50fc4a
--- /dev/null
+++ b/util_linux_test.go
@@ -0,0 +1,80 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func withWithoutOpenat2(t *testing.T, testFn func(t *testing.T)) {
+	t.Run("openat2=auto", testFn)
+
+	for _, useOpenat2 := range []bool{true, false} {
+		useOpenat2 := useOpenat2 // copy iterator
+		t.Run(fmt.Sprintf("openat2=%v", useOpenat2), func(t *testing.T) {
+			if useOpenat2 && !hasOpenat2() {
+				t.Skip("no openat2 support")
+			}
+			testingForceHasOpenat2 = &useOpenat2
+			defer func() { testingForceHasOpenat2 = nil }()
+
+			testFn(t)
+		})
+	}
+}
+
+// Format:
+//
+//	dir <name>
+//	file <name> <?content>
+//	symlink <name> <target>
+func createInTree(t *testing.T, root, spec string) {
+	f := strings.Fields(spec)
+	if len(f) < 2 {
+		t.Fatalf("invalid spec %q", spec)
+	}
+	inoType, subPath, f := f[0], f[1], f[2:]
+	fullPath := filepath.Join(root, subPath)
+	switch inoType {
+	case "dir":
+		err := os.MkdirAll(fullPath, 0o755)
+		require.NoError(t, err)
+	case "file":
+		var contents []byte
+		if len(f) >= 1 {
+			contents = []byte(f[0])
+		}
+		err := os.WriteFile(fullPath, contents, 0o644)
+		require.NoError(t, err)
+	case "symlink":
+		if len(f) < 1 {
+			t.Fatalf("invalid spec %q", spec)
+		}
+		target := f[0]
+		err := os.Symlink(target, fullPath)
+		require.NoError(t, err)
+	}
+}
+
+func createTree(t *testing.T, specs ...string) string {
+	root := t.TempDir()
+
+	// Put the root in a subdir.
+	treeRoot := filepath.Join(root, "tree")
+	os.MkdirAll(treeRoot, 0o755)
+
+	for _, spec := range specs {
+		createInTree(t, treeRoot, spec)
+	}
+	return treeRoot
+}

From 4f530fd438160e9aa1f135321b6992579c1292e8 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 26 Jun 2024 19:44:38 +1000
Subject: [PATCH 07/11] procfs: harden proc handle using fsopen() and
 open_tree()

Because we depend on procfs to be correct when operating on other
filesystems, having a safe procfs handle is vital. Unfortunately, if we
are an administrative program running inside a container that can modify
its mount configuration, our current checks are not sufficient to avoid
being tricked into thinking a path is real.

Luckily, with fsopen() and open_tree() it is possible to create a
completely private procfs instance that an attacker cannot modify. Note
that while they are both safe, they are safe for different reasons:

 1. fsopen() is safe because the created mount is completely separate to
    any other procfs mount, so any changes to mounts on the host are
    irrelevant. fsopen() can fail if we trip the mnt_too_revealing()
    check, so we may have to fall back to open_tree() in some cases.

 2. open_tree() creates a clone of a snapshot of the mount tree (or just
    the top mount if can avoid using AT_RECURSIVE, but
    mnt_too_revealing() may force us to use AT_RECURSIVE). While the
    tree we clone might have been messed with by an attacker, after
    cloning there is no way for the attacker to affect our clone (even
    mount propagation won't propagate into a clone[1]).

    The only risk is whether there are over-mounts with AT_RECURSIVE.
    Because anonymous mounts don't show up in mountinfo, the best we can
    do is check the mount id through statx to see whether a target has
    an overmount. We only do this for symlink operations (in particular,
    readlink) because openat2 handles the non-magiclink case for us and
    statx gained STATX_MNT_ID support in Linux 5.8.

    We could do this check only for the open_tree(AT_RECURSIVE) case,
    but it's simpler to always do it (of course, an attacker could add
    the mount after the check if we use the host /proc the "standard"
    way, but at least there's a chance we'll detect it).

    listmounts(2) would let us detect any overmounts at creation time,
    but it's not clear whether you want to error out if there is a mount
    over a path you never use (lxcfs has overmounts of /proc/cpuinfo and
    /proc/meminfo, which we never use).

Unfortunately, we can only use these when running as a privileged user.
In theory we could create a user namespace to gain the necessary
privileges to create these handles, but this would require spawning a
proper subprocess (CLONE_NEWUSER must be done in a single-threaded
program, which is not possible in Go without using Go's ForkExec) which
won't always work when filepath-securejoin is used as a library. It's
also far too complicated to deal with in practice.

This is based on similar logic I'm working on for libpathrs.

[1]: This is true since at least Linux 5.12. See commit ee2e3f50629f
     ("mount: fix mounting of detached mounts onto targets that reside
     on shared mounts").

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 procfs_linux.go | 227 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 219 insertions(+), 8 deletions(-)

diff --git a/procfs_linux.go b/procfs_linux.go
index 8588a96..22b8dd6 100644
--- a/procfs_linux.go
+++ b/procfs_linux.go
@@ -54,6 +54,124 @@ func verifyProcRoot(procRoot *os.File) error {
 	return nil
 }
 
+var (
+	hasNewMountApiBool bool
+	hasNewMountApiOnce sync.Once
+)
+
+func hasNewMountApi() bool {
+	hasNewMountApiOnce.Do(func() {
+		// All of the pieces of the new mount API we use (fsopen, fsconfig,
+		// fsmount, open_tree) were added together in Linux 5.1[1,2], so we can
+		// just check for one of the syscalls and the others should also be
+		// available.
+		//
+		// Just try to use open_tree(2) to open a file without OPEN_TREE_CLONE.
+		// This is equivalent to openat(2), but tells us if open_tree is
+		// available (and thus all of the other basic new mount API syscalls).
+		// open_tree(2) is most light-weight syscall to test here.
+		//
+		// [1]: merge commit 400913252d09
+		// [2]: <https://lore.kernel.org/lkml/153754740781.17872.7869536526927736855.stgit@warthog.procyon.org.uk/>
+		fd, err := unix.OpenTree(-int(unix.EBADF), "/", unix.OPEN_TREE_CLOEXEC)
+		if err == nil {
+			hasNewMountApiBool = true
+			_ = unix.Close(fd)
+		}
+	})
+	return hasNewMountApiBool
+}
+
+func fsopen(fsName string, flags int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.FSOPEN_CLOEXEC
+	fd, err := unix.Fsopen(fsName, flags)
+	if err != nil {
+		return nil, os.NewSyscallError("fsopen "+fsName, err)
+	}
+	return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
+}
+
+func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.FSMOUNT_CLOEXEC
+	fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
+	if err != nil {
+		return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
+	}
+	return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
+}
+
+func newPrivateProcMount() (*os.File, error) {
+	procfsCtx, err := fsopen("proc", unix.FSOPEN_CLOEXEC)
+	if err != nil {
+		return nil, err
+	}
+	defer procfsCtx.Close()
+
+	// Try to configure hidepid=ptraceable,subset=pid if possible, but ignore errors.
+	_ = unix.FsconfigSetString(int(procfsCtx.Fd()), "hidepid", "ptraceable")
+	_ = unix.FsconfigSetString(int(procfsCtx.Fd()), "subset", "pid")
+
+	// Get an actual handle.
+	if err := unix.FsconfigCreate(int(procfsCtx.Fd())); err != nil {
+		return nil, os.NewSyscallError("fsconfig create procfs", err)
+	}
+	return fsmount(procfsCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID)
+}
+
+func openTree(dir *os.File, path string, flags uint) (*os.File, error) {
+	dirFd := -int(unix.EBADF)
+	dirName := "."
+	if dir != nil {
+		dirFd = int(dir.Fd())
+		dirName = dir.Name()
+	}
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.OPEN_TREE_CLOEXEC
+	fd, err := unix.OpenTree(dirFd, path, flags)
+	if err != nil {
+		return nil, &os.PathError{Op: "open_tree", Path: path, Err: err}
+	}
+	return os.NewFile(uintptr(fd), dirName+"/"+path), nil
+}
+
+func clonePrivateProcMount() (_ *os.File, Err error) {
+	// Try to make a clone without using AT_RECURSIVE if we can. If this works,
+	// we can be sure there are no over-mounts and so if the root is valid then
+	// we're golden. Otherwise, we have to deal with over-mounts.
+	procfsHandle, err := openTree(nil, "/proc", unix.OPEN_TREE_CLONE)
+	if err != nil {
+		procfsHandle, err = openTree(nil, "/proc", unix.OPEN_TREE_CLONE|unix.AT_RECURSIVE)
+	}
+	if err != nil {
+		return nil, fmt.Errorf("creating a detached procfs clone: %w", err)
+	}
+	defer func() {
+		if Err != nil {
+			_ = procfsHandle.Close()
+		}
+	}()
+	if err := verifyProcRoot(procfsHandle); err != nil {
+		return nil, err
+	}
+	return procfsHandle, nil
+}
+
+func privateProcRoot() (*os.File, error) {
+	if !hasNewMountApi() {
+		return nil, fmt.Errorf("new mount api: %w", unix.ENOTSUP)
+	}
+	// Try to create a new procfs mount from scratch if we can. This ensures we
+	// can get a procfs mount even if /proc is fake (for whatever reason).
+	procRoot, err := newPrivateProcMount()
+	if err != nil {
+		// Try to clone /proc then...
+		procRoot, err = clonePrivateProcMount()
+	}
+	return procRoot, err
+}
+
 var (
 	procRootHandle *os.File
 	procRootError  error
@@ -62,10 +180,7 @@ var (
 	errUnsafeProcfs = errors.New("unsafe procfs detected")
 )
 
-func doGetProcRoot() (_ *os.File, Err error) {
-	// TODO: Use fsopen or open_tree to get a safe handle that cannot be
-	// over-mounted and we can absolutely verify.
-
+func unsafeHostProcRoot() (_ *os.File, Err error) {
 	procRoot, err := os.OpenFile("/proc", unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
 	if err != nil {
 		return nil, err
@@ -81,6 +196,17 @@ func doGetProcRoot() (_ *os.File, Err error) {
 	return procRoot, nil
 }
 
+func doGetProcRoot() (*os.File, error) {
+	procRoot, err := privateProcRoot()
+	if err != nil {
+		// Fall back to using a /proc handle if making a private mount failed.
+		// If we have openat2, at least we can avoid some kinds of over-mount
+		// attacks, but without openat2 there's not much we can do.
+		procRoot, err = unsafeHostProcRoot()
+	}
+	return procRoot, err
+}
+
 func getProcRoot() (*os.File, error) {
 	procRootOnce.Do(func() {
 		procRootHandle, procRootError = doGetProcRoot()
@@ -172,6 +298,11 @@ func procThreadSelf(subpath string) (_ *os.File, _ procThreadSelfCloser, Err err
 		if err != nil {
 			return nil, nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
 		}
+		defer func() {
+			if Err != nil {
+				_ = handle.Close()
+			}
+		}()
 		// We can't detect bind-mounts of different parts of procfs on top of
 		// /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we
 		// aren't on the wrong filesystem here.
@@ -184,16 +315,96 @@ func procThreadSelf(subpath string) (_ *os.File, _ procThreadSelfCloser, Err err
 	return handle, runtime.UnlockOSThread, nil
 }
 
+var (
+	hasStatxMountIdBool bool
+	hasStatxMountIdOnce sync.Once
+)
+
+func hasStatxMountId() bool {
+	hasStatxMountIdOnce.Do(func() {
+		var (
+			stx unix.Statx_t
+			// We don't care which mount ID we get. The kernel will give us the
+			// unique one if it is supported.
+			wantStxMask uint32 = unix.STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID
+		)
+		err := unix.Statx(-int(unix.EBADF), "/", 0, int(wantStxMask), &stx)
+		hasStatxMountIdBool = (err == nil && (stx.Mask&wantStxMask != 0))
+	})
+	return hasStatxMountIdBool
+}
+
+func checkSymlinkOvermount(dir *os.File, path string) error {
+	// If we don't have statx(STATX_MNT_ID*) support, we can't do anything.
+	if !hasStatxMountId() {
+		return nil
+	}
+
+	var (
+		stx unix.Statx_t
+		// We don't care which mount ID we get. The kernel will give us the
+		// unique one if it is supported.
+		wantStxMask uint32 = unix.STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID
+	)
+
+	// Get the mntId of our procfs handle.
+	err := unix.Statx(int(dir.Fd()), "", unix.AT_EMPTY_PATH, int(wantStxMask), &stx)
+	if err != nil {
+		return &os.PathError{Op: "statx", Path: dir.Name(), Err: err}
+	}
+	if stx.Mask&wantStxMask == 0 {
+		// It's not a kernel limitation, for some reason we couldn't get a
+		// mount ID. Assume it's some kind of attack.
+		return fmt.Errorf("%w: could not get mnt id of dir %s", errUnsafeProcfs, dir.Name())
+	}
+	expectedMountId := stx.Mnt_id
+
+	// Get the mntId of the target symlink.
+	stx = unix.Statx_t{}
+	err = unix.Statx(int(dir.Fd()), path, unix.AT_SYMLINK_NOFOLLOW, int(wantStxMask), &stx)
+	if err != nil {
+		return &os.PathError{Op: "statx", Path: dir.Name() + "/" + path, Err: err}
+	}
+	if stx.Mask&wantStxMask == 0 {
+		// It's not a kernel limitation, for some reason we couldn't get a
+		// mount ID. Assume it's some kind of attack.
+		return fmt.Errorf("%w: could not get mnt id of symlink %s", errUnsafeProcfs, path)
+	}
+	gotMountId := stx.Mnt_id
+
+	// As long as the directory mount is alive, even with wrapping mount IDs,
+	// we would expect to see a different mount ID here. (Of course, if we're
+	// using unsafeHostProcRoot() then an attaker could change this after we
+	// did this check.)
+	if expectedMountId != gotMountId {
+		return fmt.Errorf("%w: symlink %s/%s has an overmount obscuring the real link (mount ids do not match %d != %d)", errUnsafeProcfs, dir.Name(), path, expectedMountId, gotMountId)
+	}
+	return nil
+}
+
 func rawProcSelfFdReadlink(fd int) (string, error) {
 	procSelfFd, closer, err := procThreadSelf("fd/")
 	if err != nil {
 		return "", fmt.Errorf("get safe /proc/thread-self/fd handle: %w", err)
 	}
+	defer procSelfFd.Close()
 	defer closer()
-	// NOTE: It is possible for an attacker to bind-mount on top of the
-	// /proc/self/fd/... symlink, and there is currently no way for us to
-	// detect this. So we just have to assume that hasn't happened...
-	return readlinkatFile(procSelfFd, strconv.Itoa(fd))
+
+	// Try to detect if there is a mount on top of the symlink we are about to
+	// read. If we are using unsafeHostProcRoot(), this could change after we
+	// check it (and there's nothing we can do about that) but for
+	// privateProcRoot() this should be guaranteed to be safe (at least since
+	// Linux 5.12[1], when anonymous mount namespaces were completely isolated
+	// from external mounts including mount propagation events).
+	//
+	// [1]: Linux commit ee2e3f50629f ("mount: fix mounting of detached mounts
+	// onto targets that reside on shared mounts").
+	fdStr := strconv.Itoa(fd)
+	if err := checkSymlinkOvermount(procSelfFd, fdStr); err != nil {
+		return "", fmt.Errorf("check safety of fd %d proc magiclink: %w", fd, err)
+	}
+	// Get the contents of the link.
+	return readlinkatFile(procSelfFd, fdStr)
 }
 
 func procSelfFdReadlink(f *os.File) (string, error) {

From 45ab189c15dc2920d84dcad7b2a3584561678cbc Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 28 Jun 2024 16:51:48 +1000
Subject: [PATCH 08/11] procfs: add tests for our safe /proc helpers

This ensures we test all of the fallbacks even if we don't use them in
practice when doing MkdirAll in our test suite. In addition, add some
hooks to allow the tests to force the usage of fallbacks when using the
standard getProcRoot() function everything else uses. This raises the
test coverage to 87.1%.

In terms of correctness tests, there are a couple of extra tests it's
probably worth adding (see procfs_linux_test.go for a list).

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .github/workflows/ci.yml |   4 +
 procfs_linux.go          |  34 ++--
 procfs_linux_test.go     | 399 +++++++++++++++++++++++++++++++++++++++
 testing_mocks_linux.go   |  68 +++++++
 util_linux_test.go       |  46 +++++
 5 files changed, 537 insertions(+), 14 deletions(-)
 create mode 100644 procfs_linux_test.go
 create mode 100644 testing_mocks_linux.go

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 67a117b..1e819e3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,3 +35,7 @@ jobs:
           go-version: ${{ matrix.go-version }}
       - name: unit tests
         run: go test -v -cover ./...
+      # TODO: Merge the code coverage stats from both runs...
+      - name: unit tests (root)
+        if: ${{ ! startsWith(matrix.os, 'windows-') }}
+        run: sudo go test -v -cover ./...
diff --git a/procfs_linux.go b/procfs_linux.go
index 22b8dd6..48419e2 100644
--- a/procfs_linux.go
+++ b/procfs_linux.go
@@ -141,7 +141,7 @@ func clonePrivateProcMount() (_ *os.File, Err error) {
 	// we can be sure there are no over-mounts and so if the root is valid then
 	// we're golden. Otherwise, we have to deal with over-mounts.
 	procfsHandle, err := openTree(nil, "/proc", unix.OPEN_TREE_CLONE)
-	if err != nil {
+	if err != nil || testingForcePrivateProcRootOpenTreeAtRecursive(procfsHandle) {
 		procfsHandle, err = openTree(nil, "/proc", unix.OPEN_TREE_CLONE|unix.AT_RECURSIVE)
 	}
 	if err != nil {
@@ -165,7 +165,7 @@ func privateProcRoot() (*os.File, error) {
 	// Try to create a new procfs mount from scratch if we can. This ensures we
 	// can get a procfs mount even if /proc is fake (for whatever reason).
 	procRoot, err := newPrivateProcMount()
-	if err != nil {
+	if err != nil || testingForcePrivateProcRootOpenTree(procRoot) {
 		// Try to clone /proc then...
 		procRoot, err = clonePrivateProcMount()
 	}
@@ -198,7 +198,7 @@ func unsafeHostProcRoot() (_ *os.File, Err error) {
 
 func doGetProcRoot() (*os.File, error) {
 	procRoot, err := privateProcRoot()
-	if err != nil {
+	if err != nil || testingForceGetProcRootUnsafe(procRoot) {
 		// Fall back to using a /proc handle if making a private mount failed.
 		// If we have openat2, at least we can avoid some kinds of over-mount
 		// attacks, but without openat2 there's not much we can do.
@@ -229,12 +229,7 @@ type procThreadSelfCloser func()
 //
 // This is similar to ProcThreadSelf from runc, but with extra hardening
 // applied and using *os.File.
-func procThreadSelf(subpath string) (_ *os.File, _ procThreadSelfCloser, Err error) {
-	procRoot, err := getProcRoot()
-	if err != nil {
-		return nil, nil, err
-	}
-
+func procThreadSelf(procRoot *os.File, subpath string) (_ *os.File, _ procThreadSelfCloser, Err error) {
 	haveProcThreadSelfOnce.Do(func() {
 		// If the kernel doesn't support thread-self, it doesn't matter which
 		// /proc handle we use.
@@ -256,10 +251,10 @@ func procThreadSelf(subpath string) (_ *os.File, _ procThreadSelfCloser, Err err
 
 	// Figure out what prefix we want to use.
 	threadSelf := "thread-self/"
-	if !haveProcThreadSelf {
+	if !haveProcThreadSelf || testingForceProcSelfTask() {
 		/// Pre-3.17 kernels don't have /proc/thread-self, so do it manually.
 		threadSelf = "self/task/" + strconv.Itoa(unix.Gettid()) + "/"
-		if _, err := fstatatFile(procRoot, threadSelf, unix.AT_SYMLINK_NOFOLLOW); err != nil {
+		if _, err := fstatatFile(procRoot, threadSelf, unix.AT_SYMLINK_NOFOLLOW); err != nil || testingForceProcSelf() {
 			// In this case, we running in a pid namespace that doesn't match
 			// the /proc mount we have. This can happen inside runc.
 			//
@@ -271,7 +266,10 @@ func procThreadSelf(subpath string) (_ *os.File, _ procThreadSelfCloser, Err err
 	}
 
 	// Grab the handle.
-	var handle *os.File
+	var (
+		handle *os.File
+		err    error
+	)
 	if hasOpenat2() {
 		// We prefer being able to use RESOLVE_NO_XDEV if we can, to be
 		// absolutely sure we are operating on a clean /proc handle that
@@ -382,8 +380,8 @@ func checkSymlinkOvermount(dir *os.File, path string) error {
 	return nil
 }
 
-func rawProcSelfFdReadlink(fd int) (string, error) {
-	procSelfFd, closer, err := procThreadSelf("fd/")
+func doRawProcSelfFdReadlink(procRoot *os.File, fd int) (string, error) {
+	procSelfFd, closer, err := procThreadSelf(procRoot, "fd/")
 	if err != nil {
 		return "", fmt.Errorf("get safe /proc/thread-self/fd handle: %w", err)
 	}
@@ -407,6 +405,14 @@ func rawProcSelfFdReadlink(fd int) (string, error) {
 	return readlinkatFile(procSelfFd, fdStr)
 }
 
+func rawProcSelfFdReadlink(fd int) (string, error) {
+	procRoot, err := getProcRoot()
+	if err != nil {
+		return "", err
+	}
+	return doRawProcSelfFdReadlink(procRoot, fd)
+}
+
 func procSelfFdReadlink(f *os.File) (string, error) {
 	return rawProcSelfFdReadlink(int(f.Fd()))
 }
diff --git a/procfs_linux_test.go b/procfs_linux_test.go
new file mode 100644
index 0000000..57a37d0
--- /dev/null
+++ b/procfs_linux_test.go
@@ -0,0 +1,399 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"fmt"
+	"os"
+	"path"
+	"runtime"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/unix"
+)
+
+func doMount(t *testing.T, source, target, fsType string, flags uintptr) {
+	var sourcePath string
+	if source != "" {
+		// In order to be able to bind-mount a symlink source we need to
+		// bind-mount using an O_PATH|O_NOFOLLOW of the source.
+		file, err := os.OpenFile(source, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+		require.NoError(t, err)
+		defer runtime.KeepAlive(file)
+		defer file.Close()
+		sourcePath = fmt.Sprintf("/proc/self/fd/%d", file.Fd())
+	}
+
+	var targetPath string
+	if target != "" {
+		// In order to be able to mount on top of symlinks we need to
+		// bind-mount through an O_PATH|O_NOFOLLOW of the target.
+		file, err := os.OpenFile(target, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+		require.NoError(t, err)
+		defer runtime.KeepAlive(file)
+		defer file.Close()
+		targetPath = fmt.Sprintf("/proc/self/fd/%d", file.Fd())
+	}
+
+	err := unix.Mount(sourcePath, targetPath, fsType, flags, "")
+	require.NoErrorf(t, err, "mount(%s<%s>, %s<%s>, %s, 0x%x)", sourcePath, source, targetPath, target, fsType, flags)
+}
+
+func setupMountNamespace(t *testing.T) {
+	requireRoot(t)
+
+	// Lock our thread because we need to create a custom mount namespace. Each
+	// test run is run in its own goroutine (this is not _explicitly_
+	// guaranteed by Go but t.FailNow() uses Goexit, which means it has to be
+	// true in practice) so locking the test to this thread means the other
+	// tests will run on different goroutines.
+	//
+	// There is no UnlockOSThread() here, to ensure that the Go runtime will
+	// kill this thread once this goroutine returns (ensuring no other
+	// goroutines run in this context).
+	runtime.LockOSThread()
+
+	// New mount namespace (we are multi-threaded with a shared fs so we need
+	// CLONE_FS to split us from the other threads in the Go process).
+	err := unix.Unshare(unix.CLONE_FS | unix.CLONE_NEWNS)
+	require.NoError(t, err, "new mount namespace")
+
+	// Private /.
+	err = unix.Mount("", "/", "", unix.MS_PRIVATE|unix.MS_REC, "")
+	require.NoError(t, err)
+}
+
+func testProcThreadSelf(t *testing.T, procRoot *os.File, subpath string, expectErr bool) {
+	handle, closer, err := procThreadSelf(procRoot, subpath)
+	if expectErr {
+		assert.ErrorIsf(t, err, errUnsafeProcfs, "should have detected /proc/thread-self/%s overmount", subpath)
+	} else if assert.NoErrorf(t, err, "/proc/thread-self/%s open should succeed", subpath) {
+		_ = handle.Close()
+		closer() // LockOSThread stacks, so we can call this safely.
+	}
+}
+
+type procRootFunc func() (*os.File, error)
+
+func testProcOvermountSubdir(t *testing.T, procRootFn procRootFunc, expectOvermounts bool) {
+	testForceProcThreadSelf(t, func(t *testing.T) {
+		setupMountNamespace(t)
+
+		// Create some overmounts on /proc/{thread-self/,self/}.
+		for _, procThreadSelfPath := range []string{
+			fmt.Sprintf("/proc/self/task/%d", unix.Gettid()),
+			"/proc/self",
+		} {
+			for _, mount := range []struct {
+				source, targetSubPath, fsType string
+				flags                         uintptr
+			}{
+				// A tmpfs on top of /proc/thread-self/fdinfo to check whether
+				// verifyProcRoot() works on old kernels.
+				{"", "fdinfo", "tmpfs", 0},
+				// A bind-mount of noop-write real procfs file on top of
+				// /proc/thread-self/attr/current so we can test whether
+				// verifyProcRoot() works for the file case.
+				//
+				// We don't use procThreadSelf for files in filepath-securejoin, but
+				// this is to test the runc-equivalent behaviour for when this logic is
+				// moved to libpathrs.
+				{"/proc/self/sched", "attr/current", "", unix.MS_BIND},
+				// Bind-mounts on top of symlinks should be detected by
+				// checkSymlinkOvermount.
+				{"/proc/1/fd/0", "exe", "", unix.MS_BIND},
+				{"/proc/1/exe", "fd/0", "", unix.MS_BIND},
+				// TODO: Add a test for mounting on top of /proc/self or
+				//       /proc/thread-self. This should be detected with openat2.
+			} {
+				target := path.Join(procThreadSelfPath, mount.targetSubPath)
+				doMount(t, mount.source, target, mount.fsType, mount.flags)
+			}
+		}
+
+		procRoot, err := procRootFn()
+		require.NoError(t, err)
+		defer procRoot.Close()
+
+		// We expect to always detect tmpfs overmounts if we have a /proc with
+		// overmounts.
+		detectFdinfo := expectOvermounts
+		testProcThreadSelf(t, procRoot, "fdinfo", detectFdinfo)
+		// We only expect to detect procfs bind-mounts if there are /proc
+		// overmounts and we have openat2.
+		detectAttrCurrent := expectOvermounts && hasOpenat2()
+		testProcThreadSelf(t, procRoot, "attr/current", detectAttrCurrent)
+
+		// For magic-links we expect to detect overmounts if there are any.
+		symlinkOvermountErr := errUnsafeProcfs
+		if !expectOvermounts {
+			symlinkOvermountErr = nil
+		}
+
+		// Check that /proc/self/exe and .
+		procThreadSelf, closer, err := procThreadSelf(procRoot, ".")
+		require.NoError(t, err)
+		defer procThreadSelf.Close()
+		defer closer()
+
+		// no overmount
+		err = checkSymlinkOvermount(procThreadSelf, "cwd")
+		assert.NoError(t, err, "checking /proc/self/cwd with no overmount should succeed")
+		// basic overmount
+		err = checkSymlinkOvermount(procThreadSelf, "exe")
+		assert.ErrorIs(t, err, symlinkOvermountErr, "unexpected /proc/self/exe overmount result")
+
+		// fd no overmount
+		_, err = doRawProcSelfFdReadlink(procRoot, 1)
+		assert.NoError(t, err, "checking /proc/self/fd/1 with no overmount should succeed")
+		// fd overmount
+		link, err := doRawProcSelfFdReadlink(procRoot, 0)
+		assert.ErrorIs(t, err, symlinkOvermountErr, "unexpected /proc/self/fd/0 overmount result: got link %q", link)
+	})
+}
+
+func TestProcOvermountSubdir_unsafeHostProcRoot(t *testing.T) {
+	withWithoutOpenat2(t, func(t *testing.T) {
+		// If we use the host /proc directly, we should see overmounts.
+		testProcOvermountSubdir(t, unsafeHostProcRoot, true)
+	})
+}
+
+func TestProcOvermountSubdir_newPrivateProcMount(t *testing.T) {
+	if !hasNewMountApi() {
+		t.Skip("test requires fsopen/open_tree support")
+	}
+	withWithoutOpenat2(t, func(t *testing.T) {
+		// If we create our own procfs, the overmounts shouldn't appear.
+		testProcOvermountSubdir(t, newPrivateProcMount, false)
+	})
+}
+
+func TestProcOvermountSubdir_clonePrivateProcMount(t *testing.T) {
+	if !hasNewMountApi() {
+		t.Skip("test requires fsopen/open_tree support")
+	}
+	withWithoutOpenat2(t, func(t *testing.T) {
+		// If we use open_tree(2), we don't use AT_RECURSIVE when running in
+		// this test (because the overmounts are not locked mounts) and so we
+		// don't expect to see overmounts.
+		testProcOvermountSubdir(t, clonePrivateProcMount, false)
+	})
+}
+
+func TestProcOvermountSubdir_doGetProcRoot(t *testing.T) {
+	withWithoutOpenat2(t, func(t *testing.T) {
+		// We expect to not get overmounts if we have the new mount API.
+		// FIXME: It's possible to hit overmounts if there are locked mounts
+		// and we hit the AT_RECURSIVE case...
+		testProcOvermountSubdir(t, doGetProcRoot, !hasNewMountApi())
+	})
+}
+
+func TestProcOvermountSubdir_doGetProcRoot_Mocked(t *testing.T) {
+	if !hasNewMountApi() {
+		t.Skip("test requires fsopen/open_tree support")
+	}
+	withWithoutOpenat2(t, func(t *testing.T) {
+		testForceGetProcRoot(t, func(t *testing.T, expectOvermounts bool) {
+			testProcOvermountSubdir(t, doGetProcRoot, expectOvermounts)
+		})
+	})
+}
+
+func canFsOpen() bool {
+	f, err := fsopen("tmpfs", 0)
+	if f != nil {
+		_ = f.Close()
+	}
+	return err == nil
+}
+
+func testProcOvermount(t *testing.T, procRootFn procRootFunc, privateProcMount bool) {
+	testForceProcThreadSelf(t, func(t *testing.T) {
+		for _, mount := range []struct {
+			source, fsType string
+			flags          uintptr
+		}{
+			// Try a non-procfs filesystem overmount.
+			{"", "tmpfs", 0},
+			// Try a procfs subdir overmount.
+			{"/proc/tty", "bind", unix.MS_BIND},
+		} {
+			mount := mount // copy iterator
+			t.Run("procmount="+mount.fsType, func(t *testing.T) {
+				setupMountNamespace(t)
+				doMount(t, mount.source, "/proc", mount.fsType, mount.flags)
+
+				procRoot, err := procRootFn()
+				if procRoot != nil {
+					defer procRoot.Close()
+				}
+				if privateProcMount {
+					assert.NoError(t, err, "get proc handle should succeed")
+					assert.NoError(t, verifyProcRoot(procRoot), "verify private proc mount should succeed")
+				} else {
+					if !assert.ErrorIs(t, err, errUnsafeProcfs, "get proc handle should fail") {
+						t.Logf("procRootFn() = %v, %v", procRoot, err)
+					}
+				}
+			})
+		}
+	})
+}
+
+func TestProcOvermount_unsafeHostProcRoot(t *testing.T) {
+	testProcOvermount(t, unsafeHostProcRoot, false)
+}
+
+func TestProcOvermount_clonePrivateProcMount(t *testing.T) {
+	if !hasNewMountApi() {
+		t.Skip("test requires open_tree support")
+	}
+	testProcOvermount(t, clonePrivateProcMount, false)
+}
+
+func TestProcOvermount_newPrivateProcMount(t *testing.T) {
+	if !hasNewMountApi() || !canFsOpen() {
+		t.Skip("test requires fsopen support")
+	}
+	testProcOvermount(t, newPrivateProcMount, true)
+}
+
+func TestProcOvermount_doGetProcRoot(t *testing.T) {
+	privateProcMount := canFsOpen() && !testingForcePrivateProcRootOpenTree(nil)
+	testProcOvermount(t, doGetProcRoot, privateProcMount)
+}
+
+func TestProcOvermount_doGetProcRoot_Mocked(t *testing.T) {
+	if !hasNewMountApi() {
+		t.Skip("test requires fsopen/open_tree support")
+	}
+	testForceGetProcRoot(t, func(t *testing.T, expectOvermounts bool) {
+		privateProcMount := canFsOpen() && !testingForcePrivateProcRootOpenTree(nil)
+		testProcOvermount(t, doGetProcRoot, privateProcMount)
+	})
+}
+
+func TestProcSelfFdPath(t *testing.T) {
+	testForceProcThreadSelf(t, func(t *testing.T) {
+		root := t.TempDir()
+
+		filePath := path.Join(root, "file")
+		err := unix.Mknod(filePath, unix.S_IFREG|0o644, 0)
+		require.NoError(t, err)
+
+		symPath := path.Join(root, "sym")
+		err = unix.Symlink(filePath, symPath)
+		require.NoError(t, err)
+
+		// Open through the symlink.
+		handle, err := os.Open(symPath)
+		defer handle.Close()
+
+		// The check should fail if we expect the symlink path.
+		err = checkProcSelfFdPath(symPath, handle)
+		assert.ErrorIs(t, err, errPossibleBreakout, "checkProcSelfFdPath should fail for wrong path")
+
+		// The check should fail if we expect the symlink path.
+		err = checkProcSelfFdPath(filePath, handle)
+		assert.NoError(t, err)
+	})
+}
+
+func TestProcSelfFdPath_DeadFile(t *testing.T) {
+	testForceProcThreadSelf(t, func(t *testing.T) {
+		root := t.TempDir()
+
+		fullPath := path.Join(root, "file")
+		handle, err := os.Create(fullPath)
+		require.NoError(t, err)
+		defer handle.Close()
+
+		// The path still exists.
+		err = checkProcSelfFdPath(fullPath, handle)
+		assert.NoError(t, err, "checkProcSelfFdPath should succeed with regular file")
+
+		// Delete the path.
+		err = os.Remove(fullPath)
+		require.NoError(t, err)
+
+		// The check should fail now.
+		err = checkProcSelfFdPath(fullPath, handle)
+		assert.ErrorIs(t, err, errDeletedInode, "checkProcSelfFdPath should fail after deletion")
+
+		// The check should fail even if the expected path ends with " (deleted)".
+		err = checkProcSelfFdPath(fullPath+" (deleted)", handle)
+		assert.ErrorIs(t, err, errDeletedInode, "checkProcSelfFdPath should fail after deletion even with (deleted) suffix")
+	})
+}
+
+func TestProcSelfFdPath_DeadDir(t *testing.T) {
+	testForceProcThreadSelf(t, func(t *testing.T) {
+		root := t.TempDir()
+
+		fullPath := path.Join(root, "dir")
+		err := os.Mkdir(fullPath, 0o755)
+		require.NoError(t, err)
+
+		handle, err := os.OpenFile(fullPath, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+		require.NoError(t, err)
+		defer handle.Close()
+
+		// The path still exists.
+		err = checkProcSelfFdPath(fullPath, handle)
+		assert.NoError(t, err, "checkProcSelfFdPath should succeed with regular directory")
+
+		// Delete the path.
+		err = os.Remove(fullPath)
+		require.NoError(t, err)
+
+		// The check should fail now.
+		err = checkProcSelfFdPath(fullPath, handle)
+		assert.ErrorIs(t, err, errInvalidDirectory, "checkProcSelfFdPath should fail after deletion")
+
+		// The check should fail even if the expected path ends with " (deleted)".
+		err = checkProcSelfFdPath(fullPath+" (deleted)", handle)
+		assert.ErrorIs(t, err, errInvalidDirectory, "checkProcSelfFdPath should fail after deletion even with (deleted) suffix")
+	})
+}
+
+func testVerifyProcRoot(t *testing.T, procRoot string, expectedErr error, errString string) {
+	fakeProcRoot, err := os.OpenFile(procRoot, unix.O_PATH|unix.O_CLOEXEC, 0)
+	require.NoError(t, err)
+	defer fakeProcRoot.Close()
+
+	err = verifyProcRoot(fakeProcRoot)
+	assert.ErrorIsf(t, err, expectedErr, "verifyProcRoot(%s)", procRoot)
+	if expectedErr != nil {
+		assert.ErrorContainsf(t, err, errString, "verifyProcRoot(%s)", procRoot)
+	}
+}
+
+func TestVerifyProcRoot_Regular(t *testing.T) {
+	testForceProcThreadSelf(t, func(t *testing.T) {
+		testVerifyProcRoot(t, "/proc", nil, "")
+	})
+}
+
+func TestVerifyProcRoot_ProcNonRoot(t *testing.T) {
+	testForceProcThreadSelf(t, func(t *testing.T) {
+		testVerifyProcRoot(t, "/proc/self", errUnsafeProcfs, "incorrect procfs root inode number")
+		testVerifyProcRoot(t, "/proc/mounts", errUnsafeProcfs, "incorrect procfs root inode number")
+		testVerifyProcRoot(t, "/proc/stat", errUnsafeProcfs, "incorrect procfs root inode number")
+	})
+}
+
+func TestVerifyProcRoot_NotProc(t *testing.T) {
+	testForceProcThreadSelf(t, func(t *testing.T) {
+		testVerifyProcRoot(t, "/", errUnsafeProcfs, "incorrect procfs root filesystem type")
+		testVerifyProcRoot(t, ".", errUnsafeProcfs, "incorrect procfs root filesystem type")
+		testVerifyProcRoot(t, t.TempDir(), errUnsafeProcfs, "incorrect procfs root filesystem type")
+	})
+}
diff --git a/testing_mocks_linux.go b/testing_mocks_linux.go
new file mode 100644
index 0000000..2a25d08
--- /dev/null
+++ b/testing_mocks_linux.go
@@ -0,0 +1,68 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"os"
+	"testing"
+)
+
+type forceGetProcRootLevel int
+
+const (
+	forceGetProcRootDefault             forceGetProcRootLevel = iota
+	forceGetProcRootOpenTree                                  // force open_tree()
+	forceGetProcRootOpenTreeAtRecursive                       // force open_tree(AT_RECURSIVE)
+	forceGetProcRootUnsafe                                    // force open()
+)
+
+var testingForceGetProcRoot *forceGetProcRootLevel
+
+func testingCheckClose(check bool, f *os.File) bool {
+	if check {
+		if f != nil {
+			_ = f.Close()
+		}
+		return true
+	}
+	return false
+}
+
+func testingForcePrivateProcRootOpenTree(f *os.File) bool {
+	return testing.Testing() && testingForceGetProcRoot != nil &&
+		testingCheckClose(*testingForceGetProcRoot >= forceGetProcRootOpenTree, f)
+}
+
+func testingForcePrivateProcRootOpenTreeAtRecursive(f *os.File) bool {
+	return testing.Testing() && testingForceGetProcRoot != nil &&
+		testingCheckClose(*testingForceGetProcRoot >= forceGetProcRootOpenTreeAtRecursive, f)
+}
+
+func testingForceGetProcRootUnsafe(f *os.File) bool {
+	return testing.Testing() && testingForceGetProcRoot != nil &&
+		testingCheckClose(*testingForceGetProcRoot >= forceGetProcRootUnsafe, f)
+}
+
+type forceProcThreadSelfLevel int
+
+const (
+	forceProcThreadSelfDefault forceProcThreadSelfLevel = iota
+	forceProcSelfTask
+	forceProcSelf
+)
+
+var testingForceProcThreadSelf *forceProcThreadSelfLevel
+
+func testingForceProcSelfTask() bool {
+	return testing.Testing() && testingForceProcThreadSelf != nil &&
+		*testingForceProcThreadSelf >= forceProcSelfTask
+}
+
+func testingForceProcSelf() bool {
+	return testing.Testing() && testingForceProcThreadSelf != nil &&
+		*testingForceProcThreadSelf >= forceProcSelf
+}
diff --git a/util_linux_test.go b/util_linux_test.go
index b50fc4a..4bf85cc 100644
--- a/util_linux_test.go
+++ b/util_linux_test.go
@@ -16,6 +16,12 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+func requireRoot(t *testing.T) {
+	if os.Geteuid() != 0 {
+		t.Skip("test requires root")
+	}
+}
+
 func withWithoutOpenat2(t *testing.T, testFn func(t *testing.T)) {
 	t.Run("openat2=auto", testFn)
 
@@ -33,6 +39,46 @@ func withWithoutOpenat2(t *testing.T, testFn func(t *testing.T)) {
 	}
 }
 
+func testForceGetProcRoot(t *testing.T, testFn func(t *testing.T, expectOvermounts bool)) {
+	for _, test := range []struct {
+		name             string
+		forceGetProcRoot forceGetProcRootLevel
+		expectOvermounts bool
+	}{
+		{`procfd="fsopen()"`, forceGetProcRootDefault, false},
+		{`procfd="open_tree_clone"`, forceGetProcRootOpenTree, false},
+		{`procfd="open_tree_clone(AT_RECURSIVE)"`, forceGetProcRootOpenTreeAtRecursive, true},
+		{`procfd="open()"`, forceGetProcRootUnsafe, true},
+	} {
+		test := test // copy iterator
+		t.Run(test.name, func(t *testing.T) {
+			testingForceGetProcRoot = &test.forceGetProcRoot
+			defer func() { testingForceGetProcRoot = nil }()
+
+			testFn(t, test.expectOvermounts)
+		})
+	}
+}
+
+func testForceProcThreadSelf(t *testing.T, testFn func(t *testing.T)) {
+	for _, test := range []struct {
+		name                string
+		forceProcThreadSelf forceProcThreadSelfLevel
+	}{
+		{`thread-self="thread-self"`, forceProcThreadSelfDefault},
+		{`thread-self="self/task"`, forceProcSelfTask},
+		{`thread-self="self"`, forceProcSelf},
+	} {
+		test := test // copy iterator
+		t.Run(test.name, func(t *testing.T) {
+			testingForceProcThreadSelf = &test.forceProcThreadSelf
+			defer func() { testingForceProcThreadSelf = nil }()
+
+			testFn(t)
+		})
+	}
+}
+
 // Format:
 //
 //	dir <name>

From b6aed59e8af9852879a3da32c37af0e3238827dc Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 4 Jul 2024 00:06:41 +1000
Subject: [PATCH 09/11] lookup: add tests for partialLookupInRoot

This lets us make sure we are correctly handling all sorts of lookups
that aren't obvious just by looking at MkdirAll. Of particular note are
the tail-chained and other deep symlink resolution stacks (MkdirAll will
just return an error, but the remainingPath we get is quite important).

In addition, add some explicit tests for the error paths in
symlinkStack. We can't force these with actual lookups (since they
indicate a programming bug), but at least make sure we're testing them
properly.

This raises the test coverage to 88.6% now that we are testing more of
the error paths in partialLookupInRoot.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 lookup_linux_test.go | 584 +++++++++++++++++++++++++++++++++++++++++++
 util_linux_test.go   |  36 +++
 2 files changed, 620 insertions(+)
 create mode 100644 lookup_linux_test.go

diff --git a/lookup_linux_test.go b/lookup_linux_test.go
new file mode 100644
index 0000000..79dfabe
--- /dev/null
+++ b/lookup_linux_test.go
@@ -0,0 +1,584 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/unix"
+)
+
+type partialLookupFunc func(root *os.File, unsafePath string) (*os.File, string, error)
+
+type lookupResult struct {
+	handlePath, remainingPath string
+	err                       error
+	fileType                  uint32
+}
+
+func checkPartialLookup(t *testing.T, partialLookupFn partialLookupFunc, rootDir *os.File, unsafePath string, expected lookupResult) {
+	handle, remainingPath, err := partialLookupFn(rootDir, unsafePath)
+	if handle != nil {
+		defer handle.Close()
+	}
+	if expected.err == nil {
+		if assert.NoError(t, err) {
+			// Check the remainingPath.
+			assert.Equal(t, expected.remainingPath, remainingPath, "remaining path")
+
+			// Check the handle filepath.
+			gotPath, err := procSelfFdReadlink(handle)
+			require.NoError(t, err, "get real path of returned handle")
+			assert.Equal(t, expected.handlePath, gotPath, "real handle path")
+			// Make sure the handle matches the readlink filepath.
+			assert.Equal(t, gotPath, handle.Name(), "handle.Name() matching real handle path")
+
+			// Check the handle type.
+			unixStat, err := fstat(handle)
+			require.NoError(t, err, "fstat handle")
+			assert.Equal(t, expected.fileType, unixStat.Mode&unix.S_IFMT, "handle S_IFMT type")
+		}
+	} else {
+		if assert.Error(t, err) {
+			assert.ErrorIs(t, err, expected.err)
+		} else {
+			t.Errorf("unexpected handle %q", handle.Name())
+		}
+	}
+}
+
+func testPartialLookup(t *testing.T, partialLookupFn partialLookupFunc) {
+	tree := []string{
+		"dir a",
+		"dir b/c/d/e/f",
+		"file b/c/file",
+		"symlink e /b/c/d/e",
+		"symlink b-file b/c/file",
+		// Dangling symlinks.
+		"symlink a-fake1 a/fake",
+		"symlink a-fake2 a/fake/foo/bar/..",
+		"symlink a-fake3 a/fake/../../b",
+		"dir c",
+		"symlink c/a-fake1 a/fake",
+		"symlink c/a-fake2 a/fake/foo/bar/..",
+		"symlink c/a-fake3 a/fake/../../b",
+		// Test non-lexical symlinks.
+		"dir target",
+		"dir link1",
+		"symlink link1/target_abs /target",
+		"symlink link1/target_rel ../target",
+		"dir link2",
+		"symlink link2/link1_abs /link1",
+		"symlink link2/link1_rel ../link1",
+		"dir link3",
+		"symlink link3/target_abs /link2/link1_rel/target_rel",
+		"symlink link3/target_rel ../link2/link1_rel/target_rel",
+		"symlink link3/deep_dangling1 ../link2/link1_rel/target_rel/nonexist",
+		"symlink link3/deep_dangling2 ../link2/link1_rel/target_rel/nonexist",
+		// Deep dangling symlinks (with single components).
+		"dir dangling",
+		"symlink dangling/a b/c",
+		"dir dangling/b",
+		"symlink dangling/b/c ../c",
+		"symlink dangling/c d/e",
+		"dir dangling/d",
+		"symlink dangling/d/e ../e",
+		"symlink dangling/e f/../g",
+		"dir dangling/f",
+		"symlink dangling/g h/i/j/nonexistent",
+		"dir dangling/h/i/j",
+		// Deep dangling symlink using a non-dir component.
+		"dir dangling-file",
+		"symlink dangling-file/a b/c",
+		"dir dangling-file/b",
+		"symlink dangling-file/b/c ../c",
+		"symlink dangling-file/c d/e",
+		"dir dangling-file/d",
+		"symlink dangling-file/d/e ../e",
+		"symlink dangling-file/e f/../g",
+		"dir dangling-file/f",
+		"symlink dangling-file/g h/i/j/file/foo",
+		"dir dangling-file/h/i/j",
+		"file dangling-file/h/i/j/file",
+		// Some "bad" inodes that a regular user can create.
+		"fifo b/fifo",
+		"sock b/sock",
+		// Symlink loops.
+		"dir loop",
+		"symlink loop/basic-loop1 basic-loop1",
+		"symlink loop/basic-loop2 /loop/basic-loop2",
+		"symlink loop/basic-loop3 ../loop/basic-loop3",
+		"dir loop/a",
+		"symlink loop/a/link ../b/link",
+		"dir loop/b",
+		"symlink loop/b/link /loop/c/link",
+		"dir loop/c",
+		"symlink loop/c/link /loop/d/link",
+		"symlink loop/d e",
+		"dir loop/e",
+		"symlink loop/e/link ../a/link",
+		"symlink loop/link a/link",
+	}
+
+	root := createTree(t, tree...)
+
+	rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+	require.NoError(t, err)
+	defer rootDir.Close()
+
+	for name, test := range map[string]struct {
+		unsafePath string
+		expected   lookupResult
+	}{
+		// Complete lookups.
+		"complete-dir1": {"a", lookupResult{handlePath: "/a", remainingPath: "", fileType: unix.S_IFDIR}},
+		"complete-dir2": {"b/c/d/e/f", lookupResult{handlePath: "/b/c/d/e/f", remainingPath: "", fileType: unix.S_IFDIR}},
+		"complete-fifo": {"b/fifo", lookupResult{handlePath: "/b/fifo", remainingPath: "", fileType: unix.S_IFIFO}},
+		"complete-sock": {"b/sock", lookupResult{handlePath: "/b/sock", remainingPath: "", fileType: unix.S_IFSOCK}},
+		// Partial lookups.
+		"partial-dir-basic":  {"a/b/c/d/e/f/g/h", lookupResult{handlePath: "/a", remainingPath: "b/c/d/e/f/g/h", fileType: unix.S_IFDIR}},
+		"partial-dir-dotdot": {"a/foo/../bar/baz", lookupResult{handlePath: "/a", remainingPath: "foo/../bar/baz", fileType: unix.S_IFDIR}},
+		// Complete lookups of non-lexical symlinks.
+		"nonlexical-basic-complete":                 {"target", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-basic-partial":                  {"target/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-basic-partial-dotdot":           {"target/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level1-abs-complete":            {"link1/target_abs", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level1-abs-partial":             {"link1/target_abs/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level1-abs-partial-dotdot":      {"link1/target_abs/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level1-rel-complete":            {"link1/target_rel", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level1-rel-partial":             {"link1/target_rel/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level1-rel-partial-dotdot":      {"link1/target_rel/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-abs-complete":        {"link2/link1_abs/target_abs", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-abs-partial":         {"link2/link1_abs/target_abs/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-abs-partial-dotdot":  {"link2/link1_abs/target_abs/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-rel-complete":        {"link2/link1_abs/target_rel", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-rel-partial":         {"link2/link1_abs/target_rel/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-rel-partial-dotdot":  {"link2/link1_abs/target_rel/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-open-complete":       {"link2/link1_abs/../target", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-open-partial":        {"link2/link1_abs/../target/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-abs-open-partial-dotdot": {"link2/link1_abs/../target/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-abs-complete":        {"link2/link1_rel/target_abs", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-abs-partial":         {"link2/link1_rel/target_abs/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-abs-partial-dotdot":  {"link2/link1_rel/target_abs/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-rel-complete":        {"link2/link1_rel/target_rel", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-rel-partial":         {"link2/link1_rel/target_rel/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-rel-partial-dotdot":  {"link2/link1_rel/target_rel/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-open-complete":       {"link2/link1_rel/../target", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-open-partial":        {"link2/link1_rel/../target/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level2-rel-open-partial-dotdot": {"link2/link1_rel/../target/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level3-abs-complete":            {"link3/target_abs", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level3-abs-partial":             {"link3/target_abs/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level3-abs-partial-dotdot":      {"link3/target_abs/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		"nonlexical-level3-rel-complete":            {"link3/target_rel", lookupResult{handlePath: "/target", remainingPath: "", fileType: unix.S_IFDIR}},
+		"nonlexical-level3-rel-partial":             {"link3/target_rel/foo", lookupResult{handlePath: "/target", remainingPath: "foo", fileType: unix.S_IFDIR}},
+		"nonlexical-level3-rel-partial-dotdot":      {"link3/target_rel/../target/foo/bar/../baz", lookupResult{handlePath: "/target", remainingPath: "foo/bar/../baz", fileType: unix.S_IFDIR}},
+		// Partial lookups due to hitting a non-directory.
+		"partial-nondir-dot":             {"b/c/file/.", lookupResult{handlePath: "/b/c/file", remainingPath: ".", fileType: unix.S_IFREG}},
+		"partial-nondir-dotdot1":         {"b/c/file/..", lookupResult{handlePath: "/b/c/file", remainingPath: "..", fileType: unix.S_IFREG}},
+		"partial-nondir-dotdot2":         {"b/c/file/../foo/bar", lookupResult{handlePath: "/b/c/file", remainingPath: "../foo/bar", fileType: unix.S_IFREG}},
+		"partial-nondir-symlink-dot":     {"b-file/.", lookupResult{handlePath: "/b/c/file", remainingPath: ".", fileType: unix.S_IFREG}},
+		"partial-nondir-symlink-dotdot1": {"b-file/..", lookupResult{handlePath: "/b/c/file", remainingPath: "..", fileType: unix.S_IFREG}},
+		"partial-nondir-symlink-dotdot2": {"b-file/../foo/bar", lookupResult{handlePath: "/b/c/file", remainingPath: "../foo/bar", fileType: unix.S_IFREG}},
+		"partial-fifo-dot":               {"b/fifo/.", lookupResult{handlePath: "/b/fifo", remainingPath: ".", fileType: unix.S_IFIFO}},
+		"partial-fifo-dotdot1":           {"b/fifo/..", lookupResult{handlePath: "/b/fifo", remainingPath: "..", fileType: unix.S_IFIFO}},
+		"partial-fifo-dotdot2":           {"b/fifo/../foo/bar", lookupResult{handlePath: "/b/fifo", remainingPath: "../foo/bar", fileType: unix.S_IFIFO}},
+		"partial-sock-dot":               {"b/sock/.", lookupResult{handlePath: "/b/sock", remainingPath: ".", fileType: unix.S_IFSOCK}},
+		"partial-sock-dotdot1":           {"b/sock/..", lookupResult{handlePath: "/b/sock", remainingPath: "..", fileType: unix.S_IFSOCK}},
+		"partial-sock-dotdot2":           {"b/sock/../foo/bar", lookupResult{handlePath: "/b/sock", remainingPath: "../foo/bar", fileType: unix.S_IFSOCK}},
+		// Dangling symlinks are treated as though they are non-existent.
+		"dangling1-inroot-trailing":       {"a-fake1", lookupResult{handlePath: "/", remainingPath: "a-fake1", fileType: unix.S_IFDIR}},
+		"dangling1-inroot-partial":        {"a-fake1/foo", lookupResult{handlePath: "/", remainingPath: "a-fake1/foo", fileType: unix.S_IFDIR}},
+		"dangling1-inroot-partial-dotdot": {"a-fake1/../bar/baz", lookupResult{handlePath: "/", remainingPath: "a-fake1/../bar/baz", fileType: unix.S_IFDIR}},
+		"dangling1-sub-trailing":          {"c/a-fake1", lookupResult{handlePath: "/c", remainingPath: "a-fake1", fileType: unix.S_IFDIR}},
+		"dangling1-sub-partial":           {"c/a-fake1/foo", lookupResult{handlePath: "/c", remainingPath: "a-fake1/foo", fileType: unix.S_IFDIR}},
+		"dangling1-sub-partial-dotdot":    {"c/a-fake1/../bar/baz", lookupResult{handlePath: "/c", remainingPath: "a-fake1/../bar/baz", fileType: unix.S_IFDIR}},
+		"dangling2-inroot-trailing":       {"a-fake2", lookupResult{handlePath: "/", remainingPath: "a-fake2", fileType: unix.S_IFDIR}},
+		"dangling2-inroot-partial":        {"a-fake2/foo", lookupResult{handlePath: "/", remainingPath: "a-fake2/foo", fileType: unix.S_IFDIR}},
+		"dangling2-inroot-partial-dotdot": {"a-fake2/../bar/baz", lookupResult{handlePath: "/", remainingPath: "a-fake2/../bar/baz", fileType: unix.S_IFDIR}},
+		"dangling2-sub-trailing":          {"c/a-fake2", lookupResult{handlePath: "/c", remainingPath: "a-fake2", fileType: unix.S_IFDIR}},
+		"dangling2-sub-partial":           {"c/a-fake2/foo", lookupResult{handlePath: "/c", remainingPath: "a-fake2/foo", fileType: unix.S_IFDIR}},
+		"dangling2-sub-partial-dotdot":    {"c/a-fake2/../bar/baz", lookupResult{handlePath: "/c", remainingPath: "a-fake2/../bar/baz", fileType: unix.S_IFDIR}},
+		"dangling3-inroot-trailing":       {"a-fake3", lookupResult{handlePath: "/", remainingPath: "a-fake3", fileType: unix.S_IFDIR}},
+		"dangling3-inroot-partial":        {"a-fake3/foo", lookupResult{handlePath: "/", remainingPath: "a-fake3/foo", fileType: unix.S_IFDIR}},
+		"dangling3-inroot-partial-dotdot": {"a-fake3/../bar/baz", lookupResult{handlePath: "/", remainingPath: "a-fake3/../bar/baz", fileType: unix.S_IFDIR}},
+		"dangling3-sub-trailing":          {"c/a-fake3", lookupResult{handlePath: "/c", remainingPath: "a-fake3", fileType: unix.S_IFDIR}},
+		"dangling3-sub-partial":           {"c/a-fake3/foo", lookupResult{handlePath: "/c", remainingPath: "a-fake3/foo", fileType: unix.S_IFDIR}},
+		"dangling3-sub-partial-dotdot":    {"c/a-fake3/../bar/baz", lookupResult{handlePath: "/c", remainingPath: "a-fake3/../bar/baz", fileType: unix.S_IFDIR}},
+		// Tricky dangling symlinks.
+		"dangling-tricky1-trailing":       {"link3/deep_dangling1", lookupResult{handlePath: "/link3", remainingPath: "deep_dangling1", fileType: unix.S_IFDIR}},
+		"dangling-tricky1-partial":        {"link3/deep_dangling1/foo", lookupResult{handlePath: "/link3", remainingPath: "deep_dangling1/foo", fileType: unix.S_IFDIR}},
+		"dangling-tricky1-partial-dotdot": {"link3/deep_dangling1/..", lookupResult{handlePath: "/link3", remainingPath: "deep_dangling1/..", fileType: unix.S_IFDIR}},
+		"dangling-tricky2-trailing":       {"link3/deep_dangling2", lookupResult{handlePath: "/link3", remainingPath: "deep_dangling2", fileType: unix.S_IFDIR}},
+		"dangling-tricky2-partial":        {"link3/deep_dangling2/foo", lookupResult{handlePath: "/link3", remainingPath: "deep_dangling2/foo", fileType: unix.S_IFDIR}},
+		"dangling-tricky2-partial-dotdot": {"link3/deep_dangling2/..", lookupResult{handlePath: "/link3", remainingPath: "deep_dangling2/..", fileType: unix.S_IFDIR}},
+		// Really deep dangling links.
+		"deep-dangling1":           {"dangling/a", lookupResult{handlePath: "/dangling", remainingPath: "a", fileType: unix.S_IFDIR}},
+		"deep-dangling2":           {"dangling/b/c", lookupResult{handlePath: "/dangling/b", remainingPath: "c", fileType: unix.S_IFDIR}},
+		"deep-dangling3":           {"dangling/c", lookupResult{handlePath: "/dangling", remainingPath: "c", fileType: unix.S_IFDIR}},
+		"deep-dangling4":           {"dangling/d/e", lookupResult{handlePath: "/dangling/d", remainingPath: "e", fileType: unix.S_IFDIR}},
+		"deep-dangling5":           {"dangling/e", lookupResult{handlePath: "/dangling", remainingPath: "e", fileType: unix.S_IFDIR}},
+		"deep-dangling6":           {"dangling/g", lookupResult{handlePath: "/dangling", remainingPath: "g", fileType: unix.S_IFDIR}},
+		"deep-dangling-fileasdir1": {"dangling-file/a", lookupResult{handlePath: "/dangling-file", remainingPath: "a", fileType: unix.S_IFDIR}},
+		"deep-dangling-fileasdir2": {"dangling-file/b/c", lookupResult{handlePath: "/dangling-file/b", remainingPath: "c", fileType: unix.S_IFDIR}},
+		"deep-dangling-fileasdir3": {"dangling-file/c", lookupResult{handlePath: "/dangling-file", remainingPath: "c", fileType: unix.S_IFDIR}},
+		"deep-dangling-fileasdir4": {"dangling-file/d/e", lookupResult{handlePath: "/dangling-file/d", remainingPath: "e", fileType: unix.S_IFDIR}},
+		"deep-dangling-fileasdir5": {"dangling-file/e", lookupResult{handlePath: "/dangling-file", remainingPath: "e", fileType: unix.S_IFDIR}},
+		"deep-dangling-fileasdir6": {"dangling-file/g", lookupResult{handlePath: "/dangling-file", remainingPath: "g", fileType: unix.S_IFDIR}},
+		// Symlink loops.
+		"loop":        {"loop/link", lookupResult{err: unix.ELOOP}},
+		"loop-basic1": {"loop/basic-loop1", lookupResult{err: unix.ELOOP}},
+		"loop-basic2": {"loop/basic-loop2", lookupResult{err: unix.ELOOP}},
+		"loop-basic3": {"loop/basic-loop3", lookupResult{err: unix.ELOOP}},
+	} {
+		test := test // copy iterator
+		// Update the handlePath to be inside our root.
+		if test.expected.handlePath != "" {
+			test.expected.handlePath = filepath.Join(root, test.expected.handlePath)
+		}
+		t.Run(name, func(t *testing.T) {
+			checkPartialLookup(t, partialLookupFn, rootDir, test.unsafePath, test.expected)
+		})
+	}
+}
+
+func TestPartialLookupInRoot(t *testing.T) {
+	withWithoutOpenat2(t, func(t *testing.T) {
+		testPartialLookup(t, partialLookupInRoot)
+	})
+}
+
+func TestPartialOpenat2(t *testing.T) {
+	testPartialLookup(t, partialLookupOpenat2)
+}
+
+func TestPartialLookupInRoot_BadInode(t *testing.T) {
+	requireRoot(t) // mknod
+
+	withWithoutOpenat2(t, func(t *testing.T) {
+		partialLookupFn := partialLookupInRoot
+
+		tree := []string{
+			// Make sure we don't open "bad" inodes.
+			"dir foo",
+			"char foo/whiteout 0 0",
+			"block foo/whiteout-blk 0 0",
+		}
+
+		root := createTree(t, tree...)
+
+		rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+		require.NoError(t, err)
+		defer rootDir.Close()
+
+		for name, test := range map[string]struct {
+			unsafePath string
+			expected   lookupResult
+		}{
+			// Complete lookups.
+			"char-trailing": {"foo/whiteout", lookupResult{handlePath: "/foo/whiteout", remainingPath: "", fileType: unix.S_IFCHR}},
+			"blk-trailing":  {"foo/whiteout-blk", lookupResult{handlePath: "/foo/whiteout-blk", remainingPath: "", fileType: unix.S_IFBLK}},
+			// Partial lookups due to hitting a non-directory.
+			"char-dot":     {"foo/whiteout/.", lookupResult{handlePath: "/foo/whiteout", remainingPath: ".", fileType: unix.S_IFCHR}},
+			"char-dotdot1": {"foo/whiteout/..", lookupResult{handlePath: "/foo/whiteout", remainingPath: "..", fileType: unix.S_IFCHR}},
+			"char-dotdot2": {"foo/whiteout/../foo/bar", lookupResult{handlePath: "/foo/whiteout", remainingPath: "../foo/bar", fileType: unix.S_IFCHR}},
+			"blk-dot":      {"foo/whiteout-blk/.", lookupResult{handlePath: "/foo/whiteout-blk", remainingPath: ".", fileType: unix.S_IFBLK}},
+			"blk-dotdot1":  {"foo/whiteout-blk/..", lookupResult{handlePath: "/foo/whiteout-blk", remainingPath: "..", fileType: unix.S_IFBLK}},
+			"blk-dotdot2":  {"foo/whiteout-blk/../foo/bar", lookupResult{handlePath: "/foo/whiteout-blk", remainingPath: "../foo/bar", fileType: unix.S_IFBLK}},
+		} {
+			test := test // copy iterator
+			// Update the handlePath to be inside our root.
+			if test.expected.handlePath != "" {
+				test.expected.handlePath = filepath.Join(root, test.expected.handlePath)
+			}
+			t.Run(name, func(t *testing.T) {
+				checkPartialLookup(t, partialLookupFn, rootDir, test.unsafePath, test.expected)
+			})
+		}
+	})
+}
+
+type ssOperation interface {
+	String() string
+	Do(*testing.T, *symlinkStack) error
+}
+
+type ssOpPop struct{ part string }
+
+func (op ssOpPop) Do(t *testing.T, s *symlinkStack) error { return s.PopPart(op.part) }
+
+func (op ssOpPop) String() string { return fmt.Sprintf("PopPart(%q)", op.part) }
+
+type ssOpSwapLink struct {
+	part, dirName, expectedPath, linkTarget string
+}
+
+func fakeFile(name string) (*os.File, error) {
+	fd, err := unix.Open(".", unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, &os.PathError{Op: "open", Path: ".", Err: err}
+	}
+	return os.NewFile(uintptr(fd), name), nil
+}
+
+func (op ssOpSwapLink) Do(t *testing.T, s *symlinkStack) error {
+	f, err := fakeFile(op.dirName)
+	require.NoErrorf(t, err, "make fake file with %q name", op.dirName)
+	return s.SwapLink(op.part, f, op.expectedPath, op.linkTarget)
+}
+
+func (op ssOpSwapLink) String() string {
+	return fmt.Sprintf("SwapLink(%q, <%s>, %q, %q)", op.part, op.dirName, op.expectedPath, op.linkTarget)
+}
+
+type ssOp struct {
+	op          ssOperation
+	expectedErr error
+}
+
+func (t ssOp) String() string { return fmt.Sprintf("%s = %v", t.op, t.expectedErr) }
+
+func dumpStack(t *testing.T, ss symlinkStack) {
+	for i, sym := range ss {
+		t.Logf("ss[%d] %s", i, sym)
+	}
+}
+
+func testSymlinkStack(t *testing.T, ops ...ssOp) symlinkStack {
+	var ss symlinkStack
+	for _, op := range ops {
+		err := op.op.Do(t, &ss)
+		if !assert.ErrorIsf(t, err, op.expectedErr, "%s", op) {
+			dumpStack(t, ss)
+			ss.Close()
+			t.FailNow()
+		}
+	}
+	return ss
+}
+
+func TestSymlinkStackBasic(t *testing.T) {
+	ss := testSymlinkStack(t,
+		ssOp{op: ssOpSwapLink{"foo", "A", "", "bar/baz"}},
+		ssOp{op: ssOpSwapLink{"bar", "B", "baz", "abcd"}},
+		ssOp{op: ssOpPop{"abcd"}},
+		ssOp{op: ssOpSwapLink{"baz", "C", "", "taillink"}},
+		ssOp{op: ssOpPop{"taillink"}},
+		ssOp{op: ssOpPop{"anotherbit"}},
+	)
+	defer ss.Close()
+
+	if !assert.True(t, ss.IsEmpty()) {
+		dumpStack(t, ss)
+		t.FailNow()
+	}
+}
+
+func TestSymlinkStackBadPop(t *testing.T) {
+	ss := testSymlinkStack(t,
+		ssOp{op: ssOpSwapLink{"foo", "A", "", "bar/baz"}},
+		ssOp{op: ssOpSwapLink{"bar", "B", "baz", "abcd"}},
+		ssOp{op: ssOpSwapLink{"bad", "C", "", "abcd"}, expectedErr: errBrokenSymlinkStack},
+		ssOp{op: ssOpPop{"abcd"}},
+		ssOp{op: ssOpSwapLink{"baz", "C", "", "abcd"}},
+		ssOp{op: ssOpSwapLink{"abcd", "D", "", ""}}, // TODO: This is technically an invalid thing to push.
+		ssOp{op: ssOpSwapLink{"another", "E", "", ""}, expectedErr: errBrokenSymlinkStack},
+	)
+	defer ss.Close()
+}
+
+type expectedStackEntry struct {
+	expectedDirName  string
+	expectedUnwalked []string
+}
+
+func testStackContents(t *testing.T, msg string, ss symlinkStack, expected ...expectedStackEntry) {
+	if len(expected) > 0 {
+		require.Equalf(t, len(ss), len(expected), "%s: stack should be the expected length", msg)
+		require.Falsef(t, ss.IsEmpty(), "%s: stack IsEmpty should be false", msg)
+	} else {
+		require.Emptyf(t, len(ss), "%s: stack should be empty", msg)
+		require.Truef(t, ss.IsEmpty(), "%s: stack IsEmpty should be true", msg)
+	}
+
+	for idx, entry := range expected {
+		assert.Equalf(t, ss[idx].dir.Name(), entry.expectedDirName, "%s: stack entry %d name mismatch", msg, idx)
+		if len(entry.expectedUnwalked) > 0 {
+			assert.Equalf(t, ss[idx].linkUnwalked, entry.expectedUnwalked, "%s: stack entry %d unwalked link entries mismatch", msg, idx)
+		} else {
+			assert.Emptyf(t, ss[idx].linkUnwalked, "%s: stack entry %d unwalked link entries", msg, idx)
+		}
+	}
+
+	// Fail the test immediately so we can get the current stack in the test output.
+	if t.Failed() {
+		t.FailNow()
+	}
+}
+
+func TestSymlinkStackBasicTailChain(t *testing.T) {
+	ss := testSymlinkStack(t,
+		ssOp{op: ssOpSwapLink{"foo", "A", "", "tailA"}},
+		ssOp{op: ssOpSwapLink{"tailA", "B", "", "tailB"}},
+		ssOp{op: ssOpSwapLink{"tailB", "C", "", "tailC"}},
+		ssOp{op: ssOpSwapLink{"tailC", "D", "", "tailD"}},
+		ssOp{op: ssOpSwapLink{"tailD", "E", "", "foo/taillink"}},
+	)
+	defer func() {
+		if t.Failed() {
+			dumpStack(t, ss)
+		}
+	}()
+	defer ss.Close()
+
+	// Basic expected contents.
+	testStackContents(t, "initial state", ss,
+		// The top 4 entries should have no unwalked links.
+		expectedStackEntry{"A", nil},
+		expectedStackEntry{"B", nil},
+		expectedStackEntry{"C", nil},
+		expectedStackEntry{"D", nil},
+		// And the final entry should just be foo/taillink.
+		expectedStackEntry{"E", []string{"foo", "taillink"}},
+	)
+
+	// Popping "foo" should keep the tail-chain.
+	require.NoError(t, ss.PopPart("foo"), "pop foo")
+	testStackContents(t, "pop tail-chain end", ss,
+		// The top 4 entries should have no unwalked links.
+		expectedStackEntry{"A", nil},
+		expectedStackEntry{"B", nil},
+		expectedStackEntry{"C", nil},
+		expectedStackEntry{"D", nil},
+		// And the final entry should just be foo/taillink.
+		expectedStackEntry{"E", []string{"taillink"}},
+	)
+
+	// Dropping taillink should empty the stack.
+	require.NoError(t, ss.PopPart("taillink"), "pop taillink")
+	testStackContents(t, "pop last element in tail-chain", ss)
+	assert.True(t, ss.IsEmpty(), "pop last element in tail-chain should empty chain")
+}
+
+func TestSymlinkStackTailChain(t *testing.T) {
+	ss := testSymlinkStack(t,
+		ssOp{op: ssOpSwapLink{"foo", "A", "", "tailA/subdir1"}},
+		// First tail-chain.
+		ssOp{op: ssOpSwapLink{"tailA", "B", "", "tailB"}},
+		ssOp{op: ssOpSwapLink{"tailB", "C", "", "tailC"}},
+		ssOp{op: ssOpSwapLink{"tailC", "D", "", "tailD"}},
+		ssOp{op: ssOpSwapLink{"tailD", "E", "", "taillink1/subdir2"}},
+		// Second tail-chain.
+		ssOp{op: ssOpSwapLink{"taillink1", "F", "", "tailE"}},
+		ssOp{op: ssOpSwapLink{"tailE", "G", "", "tailF"}},
+		ssOp{op: ssOpSwapLink{"tailF", "H", "", "tailG"}},
+		ssOp{op: ssOpSwapLink{"tailG", "I", "", "tailH"}},
+		ssOp{op: ssOpSwapLink{"tailH", "J", "", "tailI"}},
+		ssOp{op: ssOpSwapLink{"tailI", "K", "", "taillink2/.."}},
+	)
+	defer func() {
+		if t.Failed() {
+			dumpStack(t, ss)
+		}
+	}()
+	defer ss.Close()
+
+	// Basic expected contents.
+	testStackContents(t, "initial state", ss,
+		// Top entry is not a tail-chain.
+		expectedStackEntry{"A", []string{"subdir1"}},
+		// The first tail-chain should have no unwalked links.
+		expectedStackEntry{"B", nil},
+		expectedStackEntry{"C", nil},
+		expectedStackEntry{"D", nil},
+		// Final entry in the first tail-chain.
+		expectedStackEntry{"E", []string{"subdir2"}},
+		// The second tail-chain should have no unwalked links.
+		expectedStackEntry{"F", nil},
+		expectedStackEntry{"G", nil},
+		expectedStackEntry{"H", nil},
+		expectedStackEntry{"I", nil},
+		expectedStackEntry{"J", nil},
+		// Final entry in the second tail-chain.
+		expectedStackEntry{"K", []string{"taillink2", ".."}},
+	)
+
+	// Popping any of the early tail chain entries must fail.
+	for _, badPart := range []string{"subdir1", "subdir2", "..", "."} {
+		require.ErrorIsf(t, ss.PopPart(badPart), errBrokenSymlinkStack, "bad pop %q", badPart)
+
+		// NOTE: Same contents as above.
+		testStackContents(t, "bad pop "+badPart, ss,
+			// Top entry is not a tail-chain.
+			expectedStackEntry{"A", []string{"subdir1"}},
+			// The first tail-chain should have no unwalked links.
+			expectedStackEntry{"B", nil},
+			expectedStackEntry{"C", nil},
+			expectedStackEntry{"D", nil},
+			// Final entry in the first tail-chain.
+			expectedStackEntry{"E", []string{"subdir2"}},
+			// The second tail-chain should have no unwalked links.
+			expectedStackEntry{"F", nil},
+			expectedStackEntry{"G", nil},
+			expectedStackEntry{"H", nil},
+			expectedStackEntry{"I", nil},
+			expectedStackEntry{"J", nil},
+			// Final entry in the second tail-chain.
+			expectedStackEntry{"K", []string{"taillink2", ".."}},
+		)
+	}
+
+	// Dropping the second-last entry should keep the tail-chain.
+	require.NoError(t, ss.PopPart("taillink2"), "pop taillink2")
+	testStackContents(t, "pop non-last element in second tail-chain", ss,
+		// Top entry is not a tail-chain.
+		expectedStackEntry{"A", []string{"subdir1"}},
+		// The first tail-chain should have no unwalked links.
+		expectedStackEntry{"B", nil},
+		expectedStackEntry{"C", nil},
+		expectedStackEntry{"D", nil},
+		// Final entry in the first tail-chain.
+		expectedStackEntry{"E", []string{"subdir2"}},
+		// The second tail-chain should have no unwalked links.
+		expectedStackEntry{"F", nil},
+		expectedStackEntry{"G", nil},
+		expectedStackEntry{"H", nil},
+		expectedStackEntry{"I", nil},
+		expectedStackEntry{"J", nil},
+		// Final entry in the second tail-chain.
+		expectedStackEntry{"K", []string{".."}},
+	)
+
+	// Dropping the last entry should only drop the final tail-chain.
+	require.NoError(t, ss.PopPart(".."), "pop ..")
+	testStackContents(t, "pop last element in second tail-chain", ss,
+		// Top entry is not a tail-chain.
+		expectedStackEntry{"A", []string{"subdir1"}},
+		// The first tail-chain should have no unwalked links.
+		expectedStackEntry{"B", nil},
+		expectedStackEntry{"C", nil},
+		expectedStackEntry{"D", nil},
+		// Final entry in the first tail-chain.
+		expectedStackEntry{"E", []string{"subdir2"}},
+	)
+
+	// Dropping the last entry should only drop the tail-chain.
+	require.NoError(t, ss.PopPart("subdir2"), "pop subdir2")
+	testStackContents(t, "pop last element in first tail-chain", ss,
+		// Top entry is not a tail-chain.
+		expectedStackEntry{"A", []string{"subdir1"}},
+	)
+
+	// Dropping the last entry should empty the stack.
+	require.NoError(t, ss.PopPart("subdir1"), "pop subdir1")
+	testStackContents(t, "pop last element", ss)
+	assert.True(t, ss.IsEmpty(), "pop last element should empty stack")
+}
diff --git a/util_linux_test.go b/util_linux_test.go
index 4bf85cc..57a65ef 100644
--- a/util_linux_test.go
+++ b/util_linux_test.go
@@ -10,10 +10,12 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"testing"
 
 	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/unix"
 )
 
 func requireRoot(t *testing.T) {
@@ -84,6 +86,10 @@ func testForceProcThreadSelf(t *testing.T, testFn func(t *testing.T)) {
 //	dir <name>
 //	file <name> <?content>
 //	symlink <name> <target>
+//	char <name> <major> <minor>
+//	block <name> <major> <minor>
+//	fifo <name>
+//	sock <name>
 func createInTree(t *testing.T, root, spec string) {
 	f := strings.Fields(spec)
 	if len(f) < 2 {
@@ -109,6 +115,36 @@ func createInTree(t *testing.T, root, spec string) {
 		target := f[0]
 		err := os.Symlink(target, fullPath)
 		require.NoError(t, err)
+	case "char", "block":
+		if len(f) < 2 {
+			t.Fatalf("invalid spec %q", spec)
+		}
+
+		major, err := strconv.Atoi(f[0])
+		require.NoErrorf(t, err, "mknod %s: parse major", subPath)
+		minor, err := strconv.Atoi(f[1])
+		require.NoErrorf(t, err, "mknod %s: parse minor", subPath)
+		dev := unix.Mkdev(uint32(major), uint32(minor))
+
+		var mode uint32 = 0o644
+		switch inoType {
+		case "char":
+			mode |= unix.S_IFCHR
+		case "block":
+			mode |= unix.S_IFBLK
+		}
+		err = unix.Mknod(fullPath, mode, int(dev))
+		require.NoErrorf(t, err, "mknod (%s %d:%d) %s", inoType, major, minor, fullPath)
+	case "fifo", "sock":
+		var mode uint32 = 0o644
+		switch inoType {
+		case "fifo":
+			mode |= unix.S_IFIFO
+		case "sock":
+			mode |= unix.S_IFSOCK
+		}
+		err := unix.Mknod(fullPath, mode, 0)
+		require.NoErrorf(t, err, "mk%s %s", inoType, fullPath)
 	}
 }
 

From 633f3d635e492927e4c1ce185c2d6a1f8a724dc9 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 4 Jul 2024 23:19:07 +1000
Subject: [PATCH 10/11] gha: collate and calculate coverage for all test runs

By combining the coverage from all platforms as well as privileged and
non-privileged Linux runs, the coverage is now 87.7% for the entire
project.

It was necessary to split the Windows CI scripts because mktemp doesn't
work in Powershell and it's necessary to reimplement the whole thing to
get what you need (not to mention that -test.gocoverdir needs to be
quoted in a particular way to not make Powershell angry).

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .github/workflows/ci.yml | 98 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1e819e3..09f1d79 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,14 +11,51 @@ on:
     - cron: "30 10 * * 0"
 
 jobs:
-  test:
+  test-windows:
+    strategy:
+      fail-fast: false
+      matrix:
+        go-version:
+          - "1.21"
+          - "1.22"
+          - "^1"
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      - name: mkdir gocoverdir
+        run: |
+          # mktemp --tmpdir -d gocoverdir.XXXXXXXX
+          function New-TemporaryDirectory {
+            param (
+              [string] $Prefix
+            )
+            $parent = [System.IO.Path]::GetTempPath()
+            do {
+              [string] $guid = [System.Guid]::NewGuid()
+              $item = New-Item -Path "$parent" -Name "$Prefix.$guid" -ItemType "directory" -ErrorAction SilentlyContinue
+            } while (-not "$item")
+            return $item.FullName
+          }
+          $GOCOVERDIR = (New-TemporaryDirectory -Prefix "gocoverdir")
+          echo "GOCOVERDIR=$GOCOVERDIR" >>"$env:GITHUB_ENV"
+      - name: unit tests
+        run: go test -v -cover '-test.gocoverdir' "$env:GOCOVERDIR" ./...
+      - name: upload coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-${{ runner.os }}-${{ github.job }}-${{ strategy.job-index }}
+          path: ${{ env.GOCOVERDIR }}
+
+  test-unix:
     strategy:
       fail-fast: false
       matrix:
         os:
           - ubuntu-latest
           - macos-latest
-          - windows-latest
         go-version:
           - "1.21"
           - "1.22"
@@ -33,9 +70,56 @@ jobs:
       - uses: actions/setup-go@v4
         with:
           go-version: ${{ matrix.go-version }}
+      - name: mkdir gocoverdir
+        run: |
+          GOCOVERDIR="$(mktemp --tmpdir -d gocoverdir.XXXXXXXX)"
+          echo "GOCOVERDIR=$GOCOVERDIR" >>"$GITHUB_ENV"
       - name: unit tests
-        run: go test -v -cover ./...
-      # TODO: Merge the code coverage stats from both runs...
-      - name: unit tests (root)
-        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        run: sudo go test -v -cover ./...
+        run: |
+          go test -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
+          sudo go test -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
+      - name: upload coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-${{ runner.os }}-${{ github.job }}-${{ strategy.job-index }}
+          path: ${{ env.GOCOVERDIR }}
+
+  coverage:
+    runs-on: ubuntu-latest
+    needs:
+      - test-windows
+      - test-unix
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-go@v4
+        with:
+          go-version: "^1"
+      - name: download all coverage
+        uses: actions/download-artifact@v4
+        with:
+          path: coverage
+      - name: generate coverage list
+        run: |
+          find coverage/
+          GOCOVERDIRS="$(printf '%s,' coverage/* | sed 's|,$||')"
+          echo "GOCOVERDIRS=$GOCOVERDIRS" >>"$GITHUB_ENV"
+          FULLCOVERAGE_FILE="$(mktemp --tmpdir fullcoverage.XXXXXXXX)"
+          echo "FULLCOVERAGE_FILE=$FULLCOVERAGE_FILE" >>"$GITHUB_ENV"
+      - name: compute coverage
+        run: go tool covdata percent -i "$GOCOVERDIRS"
+      - name: compute func coverage
+        run: go tool covdata func -i "$GOCOVERDIRS" | sort -k 3gr
+      - name: merge coverage
+        run: |
+          go tool covdata textfmt -i "$GOCOVERDIRS" -o "$FULLCOVERAGE_FILE"
+          go tool cover -html="$FULLCOVERAGE_FILE" -o "$FULLCOVERAGE_FILE.html"
+      - name: upload merged coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: fullcoverage-${{ github.job }}
+          path: ${{ env.FULLCOVERAGE_FILE }}
+      - name: upload coverage html
+        uses: actions/upload-artifact@v4
+        with:
+          name: fullcoverage-${{ github.job }}.html
+          path: ${{ env.FULLCOVERAGE_FILE }}.html

From ac327434f02ed5186ad6751df3c87ca436a2f887 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 9 Jul 2024 10:54:48 +1000
Subject: [PATCH 11/11] tests: add racing tests for partialLokupInRoot and
 MkdirAllHandle

In order to get some confidence in saying that our code safe against
these kinds of racing attacks, add some tests to cover this.

The full suite (with 50k runs of each new subtest) takes about 10
minutes, so we should use "go test -short" for the full matrix and add a
single separate test run which can run for longer (we don't need to
merge the coverage from the stress test because we hit the same
codepaths as in the short test).

While working on this, I found what seems to be a kernel bug with
readlink(/proc/self/fd/$n). There is a workaround in the tests for this,
but it is possible that the test will become flaky.

This brings the test coverage up to 91.1% on Linux (91.3% combined).

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .github/workflows/ci.yml |  19 ++-
 lookup_linux_test.go     | 271 ++++++++++++++++++++++++++++++++++++++-
 mkdir_linux_test.go      | 225 +++++++++++++++++++++++++++++++-
 procfs_linux_test.go     |  10 +-
 util_linux_test.go       |  89 +++++++++++--
 5 files changed, 593 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 09f1d79..1d5c7f4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -49,7 +49,7 @@ jobs:
           name: coverage-${{ runner.os }}-${{ github.job }}-${{ strategy.job-index }}
           path: ${{ env.GOCOVERDIR }}
 
-  test-unix:
+  test-unix-short:
     strategy:
       fail-fast: false
       matrix:
@@ -76,19 +76,30 @@ jobs:
           echo "GOCOVERDIR=$GOCOVERDIR" >>"$GITHUB_ENV"
       - name: unit tests
         run: |
-          go test -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
-          sudo go test -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
+          go test -short -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
+          sudo go test -short -v -cover -test.gocoverdir="$GOCOVERDIR" ./...
       - name: upload coverage
         uses: actions/upload-artifact@v4
         with:
           name: coverage-${{ runner.os }}-${{ github.job }}-${{ strategy.job-index }}
           path: ${{ env.GOCOVERDIR }}
 
+  test-linux-stress:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-go@v4
+        with:
+          go-version: "^1"
+      - name: stress test race protections
+        run: |
+          sudo go test -timeout 1h -v -cover -run 'Test.*Racing' ./...
+
   coverage:
     runs-on: ubuntu-latest
     needs:
       - test-windows
-      - test-unix
+      - test-unix-short
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-go@v4
diff --git a/lookup_linux_test.go b/lookup_linux_test.go
index 79dfabe..07e46fb 100644
--- a/lookup_linux_test.go
+++ b/lookup_linux_test.go
@@ -7,9 +7,11 @@
 package securejoin
 
 import (
+	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
+	"slices"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -251,7 +253,7 @@ func testPartialLookup(t *testing.T, partialLookupFn partialLookupFunc) {
 }
 
 func TestPartialLookupInRoot(t *testing.T) {
-	withWithoutOpenat2(t, func(t *testing.T) {
+	withWithoutOpenat2(t, true, func(t *testing.T) {
 		testPartialLookup(t, partialLookupInRoot)
 	})
 }
@@ -263,7 +265,7 @@ func TestPartialOpenat2(t *testing.T) {
 func TestPartialLookupInRoot_BadInode(t *testing.T) {
 	requireRoot(t) // mknod
 
-	withWithoutOpenat2(t, func(t *testing.T) {
+	withWithoutOpenat2(t, true, func(t *testing.T) {
 		partialLookupFn := partialLookupInRoot
 
 		tree := []string{
@@ -306,6 +308,271 @@ func TestPartialLookupInRoot_BadInode(t *testing.T) {
 	})
 }
 
+type racingLookupMeta struct {
+	pauseCh                                                      chan struct{}
+	passOkCount, passErrCount, skipCount, failCount, badErrCount int // test state counts
+	badNameCount, fixRemainingPathCount, unstableProcSelfFdCount int // workaround counts
+	skipErrCounts                                                map[error]int
+}
+
+func newRacingLookupMeta(pauseCh chan struct{}) *racingLookupMeta {
+	return &racingLookupMeta{
+		pauseCh:       pauseCh,
+		skipErrCounts: map[error]int{},
+	}
+}
+
+func (m *racingLookupMeta) checkPartialLookup(t *testing.T, rootDir *os.File, unsafePath string, skipErrs []error, allowedResults []lookupResult) {
+	// Similar to checkPartialLookup, but with extra logic for
+	// handling the lookup stopping partly through the lookup.
+	handle, remainingPath, err := partialLookupInRoot(rootDir, unsafePath)
+	if err != nil {
+		for _, skipErr := range skipErrs {
+			if errors.Is(err, skipErr) {
+				m.skipErrCounts[skipErr]++
+				m.skipCount++
+				return
+			}
+		}
+		for _, allowed := range allowedResults {
+			if allowed.err != nil && errors.Is(err, allowed.err) {
+				m.passErrCount++
+				return
+			}
+		}
+		// If we didn't hit any of the allowed errors, it's an
+		// unexpected error.
+		assert.NoError(t, err)
+		m.badErrCount++
+		return
+	}
+
+	var (
+		handleName string
+		realPath   string
+		unixStat   unix.Stat_t
+		realPaths  []string
+	)
+	if handle != nil {
+		handleName = handle.Name()
+
+		// Get the "proper" name from procSelfFdReadlink.
+		m.pauseCh <- struct{}{}
+		// For some reason, it seems that (at a rate of 0.01% or so) even
+		// though we pause the rename thread is a nonsensical path from
+		// procSelfFdReadlink that looks like the path is still swapped. The
+		// key thing to note is that adding sleeps doesn't seem to help much,
+		// but retrying several times usually ends up with us getting the
+		// "right" result eventually.
+		//
+		// This seems like a kernel bug (or a weird issue with Go channels),
+		// but for now let's just retry a few times and keep track of how many
+		// transitions we saw. If we only see one change, then we can use the
+		// last one and just track the statistics.
+		const readlinkRetryMax = 500
+		realPath, err = procSelfFdReadlink(handle)
+		for i := 0; err == nil && i < readlinkRetryMax; i++ {
+			if last := len(realPaths) - 1; last < 0 || realPath != realPaths[last] {
+				realPaths = append(realPaths, realPath)
+			}
+			realPath, err = procSelfFdReadlink(handle)
+		}
+		<-m.pauseCh
+		require.NoError(t, err, "get real path of returned handle")
+
+		unixStat, err = fstat(handle)
+		require.NoError(t, err, "stat handle")
+
+		_ = handle.Close()
+	}
+
+	assert.LessOrEqualf(t, len(realPaths), 2, "saw more than one transition in procSelfFdReadlink results: %v", realPaths)
+	if len(realPaths) > 1 {
+		m.unstableProcSelfFdCount++
+		assert.Contains(t, realPaths, handleName, "at least one real path should match the handle name if we got more than one real path")
+	}
+
+	if realPath != handleName {
+		// It's possible for handle.Name() to be wrong because while it was
+		// correct when it was set, it might not match if the path was swapped
+		// afterwards (for both openat2 and partialLookupInRoot).
+		m.badNameCount++
+	}
+
+	// It's possible for lookups with ".." components to decide to cut off the
+	// lookup partially through the resolution when dealing with a swapping
+	// attack, so for the purposes of validating our tests we clean up the
+	// remainingPath so that it has all of the ".." components removed (but
+	// include this in our statistics).
+	fullLogicalPath := filepath.Join(realPath, remainingPath)
+	newRemainingPath, err := filepath.Rel(realPath, fullLogicalPath)
+	require.NoErrorf(t, err, "clean remaining path %s", remainingPath)
+	if remainingPath != newRemainingPath {
+		m.fixRemainingPathCount++
+	}
+	remainingPath = newRemainingPath
+
+	gotResult := lookupResult{
+		handlePath:    realPath,
+		remainingPath: remainingPath,
+		fileType:      unixStat.Mode & unix.S_IFMT,
+	}
+	counter := &m.passOkCount
+	if !assert.Contains(t, allowedResults, gotResult) {
+		counter = &m.failCount
+	}
+	(*counter)++
+}
+
+func TestPartialLookup_RacingRename(t *testing.T) {
+	if !hasRenameExchange() {
+		t.Skip("test requires RENAME_EXCHANGE support")
+	}
+
+	withWithoutOpenat2(t, false, func(t *testing.T) {
+		tree := []string{
+			"dir a/b/c/d",
+			"symlink b-link ../b/../b/../b/../b/../b/../b/../b/../b/../b/../b/../b/../b/../b",
+			"symlink c-link ../../b/c/../../b/c/../../b/c/../../b/c/../../b/c/../../b/c/../../b/c/../../b/c/../../b/c",
+			"file file",
+			"symlink bad-link /foobar",
+		}
+
+		var (
+			handlePath      = "/a/b/c/d"
+			remainingPath   = "e"
+			defaultExpected []lookupResult
+		)
+		// The lookup could stop at any component other than /a, so allow all
+		// of them.
+		for handlePath != "/" {
+			defaultExpected = append(defaultExpected, lookupResult{
+				handlePath:    handlePath,
+				remainingPath: remainingPath,
+				fileType:      unix.S_IFDIR,
+			})
+			handlePath, remainingPath = filepath.Dir(handlePath), filepath.Join(filepath.Base(handlePath), remainingPath)
+		}
+		for name, test := range map[string]struct {
+			subPathA, subPathB string
+			unsafePath         string
+			skipErrs           []error
+			allowedResults     []lookupResult
+		}{
+			// Swap a symlink in and out.
+			"swap-dir-link1-basic":   {"a/b", "b-link", "a/b/c/d/e", nil, slices.Clone(defaultExpected)},
+			"swap-dir-link2-basic":   {"a/b/c", "c-link", "a/b/c/d/e", nil, slices.Clone(defaultExpected)},
+			"swap-dir-link1-dotdot1": {"a/b", "b-link", "a/b/../b/../b/../b/../b/../b/../b/c/d/../d/../d/../d/../d/../d/e", nil, slices.Clone(defaultExpected)},
+			"swap-dir-link1-dotdot2": {"a/b", "b-link", "a/b/c/../c/../c/../c/../c/../c/../c/d/../d/../d/../d/../d/../d/e", nil, slices.Clone(defaultExpected)},
+			"swap-dir-link2-dotdot":  {"a/b/c", "c-link", "a/b/c/../c/../c/../c/../c/../c/../c/d/../d/../d/../d/../d/../d/e", nil, slices.Clone(defaultExpected)},
+			// TODO: Swap a directory.
+			// Swap a non-directory.
+			"swap-dir-file-basic": {"a/b", "file", "a/b/c/d/e", []error{unix.ENOTDIR, unix.ENOENT}, append(
+				// We could hit one of the final paths.
+				slices.Clone(defaultExpected),
+				// We could hit the file and stop resolving.
+				lookupResult{handlePath: "/file", remainingPath: "c/d/e", fileType: unix.S_IFREG},
+			)},
+			"swap-dir-file-dotdot": {"a/b", "file", "a/b/c/../c/../c/../c/../c/../c/../c/d/../d/../d/../d/../d/../d/e", []error{unix.ENOTDIR, unix.ENOENT}, append(
+				// We could hit one of the final paths.
+				slices.Clone(defaultExpected),
+				// We could hit the file and stop resolving.
+				lookupResult{handlePath: "/file", remainingPath: "c/d/e", fileType: unix.S_IFREG},
+			)},
+			// Swap a dangling symlink.
+			"swap-dir-danglinglink-basic":  {"a/b", "bad-link", "a/b/c/d/e", []error{unix.ENOENT}, slices.Clone(defaultExpected)},
+			"swap-dir-danglinglink-dotdot": {"a/b", "bad-link", "a/b/c/../c/../c/../c/../c/../c/../c/d/../d/../d/../d/../d/../d/e", []error{unix.ENOENT}, slices.Clone(defaultExpected)},
+			// Swap the root.
+			"swap-root-basic":  {".", "../outsideroot", "a/b/c/d/e", nil, slices.Clone(defaultExpected)},
+			"swap-root-dotdot": {".", "../outsideroot", "a/b/../../a/b/../../a/b/../../a/b/../../a/b/../../a/b/../../a/b/../../a/b/c/d/e", nil, slices.Clone(defaultExpected)},
+			// Swap one of our walking paths outside the root.
+			"swap-dir-outsideroot-basic": {"a/b", "../outsideroot", "a/b/c/d/e", nil, append(
+				// We could hit the expected path.
+				slices.Clone(defaultExpected),
+				// We could also land in the "outsideroot" path. This is okay
+				// because there was a moment when this directory was inside
+				// the root, and the attacker moved it outside the root. If we
+				// were to go into "..", the lookup would've failed (and we
+				// would get an error here if that wasn't the case).
+				lookupResult{handlePath: "../outsideroot", remainingPath: "c/d/e", fileType: unix.S_IFDIR},
+				lookupResult{handlePath: "../outsideroot/c", remainingPath: "d/e", fileType: unix.S_IFDIR},
+				lookupResult{handlePath: "../outsideroot/c/d", remainingPath: "e", fileType: unix.S_IFDIR},
+			)},
+			"swap-dir-outsideroot-dotdot": {"a/b", "../outsideroot", "a/b/../../a/b/../../a/b/../../a/b/../../a/b/../../a/b/../../a/b/../../a/b/c/d/e", nil, append(
+				// We could hit the expected path.
+				slices.Clone(defaultExpected),
+				// We could also land in the "outsideroot" path. This is okay
+				// because there was a moment when this directory was inside
+				// the root, and the attacker moved it outside the root.
+				//
+				// Neither openat2 nor partialLookupInRoot will allow us to
+				// walk into ".." in this case (escaping the root), and we
+				// would catch that if it did happen.
+				lookupResult{handlePath: "../outsideroot", remainingPath: "c/d/e", fileType: unix.S_IFDIR},
+				lookupResult{handlePath: "../outsideroot/c", remainingPath: "d/e", fileType: unix.S_IFDIR},
+				lookupResult{handlePath: "../outsideroot/c/d", remainingPath: "e", fileType: unix.S_IFDIR},
+			)},
+		} {
+			test := test // copy iterator
+			test.skipErrs = append(test.skipErrs, errPossibleAttack, errPossibleBreakout)
+			t.Run(name, func(t *testing.T) {
+				root := createTree(t, tree...)
+
+				// Update the handlePath to be inside our root.
+				for idx := range test.allowedResults {
+					test.allowedResults[idx].handlePath = filepath.Join(root, test.allowedResults[idx].handlePath)
+				}
+
+				// Create an "outsideroot" path as a sibling to our root, for
+				// swapping.
+				err := os.MkdirAll(filepath.Join(root, "../outsideroot"), 0o755)
+				require.NoError(t, err)
+
+				rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+				require.NoError(t, err)
+				defer rootDir.Close()
+
+				// Run a goroutine that spams a rename in the root.
+				pauseCh := make(chan struct{})
+				exitCh := make(chan struct{})
+				defer close(exitCh)
+				go doRenameExchangeLoop(pauseCh, exitCh, rootDir, test.subPathA, test.subPathB)
+
+				// Do several runs to try to catch bugs.
+				var testRuns = 10000
+				if testing.Short() {
+					testRuns = 300
+				}
+				m := newRacingLookupMeta(pauseCh)
+				for i := 0; i < testRuns; i++ {
+					m.checkPartialLookup(t, rootDir, test.unsafePath, test.skipErrs, test.allowedResults)
+				}
+
+				pct := func(count int) string {
+					return fmt.Sprintf("%d(%.3f%%)", count, 100.0*float64(count)/float64(testRuns))
+				}
+
+				// Output some stats.
+				t.Logf("after %d runs: passOk=%s passErr=%s skip=%s fail=%s (+badErr=%s)",
+					// runs and breakdown of path-related (pass, fail) as well as skipped runs
+					testRuns, pct(m.passOkCount), pct(m.passErrCount), pct(m.skipCount), pct(m.failCount),
+					// failures due to incorrect errors (rather than bad paths)
+					pct(m.badErrCount))
+				t.Logf("  badHandleName=%s fixRemainingPath=%s unstableProcSelfFdCount=%s",
+					// stats for how many test runs had to have some "workarounds"
+					pct(m.badNameCount), pct(m.fixRemainingPathCount), pct(m.unstableProcSelfFdCount))
+				if len(m.skipErrCounts) > 0 {
+					t.Logf("  skipErr breakdown:")
+					for err, count := range m.skipErrCounts {
+						t.Logf("   %3.d: %v", count, err)
+					}
+				}
+			})
+		}
+
+	})
+}
+
 type ssOperation interface {
 	String() string
 	Do(*testing.T, *symlinkStack) error
diff --git a/mkdir_linux_test.go b/mkdir_linux_test.go
index c2afd09..879e62e 100644
--- a/mkdir_linux_test.go
+++ b/mkdir_linux_test.go
@@ -7,6 +7,8 @@
 package securejoin
 
 import (
+	"errors"
+	"fmt"
 	"os"
 	"path/filepath"
 	"testing"
@@ -46,7 +48,7 @@ func testMkdirAll_Basic(t *testing.T, mkdirAll func(t *testing.T, root, unsafePa
 		"symlink loop/link ../loop/link",
 	}
 
-	withWithoutOpenat2(t, func(t *testing.T) {
+	withWithoutOpenat2(t, true, func(t *testing.T) {
 		for name, test := range map[string]struct {
 			unsafePath  string
 			expectedErr error
@@ -240,3 +242,224 @@ func TestMkdirAllHandle_InvalidMode(t *testing.T) {
 		return nil
 	})
 }
+
+type racingMkdirMeta struct {
+	passOkCount, passErrCount, failCount int
+	passErrCounts                        map[error]int
+}
+
+func newRacingMkdirMeta() *racingMkdirMeta {
+	return &racingMkdirMeta{
+		passErrCounts: map[error]int{},
+	}
+}
+
+func (m *racingMkdirMeta) checkMkdirAllHandle_Racing(t *testing.T, root, unsafePath string, mode int, allowedErrs []error) {
+	rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+	require.NoError(t, err, "open root")
+	defer rootDir.Close()
+
+	handle, err := MkdirAllHandle(rootDir, unsafePath, mode)
+	if err != nil {
+		for _, allowedErr := range allowedErrs {
+			if errors.Is(err, allowedErr) {
+				m.passErrCounts[allowedErr]++
+				m.passErrCount++
+				return
+			}
+		}
+		assert.NoError(t, err)
+		m.failCount++
+		return
+	}
+	defer handle.Close()
+
+	// Make sure the handle has the right owner/mode.
+	unixStat, err := fstat(handle)
+	require.NoError(t, err, "stat mkdirall handle")
+	assert.Equal(t, uint32(unix.S_IFDIR|mode), unixStat.Mode, "mkdirall handle mode")
+	assert.Equal(t, uint32(unix.Geteuid()), unixStat.Uid, "mkdirall handle uid")
+	assert.Equal(t, uint32(unix.Getegid()), unixStat.Gid, "mkdirall handle gid")
+	// TODO: Does it make sense to even try to check the handle path?
+	m.passOkCount++
+}
+
+func TestMkdirAllHandle_RacingRename(t *testing.T) {
+	withWithoutOpenat2(t, false, func(t *testing.T) {
+		treeSpec := []string{
+			"dir target/a/b/c",
+			"dir swapdir-empty-ok ::0711",
+			"dir swapdir-empty-badmode ::0777",
+			"dir swapdir-nonempty1 ::0711",
+			"file swapdir-nonempty1/aaa",
+			"dir swapdir-nonempty2 ::0711",
+			"dir swapdir-nonempty2/f ::0711",
+			"file swapfile foobar ::0711",
+		}
+
+		type test struct {
+			name         string
+			pathA, pathB string
+			unsafePath   string
+			allowedErrs  []error
+		}
+
+		tests := []test{
+			{"good", "target/a/b/c/d/e", "swapdir-empty-ok", "target/a/b/c/d/e/f/g/h/i/j/k", nil},
+			{"trailing", "target/a/b/c/d/e", "swapdir-empty-badmode", "target/a/b/c/d/e", []error{errPossibleAttack}},
+			{"partial", "target/a/b/c/d/e", "swapdir-empty-badmode", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errPossibleAttack}},
+			{"trailing", "target/a/b/c/d/e", "swapdir-nonempty1", "target/a/b/c/d/e", []error{errPossibleAttack}},
+			{"partial", "target/a/b/c/d/e", "swapdir-nonempty1", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errPossibleAttack}},
+			{"trailing", "target/a/b/c/d/e", "swapdir-nonempty2", "target/a/b/c/d/e", []error{errPossibleAttack}},
+			{"partial", "target/a/b/c/d/e", "swapdir-nonempty2", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errPossibleAttack}},
+			{"trailing", "target/a/b/c/d/e", "swapfile", "target/a/b/c/d/e", []error{unix.ENOTDIR}},
+			{"partial", "target/a/b/c/d/e", "swapfile", "target/a/b/c/d/e/f/g/h/i/j/k", []error{unix.ENOTDIR}},
+		}
+
+		if unix.Geteuid() == 0 {
+			// Add some wrong-uid cases if we are root.
+			treeSpec = append(treeSpec,
+				"dir swapdir-empty-badowner1 123:0:0711",
+				"dir swapdir-empty-badowner2 0:456:0711",
+				"dir swapdir-empty-badowner3 111:222:0711",
+			)
+			tests = append(tests, []test{
+				{"trailing", "target/a/b/c/d/e", "swapdir-empty-badowner1", "target/a/b/c/d/e", []error{errPossibleAttack}},
+				{"partial", "target/a/b/c/d/e", "swapdir-empty-badowner1", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errPossibleAttack}},
+				{"trailing", "target/a/b/c/d/e", "swapdir-empty-badowner2", "target/a/b/c/d/e", []error{errPossibleAttack}},
+				{"partial", "target/a/b/c/d/e", "swapdir-empty-badowner2", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errPossibleAttack}},
+				{"trailing", "target/a/b/c/d/e", "swapdir-empty-badowner3", "target/a/b/c/d/e", []error{errPossibleAttack}},
+				{"partial", "target/a/b/c/d/e", "swapdir-empty-badowner3", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errPossibleAttack}},
+			}...)
+		}
+
+		for _, test := range tests {
+			test := test // copy iterator
+			t.Run(fmt.Sprintf("%s-%s", test.pathB, test.name), func(t *testing.T) {
+				rootCh := make(chan string)
+				defer close(rootCh)
+				go func(rootCh <-chan string) {
+					var root string
+					for {
+						select {
+						case newRoot, ok := <-rootCh:
+							if !ok {
+								return
+							}
+							root = newRoot
+						default:
+							if root != "" {
+								pathA := filepath.Join(root, test.pathA)
+								pathB := filepath.Join(root, test.pathB)
+								_ = unix.Renameat2(unix.AT_FDCWD, pathA, unix.AT_FDCWD, pathB, unix.RENAME_EXCHANGE)
+							}
+						}
+					}
+				}(rootCh)
+
+				// Do several runs to try to catch bugs.
+				var testRuns = 10000
+				if testing.Short() {
+					testRuns = 300
+				}
+				m := newRacingMkdirMeta()
+				for i := 0; i < testRuns; i++ {
+					root := createTree(t, treeSpec...)
+					rootCh <- root
+
+					m.checkMkdirAllHandle_Racing(t, root, test.unsafePath, 0o711, test.allowedErrs)
+
+					// Clean up the root after each run so we don't exhaust all
+					// space in the tmpfs.
+					_ = os.RemoveAll(root)
+				}
+
+				pct := func(count int) string {
+					return fmt.Sprintf("%d(%.3f%%)", count, 100.0*float64(count)/float64(testRuns))
+				}
+
+				// Output some stats.
+				t.Logf("after %d runs: passOk=%s passErr=%s fail=%s",
+					testRuns, pct(m.passOkCount), pct(m.passErrCount), pct(m.failCount))
+				if len(m.passErrCounts) > 0 {
+					t.Logf("  passErr breakdown:")
+					for err, count := range m.passErrCounts {
+						t.Logf("   %3.d: %v", count, err)
+					}
+				}
+			})
+		}
+	})
+}
+
+func TestMkdirAllHandle_RacingDelete(t *testing.T) {
+	withWithoutOpenat2(t, false, func(t *testing.T) {
+		treeSpec := []string{
+			"dir target/a/b/c",
+		}
+
+		for _, test := range []struct {
+			name        string
+			rmPath      string
+			unsafePath  string
+			allowedErrs []error
+		}{
+			{"rm-top", "target", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errInvalidDirectory, unix.ENOENT}},
+			{"rm-existing", "target/a/b/c", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errInvalidDirectory, unix.ENOENT}},
+			{"rm-nonexisting", "target/a/b/c/d/e", "target/a/b/c/d/e/f/g/h/i/j/k", []error{errInvalidDirectory, unix.ENOENT}},
+		} {
+			test := test // copy iterator
+			t.Run(test.rmPath, func(t *testing.T) {
+				rootCh := make(chan string)
+				defer close(rootCh)
+				go func(rootCh <-chan string) {
+					var root string
+					for {
+						select {
+						case newRoot, ok := <-rootCh:
+							if !ok {
+								return
+							}
+							root = newRoot
+						default:
+							if root != "" {
+								_ = os.RemoveAll(filepath.Join(root, test.rmPath))
+							}
+						}
+					}
+				}(rootCh)
+
+				// Do several runs to try to catch bugs.
+				var testRuns = 10000
+				if testing.Short() {
+					testRuns = 300
+				}
+				m := newRacingMkdirMeta()
+				for i := 0; i < testRuns; i++ {
+					root := createTree(t, treeSpec...)
+					rootCh <- root
+
+					m.checkMkdirAllHandle_Racing(t, root, test.unsafePath, 0o711, test.allowedErrs)
+
+					// Clean up the root after each run so we don't exhaust all
+					// space in the tmpfs.
+					_ = os.RemoveAll(root + "/..")
+				}
+
+				pct := func(count int) string {
+					return fmt.Sprintf("%d(%.3f%%)", count, 100.0*float64(count)/float64(testRuns))
+				}
+
+				// Output some stats.
+				t.Logf("after %d runs: passOk=%s passErr=%s fail=%s",
+					testRuns, pct(m.passOkCount), pct(m.passErrCount), pct(m.failCount))
+				if len(m.passErrCounts) > 0 {
+					t.Logf("  passErr breakdown:")
+					for err, count := range m.passErrCounts {
+						t.Logf("   %3.d: %v", count, err)
+					}
+				}
+			})
+		}
+	})
+}
diff --git a/procfs_linux_test.go b/procfs_linux_test.go
index 57a37d0..947f387 100644
--- a/procfs_linux_test.go
+++ b/procfs_linux_test.go
@@ -159,7 +159,7 @@ func testProcOvermountSubdir(t *testing.T, procRootFn procRootFunc, expectOvermo
 }
 
 func TestProcOvermountSubdir_unsafeHostProcRoot(t *testing.T) {
-	withWithoutOpenat2(t, func(t *testing.T) {
+	withWithoutOpenat2(t, true, func(t *testing.T) {
 		// If we use the host /proc directly, we should see overmounts.
 		testProcOvermountSubdir(t, unsafeHostProcRoot, true)
 	})
@@ -169,7 +169,7 @@ func TestProcOvermountSubdir_newPrivateProcMount(t *testing.T) {
 	if !hasNewMountApi() {
 		t.Skip("test requires fsopen/open_tree support")
 	}
-	withWithoutOpenat2(t, func(t *testing.T) {
+	withWithoutOpenat2(t, true, func(t *testing.T) {
 		// If we create our own procfs, the overmounts shouldn't appear.
 		testProcOvermountSubdir(t, newPrivateProcMount, false)
 	})
@@ -179,7 +179,7 @@ func TestProcOvermountSubdir_clonePrivateProcMount(t *testing.T) {
 	if !hasNewMountApi() {
 		t.Skip("test requires fsopen/open_tree support")
 	}
-	withWithoutOpenat2(t, func(t *testing.T) {
+	withWithoutOpenat2(t, true, func(t *testing.T) {
 		// If we use open_tree(2), we don't use AT_RECURSIVE when running in
 		// this test (because the overmounts are not locked mounts) and so we
 		// don't expect to see overmounts.
@@ -188,7 +188,7 @@ func TestProcOvermountSubdir_clonePrivateProcMount(t *testing.T) {
 }
 
 func TestProcOvermountSubdir_doGetProcRoot(t *testing.T) {
-	withWithoutOpenat2(t, func(t *testing.T) {
+	withWithoutOpenat2(t, true, func(t *testing.T) {
 		// We expect to not get overmounts if we have the new mount API.
 		// FIXME: It's possible to hit overmounts if there are locked mounts
 		// and we hit the AT_RECURSIVE case...
@@ -200,7 +200,7 @@ func TestProcOvermountSubdir_doGetProcRoot_Mocked(t *testing.T) {
 	if !hasNewMountApi() {
 		t.Skip("test requires fsopen/open_tree support")
 	}
-	withWithoutOpenat2(t, func(t *testing.T) {
+	withWithoutOpenat2(t, true, func(t *testing.T) {
 		testForceGetProcRoot(t, func(t *testing.T, expectOvermounts bool) {
 			testProcOvermountSubdir(t, doGetProcRoot, expectOvermounts)
 		})
diff --git a/util_linux_test.go b/util_linux_test.go
index 57a65ef..b3f94f5 100644
--- a/util_linux_test.go
+++ b/util_linux_test.go
@@ -7,6 +7,7 @@
 package securejoin
 
 import (
+	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -24,9 +25,10 @@ func requireRoot(t *testing.T) {
 	}
 }
 
-func withWithoutOpenat2(t *testing.T, testFn func(t *testing.T)) {
-	t.Run("openat2=auto", testFn)
-
+func withWithoutOpenat2(t *testing.T, doAuto bool, testFn func(t *testing.T)) {
+	if doAuto {
+		t.Run("openat2=auto", testFn)
+	}
 	for _, useOpenat2 := range []bool{true, false} {
 		useOpenat2 := useOpenat2 // copy iterator
 		t.Run(fmt.Sprintf("openat2=%v", useOpenat2), func(t *testing.T) {
@@ -81,15 +83,48 @@ func testForceProcThreadSelf(t *testing.T, testFn func(t *testing.T)) {
 	}
 }
 
+func hasRenameExchange() bool {
+	err := unix.Renameat2(unix.AT_FDCWD, ".", unix.AT_FDCWD, ".", unix.RENAME_EXCHANGE)
+	return !errors.Is(err, unix.ENOSYS)
+}
+
+func doRenameExchangeLoop(pauseCh chan struct{}, exitCh <-chan struct{}, dir *os.File, pathA, pathB string) {
+	var swapped bool
+	swap := func() {
+		_ = unix.Renameat2(int(dir.Fd()), pathA, int(dir.Fd()), pathB, unix.RENAME_EXCHANGE)
+		//unix.Sync()
+		swapped = !swapped
+	}
+
+	for {
+		select {
+		case <-exitCh:
+			return
+		case <-pauseCh:
+			if swapped {
+				swap()
+			}
+			// Wait for caller to unpause us.
+			select {
+			case pauseCh <- struct{}{}:
+			case <-exitCh:
+				return
+			}
+		default:
+			swap()
+		}
+	}
+}
+
 // Format:
 //
-//	dir <name>
-//	file <name> <?content>
+//	dir <name> <?uid:gid:mode>
+//	file <name> <?content> <?uid:gid:mode>
 //	symlink <name> <target>
-//	char <name> <major> <minor>
-//	block <name> <major> <minor>
-//	fifo <name>
-//	sock <name>
+//	char <name> <major> <minor> <?uid:gid:mode>
+//	block <name> <major> <minor> <?uid:gid:mode>
+//	fifo <name> <?uid:gid:mode>
+//	sock <name> <?uid:gid:mode>
 func createInTree(t *testing.T, root, spec string) {
 	f := strings.Fields(spec)
 	if len(f) < 2 {
@@ -97,8 +132,13 @@ func createInTree(t *testing.T, root, spec string) {
 	}
 	inoType, subPath, f := f[0], f[1], f[2:]
 	fullPath := filepath.Join(root, subPath)
+
+	var setOwnerMode *string
 	switch inoType {
 	case "dir":
+		if len(f) >= 1 {
+			setOwnerMode = &f[0]
+		}
 		err := os.MkdirAll(fullPath, 0o755)
 		require.NoError(t, err)
 	case "file":
@@ -106,6 +146,9 @@ func createInTree(t *testing.T, root, spec string) {
 		if len(f) >= 1 {
 			contents = []byte(f[0])
 		}
+		if len(f) >= 2 {
+			setOwnerMode = &f[1]
+		}
 		err := os.WriteFile(fullPath, contents, 0o644)
 		require.NoError(t, err)
 	case "symlink":
@@ -119,6 +162,9 @@ func createInTree(t *testing.T, root, spec string) {
 		if len(f) < 2 {
 			t.Fatalf("invalid spec %q", spec)
 		}
+		if len(f) >= 3 {
+			setOwnerMode = &f[2]
+		}
 
 		major, err := strconv.Atoi(f[0])
 		require.NoErrorf(t, err, "mknod %s: parse major", subPath)
@@ -136,6 +182,9 @@ func createInTree(t *testing.T, root, spec string) {
 		err = unix.Mknod(fullPath, mode, int(dev))
 		require.NoErrorf(t, err, "mknod (%s %d:%d) %s", inoType, major, minor, fullPath)
 	case "fifo", "sock":
+		if len(f) >= 1 {
+			setOwnerMode = &f[0]
+		}
 		var mode uint32 = 0o644
 		switch inoType {
 		case "fifo":
@@ -146,6 +195,28 @@ func createInTree(t *testing.T, root, spec string) {
 		err := unix.Mknod(fullPath, mode, 0)
 		require.NoErrorf(t, err, "mk%s %s", inoType, fullPath)
 	}
+	if setOwnerMode != nil {
+		// <?uid>:<?gid>:<?mode>
+		fields := strings.Split(*setOwnerMode, ":")
+		require.Lenf(t, fields, 3, "set owner-mode format uid:gid:mode")
+		uidStr, gidStr, modeStr := fields[0], fields[1], fields[2]
+
+		if uidStr != "" && gidStr != "" {
+			uid, err := strconv.Atoi(uidStr)
+			require.NoErrorf(t, err, "chown %s: parse uid", fullPath)
+			gid, err := strconv.Atoi(gidStr)
+			require.NoErrorf(t, err, "chown %s: parse gid", fullPath)
+			err = unix.Chown(fullPath, uid, gid)
+			require.NoErrorf(t, err, "chown %s", fullPath)
+		}
+
+		if modeStr != "" {
+			mode, err := strconv.ParseUint(modeStr, 8, 32)
+			require.NoErrorf(t, err, "chmod %s: parse mode", fullPath)
+			err = unix.Chmod(fullPath, uint32(mode))
+			require.NoErrorf(t, err, "chmod %s", fullPath)
+		}
+	}
 }
 
 func createTree(t *testing.T, specs ...string) string {