File CVE-2024-21626.patch of Package runc.32152

From 98f4aa981a769cb51549573b7224f2ac0432e56b Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 18 Jan 2024 11:48:01 +1100
Subject: [PATCH] CVE-2024-21626

This is a roll-up of the following patches provided by the upstream
advisory[1]:

 * Fix File to Close
 * init: verify after chdir that cwd is inside the container
 * setns init: do explicit lookup of execve argument early
 * init: close internal fds before execve
 * cgroup: plug leaks of /sys/fs/cgroup handle
 * libcontainer: mark all non-stdio fds O_CLOEXEC before spawning init

The embargo date is *2024-01-31 20:00:00 UTC*.

[1]: https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv

Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/cgroups/file.go             | 31 +++++-----
 libcontainer/cgroups/fs/paths.go         |  1 +
 libcontainer/container_linux.go          |  9 +++
 libcontainer/init_linux.go               | 31 ++++++++++
 libcontainer/integration/seccomp_test.go | 20 +++----
 libcontainer/logs/logs.go                |  9 +++
 libcontainer/setns_init_linux.go         | 37 +++++++++++-
 libcontainer/standard_init_linux.go      | 19 +++++++
 libcontainer/utils/utils_unix.go         | 72 +++++++++++++++++++++---
 update.go                                |  1 +
 10 files changed, 196 insertions(+), 34 deletions(-)

diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go
index 48b263a1661e..f6e1b73bd921 100644
--- a/libcontainer/cgroups/file.go
+++ b/libcontainer/cgroups/file.go
@@ -77,16 +77,16 @@ var (
 	// TestMode is set to true by unit tests that need "fake" cgroupfs.
 	TestMode bool
 
-	cgroupFd     int = -1
-	prepOnce     sync.Once
-	prepErr      error
-	resolveFlags uint64
+	cgroupRootHandle *os.File
+	prepOnce         sync.Once
+	prepErr          error
+	resolveFlags     uint64
 )
 
 func prepareOpenat2() error {
 	prepOnce.Do(func() {
 		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
-			Flags: unix.O_DIRECTORY | unix.O_PATH,
+			Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
 		})
 		if err != nil {
 			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
@@ -97,15 +97,16 @@ func prepareOpenat2() error {
 			}
 			return
 		}
+		file := os.NewFile(uintptr(fd), cgroupfsDir)
+
 		var st unix.Statfs_t
-		if err = unix.Fstatfs(fd, &st); err != nil {
+		if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
 			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
 			logrus.Warnf("falling back to securejoin: %s", prepErr)
 			return
 		}
 
-		cgroupFd = fd
-
+		cgroupRootHandle = file
 		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
 		if st.Type == unix.CGROUP2_SUPER_MAGIC {
 			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
@@ -132,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		return openFallback(path, flags, mode)
 	}
 
-	fd, err := unix.Openat2(cgroupFd, relPath,
+	fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
 		&unix.OpenHow{
 			Resolve: resolveFlags,
 			Flags:   uint64(flags) | unix.O_CLOEXEC,
@@ -140,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		})
 	if err != nil {
 		err = &os.PathError{Op: "openat2", Path: path, Err: err}
-		// Check if cgroupFd is still opened to cgroupfsDir
+		// Check if cgroupRootHandle is still opened to cgroupfsDir
 		// (happens when this package is incorrectly used
 		// across the chroot/pivot_root/mntns boundary, or
 		// when /sys/fs/cgroup is remounted).
 		//
 		// TODO: if such usage will ever be common, amend this
-		// to reopen cgroupFd and retry openat2.
-		fdStr := strconv.Itoa(cgroupFd)
+		// to reopen cgroupRootHandle and retry openat2.
+		fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
 		fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
 		if fdDest != cgroupfsDir {
-			// Wrap the error so it is clear that cgroupFd
+			// Wrap the error so it is clear that cgroupRootHandle
 			// is opened to an unexpected/wrong directory.
-			err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
-				fdStr, fdDest, cgroupfsDir, err)
+			err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
+				cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
 		}
 		return nil, err
 	}
diff --git a/libcontainer/cgroups/fs/paths.go b/libcontainer/cgroups/fs/paths.go
index 1092331b25d8..2cb970a3d55b 100644
--- a/libcontainer/cgroups/fs/paths.go
+++ b/libcontainer/cgroups/fs/paths.go
@@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string {
 	if err != nil {
 		return ""
 	}
+	defer dir.Close()
 	names, err := dir.Readdirnames(1)
 	if err != nil {
 		return ""
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 59aa0338ac6d..3a329806849a 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -353,6 +353,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) {
 		}()
 	}
 
+	// Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
+	// to make sure we don't leak any files into "runc init". Any files to be
+	// passed to "runc init" throuhg ExtraFiles will get dup2'd by the Go
+	// runtime and thus their O_CLOEXEC flag will be cleared. This is some
+	// additional protection against attacks like CVE-2024-21626, by making
+	// sure we never leak files to "runc init" we didn't intend to.
+	if err := utils.CloseExecFrom(3); err != nil {
+		return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
+	}
 	if err := parent.start(); err != nil {
 		return fmt.Errorf("unable to start container process: %w", err)
 	}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 5b88c71fc83a..057b30669811 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"net"
 	"os"
+	"path/filepath"
 	"strings"
 	"unsafe"
 
@@ -135,6 +136,32 @@ func populateProcessEnvironment(env []string) error {
 	return nil
 }
 
+// verifyCwd ensures that the current directory is actually inside the mount
+// namespace root of the current process.
+func verifyCwd() error {
+	// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
+	// current mount namespace root, and in that case prefixes "(unreachable)"
+	// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
+	// when this happens and return ENOENT rather than returning a non-absolute
+	// path. In both cases we can therefore easily detect if we have an invalid
+	// cwd by checking the return value of getcwd(3). See getcwd(3) for more
+	// details, and CVE-2024-21626 for the security issue that motivated this
+	// check.
+	//
+	// We have to use unix.Getwd() here because os.Getwd() has a workaround for
+	// $PWD which involves doing stat(.), which can fail if the current
+	// directory is inaccessible to the container process.
+	if wd, err := unix.Getwd(); err == unix.ENOENT {
+		return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
+	} else if err != nil {
+		return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
+	} else if !filepath.IsAbs(wd) {
+		// We shouldn't ever hit this, but check just in case.
+		return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
+	}
+	return nil
+}
+
 // finalizeNamespace drops the caps, sets the correct user
 // and working dir, and closes any leaked file descriptors
 // before executing the command inside the namespace
@@ -193,6 +220,10 @@ func finalizeNamespace(config *initConfig) error {
 			return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
 		}
 	}
+	// Make sure our final working directory is inside the container.
+	if err := verifyCwd(); err != nil {
+		return err
+	}
 	if err := system.ClearKeepCaps(); err != nil {
 		return fmt.Errorf("unable to clear keep caps: %w", err)
 	}
diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go
index 31092a0a5d21..ecdfa7957df1 100644
--- a/libcontainer/integration/seccomp_test.go
+++ b/libcontainer/integration/seccomp_test.go
@@ -13,7 +13,7 @@ import (
 	libseccomp "github.com/seccomp/libseccomp-golang"
 )
 
-func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
+func TestSeccompDenySyslogWithErrno(t *testing.T) {
 	if testing.Short() {
 		return
 	}
@@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
 		DefaultAction: configs.Allow,
 		Syscalls: []*configs.Syscall{
 			{
-				Name:     "getcwd",
+				Name:     "syslog",
 				Action:   configs.Errno,
 				ErrnoRet: &errnoRet,
 			},
@@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
 	buffers := newStdBuffers()
 	pwd := &libcontainer.Process{
 		Cwd:    "/",
-		Args:   []string{"pwd"},
+		Args:   []string{"dmesg"},
 		Env:    standardEnvironment,
 		Stdin:  buffers.Stdin,
 		Stdout: buffers.Stdout,
@@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
 	}
 
 	if exitCode == 0 {
-		t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
+		t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
 	}
 
-	expected := "pwd: getcwd: No such process"
+	expected := "dmesg: klogctl: No such process"
 	actual := strings.Trim(buffers.Stderr.String(), "\n")
 	if actual != expected {
 		t.Fatalf("Expected output %s but got %s\n", expected, actual)
 	}
 }
 
-func TestSeccompDenyGetcwd(t *testing.T) {
+func TestSeccompDenySyslog(t *testing.T) {
 	if testing.Short() {
 		return
 	}
@@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
 		DefaultAction: configs.Allow,
 		Syscalls: []*configs.Syscall{
 			{
-				Name:   "getcwd",
+				Name:   "syslog",
 				Action: configs.Errno,
 			},
 		},
@@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
 	buffers := newStdBuffers()
 	pwd := &libcontainer.Process{
 		Cwd:    "/",
-		Args:   []string{"pwd"},
+		Args:   []string{"dmesg"},
 		Env:    standardEnvironment,
 		Stdin:  buffers.Stdin,
 		Stdout: buffers.Stdout,
@@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) {
 	}
 
 	if exitCode == 0 {
-		t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
+		t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
 	}
 
-	expected := "pwd: getcwd: Operation not permitted"
+	expected := "dmesg: klogctl: Operation not permitted"
 	actual := strings.Trim(buffers.Stderr.String(), "\n")
 	if actual != expected {
 		t.Fatalf("Expected output %s but got %s\n", expected, actual)
diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go
index 95deb0d6ca7a..349a18ed3839 100644
--- a/libcontainer/logs/logs.go
+++ b/libcontainer/logs/logs.go
@@ -4,10 +4,19 @@ import (
 	"bufio"
 	"encoding/json"
 	"io"
+	"os"
 
 	"github.com/sirupsen/logrus"
 )
 
+// IsLogrusFd returns whether the provided fd matches the one that logrus is
+// currently outputting to. This should only ever be called by UnsafeCloseFrom
+// from `runc init`.
+func IsLogrusFd(fd uintptr) bool {
+	file, ok := logrus.StandardLogger().Out.(*os.File)
+	return ok && file.Fd() == fd
+}
+
 func ForwardLogs(logPipe io.ReadCloser) chan error {
 	done := make(chan error, 1)
 	s := bufio.NewScanner(logPipe)
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index 09ab552b3d12..d1bb12273c04 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"os/exec"
 	"strconv"
 
 	"github.com/opencontainers/selinux/go-selinux"
@@ -14,6 +15,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/utils"
 )
 
 // linuxSetnsInit performs the container's initialization for running a new process
@@ -82,6 +84,21 @@ func (l *linuxSetnsInit) Init() error {
 	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
 		return err
 	}
+
+	// Check for the arg before waiting to make sure it exists and it is
+	// returned as a create time error.
+	name, err := exec.LookPath(l.config.Args[0])
+	if err != nil {
+		return err
+	}
+	// exec.LookPath in Go < 1.20 might return no error for an executable
+	// residing on a file system mounted with noexec flag, so perform this
+	// extra check now while we can still return a proper error.
+	// TODO: remove this once go < 1.20 is not supported.
+	if err := eaccess(name); err != nil {
+		return &os.PathError{Op: "eaccess", Path: name, Err: err}
+	}
+
 	// Set seccomp as close to execve as possible, so as few syscalls take
 	// place afterward (reducing the amount of syscalls that users need to
 	// enable in their seccomp profiles).
@@ -101,5 +118,23 @@ func (l *linuxSetnsInit) Init() error {
 		return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
 	}
 
-	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
+	// Close all file descriptors we are not passing to the container. This is
+	// necessary because the execve target could use internal runc fds as the
+	// execve path, potentially giving access to binary files from the host
+	// (which can then be opened by container processes, leading to container
+	// escapes). Note that because this operation will close any open file
+	// descriptors that are referenced by (*os.File) handles from underneath
+	// the Go runtime, we must not do any file operations after this point
+	// (otherwise the (*os.File) finaliser could close the wrong file). See
+	// CVE-2024-21626 for more information as to why this protection is
+	// necessary.
+	//
+	// This is not needed for runc-dmz, because the extra execve(2) step means
+	// that all O_CLOEXEC file descriptors have already been closed and thus
+	// the second execve(2) from runc-dmz cannot access internal file
+	// descriptors from runc.
+	if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
+		return err
+	}
+	return system.Exec(name, l.config.Args[0:], os.Environ())
 }
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index c09a7bed30ea..d1d94352f93d 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -17,6 +17,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/utils"
 )
 
 type linuxStandardInit struct {
@@ -258,5 +259,23 @@ func (l *linuxStandardInit) Init() error {
 		return err
 	}
 
+	// Close all file descriptors we are not passing to the container. This is
+	// necessary because the execve target could use internal runc fds as the
+	// execve path, potentially giving access to binary files from the host
+	// (which can then be opened by container processes, leading to container
+	// escapes). Note that because this operation will close any open file
+	// descriptors that are referenced by (*os.File) handles from underneath
+	// the Go runtime, we must not do any file operations after this point
+	// (otherwise the (*os.File) finaliser could close the wrong file). See
+	// CVE-2024-21626 for more information as to why this protection is
+	// necessary.
+	//
+	// This is not needed for runc-dmz, because the extra execve(2) step means
+	// that all O_CLOEXEC file descriptors have already been closed and thus
+	// the second execve(2) from runc-dmz cannot access internal file
+	// descriptors from runc.
+	if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
+		return err
+	}
 	return system.Exec(name, l.config.Args[0:], os.Environ())
 }
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
index 220d0b439379..15533df1f2b2 100644
--- a/libcontainer/utils/utils_unix.go
+++ b/libcontainer/utils/utils_unix.go
@@ -7,8 +7,11 @@ import (
 	"fmt"
 	"os"
 	"strconv"
+	_ "unsafe" // for go:linkname
 
 	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/logs"
 )
 
 // EnsureProcHandle returns whether or not the given file handle is on procfs.
@@ -23,9 +26,11 @@ func EnsureProcHandle(fh *os.File) error {
 	return nil
 }
 
-// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
-// the process (except for those below the given fd value).
-func CloseExecFrom(minFd int) error {
+type fdFunc func(fd int)
+
+// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
+// the current process.
+func fdRangeFrom(minFd int, fn fdFunc) error {
 	fdDir, err := os.Open("/proc/self/fd")
 	if err != nil {
 		return err
@@ -50,15 +55,66 @@ func CloseExecFrom(minFd int) error {
 		if fd < minFd {
 			continue
 		}
-		// Intentionally ignore errors from unix.CloseOnExec -- the cases where
-		// this might fail are basically file descriptors that have already
-		// been closed (including and especially the one that was created when
-		// os.ReadDir did the "opendir" syscall).
-		unix.CloseOnExec(fd)
+		// Ignore the file descriptor we used for readdir, as it will be closed
+		// when we return.
+		if uintptr(fd) == fdDir.Fd() {
+			continue
+		}
+		// Run the closure.
+		fn(fd)
 	}
 	return nil
 }
 
+// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
+// equal to minFd in the current process.
+func CloseExecFrom(minFd int) error {
+	return fdRangeFrom(minFd, unix.CloseOnExec)
+}
+
+//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
+
+// In order to make sure we do not close the internal epoll descriptors the Go
+// runtime uses, we need to ensure that we skip descriptors that match
+// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
+// unfortunately there's no other way to be sure we're only keeping the file
+// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
+func runtime_IsPollDescriptor(fd uintptr) bool
+
+// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
+// current process, except for those critical to Go's runtime (such as the
+// netpoll management descriptors).
+//
+// NOTE: That this function is incredibly dangerous to use in most Go code, as
+// closing file descriptors from underneath *os.File handles can lead to very
+// bad behaviour (the closed file descriptor can be re-used and then any
+// *os.File operations would apply to the wrong file). This function is only
+// intended to be called from the last stage of runc init.
+func UnsafeCloseFrom(minFd int) error {
+	// We must not close some file descriptors.
+	return fdRangeFrom(minFd, func(fd int) {
+		if runtime_IsPollDescriptor(uintptr(fd)) {
+			// These are the Go runtimes internal netpoll file descriptors.
+			// These file descriptors are operated on deep in the Go scheduler,
+			// and closing those files from underneath Go can result in panics.
+			// There is no issue with keeping them because they are not
+			// executable and are not useful to an attacker anyway. Also we
+			// don't have any choice.
+			return
+		}
+		if logs.IsLogrusFd(uintptr(fd)) {
+			// Do not close the logrus output fd. We cannot exec a pipe, and
+			// the contents are quite limited (very little attacker control,
+			// JSON-encoded) making shellcode attacks unlikely.
+			return
+		}
+		// There's nothing we can do about errors from close(2), and the
+		// only likely error to be seen is EBADF which indicates the fd was
+		// already closed (in which case, we got what we wanted).
+		_ = unix.Close(fd)
+	})
+}
+
 // NewSockPair returns a new unix socket pair
 func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
 	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
diff --git a/update.go b/update.go
index 9ce5a2e835b2..6d582ddddecb 100644
--- a/update.go
+++ b/update.go
@@ -174,6 +174,7 @@ other options are ignored.
 				if err != nil {
 					return err
 				}
+				defer f.Close()
 			}
 			err = json.NewDecoder(f).Decode(&r)
 			if err != nil {
-- 
2.43.0

openSUSE Build Service is sponsored by