File 0006-bsc1190670-seccomp-add-support-for-clone3-syscall-in.patch of Package docker.21393

From 9cc9665d00293bdff2420a4db49278bc7bb9ed72 Mon Sep 17 00:00:00 2001
From: Tianon Gravi <admwiggin@gmail.com>
Date: Thu, 9 Sep 2021 11:31:30 -0700
Subject: [PATCH 6/6] bsc1190670: seccomp: add support for "clone3" syscall in
 default policy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a backport of 9f6b562dd12ef7b1f9e2f8e6f2ab6477790a6594, adapted to avoid the refactoring that happened in d92739713c633c155c0f3d8065c8278b1d8a44e7.

Original commit message is as follows:

> If no seccomp policy is requested, then the built-in default policy in
> dockerd applies. This has no rule for "clone3" defined, nor any default
> errno defined. So when runc receives the config it attempts to determine
> a default errno, using logic defined in its commit:
>
>   opencontainers/runc@7a8d716
>
> As explained in the above commit message, runc uses a heuristic to
> decide which errno to return by default:
>
> [quote]
>   The solution applied here is to prepend a "stub" filter which returns
>   -ENOSYS if the requested syscall has a larger syscall number than any
>   syscall mentioned in the filter. The reason for this specific rule is
>   that syscall numbers are (roughly) allocated sequentially and thus newer
>   syscalls will (usually) have a larger syscall number -- thus causing our
>   filters to produce -ENOSYS if the filter was written before the syscall
>   existed.
> [/quote]
>
> Unfortunately clone3 appears to one of the edge cases that does not
> result in use of ENOSYS, instead ending up with the historical EPERM
> errno.
>
> Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use
> clone3 by default. If it sees ENOSYS then it will automatically
> fallback to using clone. Any other errno is treated as a fatal
> error. Thus when docker seccomp policy triggers EPERM from clone3,
> no fallback occurs and programs are thus unable to spawn threads.
>
> The clone3 syscall is much more complicated than clone, most notably its
> flags are not exposed as a directly argument any more. Instead they are
> hidden inside a struct. This means that seccomp filters are unable to
> apply policy based on values seen in flags. Thus we can't directly
> replicate the current "clone" filtering for "clone3". We can at least
> ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone"
> at which point we can filter on flags.

SUSE-Bugs: bsc#1190670
Signed-off-by: Tianon Gravi <admwiggin@gmail.com>
Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 profiles/seccomp/default.json     | 16 ++++++++++++++++
 profiles/seccomp/default_linux.go | 13 +++++++++++++
 profiles/seccomp/seccomp.go       |  1 +
 profiles/seccomp/seccomp_linux.go | 28 ++++++++++++----------------
 4 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/profiles/seccomp/default.json b/profiles/seccomp/default.json
index 4213799ddb5c..ee5e04f781a8 100644
--- a/profiles/seccomp/default.json
+++ b/profiles/seccomp/default.json
@@ -591,6 +591,7 @@
 			"names": [
 				"bpf",
 				"clone",
+				"clone3",
 				"fanotify_init",
 				"fsconfig",
 				"fsmount",
@@ -670,6 +671,21 @@
 				]
 			}
 		},
+		{
+			"names": [
+				"clone3"
+			],
+			"action": "SCMP_ACT_ERRNO",
+			"errnoRet": 38,
+			"args": [],
+			"comment": "",
+			"includes": {},
+			"excludes": {
+				"caps": [
+					"CAP_SYS_ADMIN"
+				]
+			}
+		},
 		{
 			"names": [
 				"reboot"
diff --git a/profiles/seccomp/default_linux.go b/profiles/seccomp/default_linux.go
index 879eb88c64f1..fb593f336f7a 100644
--- a/profiles/seccomp/default_linux.go
+++ b/profiles/seccomp/default_linux.go
@@ -42,6 +42,7 @@ func arches() []Architecture {
 
 // DefaultProfile defines the allowed syscalls for the default seccomp profile.
 func DefaultProfile() *Seccomp {
+	nosys := uint(unix.ENOSYS)
 	syscalls := []*Syscall{
 		{
 			Names: []string{
@@ -522,6 +523,7 @@ func DefaultProfile() *Seccomp {
 			Names: []string{
 				"bpf",
 				"clone",
+				"clone3",
 				"fanotify_init",
 				"fsconfig",
 				"fsmount",
@@ -587,6 +589,17 @@ func DefaultProfile() *Seccomp {
 				Caps: []string{"CAP_SYS_ADMIN"},
 			},
 		},
+		{
+			Names: []string{
+				"clone3",
+			},
+			Action:   specs.ActErrno,
+			ErrnoRet: &nosys,
+			Args:     []*specs.LinuxSeccompArg{},
+			Excludes: Filter{
+				Caps: []string{"CAP_SYS_ADMIN"},
+			},
+		},
 		{
 			Names: []string{
 				"reboot",
diff --git a/profiles/seccomp/seccomp.go b/profiles/seccomp/seccomp.go
index d2a21cddc4b2..9edec72db546 100644
--- a/profiles/seccomp/seccomp.go
+++ b/profiles/seccomp/seccomp.go
@@ -45,6 +45,7 @@ type Syscall struct {
 	Name     string                   `json:"name,omitempty"`
 	Names    []string                 `json:"names,omitempty"`
 	Action   specs.LinuxSeccompAction `json:"action"`
+	ErrnoRet *uint                    `json:"errnoRet,omitempty"`
 	Args     []*specs.LinuxSeccompArg `json:"args"`
 	Comment  string                   `json:"comment"`
 	Includes Filter                   `json:"includes"`
diff --git a/profiles/seccomp/seccomp_linux.go b/profiles/seccomp/seccomp_linux.go
index 566f173acd3a..e35e242cd500 100644
--- a/profiles/seccomp/seccomp_linux.go
+++ b/profiles/seccomp/seccomp_linux.go
@@ -150,29 +150,25 @@ Loop:
 			}
 		}
 
+		newCall := specs.LinuxSyscall{
+			Action:   call.Action,
+			ErrnoRet: call.ErrnoRet,
+		}
 		if call.Name != "" && len(call.Names) != 0 {
 			return nil, errors.New("'name' and 'names' were specified in the seccomp profile, use either 'name' or 'names'")
 		}
-
 		if call.Name != "" {
-			newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall([]string{call.Name}, call.Action, call.Args))
+			newCall.Names = []string{call.Name}
 		} else {
-			newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall(call.Names, call.Action, call.Args))
+			newCall.Names = call.Names
+		}
+		// Loop through all the arguments of the syscall and convert them
+		for _, arg := range call.Args {
+			newCall.Args = append(newCall.Args, *arg)
 		}
-	}
-
-	return newConfig, nil
-}
 
-func createSpecsSyscall(names []string, action specs.LinuxSeccompAction, args []*specs.LinuxSeccompArg) specs.LinuxSyscall {
-	newCall := specs.LinuxSyscall{
-		Names:  names,
-		Action: action,
+		newConfig.Syscalls = append(newConfig.Syscalls, newCall)
 	}
 
-	// Loop through all the arguments of the syscall and convert them
-	for _, arg := range args {
-		newCall.Args = append(newCall.Args, *arg)
-	}
-	return newCall
+	return newConfig, nil
 }
-- 
2.33.0

openSUSE Build Service is sponsored by