From b528ef292d8e993619e3eb5ab26a4f18155203f8 Mon Sep 17 00:00:00 2001 From: Ruidong Cao Date: Tue, 10 Apr 2018 23:30:22 +0800 Subject: [PATCH] cli: implement spec command Add spec command that generates a basic config.json for kata. fixes #188 Signed-off-by: Ruidong Cao Signed-off-by: Ruidong --- Gopkg.lock | 10 +- cli/main.go | 1 + cli/spec.go | 105 ++ cli/spec_test.go | 35 + .../runc/libcontainer/seccomp/config.go | 76 ++ .../libcontainer/seccomp/seccomp_linux.go | 227 +++++ .../seccomp/seccomp_unsupported.go | 24 + .../runc/libcontainer/specconv/example.go | 220 +++++ .../runc/libcontainer/specconv/spec_linux.go | 828 ++++++++++++++++ .../seccomp/libseccomp-golang/LICENSE | 22 + .../seccomp/libseccomp-golang/seccomp.go | 902 ++++++++++++++++++ .../libseccomp-golang/seccomp_internal.go | 514 ++++++++++ 12 files changed, 2963 insertions(+), 1 deletion(-) create mode 100644 cli/spec.go create mode 100644 cli/spec_test.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go create mode 100644 vendor/github.com/seccomp/libseccomp-golang/LICENSE create mode 100644 vendor/github.com/seccomp/libseccomp-golang/seccomp.go create mode 100644 vendor/github.com/seccomp/libseccomp-golang/seccomp_internal.go diff --git a/Gopkg.lock b/Gopkg.lock index 96e175c52d..e5fca2a596 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -114,6 +114,8 @@ name = "github.com/opencontainers/runc" packages = [ "libcontainer/configs", + "libcontainer/seccomp", + "libcontainer/specconv", "libcontainer/utils" ] revision = "0351df1c5a66838d0c392b4ac4cf9450de844e2d" @@ -134,6 +136,12 @@ packages = ["."] revision = "79559b488d8848b53a8e34c330140c3fc37ee246" +[[projects]] + name = "github.com/seccomp/libseccomp-golang" + packages = ["."] + revision = "e3496e3a417d1dc9ecdceca5af2513271fed37a0" + version = "v0.9.0" + [[projects]] name = "github.com/sirupsen/logrus" packages = [ @@ -249,6 +257,6 @@ [solve-meta] analyzer-name = "dep" analyzer-version = 1 - inputs-digest = "cd6f57ab58ff674de73039bb4337cd1f2b24d069199704c510f8950e1121f395" + inputs-digest = "dd9532d9bae03bb3dc43414908beecb8b498c14bea354e261341ff81dbf01b4d" solver-name = "gps-cdcl" solver-version = 1 diff --git a/cli/main.go b/cli/main.go index 5dbfdb3e13..420d615f97 100644 --- a/cli/main.go +++ b/cli/main.go @@ -119,6 +119,7 @@ var runtimeCommands = []cli.Command{ psCLICommand, resumeCLICommand, runCLICommand, + specCLICommand, startCLICommand, stateCLICommand, versionCLICommand, diff --git a/cli/spec.go b/cli/spec.go new file mode 100644 index 0000000000..ba367c8b20 --- /dev/null +++ b/cli/spec.go @@ -0,0 +1,105 @@ +// Copyright (c) 2014,2015,2016,2017 Docker, Inc. +// Copyright (c) 2018 Huawei Corporation. +// +// SPDX-License-Identifier: Apache-2.0 +// + +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + + "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/urfave/cli" +) + +var specCLICommand = cli.Command{ + Name: "spec", + Usage: "create a new specification file", + ArgsUsage: "", + Description: `The spec command creates the new specification file named "` + specConfig + `" for +the bundle. + +The spec generated is just a starter file. Editing of the spec is required to +achieve desired results. For example, the newly generated spec includes an args +parameter that is initially set to call the "sh" command when the container is +started. Calling "sh" may work for an ubuntu container or busybox, but will not +work for containers that do not include the "sh" program. + +EXAMPLE: + To run docker's hello-world container one needs to set the args parameter +in the spec to call hello. This can be done using the sed command or a text +editor. The following commands create a bundle for hello-world, change the +default args parameter in the spec from "sh" to "/hello", then run the hello +command in a new hello-world container named container1: + + mkdir hello + cd hello + docker pull hello-world + docker export $(docker create hello-world) > hello-world.tar + mkdir rootfs + tar -C rootfs -xf hello-world.tar + kata-runtime spec + sed -i 's;"sh";"/hello";' ` + specConfig + ` + kata-runtime run container1 + +In the run command above, "container1" is the name for the instance of the +container that you are starting. The name you provide for the container instance +must be unique on your host. + +An alternative for generating a customized spec config is to use "oci-runtime-tool", the +sub-command "oci-runtime-tool generate" has lots of options that can be used to do any +customizations as you want, see runtime-tools (https://github.com/opencontainers/runtime-tools) +to get more information. + +When starting a container through kata-runtime, kata-runtime needs root privilege. If not +already running as root, you can use sudo to give kata-runtime root privilege. For +example: "sudo kata-runtime start container1" will give kata-runtime root privilege to start the +container on your host. + +Alternatively, you can start a rootless container, which has the ability to run +without root privileges. For this to work, the specification file needs to be +adjusted accordingly. You can pass the parameter --rootless to this command to +generate a proper rootless spec file.`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: "path to the root of the bundle directory", + }, + }, + Action: func(context *cli.Context) error { + spec := specconv.Example() + + checkNoFile := func(name string) error { + _, err := os.Stat(name) + if err == nil { + return fmt.Errorf("File %s exists. Remove it first", name) + } + if !os.IsNotExist(err) { + return err + } + return nil + } + bundle := context.String("bundle") + if bundle != "" { + if err := os.Chdir(bundle); err != nil { + return err + } + } + if err := checkNoFile(specConfig); err != nil { + return err + } + data, err := json.MarshalIndent(spec, "", "\t") + if err != nil { + return err + } + if err := ioutil.WriteFile(specConfig, data, 0640); err != nil { + return err + } + return nil + }, +} diff --git a/cli/spec_test.go b/cli/spec_test.go new file mode 100644 index 0000000000..ff6d9a6f9a --- /dev/null +++ b/cli/spec_test.go @@ -0,0 +1,35 @@ +// Copyright (c) 2018 Huawei Corporation. +// +// SPDX-License-Identifier: Apache-2.0 +// + +package main + +import ( + "flag" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/urfave/cli" +) + +func TestSpecCliAction(t *testing.T) { + assert := assert.New(t) + + actionFunc, ok := specCLICommand.Action.(func(context *cli.Context) error) + assert.True(ok) + + flagSet := flag.NewFlagSet("flag", flag.ContinueOnError) + ctx := cli.NewContext(&cli.App{}, flagSet, nil) + defer os.Remove(specConfig) + err := actionFunc(ctx) + assert.NoError(err) + + pattern := "gid=5" + patternRootless := "uidMappings" + err = grep(pattern, specConfig) + assert.NoError(err) + err = grep(patternRootless, specConfig) + assert.Error(err) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go new file mode 100644 index 0000000000..ded5a6bbc8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -0,0 +1,76 @@ +package seccomp + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var operators = map[string]configs.Operator{ + "SCMP_CMP_NE": configs.NotEqualTo, + "SCMP_CMP_LT": configs.LessThan, + "SCMP_CMP_LE": configs.LessThanOrEqualTo, + "SCMP_CMP_EQ": configs.EqualTo, + "SCMP_CMP_GE": configs.GreaterThanOrEqualTo, + "SCMP_CMP_GT": configs.GreaterThan, + "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo, +} + +var actions = map[string]configs.Action{ + "SCMP_ACT_KILL": configs.Kill, + "SCMP_ACT_ERRNO": configs.Errno, + "SCMP_ACT_TRAP": configs.Trap, + "SCMP_ACT_ALLOW": configs.Allow, + "SCMP_ACT_TRACE": configs.Trace, +} + +var archs = map[string]string{ + "SCMP_ARCH_X86": "x86", + "SCMP_ARCH_X86_64": "amd64", + "SCMP_ARCH_X32": "x32", + "SCMP_ARCH_ARM": "arm", + "SCMP_ARCH_AARCH64": "arm64", + "SCMP_ARCH_MIPS": "mips", + "SCMP_ARCH_MIPS64": "mips64", + "SCMP_ARCH_MIPS64N32": "mips64n32", + "SCMP_ARCH_MIPSEL": "mipsel", + "SCMP_ARCH_MIPSEL64": "mipsel64", + "SCMP_ARCH_MIPSEL64N32": "mipsel64n32", + "SCMP_ARCH_PPC": "ppc", + "SCMP_ARCH_PPC64": "ppc64", + "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_S390": "s390", + "SCMP_ARCH_S390X": "s390x", +} + +// ConvertStringToOperator converts a string into a Seccomp comparison operator. +// Comparison operators use the names they are assigned by Libseccomp's header. +// Attempting to convert a string that is not a valid operator results in an +// error. +func ConvertStringToOperator(in string) (configs.Operator, error) { + if op, ok := operators[in]; ok == true { + return op, nil + } + return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in) +} + +// ConvertStringToAction converts a string into a Seccomp rule match action. +// Actions use the names they are assigned in Libseccomp's header, though some +// (notable, SCMP_ACT_TRACE) are not available in this implementation and will +// return errors. +// Attempting to convert a string that is not a valid action results in an +// error. +func ConvertStringToAction(in string) (configs.Action, error) { + if act, ok := actions[in]; ok == true { + return act, nil + } + return 0, fmt.Errorf("string %s is not a valid action for seccomp", in) +} + +// ConvertStringToArch converts a string into a Seccomp comparison arch. +func ConvertStringToArch(in string) (string, error) { + if arch, ok := archs[in]; ok == true { + return arch, nil + } + return "", fmt.Errorf("string %s is not a valid arch for seccomp", in) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go new file mode 100644 index 0000000000..2523cbf990 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -0,0 +1,227 @@ +// +build linux,cgo,seccomp + +package seccomp + +import ( + "bufio" + "fmt" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + libseccomp "github.com/seccomp/libseccomp-golang" + + "golang.org/x/sys/unix" +) + +var ( + actAllow = libseccomp.ActAllow + actTrap = libseccomp.ActTrap + actKill = libseccomp.ActKill + actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) + actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) +) + +// Filters given syscalls in a container, preventing them from being used +// Started in the container init process, and carried over to all child processes +// Setns calls, however, require a separate invocation, as they are not children +// of the init until they join the namespace +func InitSeccomp(config *configs.Seccomp) error { + if config == nil { + return fmt.Errorf("cannot initialize Seccomp - nil config passed") + } + + defaultAction, err := getAction(config.DefaultAction) + if err != nil { + return fmt.Errorf("error initializing seccomp - invalid default action") + } + + filter, err := libseccomp.NewFilter(defaultAction) + if err != nil { + return fmt.Errorf("error creating filter: %s", err) + } + + // Add extra architectures + for _, arch := range config.Architectures { + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + return err + } + + if err := filter.AddArch(scmpArch); err != nil { + return err + } + } + + // Unset no new privs bit + if err := filter.SetNoNewPrivsBit(false); err != nil { + return fmt.Errorf("error setting no new privileges: %s", err) + } + + // Add a rule for each syscall + for _, call := range config.Syscalls { + if call == nil { + return fmt.Errorf("encountered nil syscall while initializing Seccomp") + } + + if err = matchCall(filter, call); err != nil { + return err + } + } + + if err = filter.Load(); err != nil { + return fmt.Errorf("error loading seccomp filter into kernel: %s", err) + } + + return nil +} + +// IsEnabled returns if the kernel has been configured to support seccomp. +func IsEnabled() bool { + // Try to read from /proc/self/status for kernels > 3.8 + s, err := parseStatusFile("/proc/self/status") + if err != nil { + // Check if Seccomp is supported, via CONFIG_SECCOMP. + if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL { + // Make sure the kernel has CONFIG_SECCOMP_FILTER. + if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL { + return true + } + } + return false + } + _, ok := s["Seccomp"] + return ok +} + +// Convert Libcontainer Action to Libseccomp ScmpAction +func getAction(act configs.Action) (libseccomp.ScmpAction, error) { + switch act { + case configs.Kill: + return actKill, nil + case configs.Errno: + return actErrno, nil + case configs.Trap: + return actTrap, nil + case configs.Allow: + return actAllow, nil + case configs.Trace: + return actTrace, nil + default: + return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule") + } +} + +// Convert Libcontainer Operator to Libseccomp ScmpCompareOp +func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) { + switch op { + case configs.EqualTo: + return libseccomp.CompareEqual, nil + case configs.NotEqualTo: + return libseccomp.CompareNotEqual, nil + case configs.GreaterThan: + return libseccomp.CompareGreater, nil + case configs.GreaterThanOrEqualTo: + return libseccomp.CompareGreaterEqual, nil + case configs.LessThan: + return libseccomp.CompareLess, nil + case configs.LessThanOrEqualTo: + return libseccomp.CompareLessOrEqual, nil + case configs.MaskEqualTo: + return libseccomp.CompareMaskedEqual, nil + default: + return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule") + } +} + +// Convert Libcontainer Arg to Libseccomp ScmpCondition +func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { + cond := libseccomp.ScmpCondition{} + + if arg == nil { + return cond, fmt.Errorf("cannot convert nil to syscall condition") + } + + op, err := getOperator(arg.Op) + if err != nil { + return cond, err + } + + return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo) +} + +// Add a rule to match a single syscall +func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { + if call == nil || filter == nil { + return fmt.Errorf("cannot use nil as syscall to block") + } + + if len(call.Name) == 0 { + return fmt.Errorf("empty string is not a valid syscall") + } + + // If we can't resolve the syscall, assume it's not supported on this kernel + // Ignore it, don't error out + callNum, err := libseccomp.GetSyscallFromName(call.Name) + if err != nil { + return nil + } + + // Convert the call's action to the libseccomp equivalent + callAct, err := getAction(call.Action) + if err != nil { + return err + } + + // Unconditional match - just add the rule + if len(call.Args) == 0 { + if err = filter.AddRule(callNum, callAct); err != nil { + return err + } + } else { + // Conditional match - convert the per-arg rules into library format + conditions := []libseccomp.ScmpCondition{} + + for _, cond := range call.Args { + newCond, err := getCondition(cond) + if err != nil { + return err + } + + conditions = append(conditions, newCond) + } + + if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return err + } + } + + return nil +} + +func parseStatusFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + s := bufio.NewScanner(f) + status := make(map[string]string) + + for s.Scan() { + text := s.Text() + parts := strings.Split(text, ":") + + if len(parts) <= 1 { + continue + } + + status[parts[0]] = parts[1] + } + if err := s.Err(); err != nil { + return nil, err + } + + return status, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go new file mode 100644 index 0000000000..44df1ad4c2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go @@ -0,0 +1,24 @@ +// +build !linux !cgo !seccomp + +package seccomp + +import ( + "errors" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") + +// InitSeccomp does nothing because seccomp is not supported. +func InitSeccomp(config *configs.Seccomp) error { + if config != nil { + return ErrSeccompNotEnabled + } + return nil +} + +// IsEnabled returns false, because it is not supported. +func IsEnabled() bool { + return false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go new file mode 100644 index 0000000000..0b1cd3b621 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go @@ -0,0 +1,220 @@ +package specconv + +import ( + "os" + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Example returns an example spec file, with many options set so a user can +// see what a standard spec file looks like. +func Example() *specs.Spec { + return &specs.Spec{ + Version: specs.Version, + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, + }, + Process: &specs.Process{ + Terminal: true, + User: specs.User{}, + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: "/", + NoNewPrivileges: true, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Ambient: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: uint64(1024), + Soft: uint64(1024), + }, + }, + }, + Hostname: "runc", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "proc", + Source: "proc", + Options: nil, + }, + { + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/pts", + Type: "devpts", + Source: "devpts", + Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, + }, + { + Destination: "/dev/shm", + Type: "tmpfs", + Source: "shm", + Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, + }, + { + Destination: "/dev/mqueue", + Type: "mqueue", + Source: "mqueue", + Options: []string{"nosuid", "noexec", "nodev"}, + }, + { + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, + }, + }, + Linux: &specs.Linux{ + MaskedPaths: []string{ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + }, + ReadonlyPaths: []string{ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + }, + Resources: &specs.LinuxResources{ + Devices: []specs.LinuxDeviceCgroup{ + { + Allow: false, + Access: "rwm", + }, + }, + }, + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, + }, + }, + } +} + +// ExampleRootless returns an example spec file that works with rootless +// containers. It's essentially a modified version of the specfile from +// Example(). +func ToRootless(spec *specs.Spec) { + var namespaces []specs.LinuxNamespace + + // Remove networkns from the spec. + for _, ns := range spec.Linux.Namespaces { + switch ns.Type { + case specs.NetworkNamespace, specs.UserNamespace: + // Do nothing. + default: + namespaces = append(namespaces, ns) + } + } + // Add userns to the spec. + namespaces = append(namespaces, specs.LinuxNamespace{ + Type: specs.UserNamespace, + }) + spec.Linux.Namespaces = namespaces + + // Add mappings for the current user. + spec.Linux.UIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Geteuid()), + ContainerID: 0, + Size: 1, + }} + spec.Linux.GIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Getegid()), + ContainerID: 0, + Size: 1, + }} + + // Fix up mounts. + var mounts []specs.Mount + for _, mount := range spec.Mounts { + // Ignore all mounts that are under /sys. + if strings.HasPrefix(mount.Destination, "/sys") { + continue + } + + // Remove all gid= and uid= mappings. + var options []string + for _, option := range mount.Options { + if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") { + options = append(options, option) + } + } + + mount.Options = options + mounts = append(mounts, mount) + } + // Add the sysfs mount as an rbind. + mounts = append(mounts, specs.Mount{ + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }) + spec.Mounts = mounts + + // Remove cgroup settings. + spec.Linux.Resources = nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go new file mode 100644 index 0000000000..9a17d1e7ab --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go @@ -0,0 +1,828 @@ +// +build linux + +// Package specconv implements conversion of specifications to libcontainer +// configurations +package specconv + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/seccomp" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + + "golang.org/x/sys/unix" +) + +const wildcard = -1 + +var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ + specs.PIDNamespace: configs.NEWPID, + specs.NetworkNamespace: configs.NEWNET, + specs.MountNamespace: configs.NEWNS, + specs.UserNamespace: configs.NEWUSER, + specs.IPCNamespace: configs.NEWIPC, + specs.UTSNamespace: configs.NEWUTS, +} + +var mountPropagationMapping = map[string]int{ + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "private": unix.MS_PRIVATE, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "slave": unix.MS_SLAVE, + "rshared": unix.MS_SHARED | unix.MS_REC, + "shared": unix.MS_SHARED, + "": 0, +} + +var allowedDevices = []*configs.Device{ + // allow mknod for any device + { + Type: 'c', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: 'b', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/null", + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/random", + Major: 1, + Minor: 8, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/full", + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/tty", + Major: 5, + Minor: 0, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/zero", + Major: 1, + Minor: 5, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/urandom", + Major: 1, + Minor: 9, + Permissions: "rwm", + Allow: true, + }, + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + Allow: true, + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: wildcard, + Permissions: "rwm", + Allow: true, + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + Allow: true, + }, + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + Allow: true, + }, +} + +type CreateOpts struct { + CgroupName string + UseSystemdCgroup bool + NoPivotRoot bool + NoNewKeyring bool + Spec *specs.Spec + Rootless bool +} + +// CreateLibcontainerConfig creates a new libcontainer configuration from a +// given specification and a cgroup name +func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { + // runc's cwd will always be the bundle path + rcwd, err := os.Getwd() + if err != nil { + return nil, err + } + cwd, err := filepath.Abs(rcwd) + if err != nil { + return nil, err + } + spec := opts.Spec + if spec.Root == nil { + return nil, fmt.Errorf("Root must be specified") + } + rootfsPath := spec.Root.Path + if !filepath.IsAbs(rootfsPath) { + rootfsPath = filepath.Join(cwd, rootfsPath) + } + labels := []string{} + for k, v := range spec.Annotations { + labels = append(labels, fmt.Sprintf("%s=%s", k, v)) + } + config := &configs.Config{ + Rootfs: rootfsPath, + NoPivotRoot: opts.NoPivotRoot, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), + NoNewKeyring: opts.NoNewKeyring, + Rootless: opts.Rootless, + } + + exists := false + for _, m := range spec.Mounts { + config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m)) + } + if err := createDevices(spec, config); err != nil { + return nil, err + } + if err := setupUserNamespace(spec, config); err != nil { + return nil, err + } + c, err := createCgroupConfig(opts) + if err != nil { + return nil, err + } + config.Cgroups = c + // set linux-specific config + if spec.Linux != nil { + if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { + return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) + } + + for _, ns := range spec.Linux.Namespaces { + t, exists := namespaceMapping[ns.Type] + if !exists { + return nil, fmt.Errorf("namespace %q does not exist", ns) + } + if config.Namespaces.Contains(t) { + return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns) + } + config.Namespaces.Add(t, ns.Path) + } + if config.Namespaces.Contains(configs.NEWNET) { + config.Networks = []*configs.Network{ + { + Type: "loopback", + }, + } + } + config.MaskPaths = spec.Linux.MaskedPaths + config.ReadonlyPaths = spec.Linux.ReadonlyPaths + config.MountLabel = spec.Linux.MountLabel + config.Sysctl = spec.Linux.Sysctl + if spec.Linux.Seccomp != nil { + seccomp, err := setupSeccomp(spec.Linux.Seccomp) + if err != nil { + return nil, err + } + config.Seccomp = seccomp + } + } + if spec.Process.SelinuxLabel != "" { + config.ProcessLabel = spec.Process.SelinuxLabel + } + if spec.Process != nil && spec.Process.OOMScoreAdj != nil { + config.OomScoreAdj = *spec.Process.OOMScoreAdj + } + if spec.Process.Capabilities != nil { + config.Capabilities = &configs.Capabilities{ + Bounding: spec.Process.Capabilities.Bounding, + Effective: spec.Process.Capabilities.Effective, + Permitted: spec.Process.Capabilities.Permitted, + Inheritable: spec.Process.Capabilities.Inheritable, + Ambient: spec.Process.Capabilities.Ambient, + } + } + createHooks(spec, config) + config.Version = specs.Version + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + } + return config, nil +} + +func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { + flags, pgflags, data, ext := parseMountOptions(m.Options) + source := m.Source + if m.Type == "bind" { + if !filepath.IsAbs(source) { + source = filepath.Join(cwd, m.Source) + } + } + return &configs.Mount{ + Device: m.Type, + Source: source, + Destination: m.Destination, + Data: data, + Flags: flags, + PropagationFlags: pgflags, + Extensions: ext, + } +} + +func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { + var ( + myCgroupPath string + + spec = opts.Spec + useSystemdCgroup = opts.UseSystemdCgroup + name = opts.CgroupName + ) + + c := &configs.Cgroup{ + Resources: &configs.Resources{}, + } + + if spec.Linux != nil && spec.Linux.CgroupsPath != "" { + myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath) + if useSystemdCgroup { + myCgroupPath = spec.Linux.CgroupsPath + } + } + + if useSystemdCgroup { + if myCgroupPath == "" { + c.Parent = "system.slice" + c.ScopePrefix = "runc" + c.Name = name + } else { + // Parse the path from expected "slice:prefix:name" + // for e.g. "system.slice:docker:1234" + parts := strings.Split(myCgroupPath, ":") + if len(parts) != 3 { + return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups") + } + c.Parent = parts[0] + c.ScopePrefix = parts[1] + c.Name = parts[2] + } + } else { + if myCgroupPath == "" { + c.Name = name + } + c.Path = myCgroupPath + } + + // In rootless containers, any attempt to make cgroup changes will fail. + // libcontainer will validate this and we shouldn't add any cgroup options + // the user didn't specify. + if !opts.Rootless { + c.Resources.AllowedDevices = allowedDevices + } + if spec.Linux != nil { + r := spec.Linux.Resources + if r == nil { + return c, nil + } + for i, d := range spec.Linux.Resources.Devices { + var ( + t = "a" + major = int64(-1) + minor = int64(-1) + ) + if d.Type != "" { + t = d.Type + } + if d.Major != nil { + major = *d.Major + } + if d.Minor != nil { + minor = *d.Minor + } + if d.Access == "" { + return nil, fmt.Errorf("device access at %d field cannot be empty", i) + } + dt, err := stringToCgroupDeviceRune(t) + if err != nil { + return nil, err + } + dd := &configs.Device{ + Type: dt, + Major: major, + Minor: minor, + Permissions: d.Access, + Allow: d.Allow, + } + c.Resources.Devices = append(c.Resources.Devices, dd) + } + if r.Memory != nil { + if r.Memory.Limit != nil { + c.Resources.Memory = *r.Memory.Limit + } + if r.Memory.Reservation != nil { + c.Resources.MemoryReservation = *r.Memory.Reservation + } + if r.Memory.Swap != nil { + c.Resources.MemorySwap = *r.Memory.Swap + } + if r.Memory.Kernel != nil { + c.Resources.KernelMemory = *r.Memory.Kernel + } + if r.Memory.KernelTCP != nil { + c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP + } + if r.Memory.Swappiness != nil { + c.Resources.MemorySwappiness = r.Memory.Swappiness + } + if r.Memory.DisableOOMKiller != nil { + c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller + } + } + if r.CPU != nil { + if r.CPU.Shares != nil { + c.Resources.CpuShares = *r.CPU.Shares + } + if r.CPU.Quota != nil { + c.Resources.CpuQuota = *r.CPU.Quota + } + if r.CPU.Period != nil { + c.Resources.CpuPeriod = *r.CPU.Period + } + if r.CPU.RealtimeRuntime != nil { + c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime + } + if r.CPU.RealtimePeriod != nil { + c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod + } + if r.CPU.Cpus != "" { + c.Resources.CpusetCpus = r.CPU.Cpus + } + if r.CPU.Mems != "" { + c.Resources.CpusetMems = r.CPU.Mems + } + } + if r.Pids != nil { + c.Resources.PidsLimit = r.Pids.Limit + } + if r.BlockIO != nil { + if r.BlockIO.Weight != nil { + c.Resources.BlkioWeight = *r.BlockIO.Weight + } + if r.BlockIO.LeafWeight != nil { + c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight + } + if r.BlockIO.WeightDevice != nil { + for _, wd := range r.BlockIO.WeightDevice { + var weight, leafWeight uint16 + if wd.Weight != nil { + weight = *wd.Weight + } + if wd.LeafWeight != nil { + leafWeight = *wd.LeafWeight + } + weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) + c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) + } + } + if r.BlockIO.ThrottleReadBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleReadBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleReadIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleReadIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) + } + } + } + for _, l := range r.HugepageLimits { + c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: l.Pagesize, + Limit: l.Limit, + }) + } + if r.Network != nil { + if r.Network.ClassID != nil { + c.Resources.NetClsClassid = *r.Network.ClassID + } + for _, m := range r.Network.Priorities { + c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Name, + Priority: int64(m.Priority), + }) + } + } + } + if !opts.Rootless { + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + } + return c, nil +} + +func stringToCgroupDeviceRune(s string) (rune, error) { + switch s { + case "a": + return 'a', nil + case "b": + return 'b', nil + case "c": + return 'c', nil + default: + return 0, fmt.Errorf("invalid cgroup device type %q", s) + } +} + +func stringToDeviceRune(s string) (rune, error) { + switch s { + case "p": + return 'p', nil + case "u": + return 'u', nil + case "b": + return 'b', nil + case "c": + return 'c', nil + default: + return 0, fmt.Errorf("invalid device type %q", s) + } +} + +func createDevices(spec *specs.Spec, config *configs.Config) error { + // add whitelisted devices + config.Devices = []*configs.Device{ + { + Type: 'c', + Path: "/dev/null", + Major: 1, + Minor: 3, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/random", + Major: 1, + Minor: 8, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/full", + Major: 1, + Minor: 7, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/tty", + Major: 5, + Minor: 0, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/zero", + Major: 1, + Minor: 5, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/urandom", + Major: 1, + Minor: 9, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + } + // merge in additional devices from the spec + if spec.Linux != nil { + for _, d := range spec.Linux.Devices { + var uid, gid uint32 + var filemode os.FileMode = 0666 + + if d.UID != nil { + uid = *d.UID + } + if d.GID != nil { + gid = *d.GID + } + dt, err := stringToDeviceRune(d.Type) + if err != nil { + return err + } + if d.FileMode != nil { + filemode = *d.FileMode + } + device := &configs.Device{ + Type: dt, + Path: d.Path, + Major: d.Major, + Minor: d.Minor, + FileMode: filemode, + Uid: uid, + Gid: gid, + } + config.Devices = append(config.Devices, device) + } + } + return nil +} + +func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { + create := func(m specs.LinuxIDMapping) configs.IDMap { + return configs.IDMap{ + HostID: int(m.HostID), + ContainerID: int(m.ContainerID), + Size: int(m.Size), + } + } + if spec.Linux != nil { + if len(spec.Linux.UIDMappings) == 0 { + return nil + } + for _, m := range spec.Linux.UIDMappings { + config.UidMappings = append(config.UidMappings, create(m)) + } + for _, m := range spec.Linux.GIDMappings { + config.GidMappings = append(config.GidMappings, create(m)) + } + } + rootUID, err := config.HostRootUID() + if err != nil { + return err + } + rootGID, err := config.HostRootGID() + if err != nil { + return err + } + for _, node := range config.Devices { + node.Uid = uint32(rootUID) + node.Gid = uint32(rootGID) + } + return nil +} + +// parseMountOptions parses the string and returns the flags, propagation +// flags and any mount data that it contains. +func parseMountOptions(options []string) (int, []int, string, int) { + var ( + flag int + pgflag []int + data []string + extFlags int + ) + flags := map[string]struct { + clear bool + flag int + }{ + "acl": {false, unix.MS_POSIXACL}, + "async": {true, unix.MS_SYNCHRONOUS}, + "atime": {true, unix.MS_NOATIME}, + "bind": {false, unix.MS_BIND}, + "defaults": {false, 0}, + "dev": {true, unix.MS_NODEV}, + "diratime": {true, unix.MS_NODIRATIME}, + "dirsync": {false, unix.MS_DIRSYNC}, + "exec": {true, unix.MS_NOEXEC}, + "iversion": {false, unix.MS_I_VERSION}, + "lazytime": {false, unix.MS_LAZYTIME}, + "loud": {true, unix.MS_SILENT}, + "mand": {false, unix.MS_MANDLOCK}, + "noacl": {true, unix.MS_POSIXACL}, + "noatime": {false, unix.MS_NOATIME}, + "nodev": {false, unix.MS_NODEV}, + "nodiratime": {false, unix.MS_NODIRATIME}, + "noexec": {false, unix.MS_NOEXEC}, + "noiversion": {true, unix.MS_I_VERSION}, + "nolazytime": {true, unix.MS_LAZYTIME}, + "nomand": {true, unix.MS_MANDLOCK}, + "norelatime": {true, unix.MS_RELATIME}, + "nostrictatime": {true, unix.MS_STRICTATIME}, + "nosuid": {false, unix.MS_NOSUID}, + "rbind": {false, unix.MS_BIND | unix.MS_REC}, + "relatime": {false, unix.MS_RELATIME}, + "remount": {false, unix.MS_REMOUNT}, + "ro": {false, unix.MS_RDONLY}, + "rw": {true, unix.MS_RDONLY}, + "silent": {false, unix.MS_SILENT}, + "strictatime": {false, unix.MS_STRICTATIME}, + "suid": {true, unix.MS_NOSUID}, + "sync": {false, unix.MS_SYNCHRONOUS}, + } + propagationFlags := map[string]int{ + "private": unix.MS_PRIVATE, + "shared": unix.MS_SHARED, + "slave": unix.MS_SLAVE, + "unbindable": unix.MS_UNBINDABLE, + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "rshared": unix.MS_SHARED | unix.MS_REC, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + } + extensionFlags := map[string]struct { + clear bool + flag int + }{ + "tmpcopyup": {false, configs.EXT_COPYUP}, + } + for _, o := range options { + // If the option does not exist in the flags table or the flag + // is not supported on the platform, + // then it is a data value for a specific fs type + if f, exists := flags[o]; exists && f.flag != 0 { + if f.clear { + flag &= ^f.flag + } else { + flag |= f.flag + } + } else if f, exists := propagationFlags[o]; exists && f != 0 { + pgflag = append(pgflag, f) + } else if f, exists := extensionFlags[o]; exists && f.flag != 0 { + if f.clear { + extFlags &= ^f.flag + } else { + extFlags |= f.flag + } + } else { + data = append(data, o) + } + } + return flag, pgflag, strings.Join(data, ","), extFlags +} + +func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { + if config == nil { + return nil, nil + } + + // No default action specified, no syscalls listed, assume seccomp disabled + if config.DefaultAction == "" && len(config.Syscalls) == 0 { + return nil, nil + } + + newConfig := new(configs.Seccomp) + newConfig.Syscalls = []*configs.Syscall{} + + if len(config.Architectures) > 0 { + newConfig.Architectures = []string{} + for _, arch := range config.Architectures { + newArch, err := seccomp.ConvertStringToArch(string(arch)) + if err != nil { + return nil, err + } + newConfig.Architectures = append(newConfig.Architectures, newArch) + } + } + + // Convert default action from string representation + newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction)) + if err != nil { + return nil, err + } + newConfig.DefaultAction = newDefaultAction + + // Loop through all syscall blocks and convert them to libcontainer format + for _, call := range config.Syscalls { + newAction, err := seccomp.ConvertStringToAction(string(call.Action)) + if err != nil { + return nil, err + } + + for _, name := range call.Names { + newCall := configs.Syscall{ + Name: name, + Action: newAction, + Args: []*configs.Arg{}, + } + // Loop through all the arguments of the syscall and convert them + for _, arg := range call.Args { + newOp, err := seccomp.ConvertStringToOperator(string(arg.Op)) + if err != nil { + return nil, err + } + + newArg := configs.Arg{ + Index: arg.Index, + Value: arg.Value, + ValueTwo: arg.ValueTwo, + Op: newOp, + } + + newCall.Args = append(newCall.Args, &newArg) + } + newConfig.Syscalls = append(newConfig.Syscalls, &newCall) + } + } + + return newConfig, nil +} + +func createHooks(rspec *specs.Spec, config *configs.Config) { + config.Hooks = &configs.Hooks{} + if rspec.Hooks != nil { + + for _, h := range rspec.Hooks.Prestart { + cmd := createCommandHook(h) + config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststart { + cmd := createCommandHook(h) + config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststop { + cmd := createCommandHook(h) + config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd)) + } + } +} + +func createCommandHook(h specs.Hook) configs.Command { + cmd := configs.Command{ + Path: h.Path, + Args: h.Args, + Env: h.Env, + } + if h.Timeout != nil { + d := time.Duration(*h.Timeout) * time.Second + cmd.Timeout = &d + } + return cmd +} diff --git a/vendor/github.com/seccomp/libseccomp-golang/LICENSE b/vendor/github.com/seccomp/libseccomp-golang/LICENSE new file mode 100644 index 0000000000..81cf60de29 --- /dev/null +++ b/vendor/github.com/seccomp/libseccomp-golang/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2015 Matthew Heon +Copyright (c) 2015 Paul Moore +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +- Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/seccomp/libseccomp-golang/seccomp.go b/vendor/github.com/seccomp/libseccomp-golang/seccomp.go new file mode 100644 index 0000000000..53bcb024dc --- /dev/null +++ b/vendor/github.com/seccomp/libseccomp-golang/seccomp.go @@ -0,0 +1,902 @@ +// +build linux + +// Public API specification for libseccomp Go bindings +// Contains public API for the bindings + +// Package seccomp provides bindings for libseccomp, a library wrapping the Linux +// seccomp syscall. Seccomp enables an application to restrict system call use +// for itself and its children. +package seccomp + +import ( + "fmt" + "os" + "runtime" + "strings" + "sync" + "syscall" + "unsafe" +) + +// C wrapping code + +// #cgo pkg-config: libseccomp +// #include +// #include +import "C" + +// Exported types + +// VersionError denotes that the system libseccomp version is incompatible +// with this package. +type VersionError struct { + message string + minimum string +} + +func (e VersionError) Error() string { + format := "Libseccomp version too low: " + if e.message != "" { + format += e.message + ": " + } + format += "minimum supported is " + if e.minimum != "" { + format += e.minimum + ": " + } else { + format += "2.1.0: " + } + format += "detected %d.%d.%d" + return fmt.Sprintf(format, verMajor, verMinor, verMicro) +} + +// ScmpArch represents a CPU architecture. Seccomp can restrict syscalls on a +// per-architecture basis. +type ScmpArch uint + +// ScmpAction represents an action to be taken on a filter rule match in +// libseccomp +type ScmpAction uint + +// ScmpCompareOp represents a comparison operator which can be used in a filter +// rule +type ScmpCompareOp uint + +// ScmpCondition represents a rule in a libseccomp filter context +type ScmpCondition struct { + Argument uint `json:"argument,omitempty"` + Op ScmpCompareOp `json:"operator,omitempty"` + Operand1 uint64 `json:"operand_one,omitempty"` + Operand2 uint64 `json:"operand_two,omitempty"` +} + +// ScmpSyscall represents a Linux System Call +type ScmpSyscall int32 + +// Exported Constants + +const ( + // Valid architectures recognized by libseccomp + // ARM64 and all MIPS architectures are unsupported by versions of the + // library before v2.2 and will return errors if used + + // ArchInvalid is a placeholder to ensure uninitialized ScmpArch + // variables are invalid + ArchInvalid ScmpArch = iota + // ArchNative is the native architecture of the kernel + ArchNative ScmpArch = iota + // ArchX86 represents 32-bit x86 syscalls + ArchX86 ScmpArch = iota + // ArchAMD64 represents 64-bit x86-64 syscalls + ArchAMD64 ScmpArch = iota + // ArchX32 represents 64-bit x86-64 syscalls (32-bit pointers) + ArchX32 ScmpArch = iota + // ArchARM represents 32-bit ARM syscalls + ArchARM ScmpArch = iota + // ArchARM64 represents 64-bit ARM syscalls + ArchARM64 ScmpArch = iota + // ArchMIPS represents 32-bit MIPS syscalls + ArchMIPS ScmpArch = iota + // ArchMIPS64 represents 64-bit MIPS syscalls + ArchMIPS64 ScmpArch = iota + // ArchMIPS64N32 represents 64-bit MIPS syscalls (32-bit pointers) + ArchMIPS64N32 ScmpArch = iota + // ArchMIPSEL represents 32-bit MIPS syscalls (little endian) + ArchMIPSEL ScmpArch = iota + // ArchMIPSEL64 represents 64-bit MIPS syscalls (little endian) + ArchMIPSEL64 ScmpArch = iota + // ArchMIPSEL64N32 represents 64-bit MIPS syscalls (little endian, + // 32-bit pointers) + ArchMIPSEL64N32 ScmpArch = iota + // ArchPPC represents 32-bit POWERPC syscalls + ArchPPC ScmpArch = iota + // ArchPPC64 represents 64-bit POWER syscalls (big endian) + ArchPPC64 ScmpArch = iota + // ArchPPC64LE represents 64-bit POWER syscalls (little endian) + ArchPPC64LE ScmpArch = iota + // ArchS390 represents 31-bit System z/390 syscalls + ArchS390 ScmpArch = iota + // ArchS390X represents 64-bit System z/390 syscalls + ArchS390X ScmpArch = iota +) + +const ( + // Supported actions on filter match + + // ActInvalid is a placeholder to ensure uninitialized ScmpAction + // variables are invalid + ActInvalid ScmpAction = iota + // ActKill kills the process + ActKill ScmpAction = iota + // ActTrap throws SIGSYS + ActTrap ScmpAction = iota + // ActErrno causes the syscall to return a negative error code. This + // code can be set with the SetReturnCode method + ActErrno ScmpAction = iota + // ActTrace causes the syscall to notify tracing processes with the + // given error code. This code can be set with the SetReturnCode method + ActTrace ScmpAction = iota + // ActAllow permits the syscall to continue execution + ActAllow ScmpAction = iota +) + +const ( + // These are comparison operators used in conditional seccomp rules + // They are used to compare the value of a single argument of a syscall + // against a user-defined constant + + // CompareInvalid is a placeholder to ensure uninitialized ScmpCompareOp + // variables are invalid + CompareInvalid ScmpCompareOp = iota + // CompareNotEqual returns true if the argument is not equal to the + // given value + CompareNotEqual ScmpCompareOp = iota + // CompareLess returns true if the argument is less than the given value + CompareLess ScmpCompareOp = iota + // CompareLessOrEqual returns true if the argument is less than or equal + // to the given value + CompareLessOrEqual ScmpCompareOp = iota + // CompareEqual returns true if the argument is equal to the given value + CompareEqual ScmpCompareOp = iota + // CompareGreaterEqual returns true if the argument is greater than or + // equal to the given value + CompareGreaterEqual ScmpCompareOp = iota + // CompareGreater returns true if the argument is greater than the given + // value + CompareGreater ScmpCompareOp = iota + // CompareMaskedEqual returns true if the argument is equal to the given + // value, when masked (bitwise &) against the second given value + CompareMaskedEqual ScmpCompareOp = iota +) + +// Helpers for types + +// GetArchFromString returns an ScmpArch constant from a string representing an +// architecture +func GetArchFromString(arch string) (ScmpArch, error) { + if err := ensureSupportedVersion(); err != nil { + return ArchInvalid, err + } + + switch strings.ToLower(arch) { + case "x86": + return ArchX86, nil + case "amd64", "x86-64", "x86_64", "x64": + return ArchAMD64, nil + case "x32": + return ArchX32, nil + case "arm": + return ArchARM, nil + case "arm64", "aarch64": + return ArchARM64, nil + case "mips": + return ArchMIPS, nil + case "mips64": + return ArchMIPS64, nil + case "mips64n32": + return ArchMIPS64N32, nil + case "mipsel": + return ArchMIPSEL, nil + case "mipsel64": + return ArchMIPSEL64, nil + case "mipsel64n32": + return ArchMIPSEL64N32, nil + case "ppc": + return ArchPPC, nil + case "ppc64": + return ArchPPC64, nil + case "ppc64le": + return ArchPPC64LE, nil + case "s390": + return ArchS390, nil + case "s390x": + return ArchS390X, nil + default: + return ArchInvalid, fmt.Errorf("cannot convert unrecognized string %s", arch) + } +} + +// String returns a string representation of an architecture constant +func (a ScmpArch) String() string { + switch a { + case ArchX86: + return "x86" + case ArchAMD64: + return "amd64" + case ArchX32: + return "x32" + case ArchARM: + return "arm" + case ArchARM64: + return "arm64" + case ArchMIPS: + return "mips" + case ArchMIPS64: + return "mips64" + case ArchMIPS64N32: + return "mips64n32" + case ArchMIPSEL: + return "mipsel" + case ArchMIPSEL64: + return "mipsel64" + case ArchMIPSEL64N32: + return "mipsel64n32" + case ArchPPC: + return "ppc" + case ArchPPC64: + return "ppc64" + case ArchPPC64LE: + return "ppc64le" + case ArchS390: + return "s390" + case ArchS390X: + return "s390x" + case ArchNative: + return "native" + case ArchInvalid: + return "Invalid architecture" + default: + return "Unknown architecture" + } +} + +// String returns a string representation of a comparison operator constant +func (a ScmpCompareOp) String() string { + switch a { + case CompareNotEqual: + return "Not equal" + case CompareLess: + return "Less than" + case CompareLessOrEqual: + return "Less than or equal to" + case CompareEqual: + return "Equal" + case CompareGreaterEqual: + return "Greater than or equal to" + case CompareGreater: + return "Greater than" + case CompareMaskedEqual: + return "Masked equality" + case CompareInvalid: + return "Invalid comparison operator" + default: + return "Unrecognized comparison operator" + } +} + +// String returns a string representation of a seccomp match action +func (a ScmpAction) String() string { + switch a & 0xFFFF { + case ActKill: + return "Action: Kill Process" + case ActTrap: + return "Action: Send SIGSYS" + case ActErrno: + return fmt.Sprintf("Action: Return error code %d", (a >> 16)) + case ActTrace: + return fmt.Sprintf("Action: Notify tracing processes with code %d", + (a >> 16)) + case ActAllow: + return "Action: Allow system call" + default: + return "Unrecognized Action" + } +} + +// SetReturnCode adds a return code to a supporting ScmpAction, clearing any +// existing code Only valid on ActErrno and ActTrace. Takes no action otherwise. +// Accepts 16-bit return code as argument. +// Returns a valid ScmpAction of the original type with the new error code set. +func (a ScmpAction) SetReturnCode(code int16) ScmpAction { + aTmp := a & 0x0000FFFF + if aTmp == ActErrno || aTmp == ActTrace { + return (aTmp | (ScmpAction(code)&0xFFFF)<<16) + } + return a +} + +// GetReturnCode returns the return code of an ScmpAction +func (a ScmpAction) GetReturnCode() int16 { + return int16(a >> 16) +} + +// General utility functions + +// GetLibraryVersion returns the version of the library the bindings are built +// against. +// The version is formatted as follows: Major.Minor.Micro +func GetLibraryVersion() (major, minor, micro int) { + return verMajor, verMinor, verMicro +} + +// Syscall functions + +// GetName retrieves the name of a syscall from its number. +// Acts on any syscall number. +// Returns either a string containing the name of the syscall, or an error. +func (s ScmpSyscall) GetName() (string, error) { + return s.GetNameByArch(ArchNative) +} + +// GetNameByArch retrieves the name of a syscall from its number for a given +// architecture. +// Acts on any syscall number. +// Accepts a valid architecture constant. +// Returns either a string containing the name of the syscall, or an error. +// if the syscall is unrecognized or an issue occurred. +func (s ScmpSyscall) GetNameByArch(arch ScmpArch) (string, error) { + if err := sanitizeArch(arch); err != nil { + return "", err + } + + cString := C.seccomp_syscall_resolve_num_arch(arch.toNative(), C.int(s)) + if cString == nil { + return "", fmt.Errorf("could not resolve syscall name") + } + defer C.free(unsafe.Pointer(cString)) + + finalStr := C.GoString(cString) + return finalStr, nil +} + +// GetSyscallFromName returns the number of a syscall by name on the kernel's +// native architecture. +// Accepts a string containing the name of a syscall. +// Returns the number of the syscall, or an error if no syscall with that name +// was found. +func GetSyscallFromName(name string) (ScmpSyscall, error) { + if err := ensureSupportedVersion(); err != nil { + return 0, err + } + + cString := C.CString(name) + defer C.free(unsafe.Pointer(cString)) + + result := C.seccomp_syscall_resolve_name(cString) + if result == scmpError { + return 0, fmt.Errorf("could not resolve name to syscall") + } + + return ScmpSyscall(result), nil +} + +// GetSyscallFromNameByArch returns the number of a syscall by name for a given +// architecture's ABI. +// Accepts the name of a syscall and an architecture constant. +// Returns the number of the syscall, or an error if an invalid architecture is +// passed or a syscall with that name was not found. +func GetSyscallFromNameByArch(name string, arch ScmpArch) (ScmpSyscall, error) { + if err := ensureSupportedVersion(); err != nil { + return 0, err + } + if err := sanitizeArch(arch); err != nil { + return 0, err + } + + cString := C.CString(name) + defer C.free(unsafe.Pointer(cString)) + + result := C.seccomp_syscall_resolve_name_arch(arch.toNative(), cString) + if result == scmpError { + return 0, fmt.Errorf("could not resolve name to syscall") + } + + return ScmpSyscall(result), nil +} + +// MakeCondition creates and returns a new condition to attach to a filter rule. +// Associated rules will only match if this condition is true. +// Accepts the number the argument we are checking, and a comparison operator +// and value to compare to. +// The rule will match if argument $arg (zero-indexed) of the syscall is +// $COMPARE_OP the provided comparison value. +// Some comparison operators accept two values. Masked equals, for example, +// will mask $arg of the syscall with the second value provided (via bitwise +// AND) and then compare against the first value provided. +// For example, in the less than or equal case, if the syscall argument was +// 0 and the value provided was 1, the condition would match, as 0 is less +// than or equal to 1. +// Return either an error on bad argument or a valid ScmpCondition struct. +func MakeCondition(arg uint, comparison ScmpCompareOp, values ...uint64) (ScmpCondition, error) { + var condStruct ScmpCondition + + if err := ensureSupportedVersion(); err != nil { + return condStruct, err + } + + if comparison == CompareInvalid { + return condStruct, fmt.Errorf("invalid comparison operator") + } else if arg > 5 { + return condStruct, fmt.Errorf("syscalls only have up to 6 arguments") + } else if len(values) > 2 { + return condStruct, fmt.Errorf("conditions can have at most 2 arguments") + } else if len(values) == 0 { + return condStruct, fmt.Errorf("must provide at least one value to compare against") + } + + condStruct.Argument = arg + condStruct.Op = comparison + condStruct.Operand1 = values[0] + if len(values) == 2 { + condStruct.Operand2 = values[1] + } else { + condStruct.Operand2 = 0 // Unused + } + + return condStruct, nil +} + +// Utility Functions + +// GetNativeArch returns architecture token representing the native kernel +// architecture +func GetNativeArch() (ScmpArch, error) { + if err := ensureSupportedVersion(); err != nil { + return ArchInvalid, err + } + + arch := C.seccomp_arch_native() + + return archFromNative(arch) +} + +// Public Filter API + +// ScmpFilter represents a filter context in libseccomp. +// A filter context is initially empty. Rules can be added to it, and it can +// then be loaded into the kernel. +type ScmpFilter struct { + filterCtx C.scmp_filter_ctx + valid bool + lock sync.Mutex +} + +// NewFilter creates and returns a new filter context. +// Accepts a default action to be taken for syscalls which match no rules in +// the filter. +// Returns a reference to a valid filter context, or nil and an error if the +// filter context could not be created or an invalid default action was given. +func NewFilter(defaultAction ScmpAction) (*ScmpFilter, error) { + if err := ensureSupportedVersion(); err != nil { + return nil, err + } + + if err := sanitizeAction(defaultAction); err != nil { + return nil, err + } + + fPtr := C.seccomp_init(defaultAction.toNative()) + if fPtr == nil { + return nil, fmt.Errorf("could not create filter") + } + + filter := new(ScmpFilter) + filter.filterCtx = fPtr + filter.valid = true + runtime.SetFinalizer(filter, filterFinalizer) + + return filter, nil +} + +// IsValid determines whether a filter context is valid to use. +// Some operations (Release and Merge) render filter contexts invalid and +// consequently prevent further use. +func (f *ScmpFilter) IsValid() bool { + f.lock.Lock() + defer f.lock.Unlock() + + return f.valid +} + +// Reset resets a filter context, removing all its existing state. +// Accepts a new default action to be taken for syscalls which do not match. +// Returns an error if the filter or action provided are invalid. +func (f *ScmpFilter) Reset(defaultAction ScmpAction) error { + f.lock.Lock() + defer f.lock.Unlock() + + if err := sanitizeAction(defaultAction); err != nil { + return err + } else if !f.valid { + return errBadFilter + } + + retCode := C.seccomp_reset(f.filterCtx, defaultAction.toNative()) + if retCode != 0 { + return syscall.Errno(-1 * retCode) + } + + return nil +} + +// Release releases a filter context, freeing its memory. Should be called after +// loading into the kernel, when the filter is no longer needed. +// After calling this function, the given filter is no longer valid and cannot +// be used. +// Release() will be invoked automatically when a filter context is garbage +// collected, but can also be called manually to free memory. +func (f *ScmpFilter) Release() { + f.lock.Lock() + defer f.lock.Unlock() + + if !f.valid { + return + } + + f.valid = false + C.seccomp_release(f.filterCtx) +} + +// Merge merges two filter contexts. +// The source filter src will be released as part of the process, and will no +// longer be usable or valid after this call. +// To be merged, filters must NOT share any architectures, and all their +// attributes (Default Action, Bad Arch Action, No New Privs and TSync bools) +// must match. +// The filter src will be merged into the filter this is called on. +// The architectures of the src filter not present in the destination, and all +// associated rules, will be added to the destination. +// Returns an error if merging the filters failed. +func (f *ScmpFilter) Merge(src *ScmpFilter) error { + f.lock.Lock() + defer f.lock.Unlock() + + src.lock.Lock() + defer src.lock.Unlock() + + if !src.valid || !f.valid { + return fmt.Errorf("one or more of the filter contexts is invalid or uninitialized") + } + + // Merge the filters + retCode := C.seccomp_merge(f.filterCtx, src.filterCtx) + if syscall.Errno(-1*retCode) == syscall.EINVAL { + return fmt.Errorf("filters could not be merged due to a mismatch in attributes or invalid filter") + } else if retCode != 0 { + return syscall.Errno(-1 * retCode) + } + + src.valid = false + + return nil +} + +// IsArchPresent checks if an architecture is present in a filter. +// If a filter contains an architecture, it uses its default action for +// syscalls which do not match rules in it, and its rules can match syscalls +// for that ABI. +// If a filter does not contain an architecture, all syscalls made to that +// kernel ABI will fail with the filter's default Bad Architecture Action +// (by default, killing the process). +// Accepts an architecture constant. +// Returns true if the architecture is present in the filter, false otherwise, +// and an error on an invalid filter context, architecture constant, or an +// issue with the call to libseccomp. +func (f *ScmpFilter) IsArchPresent(arch ScmpArch) (bool, error) { + f.lock.Lock() + defer f.lock.Unlock() + + if err := sanitizeArch(arch); err != nil { + return false, err + } else if !f.valid { + return false, errBadFilter + } + + retCode := C.seccomp_arch_exist(f.filterCtx, arch.toNative()) + if syscall.Errno(-1*retCode) == syscall.EEXIST { + // -EEXIST is "arch not present" + return false, nil + } else if retCode != 0 { + return false, syscall.Errno(-1 * retCode) + } + + return true, nil +} + +// AddArch adds an architecture to the filter. +// Accepts an architecture constant. +// Returns an error on invalid filter context or architecture token, or an +// issue with the call to libseccomp. +func (f *ScmpFilter) AddArch(arch ScmpArch) error { + f.lock.Lock() + defer f.lock.Unlock() + + if err := sanitizeArch(arch); err != nil { + return err + } else if !f.valid { + return errBadFilter + } + + // Libseccomp returns -EEXIST if the specified architecture is already + // present. Succeed silently in this case, as it's not fatal, and the + // architecture is present already. + retCode := C.seccomp_arch_add(f.filterCtx, arch.toNative()) + if retCode != 0 && syscall.Errno(-1*retCode) != syscall.EEXIST { + return syscall.Errno(-1 * retCode) + } + + return nil +} + +// RemoveArch removes an architecture from the filter. +// Accepts an architecture constant. +// Returns an error on invalid filter context or architecture token, or an +// issue with the call to libseccomp. +func (f *ScmpFilter) RemoveArch(arch ScmpArch) error { + f.lock.Lock() + defer f.lock.Unlock() + + if err := sanitizeArch(arch); err != nil { + return err + } else if !f.valid { + return errBadFilter + } + + // Similar to AddArch, -EEXIST is returned if the arch is not present + // Succeed silently in that case, this is not fatal and the architecture + // is not present in the filter after RemoveArch + retCode := C.seccomp_arch_remove(f.filterCtx, arch.toNative()) + if retCode != 0 && syscall.Errno(-1*retCode) != syscall.EEXIST { + return syscall.Errno(-1 * retCode) + } + + return nil +} + +// Load loads a filter context into the kernel. +// Returns an error if the filter context is invalid or the syscall failed. +func (f *ScmpFilter) Load() error { + f.lock.Lock() + defer f.lock.Unlock() + + if !f.valid { + return errBadFilter + } + + if retCode := C.seccomp_load(f.filterCtx); retCode != 0 { + return syscall.Errno(-1 * retCode) + } + + return nil +} + +// GetDefaultAction returns the default action taken on a syscall which does not +// match a rule in the filter, or an error if an issue was encountered +// retrieving the value. +func (f *ScmpFilter) GetDefaultAction() (ScmpAction, error) { + action, err := f.getFilterAttr(filterAttrActDefault) + if err != nil { + return 0x0, err + } + + return actionFromNative(action) +} + +// GetBadArchAction returns the default action taken on a syscall for an +// architecture not in the filter, or an error if an issue was encountered +// retrieving the value. +func (f *ScmpFilter) GetBadArchAction() (ScmpAction, error) { + action, err := f.getFilterAttr(filterAttrActBadArch) + if err != nil { + return 0x0, err + } + + return actionFromNative(action) +} + +// GetNoNewPrivsBit returns the current state the No New Privileges bit will be set +// to on the filter being loaded, or an error if an issue was encountered +// retrieving the value. +// The No New Privileges bit tells the kernel that new processes run with exec() +// cannot gain more privileges than the process that ran exec(). +// For example, a process with No New Privileges set would be unable to exec +// setuid/setgid executables. +func (f *ScmpFilter) GetNoNewPrivsBit() (bool, error) { + noNewPrivs, err := f.getFilterAttr(filterAttrNNP) + if err != nil { + return false, err + } + + if noNewPrivs == 0 { + return false, nil + } + + return true, nil +} + +// GetTsyncBit returns whether Thread Synchronization will be enabled on the +// filter being loaded, or an error if an issue was encountered retrieving the +// value. +// Thread Sync ensures that all members of the thread group of the calling +// process will share the same Seccomp filter set. +// Tsync is a fairly recent addition to the Linux kernel and older kernels +// lack support. If the running kernel does not support Tsync and it is +// requested in a filter, Libseccomp will not enable TSync support and will +// proceed as normal. +// This function is unavailable before v2.2 of libseccomp and will return an +// error. +func (f *ScmpFilter) GetTsyncBit() (bool, error) { + tSync, err := f.getFilterAttr(filterAttrTsync) + if err != nil { + return false, err + } + + if tSync == 0 { + return false, nil + } + + return true, nil +} + +// SetBadArchAction sets the default action taken on a syscall for an +// architecture not in the filter, or an error if an issue was encountered +// setting the value. +func (f *ScmpFilter) SetBadArchAction(action ScmpAction) error { + if err := sanitizeAction(action); err != nil { + return err + } + + return f.setFilterAttr(filterAttrActBadArch, action.toNative()) +} + +// SetNoNewPrivsBit sets the state of the No New Privileges bit, which will be +// applied on filter load, or an error if an issue was encountered setting the +// value. +// Filters with No New Privileges set to 0 can only be loaded if the process +// has the CAP_SYS_ADMIN capability. +func (f *ScmpFilter) SetNoNewPrivsBit(state bool) error { + var toSet C.uint32_t = 0x0 + + if state { + toSet = 0x1 + } + + return f.setFilterAttr(filterAttrNNP, toSet) +} + +// SetTsync sets whether Thread Synchronization will be enabled on the filter +// being loaded. Returns an error if setting Tsync failed, or the filter is +// invalid. +// Thread Sync ensures that all members of the thread group of the calling +// process will share the same Seccomp filter set. +// Tsync is a fairly recent addition to the Linux kernel and older kernels +// lack support. If the running kernel does not support Tsync and it is +// requested in a filter, Libseccomp will not enable TSync support and will +// proceed as normal. +// This function is unavailable before v2.2 of libseccomp and will return an +// error. +func (f *ScmpFilter) SetTsync(enable bool) error { + var toSet C.uint32_t = 0x0 + + if enable { + toSet = 0x1 + } + + return f.setFilterAttr(filterAttrTsync, toSet) +} + +// SetSyscallPriority sets a syscall's priority. +// This provides a hint to the filter generator in libseccomp about the +// importance of this syscall. High-priority syscalls are placed +// first in the filter code, and incur less overhead (at the expense of +// lower-priority syscalls). +func (f *ScmpFilter) SetSyscallPriority(call ScmpSyscall, priority uint8) error { + f.lock.Lock() + defer f.lock.Unlock() + + if !f.valid { + return errBadFilter + } + + if retCode := C.seccomp_syscall_priority(f.filterCtx, C.int(call), + C.uint8_t(priority)); retCode != 0 { + return syscall.Errno(-1 * retCode) + } + + return nil +} + +// AddRule adds a single rule for an unconditional action on a syscall. +// Accepts the number of the syscall and the action to be taken on the call +// being made. +// Returns an error if an issue was encountered adding the rule. +func (f *ScmpFilter) AddRule(call ScmpSyscall, action ScmpAction) error { + return f.addRuleGeneric(call, action, false, nil) +} + +// AddRuleExact adds a single rule for an unconditional action on a syscall. +// Accepts the number of the syscall and the action to be taken on the call +// being made. +// No modifications will be made to the rule, and it will fail to add if it +// cannot be applied to the current architecture without modification. +// The rule will function exactly as described, but it may not function identically +// (or be able to be applied to) all architectures. +// Returns an error if an issue was encountered adding the rule. +func (f *ScmpFilter) AddRuleExact(call ScmpSyscall, action ScmpAction) error { + return f.addRuleGeneric(call, action, true, nil) +} + +// AddRuleConditional adds a single rule for a conditional action on a syscall. +// Returns an error if an issue was encountered adding the rule. +// All conditions must match for the rule to match. +// There is a bug in library versions below v2.2.1 which can, in some cases, +// cause conditions to be lost when more than one are used. Consequently, +// AddRuleConditional is disabled on library versions lower than v2.2.1 +func (f *ScmpFilter) AddRuleConditional(call ScmpSyscall, action ScmpAction, conds []ScmpCondition) error { + return f.addRuleGeneric(call, action, false, conds) +} + +// AddRuleConditionalExact adds a single rule for a conditional action on a +// syscall. +// No modifications will be made to the rule, and it will fail to add if it +// cannot be applied to the current architecture without modification. +// The rule will function exactly as described, but it may not function identically +// (or be able to be applied to) all architectures. +// Returns an error if an issue was encountered adding the rule. +// There is a bug in library versions below v2.2.1 which can, in some cases, +// cause conditions to be lost when more than one are used. Consequently, +// AddRuleConditionalExact is disabled on library versions lower than v2.2.1 +func (f *ScmpFilter) AddRuleConditionalExact(call ScmpSyscall, action ScmpAction, conds []ScmpCondition) error { + return f.addRuleGeneric(call, action, true, conds) +} + +// ExportPFC output PFC-formatted, human-readable dump of a filter context's +// rules to a file. +// Accepts file to write to (must be open for writing). +// Returns an error if writing to the file fails. +func (f *ScmpFilter) ExportPFC(file *os.File) error { + f.lock.Lock() + defer f.lock.Unlock() + + fd := file.Fd() + + if !f.valid { + return errBadFilter + } + + if retCode := C.seccomp_export_pfc(f.filterCtx, C.int(fd)); retCode != 0 { + return syscall.Errno(-1 * retCode) + } + + return nil +} + +// ExportBPF outputs Berkeley Packet Filter-formatted, kernel-readable dump of a +// filter context's rules to a file. +// Accepts file to write to (must be open for writing). +// Returns an error if writing to the file fails. +func (f *ScmpFilter) ExportBPF(file *os.File) error { + f.lock.Lock() + defer f.lock.Unlock() + + fd := file.Fd() + + if !f.valid { + return errBadFilter + } + + if retCode := C.seccomp_export_bpf(f.filterCtx, C.int(fd)); retCode != 0 { + return syscall.Errno(-1 * retCode) + } + + return nil +} diff --git a/vendor/github.com/seccomp/libseccomp-golang/seccomp_internal.go b/vendor/github.com/seccomp/libseccomp-golang/seccomp_internal.go new file mode 100644 index 0000000000..b0caac91be --- /dev/null +++ b/vendor/github.com/seccomp/libseccomp-golang/seccomp_internal.go @@ -0,0 +1,514 @@ +// +build linux + +// Internal functions for libseccomp Go bindings +// No exported functions + +package seccomp + +import ( + "fmt" + "syscall" +) + +// Unexported C wrapping code - provides the C-Golang interface +// Get the seccomp header in scope +// Need stdlib.h for free() on cstrings + +// #cgo pkg-config: libseccomp +/* +#include +#include + +#if SCMP_VER_MAJOR < 2 +#error Minimum supported version of Libseccomp is v2.1.0 +#elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 1 +#error Minimum supported version of Libseccomp is v2.1.0 +#endif + +#define ARCH_BAD ~0 + +const uint32_t C_ARCH_BAD = ARCH_BAD; + +#ifndef SCMP_ARCH_AARCH64 +#define SCMP_ARCH_AARCH64 ARCH_BAD +#endif + +#ifndef SCMP_ARCH_MIPS +#define SCMP_ARCH_MIPS ARCH_BAD +#endif + +#ifndef SCMP_ARCH_MIPS64 +#define SCMP_ARCH_MIPS64 ARCH_BAD +#endif + +#ifndef SCMP_ARCH_MIPS64N32 +#define SCMP_ARCH_MIPS64N32 ARCH_BAD +#endif + +#ifndef SCMP_ARCH_MIPSEL +#define SCMP_ARCH_MIPSEL ARCH_BAD +#endif + +#ifndef SCMP_ARCH_MIPSEL64 +#define SCMP_ARCH_MIPSEL64 ARCH_BAD +#endif + +#ifndef SCMP_ARCH_MIPSEL64N32 +#define SCMP_ARCH_MIPSEL64N32 ARCH_BAD +#endif + +#ifndef SCMP_ARCH_PPC +#define SCMP_ARCH_PPC ARCH_BAD +#endif + +#ifndef SCMP_ARCH_PPC64 +#define SCMP_ARCH_PPC64 ARCH_BAD +#endif + +#ifndef SCMP_ARCH_PPC64LE +#define SCMP_ARCH_PPC64LE ARCH_BAD +#endif + +#ifndef SCMP_ARCH_S390 +#define SCMP_ARCH_S390 ARCH_BAD +#endif + +#ifndef SCMP_ARCH_S390X +#define SCMP_ARCH_S390X ARCH_BAD +#endif + +const uint32_t C_ARCH_NATIVE = SCMP_ARCH_NATIVE; +const uint32_t C_ARCH_X86 = SCMP_ARCH_X86; +const uint32_t C_ARCH_X86_64 = SCMP_ARCH_X86_64; +const uint32_t C_ARCH_X32 = SCMP_ARCH_X32; +const uint32_t C_ARCH_ARM = SCMP_ARCH_ARM; +const uint32_t C_ARCH_AARCH64 = SCMP_ARCH_AARCH64; +const uint32_t C_ARCH_MIPS = SCMP_ARCH_MIPS; +const uint32_t C_ARCH_MIPS64 = SCMP_ARCH_MIPS64; +const uint32_t C_ARCH_MIPS64N32 = SCMP_ARCH_MIPS64N32; +const uint32_t C_ARCH_MIPSEL = SCMP_ARCH_MIPSEL; +const uint32_t C_ARCH_MIPSEL64 = SCMP_ARCH_MIPSEL64; +const uint32_t C_ARCH_MIPSEL64N32 = SCMP_ARCH_MIPSEL64N32; +const uint32_t C_ARCH_PPC = SCMP_ARCH_PPC; +const uint32_t C_ARCH_PPC64 = SCMP_ARCH_PPC64; +const uint32_t C_ARCH_PPC64LE = SCMP_ARCH_PPC64LE; +const uint32_t C_ARCH_S390 = SCMP_ARCH_S390; +const uint32_t C_ARCH_S390X = SCMP_ARCH_S390X; + +const uint32_t C_ACT_KILL = SCMP_ACT_KILL; +const uint32_t C_ACT_TRAP = SCMP_ACT_TRAP; +const uint32_t C_ACT_ERRNO = SCMP_ACT_ERRNO(0); +const uint32_t C_ACT_TRACE = SCMP_ACT_TRACE(0); +const uint32_t C_ACT_ALLOW = SCMP_ACT_ALLOW; + +// If TSync is not supported, make sure it doesn't map to a supported filter attribute +// Don't worry about major version < 2, the minimum version checks should catch that case +#if SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 2 +#define SCMP_FLTATR_CTL_TSYNC _SCMP_CMP_MIN +#endif + +const uint32_t C_ATTRIBUTE_DEFAULT = (uint32_t)SCMP_FLTATR_ACT_DEFAULT; +const uint32_t C_ATTRIBUTE_BADARCH = (uint32_t)SCMP_FLTATR_ACT_BADARCH; +const uint32_t C_ATTRIBUTE_NNP = (uint32_t)SCMP_FLTATR_CTL_NNP; +const uint32_t C_ATTRIBUTE_TSYNC = (uint32_t)SCMP_FLTATR_CTL_TSYNC; + +const int C_CMP_NE = (int)SCMP_CMP_NE; +const int C_CMP_LT = (int)SCMP_CMP_LT; +const int C_CMP_LE = (int)SCMP_CMP_LE; +const int C_CMP_EQ = (int)SCMP_CMP_EQ; +const int C_CMP_GE = (int)SCMP_CMP_GE; +const int C_CMP_GT = (int)SCMP_CMP_GT; +const int C_CMP_MASKED_EQ = (int)SCMP_CMP_MASKED_EQ; + +const int C_VERSION_MAJOR = SCMP_VER_MAJOR; +const int C_VERSION_MINOR = SCMP_VER_MINOR; +const int C_VERSION_MICRO = SCMP_VER_MICRO; + +typedef struct scmp_arg_cmp* scmp_cast_t; + +// Wrapper to create an scmp_arg_cmp struct +void* +make_struct_arg_cmp( + unsigned int arg, + int compare, + uint64_t a, + uint64_t b + ) +{ + struct scmp_arg_cmp *s = malloc(sizeof(struct scmp_arg_cmp)); + + s->arg = arg; + s->op = compare; + s->datum_a = a; + s->datum_b = b; + + return s; +} +*/ +import "C" + +// Nonexported types +type scmpFilterAttr uint32 + +// Nonexported constants + +const ( + filterAttrActDefault scmpFilterAttr = iota + filterAttrActBadArch scmpFilterAttr = iota + filterAttrNNP scmpFilterAttr = iota + filterAttrTsync scmpFilterAttr = iota +) + +const ( + // An error return from certain libseccomp functions + scmpError C.int = -1 + // Comparison boundaries to check for architecture validity + archStart ScmpArch = ArchNative + archEnd ScmpArch = ArchS390X + // Comparison boundaries to check for action validity + actionStart ScmpAction = ActKill + actionEnd ScmpAction = ActAllow + // Comparison boundaries to check for comparison operator validity + compareOpStart ScmpCompareOp = CompareNotEqual + compareOpEnd ScmpCompareOp = CompareMaskedEqual +) + +var ( + // Error thrown on bad filter context + errBadFilter = fmt.Errorf("filter is invalid or uninitialized") + // Constants representing library major, minor, and micro versions + verMajor = int(C.C_VERSION_MAJOR) + verMinor = int(C.C_VERSION_MINOR) + verMicro = int(C.C_VERSION_MICRO) +) + +// Nonexported functions + +// Check if library version is greater than or equal to the given one +func checkVersionAbove(major, minor, micro int) bool { + return (verMajor > major) || + (verMajor == major && verMinor > minor) || + (verMajor == major && verMinor == minor && verMicro >= micro) +} + +// Ensure that the library is supported, i.e. >= 2.1.0. +func ensureSupportedVersion() error { + if !checkVersionAbove(2, 1, 0) { + return VersionError{} + } + return nil +} + +// Filter helpers + +// Filter finalizer - ensure that kernel context for filters is freed +func filterFinalizer(f *ScmpFilter) { + f.Release() +} + +// Get a raw filter attribute +func (f *ScmpFilter) getFilterAttr(attr scmpFilterAttr) (C.uint32_t, error) { + f.lock.Lock() + defer f.lock.Unlock() + + if !f.valid { + return 0x0, errBadFilter + } + + if !checkVersionAbove(2, 2, 0) && attr == filterAttrTsync { + return 0x0, VersionError{ + message: "thread synchronization attribute is not supported", + minimum: "2.2.0", + } + } + + var attribute C.uint32_t + + retCode := C.seccomp_attr_get(f.filterCtx, attr.toNative(), &attribute) + if retCode != 0 { + return 0x0, syscall.Errno(-1 * retCode) + } + + return attribute, nil +} + +// Set a raw filter attribute +func (f *ScmpFilter) setFilterAttr(attr scmpFilterAttr, value C.uint32_t) error { + f.lock.Lock() + defer f.lock.Unlock() + + if !f.valid { + return errBadFilter + } + + if !checkVersionAbove(2, 2, 0) && attr == filterAttrTsync { + return VersionError{ + message: "thread synchronization attribute is not supported", + minimum: "2.2.0", + } + } + + retCode := C.seccomp_attr_set(f.filterCtx, attr.toNative(), value) + if retCode != 0 { + return syscall.Errno(-1 * retCode) + } + + return nil +} + +// DOES NOT LOCK OR CHECK VALIDITY +// Assumes caller has already done this +// Wrapper for seccomp_rule_add_... functions +func (f *ScmpFilter) addRuleWrapper(call ScmpSyscall, action ScmpAction, exact bool, cond C.scmp_cast_t) error { + var length C.uint + if cond != nil { + length = 1 + } else { + length = 0 + } + + var retCode C.int + if exact { + retCode = C.seccomp_rule_add_exact_array(f.filterCtx, action.toNative(), C.int(call), length, cond) + } else { + retCode = C.seccomp_rule_add_array(f.filterCtx, action.toNative(), C.int(call), length, cond) + } + + if syscall.Errno(-1*retCode) == syscall.EFAULT { + return fmt.Errorf("unrecognized syscall") + } else if syscall.Errno(-1*retCode) == syscall.EPERM { + return fmt.Errorf("requested action matches default action of filter") + } else if retCode != 0 { + return syscall.Errno(-1 * retCode) + } + + return nil +} + +// Generic add function for filter rules +func (f *ScmpFilter) addRuleGeneric(call ScmpSyscall, action ScmpAction, exact bool, conds []ScmpCondition) error { + f.lock.Lock() + defer f.lock.Unlock() + + if !f.valid { + return errBadFilter + } + + if len(conds) == 0 { + if err := f.addRuleWrapper(call, action, exact, nil); err != nil { + return err + } + } else { + // We don't support conditional filtering in library version v2.1 + if !checkVersionAbove(2, 2, 1) { + return VersionError{ + message: "conditional filtering is not supported", + minimum: "2.2.1", + } + } + + for _, cond := range conds { + cmpStruct := C.make_struct_arg_cmp(C.uint(cond.Argument), cond.Op.toNative(), C.uint64_t(cond.Operand1), C.uint64_t(cond.Operand2)) + defer C.free(cmpStruct) + + if err := f.addRuleWrapper(call, action, exact, C.scmp_cast_t(cmpStruct)); err != nil { + return err + } + } + } + + return nil +} + +// Generic Helpers + +// Helper - Sanitize Arch token input +func sanitizeArch(in ScmpArch) error { + if in < archStart || in > archEnd { + return fmt.Errorf("unrecognized architecture") + } + + if in.toNative() == C.C_ARCH_BAD { + return fmt.Errorf("architecture is not supported on this version of the library") + } + + return nil +} + +func sanitizeAction(in ScmpAction) error { + inTmp := in & 0x0000FFFF + if inTmp < actionStart || inTmp > actionEnd { + return fmt.Errorf("unrecognized action") + } + + if inTmp != ActTrace && inTmp != ActErrno && (in&0xFFFF0000) != 0 { + return fmt.Errorf("highest 16 bits must be zeroed except for Trace and Errno") + } + + return nil +} + +func sanitizeCompareOp(in ScmpCompareOp) error { + if in < compareOpStart || in > compareOpEnd { + return fmt.Errorf("unrecognized comparison operator") + } + + return nil +} + +func archFromNative(a C.uint32_t) (ScmpArch, error) { + switch a { + case C.C_ARCH_X86: + return ArchX86, nil + case C.C_ARCH_X86_64: + return ArchAMD64, nil + case C.C_ARCH_X32: + return ArchX32, nil + case C.C_ARCH_ARM: + return ArchARM, nil + case C.C_ARCH_NATIVE: + return ArchNative, nil + case C.C_ARCH_AARCH64: + return ArchARM64, nil + case C.C_ARCH_MIPS: + return ArchMIPS, nil + case C.C_ARCH_MIPS64: + return ArchMIPS64, nil + case C.C_ARCH_MIPS64N32: + return ArchMIPS64N32, nil + case C.C_ARCH_MIPSEL: + return ArchMIPSEL, nil + case C.C_ARCH_MIPSEL64: + return ArchMIPSEL64, nil + case C.C_ARCH_MIPSEL64N32: + return ArchMIPSEL64N32, nil + case C.C_ARCH_PPC: + return ArchPPC, nil + case C.C_ARCH_PPC64: + return ArchPPC64, nil + case C.C_ARCH_PPC64LE: + return ArchPPC64LE, nil + case C.C_ARCH_S390: + return ArchS390, nil + case C.C_ARCH_S390X: + return ArchS390X, nil + default: + return 0x0, fmt.Errorf("unrecognized architecture") + } +} + +// Only use with sanitized arches, no error handling +func (a ScmpArch) toNative() C.uint32_t { + switch a { + case ArchX86: + return C.C_ARCH_X86 + case ArchAMD64: + return C.C_ARCH_X86_64 + case ArchX32: + return C.C_ARCH_X32 + case ArchARM: + return C.C_ARCH_ARM + case ArchARM64: + return C.C_ARCH_AARCH64 + case ArchMIPS: + return C.C_ARCH_MIPS + case ArchMIPS64: + return C.C_ARCH_MIPS64 + case ArchMIPS64N32: + return C.C_ARCH_MIPS64N32 + case ArchMIPSEL: + return C.C_ARCH_MIPSEL + case ArchMIPSEL64: + return C.C_ARCH_MIPSEL64 + case ArchMIPSEL64N32: + return C.C_ARCH_MIPSEL64N32 + case ArchPPC: + return C.C_ARCH_PPC + case ArchPPC64: + return C.C_ARCH_PPC64 + case ArchPPC64LE: + return C.C_ARCH_PPC64LE + case ArchS390: + return C.C_ARCH_S390 + case ArchS390X: + return C.C_ARCH_S390X + case ArchNative: + return C.C_ARCH_NATIVE + default: + return 0x0 + } +} + +// Only use with sanitized ops, no error handling +func (a ScmpCompareOp) toNative() C.int { + switch a { + case CompareNotEqual: + return C.C_CMP_NE + case CompareLess: + return C.C_CMP_LT + case CompareLessOrEqual: + return C.C_CMP_LE + case CompareEqual: + return C.C_CMP_EQ + case CompareGreaterEqual: + return C.C_CMP_GE + case CompareGreater: + return C.C_CMP_GT + case CompareMaskedEqual: + return C.C_CMP_MASKED_EQ + default: + return 0x0 + } +} + +func actionFromNative(a C.uint32_t) (ScmpAction, error) { + aTmp := a & 0xFFFF + switch a & 0xFFFF0000 { + case C.C_ACT_KILL: + return ActKill, nil + case C.C_ACT_TRAP: + return ActTrap, nil + case C.C_ACT_ERRNO: + return ActErrno.SetReturnCode(int16(aTmp)), nil + case C.C_ACT_TRACE: + return ActTrace.SetReturnCode(int16(aTmp)), nil + case C.C_ACT_ALLOW: + return ActAllow, nil + default: + return 0x0, fmt.Errorf("unrecognized action") + } +} + +// Only use with sanitized actions, no error handling +func (a ScmpAction) toNative() C.uint32_t { + switch a & 0xFFFF { + case ActKill: + return C.C_ACT_KILL + case ActTrap: + return C.C_ACT_TRAP + case ActErrno: + return C.C_ACT_ERRNO | (C.uint32_t(a) >> 16) + case ActTrace: + return C.C_ACT_TRACE | (C.uint32_t(a) >> 16) + case ActAllow: + return C.C_ACT_ALLOW + default: + return 0x0 + } +} + +// Internal only, assumes safe attribute +func (a scmpFilterAttr) toNative() uint32 { + switch a { + case filterAttrActDefault: + return uint32(C.C_ATTRIBUTE_DEFAULT) + case filterAttrActBadArch: + return uint32(C.C_ATTRIBUTE_BADARCH) + case filterAttrNNP: + return uint32(C.C_ATTRIBUTE_NNP) + case filterAttrTsync: + return uint32(C.C_ATTRIBUTE_TSYNC) + default: + return 0x0 + } +}