Merge pull request #196 from caoruidong/add-spec

cli: implement spec command
This commit is contained in:
James O. D. Hunt 2018-04-17 18:07:42 +01:00 committed by GitHub
commit a4b7e20457
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 2963 additions and 1 deletions

10
Gopkg.lock generated
View File

@ -114,6 +114,8 @@
name = "github.com/opencontainers/runc" name = "github.com/opencontainers/runc"
packages = [ packages = [
"libcontainer/configs", "libcontainer/configs",
"libcontainer/seccomp",
"libcontainer/specconv",
"libcontainer/utils" "libcontainer/utils"
] ]
revision = "0351df1c5a66838d0c392b4ac4cf9450de844e2d" revision = "0351df1c5a66838d0c392b4ac4cf9450de844e2d"
@ -134,6 +136,12 @@
packages = ["."] packages = ["."]
revision = "79559b488d8848b53a8e34c330140c3fc37ee246" revision = "79559b488d8848b53a8e34c330140c3fc37ee246"
[[projects]]
name = "github.com/seccomp/libseccomp-golang"
packages = ["."]
revision = "e3496e3a417d1dc9ecdceca5af2513271fed37a0"
version = "v0.9.0"
[[projects]] [[projects]]
name = "github.com/sirupsen/logrus" name = "github.com/sirupsen/logrus"
packages = [ packages = [
@ -249,6 +257,6 @@
[solve-meta] [solve-meta]
analyzer-name = "dep" analyzer-name = "dep"
analyzer-version = 1 analyzer-version = 1
inputs-digest = "cd6f57ab58ff674de73039bb4337cd1f2b24d069199704c510f8950e1121f395" inputs-digest = "dd9532d9bae03bb3dc43414908beecb8b498c14bea354e261341ff81dbf01b4d"
solver-name = "gps-cdcl" solver-name = "gps-cdcl"
solver-version = 1 solver-version = 1

View File

@ -119,6 +119,7 @@ var runtimeCommands = []cli.Command{
psCLICommand, psCLICommand,
resumeCLICommand, resumeCLICommand,
runCLICommand, runCLICommand,
specCLICommand,
startCLICommand, startCLICommand,
stateCLICommand, stateCLICommand,
versionCLICommand, versionCLICommand,

105
cli/spec.go Normal file
View File

@ -0,0 +1,105 @@
// Copyright (c) 2014,2015,2016,2017 Docker, Inc.
// Copyright (c) 2018 Huawei Corporation.
//
// SPDX-License-Identifier: Apache-2.0
//
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"github.com/opencontainers/runc/libcontainer/specconv"
"github.com/urfave/cli"
)
var specCLICommand = cli.Command{
Name: "spec",
Usage: "create a new specification file",
ArgsUsage: "",
Description: `The spec command creates the new specification file named "` + specConfig + `" for
the bundle.
The spec generated is just a starter file. Editing of the spec is required to
achieve desired results. For example, the newly generated spec includes an args
parameter that is initially set to call the "sh" command when the container is
started. Calling "sh" may work for an ubuntu container or busybox, but will not
work for containers that do not include the "sh" program.
EXAMPLE:
To run docker's hello-world container one needs to set the args parameter
in the spec to call hello. This can be done using the sed command or a text
editor. The following commands create a bundle for hello-world, change the
default args parameter in the spec from "sh" to "/hello", then run the hello
command in a new hello-world container named container1:
mkdir hello
cd hello
docker pull hello-world
docker export $(docker create hello-world) > hello-world.tar
mkdir rootfs
tar -C rootfs -xf hello-world.tar
kata-runtime spec
sed -i 's;"sh";"/hello";' ` + specConfig + `
kata-runtime run container1
In the run command above, "container1" is the name for the instance of the
container that you are starting. The name you provide for the container instance
must be unique on your host.
An alternative for generating a customized spec config is to use "oci-runtime-tool", the
sub-command "oci-runtime-tool generate" has lots of options that can be used to do any
customizations as you want, see runtime-tools (https://github.com/opencontainers/runtime-tools)
to get more information.
When starting a container through kata-runtime, kata-runtime needs root privilege. If not
already running as root, you can use sudo to give kata-runtime root privilege. For
example: "sudo kata-runtime start container1" will give kata-runtime root privilege to start the
container on your host.
Alternatively, you can start a rootless container, which has the ability to run
without root privileges. For this to work, the specification file needs to be
adjusted accordingly. You can pass the parameter --rootless to this command to
generate a proper rootless spec file.`,
Flags: []cli.Flag{
cli.StringFlag{
Name: "bundle, b",
Value: "",
Usage: "path to the root of the bundle directory",
},
},
Action: func(context *cli.Context) error {
spec := specconv.Example()
checkNoFile := func(name string) error {
_, err := os.Stat(name)
if err == nil {
return fmt.Errorf("File %s exists. Remove it first", name)
}
if !os.IsNotExist(err) {
return err
}
return nil
}
bundle := context.String("bundle")
if bundle != "" {
if err := os.Chdir(bundle); err != nil {
return err
}
}
if err := checkNoFile(specConfig); err != nil {
return err
}
data, err := json.MarshalIndent(spec, "", "\t")
if err != nil {
return err
}
if err := ioutil.WriteFile(specConfig, data, 0640); err != nil {
return err
}
return nil
},
}

35
cli/spec_test.go Normal file
View File

@ -0,0 +1,35 @@
// Copyright (c) 2018 Huawei Corporation.
//
// SPDX-License-Identifier: Apache-2.0
//
package main
import (
"flag"
"os"
"testing"
"github.com/stretchr/testify/assert"
"github.com/urfave/cli"
)
func TestSpecCliAction(t *testing.T) {
assert := assert.New(t)
actionFunc, ok := specCLICommand.Action.(func(context *cli.Context) error)
assert.True(ok)
flagSet := flag.NewFlagSet("flag", flag.ContinueOnError)
ctx := cli.NewContext(&cli.App{}, flagSet, nil)
defer os.Remove(specConfig)
err := actionFunc(ctx)
assert.NoError(err)
pattern := "gid=5"
patternRootless := "uidMappings"
err = grep(pattern, specConfig)
assert.NoError(err)
err = grep(patternRootless, specConfig)
assert.Error(err)
}

View File

@ -0,0 +1,76 @@
package seccomp
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/configs"
)
var operators = map[string]configs.Operator{
"SCMP_CMP_NE": configs.NotEqualTo,
"SCMP_CMP_LT": configs.LessThan,
"SCMP_CMP_LE": configs.LessThanOrEqualTo,
"SCMP_CMP_EQ": configs.EqualTo,
"SCMP_CMP_GE": configs.GreaterThanOrEqualTo,
"SCMP_CMP_GT": configs.GreaterThan,
"SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
}
var actions = map[string]configs.Action{
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
}
var archs = map[string]string{
"SCMP_ARCH_X86": "x86",
"SCMP_ARCH_X86_64": "amd64",
"SCMP_ARCH_X32": "x32",
"SCMP_ARCH_ARM": "arm",
"SCMP_ARCH_AARCH64": "arm64",
"SCMP_ARCH_MIPS": "mips",
"SCMP_ARCH_MIPS64": "mips64",
"SCMP_ARCH_MIPS64N32": "mips64n32",
"SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
// Comparison operators use the names they are assigned by Libseccomp's header.
// Attempting to convert a string that is not a valid operator results in an
// error.
func ConvertStringToOperator(in string) (configs.Operator, error) {
if op, ok := operators[in]; ok == true {
return op, nil
}
return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
}
// ConvertStringToAction converts a string into a Seccomp rule match action.
// Actions use the names they are assigned in Libseccomp's header, though some
// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
// return errors.
// Attempting to convert a string that is not a valid action results in an
// error.
func ConvertStringToAction(in string) (configs.Action, error) {
if act, ok := actions[in]; ok == true {
return act, nil
}
return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
}
// ConvertStringToArch converts a string into a Seccomp comparison arch.
func ConvertStringToArch(in string) (string, error) {
if arch, ok := archs[in]; ok == true {
return arch, nil
}
return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
}

View File

@ -0,0 +1,227 @@
// +build linux,cgo,seccomp
package seccomp
import (
"bufio"
"fmt"
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
libseccomp "github.com/seccomp/libseccomp-golang"
"golang.org/x/sys/unix"
)
var (
actAllow = libseccomp.ActAllow
actTrap = libseccomp.ActTrap
actKill = libseccomp.ActKill
actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
)
// Filters given syscalls in a container, preventing them from being used
// Started in the container init process, and carried over to all child processes
// Setns calls, however, require a separate invocation, as they are not children
// of the init until they join the namespace
func InitSeccomp(config *configs.Seccomp) error {
if config == nil {
return fmt.Errorf("cannot initialize Seccomp - nil config passed")
}
defaultAction, err := getAction(config.DefaultAction)
if err != nil {
return fmt.Errorf("error initializing seccomp - invalid default action")
}
filter, err := libseccomp.NewFilter(defaultAction)
if err != nil {
return fmt.Errorf("error creating filter: %s", err)
}
// Add extra architectures
for _, arch := range config.Architectures {
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
return err
}
if err := filter.AddArch(scmpArch); err != nil {
return err
}
}
// Unset no new privs bit
if err := filter.SetNoNewPrivsBit(false); err != nil {
return fmt.Errorf("error setting no new privileges: %s", err)
}
// Add a rule for each syscall
for _, call := range config.Syscalls {
if call == nil {
return fmt.Errorf("encountered nil syscall while initializing Seccomp")
}
if err = matchCall(filter, call); err != nil {
return err
}
}
if err = filter.Load(); err != nil {
return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
}
return nil
}
// IsEnabled returns if the kernel has been configured to support seccomp.
func IsEnabled() bool {
// Try to read from /proc/self/status for kernels > 3.8
s, err := parseStatusFile("/proc/self/status")
if err != nil {
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL {
return true
}
}
return false
}
_, ok := s["Seccomp"]
return ok
}
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
switch act {
case configs.Kill:
return actKill, nil
case configs.Errno:
return actErrno, nil
case configs.Trap:
return actTrap, nil
case configs.Allow:
return actAllow, nil
case configs.Trace:
return actTrace, nil
default:
return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
}
}
// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
switch op {
case configs.EqualTo:
return libseccomp.CompareEqual, nil
case configs.NotEqualTo:
return libseccomp.CompareNotEqual, nil
case configs.GreaterThan:
return libseccomp.CompareGreater, nil
case configs.GreaterThanOrEqualTo:
return libseccomp.CompareGreaterEqual, nil
case configs.LessThan:
return libseccomp.CompareLess, nil
case configs.LessThanOrEqualTo:
return libseccomp.CompareLessOrEqual, nil
case configs.MaskEqualTo:
return libseccomp.CompareMaskedEqual, nil
default:
return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
}
}
// Convert Libcontainer Arg to Libseccomp ScmpCondition
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
cond := libseccomp.ScmpCondition{}
if arg == nil {
return cond, fmt.Errorf("cannot convert nil to syscall condition")
}
op, err := getOperator(arg.Op)
if err != nil {
return cond, err
}
return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
}
// Add a rule to match a single syscall
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
if call == nil || filter == nil {
return fmt.Errorf("cannot use nil as syscall to block")
}
if len(call.Name) == 0 {
return fmt.Errorf("empty string is not a valid syscall")
}
// If we can't resolve the syscall, assume it's not supported on this kernel
// Ignore it, don't error out
callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil {
return nil
}
// Convert the call's action to the libseccomp equivalent
callAct, err := getAction(call.Action)
if err != nil {
return err
}
// Unconditional match - just add the rule
if len(call.Args) == 0 {
if err = filter.AddRule(callNum, callAct); err != nil {
return err
}
} else {
// Conditional match - convert the per-arg rules into library format
conditions := []libseccomp.ScmpCondition{}
for _, cond := range call.Args {
newCond, err := getCondition(cond)
if err != nil {
return err
}
conditions = append(conditions, newCond)
}
if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
return err
}
}
return nil
}
func parseStatusFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
status := make(map[string]string)
for s.Scan() {
text := s.Text()
parts := strings.Split(text, ":")
if len(parts) <= 1 {
continue
}
status[parts[0]] = parts[1]
}
if err := s.Err(); err != nil {
return nil, err
}
return status, nil
}

View File

@ -0,0 +1,24 @@
// +build !linux !cgo !seccomp
package seccomp
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error {
if config != nil {
return ErrSeccompNotEnabled
}
return nil
}
// IsEnabled returns false, because it is not supported.
func IsEnabled() bool {
return false
}

View File

@ -0,0 +1,220 @@
package specconv
import (
"os"
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
)
// Example returns an example spec file, with many options set so a user can
// see what a standard spec file looks like.
func Example() *specs.Spec {
return &specs.Spec{
Version: specs.Version,
Root: &specs.Root{
Path: "rootfs",
Readonly: true,
},
Process: &specs.Process{
Terminal: true,
User: specs.User{},
Args: []string{
"sh",
},
Env: []string{
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm",
},
Cwd: "/",
NoNewPrivileges: true,
Capabilities: &specs.LinuxCapabilities{
Bounding: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Permitted: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Inheritable: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Ambient: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Effective: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
},
Rlimits: []specs.POSIXRlimit{
{
Type: "RLIMIT_NOFILE",
Hard: uint64(1024),
Soft: uint64(1024),
},
},
},
Hostname: "runc",
Mounts: []specs.Mount{
{
Destination: "/proc",
Type: "proc",
Source: "proc",
Options: nil,
},
{
Destination: "/dev",
Type: "tmpfs",
Source: "tmpfs",
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
},
{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
},
{
Destination: "/dev/shm",
Type: "tmpfs",
Source: "shm",
Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
},
{
Destination: "/dev/mqueue",
Type: "mqueue",
Source: "mqueue",
Options: []string{"nosuid", "noexec", "nodev"},
},
{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", "ro"},
},
{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
},
},
Linux: &specs.Linux{
MaskedPaths: []string{
"/proc/kcore",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware",
},
ReadonlyPaths: []string{
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
},
Resources: &specs.LinuxResources{
Devices: []specs.LinuxDeviceCgroup{
{
Allow: false,
Access: "rwm",
},
},
},
Namespaces: []specs.LinuxNamespace{
{
Type: "pid",
},
{
Type: "network",
},
{
Type: "ipc",
},
{
Type: "uts",
},
{
Type: "mount",
},
},
},
}
}
// ExampleRootless returns an example spec file that works with rootless
// containers. It's essentially a modified version of the specfile from
// Example().
func ToRootless(spec *specs.Spec) {
var namespaces []specs.LinuxNamespace
// Remove networkns from the spec.
for _, ns := range spec.Linux.Namespaces {
switch ns.Type {
case specs.NetworkNamespace, specs.UserNamespace:
// Do nothing.
default:
namespaces = append(namespaces, ns)
}
}
// Add userns to the spec.
namespaces = append(namespaces, specs.LinuxNamespace{
Type: specs.UserNamespace,
})
spec.Linux.Namespaces = namespaces
// Add mappings for the current user.
spec.Linux.UIDMappings = []specs.LinuxIDMapping{{
HostID: uint32(os.Geteuid()),
ContainerID: 0,
Size: 1,
}}
spec.Linux.GIDMappings = []specs.LinuxIDMapping{{
HostID: uint32(os.Getegid()),
ContainerID: 0,
Size: 1,
}}
// Fix up mounts.
var mounts []specs.Mount
for _, mount := range spec.Mounts {
// Ignore all mounts that are under /sys.
if strings.HasPrefix(mount.Destination, "/sys") {
continue
}
// Remove all gid= and uid= mappings.
var options []string
for _, option := range mount.Options {
if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
options = append(options, option)
}
}
mount.Options = options
mounts = append(mounts, mount)
}
// Add the sysfs mount as an rbind.
mounts = append(mounts, specs.Mount{
Source: "/sys",
Destination: "/sys",
Type: "none",
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
})
spec.Mounts = mounts
// Remove cgroup settings.
spec.Linux.Resources = nil
}

View File

@ -0,0 +1,828 @@
// +build linux
// Package specconv implements conversion of specifications to libcontainer
// configurations
package specconv
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
const wildcard = -1
var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
specs.PIDNamespace: configs.NEWPID,
specs.NetworkNamespace: configs.NEWNET,
specs.MountNamespace: configs.NEWNS,
specs.UserNamespace: configs.NEWUSER,
specs.IPCNamespace: configs.NEWIPC,
specs.UTSNamespace: configs.NEWUTS,
}
var mountPropagationMapping = map[string]int{
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
"private": unix.MS_PRIVATE,
"rslave": unix.MS_SLAVE | unix.MS_REC,
"slave": unix.MS_SLAVE,
"rshared": unix.MS_SHARED | unix.MS_REC,
"shared": unix.MS_SHARED,
"": 0,
}
var allowedDevices = []*configs.Device{
// allow mknod for any device
{
Type: 'c',
Major: wildcard,
Minor: wildcard,
Permissions: "m",
Allow: true,
},
{
Type: 'b',
Major: wildcard,
Minor: wildcard,
Permissions: "m",
Allow: true,
},
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
Permissions: "rwm",
Allow: true,
},
{
Path: "/dev/console",
Type: 'c',
Major: 5,
Minor: 1,
Permissions: "rwm",
Allow: true,
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Path: "",
Type: 'c',
Major: 136,
Minor: wildcard,
Permissions: "rwm",
Allow: true,
},
{
Path: "",
Type: 'c',
Major: 5,
Minor: 2,
Permissions: "rwm",
Allow: true,
},
// tuntap
{
Path: "",
Type: 'c',
Major: 10,
Minor: 200,
Permissions: "rwm",
Allow: true,
},
}
type CreateOpts struct {
CgroupName string
UseSystemdCgroup bool
NoPivotRoot bool
NoNewKeyring bool
Spec *specs.Spec
Rootless bool
}
// CreateLibcontainerConfig creates a new libcontainer configuration from a
// given specification and a cgroup name
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
// runc's cwd will always be the bundle path
rcwd, err := os.Getwd()
if err != nil {
return nil, err
}
cwd, err := filepath.Abs(rcwd)
if err != nil {
return nil, err
}
spec := opts.Spec
if spec.Root == nil {
return nil, fmt.Errorf("Root must be specified")
}
rootfsPath := spec.Root.Path
if !filepath.IsAbs(rootfsPath) {
rootfsPath = filepath.Join(cwd, rootfsPath)
}
labels := []string{}
for k, v := range spec.Annotations {
labels = append(labels, fmt.Sprintf("%s=%s", k, v))
}
config := &configs.Config{
Rootfs: rootfsPath,
NoPivotRoot: opts.NoPivotRoot,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
NoNewKeyring: opts.NoNewKeyring,
Rootless: opts.Rootless,
}
exists := false
for _, m := range spec.Mounts {
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
}
if err := createDevices(spec, config); err != nil {
return nil, err
}
if err := setupUserNamespace(spec, config); err != nil {
return nil, err
}
c, err := createCgroupConfig(opts)
if err != nil {
return nil, err
}
config.Cgroups = c
// set linux-specific config
if spec.Linux != nil {
if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
}
for _, ns := range spec.Linux.Namespaces {
t, exists := namespaceMapping[ns.Type]
if !exists {
return nil, fmt.Errorf("namespace %q does not exist", ns)
}
if config.Namespaces.Contains(t) {
return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
}
config.Namespaces.Add(t, ns.Path)
}
if config.Namespaces.Contains(configs.NEWNET) {
config.Networks = []*configs.Network{
{
Type: "loopback",
},
}
}
config.MaskPaths = spec.Linux.MaskedPaths
config.ReadonlyPaths = spec.Linux.ReadonlyPaths
config.MountLabel = spec.Linux.MountLabel
config.Sysctl = spec.Linux.Sysctl
if spec.Linux.Seccomp != nil {
seccomp, err := setupSeccomp(spec.Linux.Seccomp)
if err != nil {
return nil, err
}
config.Seccomp = seccomp
}
}
if spec.Process.SelinuxLabel != "" {
config.ProcessLabel = spec.Process.SelinuxLabel
}
if spec.Process != nil && spec.Process.OOMScoreAdj != nil {
config.OomScoreAdj = *spec.Process.OOMScoreAdj
}
if spec.Process.Capabilities != nil {
config.Capabilities = &configs.Capabilities{
Bounding: spec.Process.Capabilities.Bounding,
Effective: spec.Process.Capabilities.Effective,
Permitted: spec.Process.Capabilities.Permitted,
Inheritable: spec.Process.Capabilities.Inheritable,
Ambient: spec.Process.Capabilities.Ambient,
}
}
createHooks(spec, config)
config.Version = specs.Version
if spec.Linux.IntelRdt != nil {
config.IntelRdt = &configs.IntelRdt{}
if spec.Linux.IntelRdt.L3CacheSchema != "" {
config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
}
}
return config, nil
}
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
flags, pgflags, data, ext := parseMountOptions(m.Options)
source := m.Source
if m.Type == "bind" {
if !filepath.IsAbs(source) {
source = filepath.Join(cwd, m.Source)
}
}
return &configs.Mount{
Device: m.Type,
Source: source,
Destination: m.Destination,
Data: data,
Flags: flags,
PropagationFlags: pgflags,
Extensions: ext,
}
}
func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
var (
myCgroupPath string
spec = opts.Spec
useSystemdCgroup = opts.UseSystemdCgroup
name = opts.CgroupName
)
c := &configs.Cgroup{
Resources: &configs.Resources{},
}
if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
if useSystemdCgroup {
myCgroupPath = spec.Linux.CgroupsPath
}
}
if useSystemdCgroup {
if myCgroupPath == "" {
c.Parent = "system.slice"
c.ScopePrefix = "runc"
c.Name = name
} else {
// Parse the path from expected "slice:prefix:name"
// for e.g. "system.slice:docker:1234"
parts := strings.Split(myCgroupPath, ":")
if len(parts) != 3 {
return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups")
}
c.Parent = parts[0]
c.ScopePrefix = parts[1]
c.Name = parts[2]
}
} else {
if myCgroupPath == "" {
c.Name = name
}
c.Path = myCgroupPath
}
// In rootless containers, any attempt to make cgroup changes will fail.
// libcontainer will validate this and we shouldn't add any cgroup options
// the user didn't specify.
if !opts.Rootless {
c.Resources.AllowedDevices = allowedDevices
}
if spec.Linux != nil {
r := spec.Linux.Resources
if r == nil {
return c, nil
}
for i, d := range spec.Linux.Resources.Devices {
var (
t = "a"
major = int64(-1)
minor = int64(-1)
)
if d.Type != "" {
t = d.Type
}
if d.Major != nil {
major = *d.Major
}
if d.Minor != nil {
minor = *d.Minor
}
if d.Access == "" {
return nil, fmt.Errorf("device access at %d field cannot be empty", i)
}
dt, err := stringToCgroupDeviceRune(t)
if err != nil {
return nil, err
}
dd := &configs.Device{
Type: dt,
Major: major,
Minor: minor,
Permissions: d.Access,
Allow: d.Allow,
}
c.Resources.Devices = append(c.Resources.Devices, dd)
}
if r.Memory != nil {
if r.Memory.Limit != nil {
c.Resources.Memory = *r.Memory.Limit
}
if r.Memory.Reservation != nil {
c.Resources.MemoryReservation = *r.Memory.Reservation
}
if r.Memory.Swap != nil {
c.Resources.MemorySwap = *r.Memory.Swap
}
if r.Memory.Kernel != nil {
c.Resources.KernelMemory = *r.Memory.Kernel
}
if r.Memory.KernelTCP != nil {
c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
}
if r.Memory.Swappiness != nil {
c.Resources.MemorySwappiness = r.Memory.Swappiness
}
if r.Memory.DisableOOMKiller != nil {
c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
}
}
if r.CPU != nil {
if r.CPU.Shares != nil {
c.Resources.CpuShares = *r.CPU.Shares
}
if r.CPU.Quota != nil {
c.Resources.CpuQuota = *r.CPU.Quota
}
if r.CPU.Period != nil {
c.Resources.CpuPeriod = *r.CPU.Period
}
if r.CPU.RealtimeRuntime != nil {
c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
}
if r.CPU.RealtimePeriod != nil {
c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
}
if r.CPU.Cpus != "" {
c.Resources.CpusetCpus = r.CPU.Cpus
}
if r.CPU.Mems != "" {
c.Resources.CpusetMems = r.CPU.Mems
}
}
if r.Pids != nil {
c.Resources.PidsLimit = r.Pids.Limit
}
if r.BlockIO != nil {
if r.BlockIO.Weight != nil {
c.Resources.BlkioWeight = *r.BlockIO.Weight
}
if r.BlockIO.LeafWeight != nil {
c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
}
if r.BlockIO.WeightDevice != nil {
for _, wd := range r.BlockIO.WeightDevice {
var weight, leafWeight uint16
if wd.Weight != nil {
weight = *wd.Weight
}
if wd.LeafWeight != nil {
leafWeight = *wd.LeafWeight
}
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
}
}
if r.BlockIO.ThrottleReadBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleReadBpsDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleReadIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
}
}
}
for _, l := range r.HugepageLimits {
c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
Pagesize: l.Pagesize,
Limit: l.Limit,
})
}
if r.Network != nil {
if r.Network.ClassID != nil {
c.Resources.NetClsClassid = *r.Network.ClassID
}
for _, m := range r.Network.Priorities {
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
Interface: m.Name,
Priority: int64(m.Priority),
})
}
}
}
if !opts.Rootless {
// append the default allowed devices to the end of the list
c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
}
return c, nil
}
func stringToCgroupDeviceRune(s string) (rune, error) {
switch s {
case "a":
return 'a', nil
case "b":
return 'b', nil
case "c":
return 'c', nil
default:
return 0, fmt.Errorf("invalid cgroup device type %q", s)
}
}
func stringToDeviceRune(s string) (rune, error) {
switch s {
case "p":
return 'p', nil
case "u":
return 'u', nil
case "b":
return 'b', nil
case "c":
return 'c', nil
default:
return 0, fmt.Errorf("invalid device type %q", s)
}
}
func createDevices(spec *specs.Spec, config *configs.Config) error {
// add whitelisted devices
config.Devices = []*configs.Device{
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
}
// merge in additional devices from the spec
if spec.Linux != nil {
for _, d := range spec.Linux.Devices {
var uid, gid uint32
var filemode os.FileMode = 0666
if d.UID != nil {
uid = *d.UID
}
if d.GID != nil {
gid = *d.GID
}
dt, err := stringToDeviceRune(d.Type)
if err != nil {
return err
}
if d.FileMode != nil {
filemode = *d.FileMode
}
device := &configs.Device{
Type: dt,
Path: d.Path,
Major: d.Major,
Minor: d.Minor,
FileMode: filemode,
Uid: uid,
Gid: gid,
}
config.Devices = append(config.Devices, device)
}
}
return nil
}
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
create := func(m specs.LinuxIDMapping) configs.IDMap {
return configs.IDMap{
HostID: int(m.HostID),
ContainerID: int(m.ContainerID),
Size: int(m.Size),
}
}
if spec.Linux != nil {
if len(spec.Linux.UIDMappings) == 0 {
return nil
}
for _, m := range spec.Linux.UIDMappings {
config.UidMappings = append(config.UidMappings, create(m))
}
for _, m := range spec.Linux.GIDMappings {
config.GidMappings = append(config.GidMappings, create(m))
}
}
rootUID, err := config.HostRootUID()
if err != nil {
return err
}
rootGID, err := config.HostRootGID()
if err != nil {
return err
}
for _, node := range config.Devices {
node.Uid = uint32(rootUID)
node.Gid = uint32(rootGID)
}
return nil
}
// parseMountOptions parses the string and returns the flags, propagation
// flags and any mount data that it contains.
func parseMountOptions(options []string) (int, []int, string, int) {
var (
flag int
pgflag []int
data []string
extFlags int
)
flags := map[string]struct {
clear bool
flag int
}{
"acl": {false, unix.MS_POSIXACL},
"async": {true, unix.MS_SYNCHRONOUS},
"atime": {true, unix.MS_NOATIME},
"bind": {false, unix.MS_BIND},
"defaults": {false, 0},
"dev": {true, unix.MS_NODEV},
"diratime": {true, unix.MS_NODIRATIME},
"dirsync": {false, unix.MS_DIRSYNC},
"exec": {true, unix.MS_NOEXEC},
"iversion": {false, unix.MS_I_VERSION},
"lazytime": {false, unix.MS_LAZYTIME},
"loud": {true, unix.MS_SILENT},
"mand": {false, unix.MS_MANDLOCK},
"noacl": {true, unix.MS_POSIXACL},
"noatime": {false, unix.MS_NOATIME},
"nodev": {false, unix.MS_NODEV},
"nodiratime": {false, unix.MS_NODIRATIME},
"noexec": {false, unix.MS_NOEXEC},
"noiversion": {true, unix.MS_I_VERSION},
"nolazytime": {true, unix.MS_LAZYTIME},
"nomand": {true, unix.MS_MANDLOCK},
"norelatime": {true, unix.MS_RELATIME},
"nostrictatime": {true, unix.MS_STRICTATIME},
"nosuid": {false, unix.MS_NOSUID},
"rbind": {false, unix.MS_BIND | unix.MS_REC},
"relatime": {false, unix.MS_RELATIME},
"remount": {false, unix.MS_REMOUNT},
"ro": {false, unix.MS_RDONLY},
"rw": {true, unix.MS_RDONLY},
"silent": {false, unix.MS_SILENT},
"strictatime": {false, unix.MS_STRICTATIME},
"suid": {true, unix.MS_NOSUID},
"sync": {false, unix.MS_SYNCHRONOUS},
}
propagationFlags := map[string]int{
"private": unix.MS_PRIVATE,
"shared": unix.MS_SHARED,
"slave": unix.MS_SLAVE,
"unbindable": unix.MS_UNBINDABLE,
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
"rshared": unix.MS_SHARED | unix.MS_REC,
"rslave": unix.MS_SLAVE | unix.MS_REC,
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
}
extensionFlags := map[string]struct {
clear bool
flag int
}{
"tmpcopyup": {false, configs.EXT_COPYUP},
}
for _, o := range options {
// If the option does not exist in the flags table or the flag
// is not supported on the platform,
// then it is a data value for a specific fs type
if f, exists := flags[o]; exists && f.flag != 0 {
if f.clear {
flag &= ^f.flag
} else {
flag |= f.flag
}
} else if f, exists := propagationFlags[o]; exists && f != 0 {
pgflag = append(pgflag, f)
} else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
if f.clear {
extFlags &= ^f.flag
} else {
extFlags |= f.flag
}
} else {
data = append(data, o)
}
}
return flag, pgflag, strings.Join(data, ","), extFlags
}
func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
if config == nil {
return nil, nil
}
// No default action specified, no syscalls listed, assume seccomp disabled
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
return nil, nil
}
newConfig := new(configs.Seccomp)
newConfig.Syscalls = []*configs.Syscall{}
if len(config.Architectures) > 0 {
newConfig.Architectures = []string{}
for _, arch := range config.Architectures {
newArch, err := seccomp.ConvertStringToArch(string(arch))
if err != nil {
return nil, err
}
newConfig.Architectures = append(newConfig.Architectures, newArch)
}
}
// Convert default action from string representation
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
if err != nil {
return nil, err
}
newConfig.DefaultAction = newDefaultAction
// Loop through all syscall blocks and convert them to libcontainer format
for _, call := range config.Syscalls {
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
if err != nil {
return nil, err
}
for _, name := range call.Names {
newCall := configs.Syscall{
Name: name,
Action: newAction,
Args: []*configs.Arg{},
}
// Loop through all the arguments of the syscall and convert them
for _, arg := range call.Args {
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
if err != nil {
return nil, err
}
newArg := configs.Arg{
Index: arg.Index,
Value: arg.Value,
ValueTwo: arg.ValueTwo,
Op: newOp,
}
newCall.Args = append(newCall.Args, &newArg)
}
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
}
}
return newConfig, nil
}
func createHooks(rspec *specs.Spec, config *configs.Config) {
config.Hooks = &configs.Hooks{}
if rspec.Hooks != nil {
for _, h := range rspec.Hooks.Prestart {
cmd := createCommandHook(h)
config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
}
for _, h := range rspec.Hooks.Poststart {
cmd := createCommandHook(h)
config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
}
for _, h := range rspec.Hooks.Poststop {
cmd := createCommandHook(h)
config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
}
}
}
func createCommandHook(h specs.Hook) configs.Command {
cmd := configs.Command{
Path: h.Path,
Args: h.Args,
Env: h.Env,
}
if h.Timeout != nil {
d := time.Duration(*h.Timeout) * time.Second
cmd.Timeout = &d
}
return cmd
}

22
vendor/github.com/seccomp/libseccomp-golang/LICENSE generated vendored Normal file
View File

@ -0,0 +1,22 @@
Copyright (c) 2015 Matthew Heon <mheon@redhat.com>
Copyright (c) 2015 Paul Moore <pmoore@redhat.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

902
vendor/github.com/seccomp/libseccomp-golang/seccomp.go generated vendored Normal file
View File

@ -0,0 +1,902 @@
// +build linux
// Public API specification for libseccomp Go bindings
// Contains public API for the bindings
// Package seccomp provides bindings for libseccomp, a library wrapping the Linux
// seccomp syscall. Seccomp enables an application to restrict system call use
// for itself and its children.
package seccomp
import (
"fmt"
"os"
"runtime"
"strings"
"sync"
"syscall"
"unsafe"
)
// C wrapping code
// #cgo pkg-config: libseccomp
// #include <stdlib.h>
// #include <seccomp.h>
import "C"
// Exported types
// VersionError denotes that the system libseccomp version is incompatible
// with this package.
type VersionError struct {
message string
minimum string
}
func (e VersionError) Error() string {
format := "Libseccomp version too low: "
if e.message != "" {
format += e.message + ": "
}
format += "minimum supported is "
if e.minimum != "" {
format += e.minimum + ": "
} else {
format += "2.1.0: "
}
format += "detected %d.%d.%d"
return fmt.Sprintf(format, verMajor, verMinor, verMicro)
}
// ScmpArch represents a CPU architecture. Seccomp can restrict syscalls on a
// per-architecture basis.
type ScmpArch uint
// ScmpAction represents an action to be taken on a filter rule match in
// libseccomp
type ScmpAction uint
// ScmpCompareOp represents a comparison operator which can be used in a filter
// rule
type ScmpCompareOp uint
// ScmpCondition represents a rule in a libseccomp filter context
type ScmpCondition struct {
Argument uint `json:"argument,omitempty"`
Op ScmpCompareOp `json:"operator,omitempty"`
Operand1 uint64 `json:"operand_one,omitempty"`
Operand2 uint64 `json:"operand_two,omitempty"`
}
// ScmpSyscall represents a Linux System Call
type ScmpSyscall int32
// Exported Constants
const (
// Valid architectures recognized by libseccomp
// ARM64 and all MIPS architectures are unsupported by versions of the
// library before v2.2 and will return errors if used
// ArchInvalid is a placeholder to ensure uninitialized ScmpArch
// variables are invalid
ArchInvalid ScmpArch = iota
// ArchNative is the native architecture of the kernel
ArchNative ScmpArch = iota
// ArchX86 represents 32-bit x86 syscalls
ArchX86 ScmpArch = iota
// ArchAMD64 represents 64-bit x86-64 syscalls
ArchAMD64 ScmpArch = iota
// ArchX32 represents 64-bit x86-64 syscalls (32-bit pointers)
ArchX32 ScmpArch = iota
// ArchARM represents 32-bit ARM syscalls
ArchARM ScmpArch = iota
// ArchARM64 represents 64-bit ARM syscalls
ArchARM64 ScmpArch = iota
// ArchMIPS represents 32-bit MIPS syscalls
ArchMIPS ScmpArch = iota
// ArchMIPS64 represents 64-bit MIPS syscalls
ArchMIPS64 ScmpArch = iota
// ArchMIPS64N32 represents 64-bit MIPS syscalls (32-bit pointers)
ArchMIPS64N32 ScmpArch = iota
// ArchMIPSEL represents 32-bit MIPS syscalls (little endian)
ArchMIPSEL ScmpArch = iota
// ArchMIPSEL64 represents 64-bit MIPS syscalls (little endian)
ArchMIPSEL64 ScmpArch = iota
// ArchMIPSEL64N32 represents 64-bit MIPS syscalls (little endian,
// 32-bit pointers)
ArchMIPSEL64N32 ScmpArch = iota
// ArchPPC represents 32-bit POWERPC syscalls
ArchPPC ScmpArch = iota
// ArchPPC64 represents 64-bit POWER syscalls (big endian)
ArchPPC64 ScmpArch = iota
// ArchPPC64LE represents 64-bit POWER syscalls (little endian)
ArchPPC64LE ScmpArch = iota
// ArchS390 represents 31-bit System z/390 syscalls
ArchS390 ScmpArch = iota
// ArchS390X represents 64-bit System z/390 syscalls
ArchS390X ScmpArch = iota
)
const (
// Supported actions on filter match
// ActInvalid is a placeholder to ensure uninitialized ScmpAction
// variables are invalid
ActInvalid ScmpAction = iota
// ActKill kills the process
ActKill ScmpAction = iota
// ActTrap throws SIGSYS
ActTrap ScmpAction = iota
// ActErrno causes the syscall to return a negative error code. This
// code can be set with the SetReturnCode method
ActErrno ScmpAction = iota
// ActTrace causes the syscall to notify tracing processes with the
// given error code. This code can be set with the SetReturnCode method
ActTrace ScmpAction = iota
// ActAllow permits the syscall to continue execution
ActAllow ScmpAction = iota
)
const (
// These are comparison operators used in conditional seccomp rules
// They are used to compare the value of a single argument of a syscall
// against a user-defined constant
// CompareInvalid is a placeholder to ensure uninitialized ScmpCompareOp
// variables are invalid
CompareInvalid ScmpCompareOp = iota
// CompareNotEqual returns true if the argument is not equal to the
// given value
CompareNotEqual ScmpCompareOp = iota
// CompareLess returns true if the argument is less than the given value
CompareLess ScmpCompareOp = iota
// CompareLessOrEqual returns true if the argument is less than or equal
// to the given value
CompareLessOrEqual ScmpCompareOp = iota
// CompareEqual returns true if the argument is equal to the given value
CompareEqual ScmpCompareOp = iota
// CompareGreaterEqual returns true if the argument is greater than or
// equal to the given value
CompareGreaterEqual ScmpCompareOp = iota
// CompareGreater returns true if the argument is greater than the given
// value
CompareGreater ScmpCompareOp = iota
// CompareMaskedEqual returns true if the argument is equal to the given
// value, when masked (bitwise &) against the second given value
CompareMaskedEqual ScmpCompareOp = iota
)
// Helpers for types
// GetArchFromString returns an ScmpArch constant from a string representing an
// architecture
func GetArchFromString(arch string) (ScmpArch, error) {
if err := ensureSupportedVersion(); err != nil {
return ArchInvalid, err
}
switch strings.ToLower(arch) {
case "x86":
return ArchX86, nil
case "amd64", "x86-64", "x86_64", "x64":
return ArchAMD64, nil
case "x32":
return ArchX32, nil
case "arm":
return ArchARM, nil
case "arm64", "aarch64":
return ArchARM64, nil
case "mips":
return ArchMIPS, nil
case "mips64":
return ArchMIPS64, nil
case "mips64n32":
return ArchMIPS64N32, nil
case "mipsel":
return ArchMIPSEL, nil
case "mipsel64":
return ArchMIPSEL64, nil
case "mipsel64n32":
return ArchMIPSEL64N32, nil
case "ppc":
return ArchPPC, nil
case "ppc64":
return ArchPPC64, nil
case "ppc64le":
return ArchPPC64LE, nil
case "s390":
return ArchS390, nil
case "s390x":
return ArchS390X, nil
default:
return ArchInvalid, fmt.Errorf("cannot convert unrecognized string %s", arch)
}
}
// String returns a string representation of an architecture constant
func (a ScmpArch) String() string {
switch a {
case ArchX86:
return "x86"
case ArchAMD64:
return "amd64"
case ArchX32:
return "x32"
case ArchARM:
return "arm"
case ArchARM64:
return "arm64"
case ArchMIPS:
return "mips"
case ArchMIPS64:
return "mips64"
case ArchMIPS64N32:
return "mips64n32"
case ArchMIPSEL:
return "mipsel"
case ArchMIPSEL64:
return "mipsel64"
case ArchMIPSEL64N32:
return "mipsel64n32"
case ArchPPC:
return "ppc"
case ArchPPC64:
return "ppc64"
case ArchPPC64LE:
return "ppc64le"
case ArchS390:
return "s390"
case ArchS390X:
return "s390x"
case ArchNative:
return "native"
case ArchInvalid:
return "Invalid architecture"
default:
return "Unknown architecture"
}
}
// String returns a string representation of a comparison operator constant
func (a ScmpCompareOp) String() string {
switch a {
case CompareNotEqual:
return "Not equal"
case CompareLess:
return "Less than"
case CompareLessOrEqual:
return "Less than or equal to"
case CompareEqual:
return "Equal"
case CompareGreaterEqual:
return "Greater than or equal to"
case CompareGreater:
return "Greater than"
case CompareMaskedEqual:
return "Masked equality"
case CompareInvalid:
return "Invalid comparison operator"
default:
return "Unrecognized comparison operator"
}
}
// String returns a string representation of a seccomp match action
func (a ScmpAction) String() string {
switch a & 0xFFFF {
case ActKill:
return "Action: Kill Process"
case ActTrap:
return "Action: Send SIGSYS"
case ActErrno:
return fmt.Sprintf("Action: Return error code %d", (a >> 16))
case ActTrace:
return fmt.Sprintf("Action: Notify tracing processes with code %d",
(a >> 16))
case ActAllow:
return "Action: Allow system call"
default:
return "Unrecognized Action"
}
}
// SetReturnCode adds a return code to a supporting ScmpAction, clearing any
// existing code Only valid on ActErrno and ActTrace. Takes no action otherwise.
// Accepts 16-bit return code as argument.
// Returns a valid ScmpAction of the original type with the new error code set.
func (a ScmpAction) SetReturnCode(code int16) ScmpAction {
aTmp := a & 0x0000FFFF
if aTmp == ActErrno || aTmp == ActTrace {
return (aTmp | (ScmpAction(code)&0xFFFF)<<16)
}
return a
}
// GetReturnCode returns the return code of an ScmpAction
func (a ScmpAction) GetReturnCode() int16 {
return int16(a >> 16)
}
// General utility functions
// GetLibraryVersion returns the version of the library the bindings are built
// against.
// The version is formatted as follows: Major.Minor.Micro
func GetLibraryVersion() (major, minor, micro int) {
return verMajor, verMinor, verMicro
}
// Syscall functions
// GetName retrieves the name of a syscall from its number.
// Acts on any syscall number.
// Returns either a string containing the name of the syscall, or an error.
func (s ScmpSyscall) GetName() (string, error) {
return s.GetNameByArch(ArchNative)
}
// GetNameByArch retrieves the name of a syscall from its number for a given
// architecture.
// Acts on any syscall number.
// Accepts a valid architecture constant.
// Returns either a string containing the name of the syscall, or an error.
// if the syscall is unrecognized or an issue occurred.
func (s ScmpSyscall) GetNameByArch(arch ScmpArch) (string, error) {
if err := sanitizeArch(arch); err != nil {
return "", err
}
cString := C.seccomp_syscall_resolve_num_arch(arch.toNative(), C.int(s))
if cString == nil {
return "", fmt.Errorf("could not resolve syscall name")
}
defer C.free(unsafe.Pointer(cString))
finalStr := C.GoString(cString)
return finalStr, nil
}
// GetSyscallFromName returns the number of a syscall by name on the kernel's
// native architecture.
// Accepts a string containing the name of a syscall.
// Returns the number of the syscall, or an error if no syscall with that name
// was found.
func GetSyscallFromName(name string) (ScmpSyscall, error) {
if err := ensureSupportedVersion(); err != nil {
return 0, err
}
cString := C.CString(name)
defer C.free(unsafe.Pointer(cString))
result := C.seccomp_syscall_resolve_name(cString)
if result == scmpError {
return 0, fmt.Errorf("could not resolve name to syscall")
}
return ScmpSyscall(result), nil
}
// GetSyscallFromNameByArch returns the number of a syscall by name for a given
// architecture's ABI.
// Accepts the name of a syscall and an architecture constant.
// Returns the number of the syscall, or an error if an invalid architecture is
// passed or a syscall with that name was not found.
func GetSyscallFromNameByArch(name string, arch ScmpArch) (ScmpSyscall, error) {
if err := ensureSupportedVersion(); err != nil {
return 0, err
}
if err := sanitizeArch(arch); err != nil {
return 0, err
}
cString := C.CString(name)
defer C.free(unsafe.Pointer(cString))
result := C.seccomp_syscall_resolve_name_arch(arch.toNative(), cString)
if result == scmpError {
return 0, fmt.Errorf("could not resolve name to syscall")
}
return ScmpSyscall(result), nil
}
// MakeCondition creates and returns a new condition to attach to a filter rule.
// Associated rules will only match if this condition is true.
// Accepts the number the argument we are checking, and a comparison operator
// and value to compare to.
// The rule will match if argument $arg (zero-indexed) of the syscall is
// $COMPARE_OP the provided comparison value.
// Some comparison operators accept two values. Masked equals, for example,
// will mask $arg of the syscall with the second value provided (via bitwise
// AND) and then compare against the first value provided.
// For example, in the less than or equal case, if the syscall argument was
// 0 and the value provided was 1, the condition would match, as 0 is less
// than or equal to 1.
// Return either an error on bad argument or a valid ScmpCondition struct.
func MakeCondition(arg uint, comparison ScmpCompareOp, values ...uint64) (ScmpCondition, error) {
var condStruct ScmpCondition
if err := ensureSupportedVersion(); err != nil {
return condStruct, err
}
if comparison == CompareInvalid {
return condStruct, fmt.Errorf("invalid comparison operator")
} else if arg > 5 {
return condStruct, fmt.Errorf("syscalls only have up to 6 arguments")
} else if len(values) > 2 {
return condStruct, fmt.Errorf("conditions can have at most 2 arguments")
} else if len(values) == 0 {
return condStruct, fmt.Errorf("must provide at least one value to compare against")
}
condStruct.Argument = arg
condStruct.Op = comparison
condStruct.Operand1 = values[0]
if len(values) == 2 {
condStruct.Operand2 = values[1]
} else {
condStruct.Operand2 = 0 // Unused
}
return condStruct, nil
}
// Utility Functions
// GetNativeArch returns architecture token representing the native kernel
// architecture
func GetNativeArch() (ScmpArch, error) {
if err := ensureSupportedVersion(); err != nil {
return ArchInvalid, err
}
arch := C.seccomp_arch_native()
return archFromNative(arch)
}
// Public Filter API
// ScmpFilter represents a filter context in libseccomp.
// A filter context is initially empty. Rules can be added to it, and it can
// then be loaded into the kernel.
type ScmpFilter struct {
filterCtx C.scmp_filter_ctx
valid bool
lock sync.Mutex
}
// NewFilter creates and returns a new filter context.
// Accepts a default action to be taken for syscalls which match no rules in
// the filter.
// Returns a reference to a valid filter context, or nil and an error if the
// filter context could not be created or an invalid default action was given.
func NewFilter(defaultAction ScmpAction) (*ScmpFilter, error) {
if err := ensureSupportedVersion(); err != nil {
return nil, err
}
if err := sanitizeAction(defaultAction); err != nil {
return nil, err
}
fPtr := C.seccomp_init(defaultAction.toNative())
if fPtr == nil {
return nil, fmt.Errorf("could not create filter")
}
filter := new(ScmpFilter)
filter.filterCtx = fPtr
filter.valid = true
runtime.SetFinalizer(filter, filterFinalizer)
return filter, nil
}
// IsValid determines whether a filter context is valid to use.
// Some operations (Release and Merge) render filter contexts invalid and
// consequently prevent further use.
func (f *ScmpFilter) IsValid() bool {
f.lock.Lock()
defer f.lock.Unlock()
return f.valid
}
// Reset resets a filter context, removing all its existing state.
// Accepts a new default action to be taken for syscalls which do not match.
// Returns an error if the filter or action provided are invalid.
func (f *ScmpFilter) Reset(defaultAction ScmpAction) error {
f.lock.Lock()
defer f.lock.Unlock()
if err := sanitizeAction(defaultAction); err != nil {
return err
} else if !f.valid {
return errBadFilter
}
retCode := C.seccomp_reset(f.filterCtx, defaultAction.toNative())
if retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// Release releases a filter context, freeing its memory. Should be called after
// loading into the kernel, when the filter is no longer needed.
// After calling this function, the given filter is no longer valid and cannot
// be used.
// Release() will be invoked automatically when a filter context is garbage
// collected, but can also be called manually to free memory.
func (f *ScmpFilter) Release() {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return
}
f.valid = false
C.seccomp_release(f.filterCtx)
}
// Merge merges two filter contexts.
// The source filter src will be released as part of the process, and will no
// longer be usable or valid after this call.
// To be merged, filters must NOT share any architectures, and all their
// attributes (Default Action, Bad Arch Action, No New Privs and TSync bools)
// must match.
// The filter src will be merged into the filter this is called on.
// The architectures of the src filter not present in the destination, and all
// associated rules, will be added to the destination.
// Returns an error if merging the filters failed.
func (f *ScmpFilter) Merge(src *ScmpFilter) error {
f.lock.Lock()
defer f.lock.Unlock()
src.lock.Lock()
defer src.lock.Unlock()
if !src.valid || !f.valid {
return fmt.Errorf("one or more of the filter contexts is invalid or uninitialized")
}
// Merge the filters
retCode := C.seccomp_merge(f.filterCtx, src.filterCtx)
if syscall.Errno(-1*retCode) == syscall.EINVAL {
return fmt.Errorf("filters could not be merged due to a mismatch in attributes or invalid filter")
} else if retCode != 0 {
return syscall.Errno(-1 * retCode)
}
src.valid = false
return nil
}
// IsArchPresent checks if an architecture is present in a filter.
// If a filter contains an architecture, it uses its default action for
// syscalls which do not match rules in it, and its rules can match syscalls
// for that ABI.
// If a filter does not contain an architecture, all syscalls made to that
// kernel ABI will fail with the filter's default Bad Architecture Action
// (by default, killing the process).
// Accepts an architecture constant.
// Returns true if the architecture is present in the filter, false otherwise,
// and an error on an invalid filter context, architecture constant, or an
// issue with the call to libseccomp.
func (f *ScmpFilter) IsArchPresent(arch ScmpArch) (bool, error) {
f.lock.Lock()
defer f.lock.Unlock()
if err := sanitizeArch(arch); err != nil {
return false, err
} else if !f.valid {
return false, errBadFilter
}
retCode := C.seccomp_arch_exist(f.filterCtx, arch.toNative())
if syscall.Errno(-1*retCode) == syscall.EEXIST {
// -EEXIST is "arch not present"
return false, nil
} else if retCode != 0 {
return false, syscall.Errno(-1 * retCode)
}
return true, nil
}
// AddArch adds an architecture to the filter.
// Accepts an architecture constant.
// Returns an error on invalid filter context or architecture token, or an
// issue with the call to libseccomp.
func (f *ScmpFilter) AddArch(arch ScmpArch) error {
f.lock.Lock()
defer f.lock.Unlock()
if err := sanitizeArch(arch); err != nil {
return err
} else if !f.valid {
return errBadFilter
}
// Libseccomp returns -EEXIST if the specified architecture is already
// present. Succeed silently in this case, as it's not fatal, and the
// architecture is present already.
retCode := C.seccomp_arch_add(f.filterCtx, arch.toNative())
if retCode != 0 && syscall.Errno(-1*retCode) != syscall.EEXIST {
return syscall.Errno(-1 * retCode)
}
return nil
}
// RemoveArch removes an architecture from the filter.
// Accepts an architecture constant.
// Returns an error on invalid filter context or architecture token, or an
// issue with the call to libseccomp.
func (f *ScmpFilter) RemoveArch(arch ScmpArch) error {
f.lock.Lock()
defer f.lock.Unlock()
if err := sanitizeArch(arch); err != nil {
return err
} else if !f.valid {
return errBadFilter
}
// Similar to AddArch, -EEXIST is returned if the arch is not present
// Succeed silently in that case, this is not fatal and the architecture
// is not present in the filter after RemoveArch
retCode := C.seccomp_arch_remove(f.filterCtx, arch.toNative())
if retCode != 0 && syscall.Errno(-1*retCode) != syscall.EEXIST {
return syscall.Errno(-1 * retCode)
}
return nil
}
// Load loads a filter context into the kernel.
// Returns an error if the filter context is invalid or the syscall failed.
func (f *ScmpFilter) Load() error {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return errBadFilter
}
if retCode := C.seccomp_load(f.filterCtx); retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// GetDefaultAction returns the default action taken on a syscall which does not
// match a rule in the filter, or an error if an issue was encountered
// retrieving the value.
func (f *ScmpFilter) GetDefaultAction() (ScmpAction, error) {
action, err := f.getFilterAttr(filterAttrActDefault)
if err != nil {
return 0x0, err
}
return actionFromNative(action)
}
// GetBadArchAction returns the default action taken on a syscall for an
// architecture not in the filter, or an error if an issue was encountered
// retrieving the value.
func (f *ScmpFilter) GetBadArchAction() (ScmpAction, error) {
action, err := f.getFilterAttr(filterAttrActBadArch)
if err != nil {
return 0x0, err
}
return actionFromNative(action)
}
// GetNoNewPrivsBit returns the current state the No New Privileges bit will be set
// to on the filter being loaded, or an error if an issue was encountered
// retrieving the value.
// The No New Privileges bit tells the kernel that new processes run with exec()
// cannot gain more privileges than the process that ran exec().
// For example, a process with No New Privileges set would be unable to exec
// setuid/setgid executables.
func (f *ScmpFilter) GetNoNewPrivsBit() (bool, error) {
noNewPrivs, err := f.getFilterAttr(filterAttrNNP)
if err != nil {
return false, err
}
if noNewPrivs == 0 {
return false, nil
}
return true, nil
}
// GetTsyncBit returns whether Thread Synchronization will be enabled on the
// filter being loaded, or an error if an issue was encountered retrieving the
// value.
// Thread Sync ensures that all members of the thread group of the calling
// process will share the same Seccomp filter set.
// Tsync is a fairly recent addition to the Linux kernel and older kernels
// lack support. If the running kernel does not support Tsync and it is
// requested in a filter, Libseccomp will not enable TSync support and will
// proceed as normal.
// This function is unavailable before v2.2 of libseccomp and will return an
// error.
func (f *ScmpFilter) GetTsyncBit() (bool, error) {
tSync, err := f.getFilterAttr(filterAttrTsync)
if err != nil {
return false, err
}
if tSync == 0 {
return false, nil
}
return true, nil
}
// SetBadArchAction sets the default action taken on a syscall for an
// architecture not in the filter, or an error if an issue was encountered
// setting the value.
func (f *ScmpFilter) SetBadArchAction(action ScmpAction) error {
if err := sanitizeAction(action); err != nil {
return err
}
return f.setFilterAttr(filterAttrActBadArch, action.toNative())
}
// SetNoNewPrivsBit sets the state of the No New Privileges bit, which will be
// applied on filter load, or an error if an issue was encountered setting the
// value.
// Filters with No New Privileges set to 0 can only be loaded if the process
// has the CAP_SYS_ADMIN capability.
func (f *ScmpFilter) SetNoNewPrivsBit(state bool) error {
var toSet C.uint32_t = 0x0
if state {
toSet = 0x1
}
return f.setFilterAttr(filterAttrNNP, toSet)
}
// SetTsync sets whether Thread Synchronization will be enabled on the filter
// being loaded. Returns an error if setting Tsync failed, or the filter is
// invalid.
// Thread Sync ensures that all members of the thread group of the calling
// process will share the same Seccomp filter set.
// Tsync is a fairly recent addition to the Linux kernel and older kernels
// lack support. If the running kernel does not support Tsync and it is
// requested in a filter, Libseccomp will not enable TSync support and will
// proceed as normal.
// This function is unavailable before v2.2 of libseccomp and will return an
// error.
func (f *ScmpFilter) SetTsync(enable bool) error {
var toSet C.uint32_t = 0x0
if enable {
toSet = 0x1
}
return f.setFilterAttr(filterAttrTsync, toSet)
}
// SetSyscallPriority sets a syscall's priority.
// This provides a hint to the filter generator in libseccomp about the
// importance of this syscall. High-priority syscalls are placed
// first in the filter code, and incur less overhead (at the expense of
// lower-priority syscalls).
func (f *ScmpFilter) SetSyscallPriority(call ScmpSyscall, priority uint8) error {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return errBadFilter
}
if retCode := C.seccomp_syscall_priority(f.filterCtx, C.int(call),
C.uint8_t(priority)); retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// AddRule adds a single rule for an unconditional action on a syscall.
// Accepts the number of the syscall and the action to be taken on the call
// being made.
// Returns an error if an issue was encountered adding the rule.
func (f *ScmpFilter) AddRule(call ScmpSyscall, action ScmpAction) error {
return f.addRuleGeneric(call, action, false, nil)
}
// AddRuleExact adds a single rule for an unconditional action on a syscall.
// Accepts the number of the syscall and the action to be taken on the call
// being made.
// No modifications will be made to the rule, and it will fail to add if it
// cannot be applied to the current architecture without modification.
// The rule will function exactly as described, but it may not function identically
// (or be able to be applied to) all architectures.
// Returns an error if an issue was encountered adding the rule.
func (f *ScmpFilter) AddRuleExact(call ScmpSyscall, action ScmpAction) error {
return f.addRuleGeneric(call, action, true, nil)
}
// AddRuleConditional adds a single rule for a conditional action on a syscall.
// Returns an error if an issue was encountered adding the rule.
// All conditions must match for the rule to match.
// There is a bug in library versions below v2.2.1 which can, in some cases,
// cause conditions to be lost when more than one are used. Consequently,
// AddRuleConditional is disabled on library versions lower than v2.2.1
func (f *ScmpFilter) AddRuleConditional(call ScmpSyscall, action ScmpAction, conds []ScmpCondition) error {
return f.addRuleGeneric(call, action, false, conds)
}
// AddRuleConditionalExact adds a single rule for a conditional action on a
// syscall.
// No modifications will be made to the rule, and it will fail to add if it
// cannot be applied to the current architecture without modification.
// The rule will function exactly as described, but it may not function identically
// (or be able to be applied to) all architectures.
// Returns an error if an issue was encountered adding the rule.
// There is a bug in library versions below v2.2.1 which can, in some cases,
// cause conditions to be lost when more than one are used. Consequently,
// AddRuleConditionalExact is disabled on library versions lower than v2.2.1
func (f *ScmpFilter) AddRuleConditionalExact(call ScmpSyscall, action ScmpAction, conds []ScmpCondition) error {
return f.addRuleGeneric(call, action, true, conds)
}
// ExportPFC output PFC-formatted, human-readable dump of a filter context's
// rules to a file.
// Accepts file to write to (must be open for writing).
// Returns an error if writing to the file fails.
func (f *ScmpFilter) ExportPFC(file *os.File) error {
f.lock.Lock()
defer f.lock.Unlock()
fd := file.Fd()
if !f.valid {
return errBadFilter
}
if retCode := C.seccomp_export_pfc(f.filterCtx, C.int(fd)); retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// ExportBPF outputs Berkeley Packet Filter-formatted, kernel-readable dump of a
// filter context's rules to a file.
// Accepts file to write to (must be open for writing).
// Returns an error if writing to the file fails.
func (f *ScmpFilter) ExportBPF(file *os.File) error {
f.lock.Lock()
defer f.lock.Unlock()
fd := file.Fd()
if !f.valid {
return errBadFilter
}
if retCode := C.seccomp_export_bpf(f.filterCtx, C.int(fd)); retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}

View File

@ -0,0 +1,514 @@
// +build linux
// Internal functions for libseccomp Go bindings
// No exported functions
package seccomp
import (
"fmt"
"syscall"
)
// Unexported C wrapping code - provides the C-Golang interface
// Get the seccomp header in scope
// Need stdlib.h for free() on cstrings
// #cgo pkg-config: libseccomp
/*
#include <stdlib.h>
#include <seccomp.h>
#if SCMP_VER_MAJOR < 2
#error Minimum supported version of Libseccomp is v2.1.0
#elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 1
#error Minimum supported version of Libseccomp is v2.1.0
#endif
#define ARCH_BAD ~0
const uint32_t C_ARCH_BAD = ARCH_BAD;
#ifndef SCMP_ARCH_AARCH64
#define SCMP_ARCH_AARCH64 ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPS
#define SCMP_ARCH_MIPS ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPS64
#define SCMP_ARCH_MIPS64 ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPS64N32
#define SCMP_ARCH_MIPS64N32 ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPSEL
#define SCMP_ARCH_MIPSEL ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPSEL64
#define SCMP_ARCH_MIPSEL64 ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPSEL64N32
#define SCMP_ARCH_MIPSEL64N32 ARCH_BAD
#endif
#ifndef SCMP_ARCH_PPC
#define SCMP_ARCH_PPC ARCH_BAD
#endif
#ifndef SCMP_ARCH_PPC64
#define SCMP_ARCH_PPC64 ARCH_BAD
#endif
#ifndef SCMP_ARCH_PPC64LE
#define SCMP_ARCH_PPC64LE ARCH_BAD
#endif
#ifndef SCMP_ARCH_S390
#define SCMP_ARCH_S390 ARCH_BAD
#endif
#ifndef SCMP_ARCH_S390X
#define SCMP_ARCH_S390X ARCH_BAD
#endif
const uint32_t C_ARCH_NATIVE = SCMP_ARCH_NATIVE;
const uint32_t C_ARCH_X86 = SCMP_ARCH_X86;
const uint32_t C_ARCH_X86_64 = SCMP_ARCH_X86_64;
const uint32_t C_ARCH_X32 = SCMP_ARCH_X32;
const uint32_t C_ARCH_ARM = SCMP_ARCH_ARM;
const uint32_t C_ARCH_AARCH64 = SCMP_ARCH_AARCH64;
const uint32_t C_ARCH_MIPS = SCMP_ARCH_MIPS;
const uint32_t C_ARCH_MIPS64 = SCMP_ARCH_MIPS64;
const uint32_t C_ARCH_MIPS64N32 = SCMP_ARCH_MIPS64N32;
const uint32_t C_ARCH_MIPSEL = SCMP_ARCH_MIPSEL;
const uint32_t C_ARCH_MIPSEL64 = SCMP_ARCH_MIPSEL64;
const uint32_t C_ARCH_MIPSEL64N32 = SCMP_ARCH_MIPSEL64N32;
const uint32_t C_ARCH_PPC = SCMP_ARCH_PPC;
const uint32_t C_ARCH_PPC64 = SCMP_ARCH_PPC64;
const uint32_t C_ARCH_PPC64LE = SCMP_ARCH_PPC64LE;
const uint32_t C_ARCH_S390 = SCMP_ARCH_S390;
const uint32_t C_ARCH_S390X = SCMP_ARCH_S390X;
const uint32_t C_ACT_KILL = SCMP_ACT_KILL;
const uint32_t C_ACT_TRAP = SCMP_ACT_TRAP;
const uint32_t C_ACT_ERRNO = SCMP_ACT_ERRNO(0);
const uint32_t C_ACT_TRACE = SCMP_ACT_TRACE(0);
const uint32_t C_ACT_ALLOW = SCMP_ACT_ALLOW;
// If TSync is not supported, make sure it doesn't map to a supported filter attribute
// Don't worry about major version < 2, the minimum version checks should catch that case
#if SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 2
#define SCMP_FLTATR_CTL_TSYNC _SCMP_CMP_MIN
#endif
const uint32_t C_ATTRIBUTE_DEFAULT = (uint32_t)SCMP_FLTATR_ACT_DEFAULT;
const uint32_t C_ATTRIBUTE_BADARCH = (uint32_t)SCMP_FLTATR_ACT_BADARCH;
const uint32_t C_ATTRIBUTE_NNP = (uint32_t)SCMP_FLTATR_CTL_NNP;
const uint32_t C_ATTRIBUTE_TSYNC = (uint32_t)SCMP_FLTATR_CTL_TSYNC;
const int C_CMP_NE = (int)SCMP_CMP_NE;
const int C_CMP_LT = (int)SCMP_CMP_LT;
const int C_CMP_LE = (int)SCMP_CMP_LE;
const int C_CMP_EQ = (int)SCMP_CMP_EQ;
const int C_CMP_GE = (int)SCMP_CMP_GE;
const int C_CMP_GT = (int)SCMP_CMP_GT;
const int C_CMP_MASKED_EQ = (int)SCMP_CMP_MASKED_EQ;
const int C_VERSION_MAJOR = SCMP_VER_MAJOR;
const int C_VERSION_MINOR = SCMP_VER_MINOR;
const int C_VERSION_MICRO = SCMP_VER_MICRO;
typedef struct scmp_arg_cmp* scmp_cast_t;
// Wrapper to create an scmp_arg_cmp struct
void*
make_struct_arg_cmp(
unsigned int arg,
int compare,
uint64_t a,
uint64_t b
)
{
struct scmp_arg_cmp *s = malloc(sizeof(struct scmp_arg_cmp));
s->arg = arg;
s->op = compare;
s->datum_a = a;
s->datum_b = b;
return s;
}
*/
import "C"
// Nonexported types
type scmpFilterAttr uint32
// Nonexported constants
const (
filterAttrActDefault scmpFilterAttr = iota
filterAttrActBadArch scmpFilterAttr = iota
filterAttrNNP scmpFilterAttr = iota
filterAttrTsync scmpFilterAttr = iota
)
const (
// An error return from certain libseccomp functions
scmpError C.int = -1
// Comparison boundaries to check for architecture validity
archStart ScmpArch = ArchNative
archEnd ScmpArch = ArchS390X
// Comparison boundaries to check for action validity
actionStart ScmpAction = ActKill
actionEnd ScmpAction = ActAllow
// Comparison boundaries to check for comparison operator validity
compareOpStart ScmpCompareOp = CompareNotEqual
compareOpEnd ScmpCompareOp = CompareMaskedEqual
)
var (
// Error thrown on bad filter context
errBadFilter = fmt.Errorf("filter is invalid or uninitialized")
// Constants representing library major, minor, and micro versions
verMajor = int(C.C_VERSION_MAJOR)
verMinor = int(C.C_VERSION_MINOR)
verMicro = int(C.C_VERSION_MICRO)
)
// Nonexported functions
// Check if library version is greater than or equal to the given one
func checkVersionAbove(major, minor, micro int) bool {
return (verMajor > major) ||
(verMajor == major && verMinor > minor) ||
(verMajor == major && verMinor == minor && verMicro >= micro)
}
// Ensure that the library is supported, i.e. >= 2.1.0.
func ensureSupportedVersion() error {
if !checkVersionAbove(2, 1, 0) {
return VersionError{}
}
return nil
}
// Filter helpers
// Filter finalizer - ensure that kernel context for filters is freed
func filterFinalizer(f *ScmpFilter) {
f.Release()
}
// Get a raw filter attribute
func (f *ScmpFilter) getFilterAttr(attr scmpFilterAttr) (C.uint32_t, error) {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return 0x0, errBadFilter
}
if !checkVersionAbove(2, 2, 0) && attr == filterAttrTsync {
return 0x0, VersionError{
message: "thread synchronization attribute is not supported",
minimum: "2.2.0",
}
}
var attribute C.uint32_t
retCode := C.seccomp_attr_get(f.filterCtx, attr.toNative(), &attribute)
if retCode != 0 {
return 0x0, syscall.Errno(-1 * retCode)
}
return attribute, nil
}
// Set a raw filter attribute
func (f *ScmpFilter) setFilterAttr(attr scmpFilterAttr, value C.uint32_t) error {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return errBadFilter
}
if !checkVersionAbove(2, 2, 0) && attr == filterAttrTsync {
return VersionError{
message: "thread synchronization attribute is not supported",
minimum: "2.2.0",
}
}
retCode := C.seccomp_attr_set(f.filterCtx, attr.toNative(), value)
if retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// DOES NOT LOCK OR CHECK VALIDITY
// Assumes caller has already done this
// Wrapper for seccomp_rule_add_... functions
func (f *ScmpFilter) addRuleWrapper(call ScmpSyscall, action ScmpAction, exact bool, cond C.scmp_cast_t) error {
var length C.uint
if cond != nil {
length = 1
} else {
length = 0
}
var retCode C.int
if exact {
retCode = C.seccomp_rule_add_exact_array(f.filterCtx, action.toNative(), C.int(call), length, cond)
} else {
retCode = C.seccomp_rule_add_array(f.filterCtx, action.toNative(), C.int(call), length, cond)
}
if syscall.Errno(-1*retCode) == syscall.EFAULT {
return fmt.Errorf("unrecognized syscall")
} else if syscall.Errno(-1*retCode) == syscall.EPERM {
return fmt.Errorf("requested action matches default action of filter")
} else if retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// Generic add function for filter rules
func (f *ScmpFilter) addRuleGeneric(call ScmpSyscall, action ScmpAction, exact bool, conds []ScmpCondition) error {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return errBadFilter
}
if len(conds) == 0 {
if err := f.addRuleWrapper(call, action, exact, nil); err != nil {
return err
}
} else {
// We don't support conditional filtering in library version v2.1
if !checkVersionAbove(2, 2, 1) {
return VersionError{
message: "conditional filtering is not supported",
minimum: "2.2.1",
}
}
for _, cond := range conds {
cmpStruct := C.make_struct_arg_cmp(C.uint(cond.Argument), cond.Op.toNative(), C.uint64_t(cond.Operand1), C.uint64_t(cond.Operand2))
defer C.free(cmpStruct)
if err := f.addRuleWrapper(call, action, exact, C.scmp_cast_t(cmpStruct)); err != nil {
return err
}
}
}
return nil
}
// Generic Helpers
// Helper - Sanitize Arch token input
func sanitizeArch(in ScmpArch) error {
if in < archStart || in > archEnd {
return fmt.Errorf("unrecognized architecture")
}
if in.toNative() == C.C_ARCH_BAD {
return fmt.Errorf("architecture is not supported on this version of the library")
}
return nil
}
func sanitizeAction(in ScmpAction) error {
inTmp := in & 0x0000FFFF
if inTmp < actionStart || inTmp > actionEnd {
return fmt.Errorf("unrecognized action")
}
if inTmp != ActTrace && inTmp != ActErrno && (in&0xFFFF0000) != 0 {
return fmt.Errorf("highest 16 bits must be zeroed except for Trace and Errno")
}
return nil
}
func sanitizeCompareOp(in ScmpCompareOp) error {
if in < compareOpStart || in > compareOpEnd {
return fmt.Errorf("unrecognized comparison operator")
}
return nil
}
func archFromNative(a C.uint32_t) (ScmpArch, error) {
switch a {
case C.C_ARCH_X86:
return ArchX86, nil
case C.C_ARCH_X86_64:
return ArchAMD64, nil
case C.C_ARCH_X32:
return ArchX32, nil
case C.C_ARCH_ARM:
return ArchARM, nil
case C.C_ARCH_NATIVE:
return ArchNative, nil
case C.C_ARCH_AARCH64:
return ArchARM64, nil
case C.C_ARCH_MIPS:
return ArchMIPS, nil
case C.C_ARCH_MIPS64:
return ArchMIPS64, nil
case C.C_ARCH_MIPS64N32:
return ArchMIPS64N32, nil
case C.C_ARCH_MIPSEL:
return ArchMIPSEL, nil
case C.C_ARCH_MIPSEL64:
return ArchMIPSEL64, nil
case C.C_ARCH_MIPSEL64N32:
return ArchMIPSEL64N32, nil
case C.C_ARCH_PPC:
return ArchPPC, nil
case C.C_ARCH_PPC64:
return ArchPPC64, nil
case C.C_ARCH_PPC64LE:
return ArchPPC64LE, nil
case C.C_ARCH_S390:
return ArchS390, nil
case C.C_ARCH_S390X:
return ArchS390X, nil
default:
return 0x0, fmt.Errorf("unrecognized architecture")
}
}
// Only use with sanitized arches, no error handling
func (a ScmpArch) toNative() C.uint32_t {
switch a {
case ArchX86:
return C.C_ARCH_X86
case ArchAMD64:
return C.C_ARCH_X86_64
case ArchX32:
return C.C_ARCH_X32
case ArchARM:
return C.C_ARCH_ARM
case ArchARM64:
return C.C_ARCH_AARCH64
case ArchMIPS:
return C.C_ARCH_MIPS
case ArchMIPS64:
return C.C_ARCH_MIPS64
case ArchMIPS64N32:
return C.C_ARCH_MIPS64N32
case ArchMIPSEL:
return C.C_ARCH_MIPSEL
case ArchMIPSEL64:
return C.C_ARCH_MIPSEL64
case ArchMIPSEL64N32:
return C.C_ARCH_MIPSEL64N32
case ArchPPC:
return C.C_ARCH_PPC
case ArchPPC64:
return C.C_ARCH_PPC64
case ArchPPC64LE:
return C.C_ARCH_PPC64LE
case ArchS390:
return C.C_ARCH_S390
case ArchS390X:
return C.C_ARCH_S390X
case ArchNative:
return C.C_ARCH_NATIVE
default:
return 0x0
}
}
// Only use with sanitized ops, no error handling
func (a ScmpCompareOp) toNative() C.int {
switch a {
case CompareNotEqual:
return C.C_CMP_NE
case CompareLess:
return C.C_CMP_LT
case CompareLessOrEqual:
return C.C_CMP_LE
case CompareEqual:
return C.C_CMP_EQ
case CompareGreaterEqual:
return C.C_CMP_GE
case CompareGreater:
return C.C_CMP_GT
case CompareMaskedEqual:
return C.C_CMP_MASKED_EQ
default:
return 0x0
}
}
func actionFromNative(a C.uint32_t) (ScmpAction, error) {
aTmp := a & 0xFFFF
switch a & 0xFFFF0000 {
case C.C_ACT_KILL:
return ActKill, nil
case C.C_ACT_TRAP:
return ActTrap, nil
case C.C_ACT_ERRNO:
return ActErrno.SetReturnCode(int16(aTmp)), nil
case C.C_ACT_TRACE:
return ActTrace.SetReturnCode(int16(aTmp)), nil
case C.C_ACT_ALLOW:
return ActAllow, nil
default:
return 0x0, fmt.Errorf("unrecognized action")
}
}
// Only use with sanitized actions, no error handling
func (a ScmpAction) toNative() C.uint32_t {
switch a & 0xFFFF {
case ActKill:
return C.C_ACT_KILL
case ActTrap:
return C.C_ACT_TRAP
case ActErrno:
return C.C_ACT_ERRNO | (C.uint32_t(a) >> 16)
case ActTrace:
return C.C_ACT_TRACE | (C.uint32_t(a) >> 16)
case ActAllow:
return C.C_ACT_ALLOW
default:
return 0x0
}
}
// Internal only, assumes safe attribute
func (a scmpFilterAttr) toNative() uint32 {
switch a {
case filterAttrActDefault:
return uint32(C.C_ATTRIBUTE_DEFAULT)
case filterAttrActBadArch:
return uint32(C.C_ATTRIBUTE_BADARCH)
case filterAttrNNP:
return uint32(C.C_ATTRIBUTE_NNP)
case filterAttrTsync:
return uint32(C.C_ATTRIBUTE_TSYNC)
default:
return 0x0
}
}