mirror of
https://github.com/linuxkit/linuxkit.git
synced 2025-07-22 02:21:34 +00:00
330 lines
10 KiB
Go
330 lines
10 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
log "github.com/sirupsen/logrus"
|
|
"github.com/vishvananda/netlink"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// Note these definitions are from src/moby/config.go and should be kept in sync
|
|
|
|
// Runtime is the type of config processed at runtime, not used to build the OCI spec
|
|
type Runtime struct {
|
|
Cgroups []string `yaml:"cgroups" json:"cgroups,omitempty"`
|
|
Mounts []specs.Mount `yaml:"mounts" json:"mounts,omitempty"`
|
|
Mkdir []string `yaml:"mkdir" json:"mkdir,omitempty"`
|
|
Interfaces []Interface `yaml:"interfaces" json:"interfaces,omitempty"`
|
|
BindNS Namespaces `yaml:"bindNS" json:"bindNS,omitempty"`
|
|
Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"`
|
|
}
|
|
|
|
// Namespaces is the type for configuring paths to bind namespaces
|
|
type Namespaces struct {
|
|
Cgroup string `yaml:"cgroup" json:"cgroup,omitempty"`
|
|
Ipc string `yaml:"ipc" json:"ipc,omitempty"`
|
|
Mnt string `yaml:"mnt" json:"mnt,omitempty"`
|
|
Net string `yaml:"net" json:"net,omitempty"`
|
|
Pid string `yaml:"pid" json:"pid,omitempty"`
|
|
User string `yaml:"user" json:"user,omitempty"`
|
|
Uts string `yaml:"uts" json:"uts,omitempty"`
|
|
}
|
|
|
|
// Interface is the runtime config for network interfaces
|
|
type Interface struct {
|
|
Name string `yaml:"name" json:"name,omitempty"`
|
|
Add string `yaml:"add" json:"add,omitempty"`
|
|
Peer string `yaml:"peer" json:"peer,omitempty"`
|
|
CreateInRoot bool `yaml:"createInRoot" json:"createInRoot"`
|
|
}
|
|
|
|
func getRuntimeConfig(path string) Runtime {
|
|
var runtime Runtime
|
|
conf, err := os.ReadFile(filepath.Join(path, "runtime.json"))
|
|
if err != nil {
|
|
// if it does not exist it is fine to return an empty runtime, to not do anything
|
|
if os.IsNotExist(err) {
|
|
return runtime
|
|
}
|
|
log.Fatalf("Cannot read runtime config: %v", err)
|
|
}
|
|
if err := json.Unmarshal(conf, &runtime); err != nil {
|
|
log.Fatalf("Cannot parse runtime config: %v", err)
|
|
}
|
|
return runtime
|
|
}
|
|
|
|
// parseMountOptions takes fstab style mount options and parses them for
|
|
// use with a standard mount() syscall
|
|
func parseMountOptions(options []string) (int, string) {
|
|
var (
|
|
flag int
|
|
data []string
|
|
)
|
|
flags := map[string]struct {
|
|
clear bool
|
|
flag int
|
|
}{
|
|
"async": {true, unix.MS_SYNCHRONOUS},
|
|
"atime": {true, unix.MS_NOATIME},
|
|
"bind": {false, unix.MS_BIND},
|
|
"defaults": {false, 0},
|
|
"dev": {true, unix.MS_NODEV},
|
|
"diratime": {true, unix.MS_NODIRATIME},
|
|
"dirsync": {false, unix.MS_DIRSYNC},
|
|
"exec": {true, unix.MS_NOEXEC},
|
|
"mand": {false, unix.MS_MANDLOCK},
|
|
"noatime": {false, unix.MS_NOATIME},
|
|
"nodev": {false, unix.MS_NODEV},
|
|
"nodiratime": {false, unix.MS_NODIRATIME},
|
|
"noexec": {false, unix.MS_NOEXEC},
|
|
"nomand": {true, unix.MS_MANDLOCK},
|
|
"norelatime": {true, unix.MS_RELATIME},
|
|
"nostrictatime": {true, unix.MS_STRICTATIME},
|
|
"nosuid": {false, unix.MS_NOSUID},
|
|
"private": {false, unix.MS_PRIVATE},
|
|
"rbind": {false, unix.MS_BIND | unix.MS_REC},
|
|
"relatime": {false, unix.MS_RELATIME},
|
|
"remount": {false, unix.MS_REMOUNT},
|
|
"ro": {false, unix.MS_RDONLY},
|
|
"rw": {true, unix.MS_RDONLY},
|
|
"shared": {false, unix.MS_SHARED},
|
|
"slave": {false, unix.MS_SLAVE},
|
|
"strictatime": {false, unix.MS_STRICTATIME},
|
|
"suid": {true, unix.MS_NOSUID},
|
|
"sync": {false, unix.MS_SYNCHRONOUS},
|
|
"unbindable": {false, unix.MS_UNBINDABLE},
|
|
}
|
|
for _, o := range options {
|
|
// If the option does not exist in the flags table or the flag
|
|
// is not supported on the platform,
|
|
// then it is a data value for a specific fs type
|
|
if f, exists := flags[o]; exists && f.flag != 0 {
|
|
if f.clear {
|
|
flag &^= f.flag
|
|
} else {
|
|
flag |= f.flag
|
|
}
|
|
} else {
|
|
data = append(data, o)
|
|
}
|
|
}
|
|
return flag, strings.Join(data, ",")
|
|
}
|
|
|
|
// newCgroup creates a cgroup (ie directory)
|
|
// we could use github.com/containerd/cgroups but it has a lot of deps and this is just a sugary mkdir
|
|
func newCgroup(cgroup string) error {
|
|
v2, err := isCgroupV2()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if v2 {
|
|
// a cgroupv2 cgroup is a single directory
|
|
if err := os.MkdirAll(filepath.Join("/sys/fs/cgroup", cgroup), 0755); err != nil {
|
|
log.Printf("cgroup error: %v", err)
|
|
}
|
|
return nil
|
|
}
|
|
// a cgroupv1 cgroup is a directory under all directories in /sys/fs/cgroup
|
|
dirs, err := os.ReadDir("/sys/fs/cgroup")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, dir := range dirs {
|
|
if !dir.IsDir() {
|
|
continue
|
|
}
|
|
if err := os.MkdirAll(filepath.Join("/sys/fs/cgroup", dir.Name(), cgroup), 0755); err != nil {
|
|
log.Printf("cgroup error: %v", err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func isCgroupV2() (bool, error) {
|
|
_, err := os.Stat("/sys/fs/cgroup/cgroup.controllers")
|
|
if err == nil {
|
|
return true, nil
|
|
}
|
|
if os.IsNotExist(err) {
|
|
return false, nil
|
|
}
|
|
return false, err
|
|
}
|
|
|
|
// prepareFilesystem sets up the mounts and cgroups, before the container is created
|
|
func prepareFilesystem(path string, runtime Runtime) error {
|
|
// execute the runtime config that should be done up front
|
|
// we execute Mounts before Mkdir so you can make a directory under a mount
|
|
// but we do mkdir of the destination path in case missing
|
|
rootfs := filepath.Join(path, "rootfs")
|
|
makeAbsolute := func(dir string) string {
|
|
if filepath.IsAbs(dir) {
|
|
return dir
|
|
}
|
|
// relative paths are relative to rootfs of container
|
|
return filepath.Join(rootfs, dir)
|
|
}
|
|
|
|
for _, mount := range runtime.Mounts {
|
|
const mode os.FileMode = 0755
|
|
dir := makeAbsolute(mount.Destination)
|
|
err := os.MkdirAll(dir, mode)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot create directory for mount destination %s: %v", dir, err)
|
|
}
|
|
// also mkdir upper and work directories on overlay
|
|
for _, o := range mount.Options {
|
|
eq := strings.SplitN(o, "=", 2)
|
|
if len(eq) == 2 && (eq[0] == "upperdir" || eq[0] == "workdir") {
|
|
err := os.MkdirAll(eq[1], mode)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot create directory for overlay %s=%s: %v", eq[0], eq[1], err)
|
|
}
|
|
}
|
|
}
|
|
opts, data := parseMountOptions(mount.Options)
|
|
if err := unix.Mount(mount.Source, dir, mount.Type, uintptr(opts), data); err != nil {
|
|
return fmt.Errorf("Failed to mount %s: %v", mount.Source, err)
|
|
}
|
|
}
|
|
for _, dir := range runtime.Mkdir {
|
|
// in future we may need to change the structure to set mode, ownership
|
|
const mode os.FileMode = 0755
|
|
dir = makeAbsolute(dir)
|
|
err := os.MkdirAll(dir, mode)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot create directory %s: %v", dir, err)
|
|
}
|
|
}
|
|
|
|
for _, cgroup := range runtime.Cgroups {
|
|
// currently no way to specify resource limits on new cgroups at creation time
|
|
if err := newCgroup(cgroup); err != nil {
|
|
return fmt.Errorf("Cannot create cgroup %s: %v", cgroup, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// bind mount a namespace file
|
|
func bindNS(ns string, path string, pid int) error {
|
|
if path == "" {
|
|
return nil
|
|
}
|
|
// the path and file need to exist for the bind to succeed, so try to create
|
|
dir := filepath.Dir(path)
|
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
|
return fmt.Errorf("Cannot create leading directories %s for bind mount destination: %v", dir, err)
|
|
}
|
|
fi, err := os.Create(path)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot create a mount point for namespace bind at %s: %v", path, err)
|
|
}
|
|
if err := fi.Close(); err != nil {
|
|
return err
|
|
}
|
|
if err := unix.Mount(fmt.Sprintf("/proc/%d/ns/%s", pid, ns), path, "", unix.MS_BIND, ""); err != nil {
|
|
return fmt.Errorf("Failed to bind %s namespace at %s: %v", ns, path, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// prepareProcess sets up anything that needs to be done after the container process is created, but before it runs
|
|
// for example networking
|
|
func prepareProcess(pid int, runtime Runtime) error {
|
|
for _, iface := range runtime.Interfaces {
|
|
if iface.Name == "" {
|
|
return fmt.Errorf("Interface requires a name")
|
|
}
|
|
|
|
var link netlink.Link
|
|
var ns interface{} = netlink.NsPid(pid)
|
|
var move bool
|
|
var err error
|
|
|
|
if iface.Peer != "" && iface.Add == "" {
|
|
// must be a veth if specify peer
|
|
iface.Add = "veth"
|
|
}
|
|
|
|
// if create in root is set, create in root namespace first, then move
|
|
// also do the same for a veth pair
|
|
if iface.CreateInRoot || iface.Add == "veth" {
|
|
ns = nil
|
|
move = true
|
|
}
|
|
|
|
if iface.Add != "" {
|
|
switch iface.Add {
|
|
case "veth":
|
|
if iface.Peer == "" {
|
|
return fmt.Errorf("Creating a veth pair %s requires a peer to be set", iface.Name)
|
|
}
|
|
la := netlink.LinkAttrs{Name: iface.Name, Namespace: ns}
|
|
link = &netlink.Veth{LinkAttrs: la, PeerName: iface.Peer}
|
|
default:
|
|
// no special creation options needed
|
|
la := netlink.LinkAttrs{Name: iface.Name, Namespace: ns}
|
|
link = &netlink.GenericLink{LinkAttrs: la, LinkType: iface.Add}
|
|
}
|
|
if err := netlink.LinkAdd(link); err != nil {
|
|
return fmt.Errorf("Link add %s of type %s failed: %v", iface.Name, iface.Add, err)
|
|
}
|
|
fmt.Fprintf(os.Stderr, "Created interface %s type %s\n", iface.Name, iface.Add)
|
|
} else {
|
|
// find existing interface
|
|
link, err = netlink.LinkByName(iface.Name)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot find interface %s: %v", iface.Name, err)
|
|
}
|
|
// then move into namespace
|
|
move = true
|
|
}
|
|
if move {
|
|
if err := netlink.LinkSetNsPid(link, pid); err != nil {
|
|
return fmt.Errorf("Cannot move interface %s into namespace: %v", iface.Name, err)
|
|
}
|
|
fmt.Fprintf(os.Stderr, "Moved interface %s to pid %d\n", iface.Name, pid)
|
|
}
|
|
}
|
|
|
|
binds := []struct {
|
|
ns string
|
|
path string
|
|
}{
|
|
{"cgroup", runtime.BindNS.Cgroup},
|
|
{"ipc", runtime.BindNS.Ipc},
|
|
{"mnt", runtime.BindNS.Mnt},
|
|
{"net", runtime.BindNS.Net},
|
|
{"pid", runtime.BindNS.Pid},
|
|
{"user", runtime.BindNS.User},
|
|
{"uts", runtime.BindNS.Uts},
|
|
}
|
|
|
|
for _, b := range binds {
|
|
if err := bindNS(b.ns, b.path, pid); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// cleanup functions are best efforts only, mainly for rw onboot containers
|
|
func cleanup(path string) {
|
|
// remove the root mount
|
|
rootfs := filepath.Join(path, "rootfs")
|
|
_ = unix.Unmount(rootfs, 0)
|
|
}
|