mirror of
https://github.com/linuxkit/linuxkit.git
synced 2025-09-29 01:09:29 +00:00
Implements https://github.com/moby/tool/pull/181 Design for things like Kubernetes setup that requires some cgroups to exist when the service starts but it is not running in these, other services are, so there would be a race if they are not created in each. Essentially it is just a sugared `mkdir` in all the cgroup dirs. Signed-off-by: Justin Cormack <justin.cormack@docker.com>
296 lines
9.2 KiB
Go
296 lines
9.2 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/vishvananda/netlink"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// Note these definitions are from moby/tool/src/moby/config.go and should be kept in sync
|
|
|
|
// Runtime is the type of config processed at runtime, not used to build the OCI spec
|
|
type Runtime struct {
|
|
Cgroups []string `yaml:"cgroups" json:"cgroups,omitempty"`
|
|
Mounts []specs.Mount `yaml:"mounts" json:"mounts,omitempty"`
|
|
Mkdir []string `yaml:"mkdir" json:"mkdir,omitempty"`
|
|
Interfaces []Interface `yaml:"interfaces" json:"interfaces,omitempty"`
|
|
BindNS Namespaces `yaml:"bindNS" json:"bindNS,omitempty"`
|
|
}
|
|
|
|
// Namespaces is the type for configuring paths to bind namespaces
|
|
type Namespaces struct {
|
|
Cgroup string `yaml:"cgroup" json:"cgroup,omitempty"`
|
|
Ipc string `yaml:"ipc" json:"ipc,omitempty"`
|
|
Mnt string `yaml:"mnt" json:"mnt,omitempty"`
|
|
Net string `yaml:"net" json:"net,omitempty"`
|
|
Pid string `yaml:"pid" json:"pid,omitempty"`
|
|
User string `yaml:"user" json:"user,omitempty"`
|
|
Uts string `yaml:"uts" json:"uts,omitempty"`
|
|
}
|
|
|
|
// Interface is the runtime config for network interfaces
|
|
type Interface struct {
|
|
Name string `yaml:"name" json:"name,omitempty"`
|
|
Add string `yaml:"add" json:"add,omitempty"`
|
|
Peer string `yaml:"peer" json:"peer,omitempty"`
|
|
CreateInRoot bool `yaml:"createInRoot" json:"createInRoot"`
|
|
}
|
|
|
|
func getRuntimeConfig(path string) Runtime {
|
|
var runtime Runtime
|
|
conf, err := ioutil.ReadFile(filepath.Join(path, "runtime.json"))
|
|
if err != nil {
|
|
// if it does not exist it is fine to return an empty runtime, to not do anything
|
|
if os.IsNotExist(err) {
|
|
return runtime
|
|
}
|
|
log.Fatalf("Cannot read runtime config: %v", err)
|
|
}
|
|
if err := json.Unmarshal(conf, &runtime); err != nil {
|
|
log.Fatalf("Cannot parse runtime config: %v", err)
|
|
}
|
|
return runtime
|
|
}
|
|
|
|
// parseMountOptions takes fstab style mount options and parses them for
|
|
// use with a standard mount() syscall
|
|
func parseMountOptions(options []string) (int, string) {
|
|
var (
|
|
flag int
|
|
data []string
|
|
)
|
|
flags := map[string]struct {
|
|
clear bool
|
|
flag int
|
|
}{
|
|
"async": {true, unix.MS_SYNCHRONOUS},
|
|
"atime": {true, unix.MS_NOATIME},
|
|
"bind": {false, unix.MS_BIND},
|
|
"defaults": {false, 0},
|
|
"dev": {true, unix.MS_NODEV},
|
|
"diratime": {true, unix.MS_NODIRATIME},
|
|
"dirsync": {false, unix.MS_DIRSYNC},
|
|
"exec": {true, unix.MS_NOEXEC},
|
|
"mand": {false, unix.MS_MANDLOCK},
|
|
"noatime": {false, unix.MS_NOATIME},
|
|
"nodev": {false, unix.MS_NODEV},
|
|
"nodiratime": {false, unix.MS_NODIRATIME},
|
|
"noexec": {false, unix.MS_NOEXEC},
|
|
"nomand": {true, unix.MS_MANDLOCK},
|
|
"norelatime": {true, unix.MS_RELATIME},
|
|
"nostrictatime": {true, unix.MS_STRICTATIME},
|
|
"nosuid": {false, unix.MS_NOSUID},
|
|
"private": {false, unix.MS_PRIVATE},
|
|
"rbind": {false, unix.MS_BIND | unix.MS_REC},
|
|
"relatime": {false, unix.MS_RELATIME},
|
|
"remount": {false, unix.MS_REMOUNT},
|
|
"ro": {false, unix.MS_RDONLY},
|
|
"rw": {true, unix.MS_RDONLY},
|
|
"shared": {false, unix.MS_SHARED},
|
|
"slave": {false, unix.MS_SLAVE},
|
|
"strictatime": {false, unix.MS_STRICTATIME},
|
|
"suid": {true, unix.MS_NOSUID},
|
|
"sync": {false, unix.MS_SYNCHRONOUS},
|
|
"unbindable": {false, unix.MS_UNBINDABLE},
|
|
}
|
|
for _, o := range options {
|
|
// If the option does not exist in the flags table or the flag
|
|
// is not supported on the platform,
|
|
// then it is a data value for a specific fs type
|
|
if f, exists := flags[o]; exists && f.flag != 0 {
|
|
if f.clear {
|
|
flag &^= f.flag
|
|
} else {
|
|
flag |= f.flag
|
|
}
|
|
} else {
|
|
data = append(data, o)
|
|
}
|
|
}
|
|
return flag, strings.Join(data, ",")
|
|
}
|
|
|
|
// newCgroup creates a cgroup (ie directory) under all directories in /sys/fs/cgroup
|
|
// we could use github.com/containerd/cgroups but it has a lot of deps and this is just a sugary mkdir
|
|
func newCgroup(cgroup string) error {
|
|
dirs, err := ioutil.ReadDir("/sys/fs/cgroup")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, dir := range dirs {
|
|
if !dir.IsDir() {
|
|
continue
|
|
}
|
|
if err := os.MkdirAll(filepath.Join("/sys/fs/cgroup", dir.Name(), cgroup), 0755); err != nil {
|
|
log.Printf("cgroup error: %v", err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// prepareFilesystem sets up the mounts and cgroups, before the container is created
|
|
func prepareFilesystem(path string, runtime Runtime) error {
|
|
// execute the runtime config that should be done up front
|
|
// we execute Mounts before Mkdir so you can make a directory under a mount
|
|
// but we do mkdir of the destination path in case missing
|
|
for _, mount := range runtime.Mounts {
|
|
const mode os.FileMode = 0755
|
|
err := os.MkdirAll(mount.Destination, mode)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot create directory for mount destination %s: %v", mount.Destination, err)
|
|
}
|
|
// also mkdir upper and work directories on overlay
|
|
for _, o := range mount.Options {
|
|
eq := strings.SplitN(o, "=", 2)
|
|
if len(eq) == 2 && (eq[0] == "upperdir" || eq[0] == "workdir") {
|
|
err := os.MkdirAll(eq[1], mode)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot create directory for overlay %s=%s: %v", eq[0], eq[1], err)
|
|
}
|
|
}
|
|
}
|
|
opts, data := parseMountOptions(mount.Options)
|
|
if err := unix.Mount(mount.Source, mount.Destination, mount.Type, uintptr(opts), data); err != nil {
|
|
return fmt.Errorf("Failed to mount %s: %v", mount.Source, err)
|
|
}
|
|
}
|
|
for _, dir := range runtime.Mkdir {
|
|
// in future we may need to change the structure to set mode, ownership
|
|
const mode os.FileMode = 0755
|
|
err := os.MkdirAll(dir, mode)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot create directory %s: %v", dir, err)
|
|
}
|
|
}
|
|
|
|
for _, cgroup := range runtime.Cgroups {
|
|
// currently no way to specify resource limits on new cgroups at creation time
|
|
if err := newCgroup(cgroup); err != nil {
|
|
return fmt.Errorf("Cannot create cgroup %s: %v", cgroup, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// bind mount a namespace file
|
|
func bindNS(ns string, path string, pid int) error {
|
|
if path == "" {
|
|
return nil
|
|
}
|
|
// the path and file need to exist for the bind to succeed, so try to create
|
|
dir := filepath.Dir(path)
|
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
|
return fmt.Errorf("Cannot create leading directories %s for bind mount destination: %v", dir, err)
|
|
}
|
|
fi, err := os.Create(path)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot create a mount point for namespace bind at %s: %v", path, err)
|
|
}
|
|
if err := fi.Close(); err != nil {
|
|
return err
|
|
}
|
|
if err := unix.Mount(fmt.Sprintf("/proc/%d/ns/%s", pid, ns), path, "", unix.MS_BIND, ""); err != nil {
|
|
return fmt.Errorf("Failed to bind %s namespace at %s: %v", ns, path, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// prepareProcess sets up anything that needs to be done after the container process is created, but before it runs
|
|
// for example networking
|
|
func prepareProcess(pid int, runtime Runtime) error {
|
|
for _, iface := range runtime.Interfaces {
|
|
if iface.Name == "" {
|
|
return fmt.Errorf("Interface requires a name")
|
|
}
|
|
|
|
var link netlink.Link
|
|
var ns interface{} = netlink.NsPid(pid)
|
|
var move bool
|
|
var err error
|
|
|
|
if iface.Peer != "" && iface.Add == "" {
|
|
// must be a veth if specify peer
|
|
iface.Add = "veth"
|
|
}
|
|
|
|
// if create in root is set, create in root namespace first, then move
|
|
// also do the same for a veth pair
|
|
if iface.CreateInRoot || iface.Add == "veth" {
|
|
ns = nil
|
|
move = true
|
|
}
|
|
|
|
if iface.Add != "" {
|
|
switch iface.Add {
|
|
case "veth":
|
|
if iface.Peer == "" {
|
|
return fmt.Errorf("Creating a veth pair %s requires a peer to be set", iface.Name)
|
|
}
|
|
la := netlink.LinkAttrs{Name: iface.Name, Namespace: ns}
|
|
link = &netlink.Veth{LinkAttrs: la, PeerName: iface.Peer}
|
|
default:
|
|
// no special creation options needed
|
|
la := netlink.LinkAttrs{Name: iface.Name, Namespace: ns}
|
|
link = &netlink.GenericLink{la, iface.Add}
|
|
}
|
|
if err := netlink.LinkAdd(link); err != nil {
|
|
return fmt.Errorf("Link add %s of type %s failed: %v", iface.Name, iface.Add, err)
|
|
}
|
|
fmt.Fprintf(os.Stderr, "Created interface %s type %s\n", iface.Name, iface.Add)
|
|
} else {
|
|
// find existing interface
|
|
link, err = netlink.LinkByName(iface.Name)
|
|
if err != nil {
|
|
return fmt.Errorf("Cannot find interface %s: %v", iface.Name, err)
|
|
}
|
|
// then move into namespace
|
|
move = true
|
|
}
|
|
if move {
|
|
if err := netlink.LinkSetNsPid(link, int(pid)); err != nil {
|
|
return fmt.Errorf("Cannot move interface %s into namespace: %v", iface.Name, err)
|
|
}
|
|
fmt.Fprintf(os.Stderr, "Moved interface %s to pid %d\n", iface.Name, pid)
|
|
}
|
|
}
|
|
|
|
binds := []struct {
|
|
ns string
|
|
path string
|
|
}{
|
|
{"cgroup", runtime.BindNS.Cgroup},
|
|
{"ipc", runtime.BindNS.Ipc},
|
|
{"mnt", runtime.BindNS.Mnt},
|
|
{"net", runtime.BindNS.Net},
|
|
{"pid", runtime.BindNS.Pid},
|
|
{"user", runtime.BindNS.User},
|
|
{"uts", runtime.BindNS.Uts},
|
|
}
|
|
|
|
for _, b := range binds {
|
|
if err := bindNS(b.ns, b.path, pid); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// cleanup functions are best efforts only, mainly for rw onboot containers
|
|
func cleanup(path string) {
|
|
// remove the root mount
|
|
rootfs := filepath.Join(path, "rootfs")
|
|
_ = unix.Unmount(rootfs, 0)
|
|
}
|