mirror of
				https://github.com/linuxkit/linuxkit.git
				synced 2025-11-04 09:25:58 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			296 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			296 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
package main
 | 
						|
 | 
						|
import (
 | 
						|
	"encoding/json"
 | 
						|
	"fmt"
 | 
						|
	"io/ioutil"
 | 
						|
	"os"
 | 
						|
	"path/filepath"
 | 
						|
	"strings"
 | 
						|
 | 
						|
	"github.com/opencontainers/runtime-spec/specs-go"
 | 
						|
	log "github.com/sirupsen/logrus"
 | 
						|
	"github.com/vishvananda/netlink"
 | 
						|
	"golang.org/x/sys/unix"
 | 
						|
)
 | 
						|
 | 
						|
// Note these definitions are from moby/tool/src/moby/config.go and should be kept in sync
 | 
						|
 | 
						|
// Runtime is the type of config processed at runtime, not used to build the OCI spec
 | 
						|
type Runtime struct {
 | 
						|
	Cgroups    []string      `yaml:"cgroups" json:"cgroups,omitempty"`
 | 
						|
	Mounts     []specs.Mount `yaml:"mounts" json:"mounts,omitempty"`
 | 
						|
	Mkdir      []string      `yaml:"mkdir" json:"mkdir,omitempty"`
 | 
						|
	Interfaces []Interface   `yaml:"interfaces" json:"interfaces,omitempty"`
 | 
						|
	BindNS     Namespaces    `yaml:"bindNS" json:"bindNS,omitempty"`
 | 
						|
}
 | 
						|
 | 
						|
// Namespaces is the type for configuring paths to bind namespaces
 | 
						|
type Namespaces struct {
 | 
						|
	Cgroup string `yaml:"cgroup" json:"cgroup,omitempty"`
 | 
						|
	Ipc    string `yaml:"ipc" json:"ipc,omitempty"`
 | 
						|
	Mnt    string `yaml:"mnt" json:"mnt,omitempty"`
 | 
						|
	Net    string `yaml:"net" json:"net,omitempty"`
 | 
						|
	Pid    string `yaml:"pid" json:"pid,omitempty"`
 | 
						|
	User   string `yaml:"user" json:"user,omitempty"`
 | 
						|
	Uts    string `yaml:"uts" json:"uts,omitempty"`
 | 
						|
}
 | 
						|
 | 
						|
// Interface is the runtime config for network interfaces
 | 
						|
type Interface struct {
 | 
						|
	Name         string `yaml:"name" json:"name,omitempty"`
 | 
						|
	Add          string `yaml:"add" json:"add,omitempty"`
 | 
						|
	Peer         string `yaml:"peer" json:"peer,omitempty"`
 | 
						|
	CreateInRoot bool   `yaml:"createInRoot" json:"createInRoot"`
 | 
						|
}
 | 
						|
 | 
						|
func getRuntimeConfig(path string) Runtime {
 | 
						|
	var runtime Runtime
 | 
						|
	conf, err := ioutil.ReadFile(filepath.Join(path, "runtime.json"))
 | 
						|
	if err != nil {
 | 
						|
		// if it does not exist it is fine to return an empty runtime, to not do anything
 | 
						|
		if os.IsNotExist(err) {
 | 
						|
			return runtime
 | 
						|
		}
 | 
						|
		log.Fatalf("Cannot read runtime config: %v", err)
 | 
						|
	}
 | 
						|
	if err := json.Unmarshal(conf, &runtime); err != nil {
 | 
						|
		log.Fatalf("Cannot parse runtime config: %v", err)
 | 
						|
	}
 | 
						|
	return runtime
 | 
						|
}
 | 
						|
 | 
						|
// parseMountOptions takes fstab style mount options and parses them for
 | 
						|
// use with a standard mount() syscall
 | 
						|
func parseMountOptions(options []string) (int, string) {
 | 
						|
	var (
 | 
						|
		flag int
 | 
						|
		data []string
 | 
						|
	)
 | 
						|
	flags := map[string]struct {
 | 
						|
		clear bool
 | 
						|
		flag  int
 | 
						|
	}{
 | 
						|
		"async":         {true, unix.MS_SYNCHRONOUS},
 | 
						|
		"atime":         {true, unix.MS_NOATIME},
 | 
						|
		"bind":          {false, unix.MS_BIND},
 | 
						|
		"defaults":      {false, 0},
 | 
						|
		"dev":           {true, unix.MS_NODEV},
 | 
						|
		"diratime":      {true, unix.MS_NODIRATIME},
 | 
						|
		"dirsync":       {false, unix.MS_DIRSYNC},
 | 
						|
		"exec":          {true, unix.MS_NOEXEC},
 | 
						|
		"mand":          {false, unix.MS_MANDLOCK},
 | 
						|
		"noatime":       {false, unix.MS_NOATIME},
 | 
						|
		"nodev":         {false, unix.MS_NODEV},
 | 
						|
		"nodiratime":    {false, unix.MS_NODIRATIME},
 | 
						|
		"noexec":        {false, unix.MS_NOEXEC},
 | 
						|
		"nomand":        {true, unix.MS_MANDLOCK},
 | 
						|
		"norelatime":    {true, unix.MS_RELATIME},
 | 
						|
		"nostrictatime": {true, unix.MS_STRICTATIME},
 | 
						|
		"nosuid":        {false, unix.MS_NOSUID},
 | 
						|
		"private":       {false, unix.MS_PRIVATE},
 | 
						|
		"rbind":         {false, unix.MS_BIND | unix.MS_REC},
 | 
						|
		"relatime":      {false, unix.MS_RELATIME},
 | 
						|
		"remount":       {false, unix.MS_REMOUNT},
 | 
						|
		"ro":            {false, unix.MS_RDONLY},
 | 
						|
		"rw":            {true, unix.MS_RDONLY},
 | 
						|
		"shared":        {false, unix.MS_SHARED},
 | 
						|
		"slave":         {false, unix.MS_SLAVE},
 | 
						|
		"strictatime":   {false, unix.MS_STRICTATIME},
 | 
						|
		"suid":          {true, unix.MS_NOSUID},
 | 
						|
		"sync":          {false, unix.MS_SYNCHRONOUS},
 | 
						|
		"unbindable":    {false, unix.MS_UNBINDABLE},
 | 
						|
	}
 | 
						|
	for _, o := range options {
 | 
						|
		// If the option does not exist in the flags table or the flag
 | 
						|
		// is not supported on the platform,
 | 
						|
		// then it is a data value for a specific fs type
 | 
						|
		if f, exists := flags[o]; exists && f.flag != 0 {
 | 
						|
			if f.clear {
 | 
						|
				flag &^= f.flag
 | 
						|
			} else {
 | 
						|
				flag |= f.flag
 | 
						|
			}
 | 
						|
		} else {
 | 
						|
			data = append(data, o)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return flag, strings.Join(data, ",")
 | 
						|
}
 | 
						|
 | 
						|
// newCgroup creates a cgroup (ie directory) under all directories in /sys/fs/cgroup
 | 
						|
// we could use github.com/containerd/cgroups but it has a lot of deps and this is just a sugary mkdir
 | 
						|
func newCgroup(cgroup string) error {
 | 
						|
	dirs, err := ioutil.ReadDir("/sys/fs/cgroup")
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	for _, dir := range dirs {
 | 
						|
		if !dir.IsDir() {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
		if err := os.MkdirAll(filepath.Join("/sys/fs/cgroup", dir.Name(), cgroup), 0755); err != nil {
 | 
						|
			log.Printf("cgroup error: %v", err)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// prepareFilesystem sets up the mounts and cgroups, before the container is created
 | 
						|
func prepareFilesystem(path string, runtime Runtime) error {
 | 
						|
	// execute the runtime config that should be done up front
 | 
						|
	// we execute Mounts before Mkdir so you can make a directory under a mount
 | 
						|
	// but we do mkdir of the destination path in case missing
 | 
						|
	for _, mount := range runtime.Mounts {
 | 
						|
		const mode os.FileMode = 0755
 | 
						|
		err := os.MkdirAll(mount.Destination, mode)
 | 
						|
		if err != nil {
 | 
						|
			return fmt.Errorf("Cannot create directory for mount destination %s: %v", mount.Destination, err)
 | 
						|
		}
 | 
						|
		// also mkdir upper and work directories on overlay
 | 
						|
		for _, o := range mount.Options {
 | 
						|
			eq := strings.SplitN(o, "=", 2)
 | 
						|
			if len(eq) == 2 && (eq[0] == "upperdir" || eq[0] == "workdir") {
 | 
						|
				err := os.MkdirAll(eq[1], mode)
 | 
						|
				if err != nil {
 | 
						|
					return fmt.Errorf("Cannot create directory for overlay %s=%s: %v", eq[0], eq[1], err)
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
		opts, data := parseMountOptions(mount.Options)
 | 
						|
		if err := unix.Mount(mount.Source, mount.Destination, mount.Type, uintptr(opts), data); err != nil {
 | 
						|
			return fmt.Errorf("Failed to mount %s: %v", mount.Source, err)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	for _, dir := range runtime.Mkdir {
 | 
						|
		// in future we may need to change the structure to set mode, ownership
 | 
						|
		const mode os.FileMode = 0755
 | 
						|
		err := os.MkdirAll(dir, mode)
 | 
						|
		if err != nil {
 | 
						|
			return fmt.Errorf("Cannot create directory %s: %v", dir, err)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	for _, cgroup := range runtime.Cgroups {
 | 
						|
		// currently no way to specify resource limits on new cgroups at creation time
 | 
						|
		if err := newCgroup(cgroup); err != nil {
 | 
						|
			return fmt.Errorf("Cannot create cgroup %s: %v", cgroup, err)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// bind mount a namespace file
 | 
						|
func bindNS(ns string, path string, pid int) error {
 | 
						|
	if path == "" {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	// the path and file need to exist for the bind to succeed, so try to create
 | 
						|
	dir := filepath.Dir(path)
 | 
						|
	if err := os.MkdirAll(dir, 0755); err != nil {
 | 
						|
		return fmt.Errorf("Cannot create leading directories %s for bind mount destination: %v", dir, err)
 | 
						|
	}
 | 
						|
	fi, err := os.Create(path)
 | 
						|
	if err != nil {
 | 
						|
		return fmt.Errorf("Cannot create a mount point for namespace bind at %s: %v", path, err)
 | 
						|
	}
 | 
						|
	if err := fi.Close(); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if err := unix.Mount(fmt.Sprintf("/proc/%d/ns/%s", pid, ns), path, "", unix.MS_BIND, ""); err != nil {
 | 
						|
		return fmt.Errorf("Failed to bind %s namespace at %s: %v", ns, path, err)
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// prepareProcess sets up anything that needs to be done after the container process is created, but before it runs
 | 
						|
// for example networking
 | 
						|
func prepareProcess(pid int, runtime Runtime) error {
 | 
						|
	for _, iface := range runtime.Interfaces {
 | 
						|
		if iface.Name == "" {
 | 
						|
			return fmt.Errorf("Interface requires a name")
 | 
						|
		}
 | 
						|
 | 
						|
		var link netlink.Link
 | 
						|
		var ns interface{} = netlink.NsPid(pid)
 | 
						|
		var move bool
 | 
						|
		var err error
 | 
						|
 | 
						|
		if iface.Peer != "" && iface.Add == "" {
 | 
						|
			// must be a veth if specify peer
 | 
						|
			iface.Add = "veth"
 | 
						|
		}
 | 
						|
 | 
						|
		// if create in root is set, create in root namespace first, then move
 | 
						|
		// also do the same for a veth pair
 | 
						|
		if iface.CreateInRoot || iface.Add == "veth" {
 | 
						|
			ns = nil
 | 
						|
			move = true
 | 
						|
		}
 | 
						|
 | 
						|
		if iface.Add != "" {
 | 
						|
			switch iface.Add {
 | 
						|
			case "veth":
 | 
						|
				if iface.Peer == "" {
 | 
						|
					return fmt.Errorf("Creating a veth pair %s requires a peer to be set", iface.Name)
 | 
						|
				}
 | 
						|
				la := netlink.LinkAttrs{Name: iface.Name, Namespace: ns}
 | 
						|
				link = &netlink.Veth{LinkAttrs: la, PeerName: iface.Peer}
 | 
						|
			default:
 | 
						|
				// no special creation options needed
 | 
						|
				la := netlink.LinkAttrs{Name: iface.Name, Namespace: ns}
 | 
						|
				link = &netlink.GenericLink{la, iface.Add}
 | 
						|
			}
 | 
						|
			if err := netlink.LinkAdd(link); err != nil {
 | 
						|
				return fmt.Errorf("Link add %s of type %s failed: %v", iface.Name, iface.Add, err)
 | 
						|
			}
 | 
						|
			fmt.Fprintf(os.Stderr, "Created interface %s type %s\n", iface.Name, iface.Add)
 | 
						|
		} else {
 | 
						|
			// find existing interface
 | 
						|
			link, err = netlink.LinkByName(iface.Name)
 | 
						|
			if err != nil {
 | 
						|
				return fmt.Errorf("Cannot find interface %s: %v", iface.Name, err)
 | 
						|
			}
 | 
						|
			// then move into namespace
 | 
						|
			move = true
 | 
						|
		}
 | 
						|
		if move {
 | 
						|
			if err := netlink.LinkSetNsPid(link, int(pid)); err != nil {
 | 
						|
				return fmt.Errorf("Cannot move interface %s into namespace: %v", iface.Name, err)
 | 
						|
			}
 | 
						|
			fmt.Fprintf(os.Stderr, "Moved interface %s to pid %d\n", iface.Name, pid)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	binds := []struct {
 | 
						|
		ns   string
 | 
						|
		path string
 | 
						|
	}{
 | 
						|
		{"cgroup", runtime.BindNS.Cgroup},
 | 
						|
		{"ipc", runtime.BindNS.Ipc},
 | 
						|
		{"mnt", runtime.BindNS.Mnt},
 | 
						|
		{"net", runtime.BindNS.Net},
 | 
						|
		{"pid", runtime.BindNS.Pid},
 | 
						|
		{"user", runtime.BindNS.User},
 | 
						|
		{"uts", runtime.BindNS.Uts},
 | 
						|
	}
 | 
						|
 | 
						|
	for _, b := range binds {
 | 
						|
		if err := bindNS(b.ns, b.path, pid); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// cleanup functions are best efforts only, mainly for rw onboot containers
 | 
						|
func cleanup(path string) {
 | 
						|
	// remove the root mount
 | 
						|
	rootfs := filepath.Join(path, "rootfs")
 | 
						|
	_ = unix.Unmount(rootfs, 0)
 | 
						|
}
 |