mirror of
https://github.com/linuxkit/linuxkit.git
synced 2025-07-27 04:28:20 +00:00
cgroups v2 has been out since 2015. Not having to set a kernel parameter helps improve the user experience by not requiring it when it is required by services in a build. Making this the default was discussed back in 2021. Signed-off-by: Jacob Weinstock <jakobweinstock@gmail.com>
471 lines
12 KiB
Go
471 lines
12 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/csv"
|
|
"log"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const (
|
|
nodev = unix.MS_NODEV
|
|
noexec = unix.MS_NOEXEC
|
|
nosuid = unix.MS_NOSUID
|
|
readonly = unix.MS_RDONLY
|
|
rec = unix.MS_REC
|
|
relatime = unix.MS_RELATIME
|
|
remount = unix.MS_REMOUNT
|
|
shared = unix.MS_SHARED
|
|
)
|
|
|
|
var (
|
|
infinity = uint64(unix.RLIM_INFINITY)
|
|
)
|
|
|
|
// set as a subreaper
|
|
func subreaper() {
|
|
err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(1), 0, 0, 0)
|
|
if err != nil {
|
|
log.Printf("error setting as a subreaper: %v", err)
|
|
}
|
|
}
|
|
|
|
// nothing really to error to, so just warn
|
|
func mount(source string, target string, fstype string, flags uintptr, data string) {
|
|
err := unix.Mount(source, target, fstype, flags, data)
|
|
if err != nil {
|
|
log.Printf("error mounting %s to %s: %v", source, target, err)
|
|
}
|
|
}
|
|
|
|
// in some cases, do not even log an error
|
|
func mountSilent(source string, target string, fstype string, flags uintptr, data string) {
|
|
_ = unix.Mount(source, target, fstype, flags, data)
|
|
}
|
|
|
|
// make a character device
|
|
func mkchar(path string, mode, major, minor uint32) {
|
|
// unix.Mknod only supports int dev numbers; this is ok for us
|
|
dev := int(unix.Mkdev(major, minor))
|
|
err := unix.Mknod(path, mode, dev)
|
|
if err != nil {
|
|
if err.Error() == "file exists" {
|
|
return
|
|
}
|
|
log.Printf("error making device %s: %v", path, err)
|
|
}
|
|
}
|
|
|
|
// symlink with error warning
|
|
func symlink(oldpath string, newpath string) {
|
|
err := unix.Symlink(oldpath, newpath)
|
|
if err != nil {
|
|
log.Printf("error making symlink %s: %v", newpath, err)
|
|
}
|
|
}
|
|
|
|
// mkdirall with warning
|
|
func mkdir(path string, perm os.FileMode) {
|
|
err := os.MkdirAll(path, perm)
|
|
if err != nil {
|
|
log.Printf("error making directory %s: %v", path, err)
|
|
}
|
|
}
|
|
|
|
// list of all enabled cgroups
|
|
func cgroupList() []string {
|
|
var list []string
|
|
f, err := os.Open("/proc/cgroups")
|
|
if err != nil {
|
|
log.Printf("cannot open /proc/cgroups: %v", err)
|
|
return list
|
|
}
|
|
defer f.Close()
|
|
reader := csv.NewReader(f)
|
|
// tab delimited
|
|
reader.Comma = '\t'
|
|
// four fields
|
|
reader.FieldsPerRecord = 4
|
|
cgroups, err := reader.ReadAll()
|
|
if err != nil {
|
|
log.Printf("cannot parse /proc/cgroups: %v", err)
|
|
return list
|
|
}
|
|
for _, cg := range cgroups {
|
|
// see if enabled
|
|
if cg[3] == "1" {
|
|
list = append(list, cg[0])
|
|
}
|
|
}
|
|
return list
|
|
}
|
|
|
|
// write a file, eg sysfs
|
|
func write(path string, value string) {
|
|
err := os.WriteFile(path, []byte(value), 0600)
|
|
if err != nil {
|
|
log.Printf("cannot write to %s: %v", path, err)
|
|
}
|
|
}
|
|
|
|
// read a file, eg sysfs, strip whitespace, empty string if does not exist
|
|
func read(path string) string {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(string(data))
|
|
}
|
|
|
|
// read a directory
|
|
func readdir(path string) []string {
|
|
var names []string
|
|
files, err := os.ReadDir(path)
|
|
if err != nil {
|
|
log.Printf("cannot read directory %s: %v", path, err)
|
|
return names
|
|
}
|
|
for _, f := range files {
|
|
names = append(names, f.Name())
|
|
}
|
|
return names
|
|
}
|
|
|
|
// glob logging errors
|
|
func glob(pattern string) []string {
|
|
files, err := filepath.Glob(pattern)
|
|
if err != nil {
|
|
log.Printf("error in glob %s: %v", pattern, err)
|
|
return []string{}
|
|
}
|
|
return files
|
|
}
|
|
|
|
// test if a file exists
|
|
func exists(path string) bool {
|
|
_, err := os.Stat(path)
|
|
return err == nil
|
|
}
|
|
|
|
// modalias runs modprobe on the modalias file contents
|
|
func modalias(path string) {
|
|
alias := read(path)
|
|
cmd := exec.Command("/sbin/modprobe", "-abq", alias)
|
|
// many of these error so do not report
|
|
_ = cmd.Run()
|
|
}
|
|
|
|
func doMounts() {
|
|
// mount proc filesystem
|
|
mount("proc", "/proc", "proc", nodev|nosuid|noexec|relatime, "")
|
|
|
|
// remount rootfs read only if it is not already
|
|
mountSilent("", "/", "", remount|readonly, "")
|
|
|
|
// mount tmpfs for /tmp and /run
|
|
mount("tmpfs", "/run", "tmpfs", nodev|nosuid|noexec|relatime, "size=10%,mode=755")
|
|
mount("tmpfs", "/tmp", "tmpfs", nodev|nosuid|noexec|relatime, "size=10%,mode=1777")
|
|
|
|
// mount tmpfs for /var. This may be overmounted with a persistent filesystem later
|
|
mount("tmpfs", "/var", "tmpfs", nodev|nosuid|noexec|relatime, "size=50%,mode=755")
|
|
// add standard directories in /var
|
|
mkdir("/var/cache", 0755)
|
|
mkdir("/var/empty", 0555)
|
|
mkdir("/var/lib", 0755)
|
|
mkdir("/var/local", 0755)
|
|
mkdir("/var/lock", 0755)
|
|
mkdir("/var/log", 0755)
|
|
mkdir("/var/opt", 0755)
|
|
mkdir("/var/spool", 0755)
|
|
mkdir("/var/tmp", 01777)
|
|
symlink("/run", "/var/run")
|
|
|
|
// mount devfs
|
|
mount("dev", "/dev", "devtmpfs", nosuid|noexec|relatime, "size=10m,nr_inodes=248418,mode=755")
|
|
// make minimum necessary devices
|
|
mkchar("/dev/console", 0600, 5, 1)
|
|
mkchar("/dev/tty1", 0620, 4, 1)
|
|
mkchar("/dev/tty", 0666, 5, 0)
|
|
mkchar("/dev/null", 0666, 1, 3)
|
|
mkchar("/dev/kmsg", 0660, 1, 11)
|
|
// make standard symlinks
|
|
symlink("/proc/self/fd", "/dev/fd")
|
|
symlink("/proc/self/fd/0", "/dev/stdin")
|
|
symlink("/proc/self/fd/1", "/dev/stdout")
|
|
symlink("/proc/self/fd/2", "/dev/stderr")
|
|
symlink("/proc/kcore", "/dev/kcore")
|
|
// dev mountpoints
|
|
mkdir("/dev/mqueue", 01777)
|
|
mkdir("/dev/shm", 01777)
|
|
mkdir("/dev/pts", 0755)
|
|
// mounts on /dev
|
|
mount("mqueue", "/dev/mqueue", "mqueue", noexec|nosuid|nodev, "")
|
|
mount("shm", "/dev/shm", "tmpfs", noexec|nosuid|nodev, "mode=1777")
|
|
mount("devpts", "/dev/pts", "devpts", noexec|nosuid, "gid=5,mode=0620")
|
|
|
|
// sysfs
|
|
mount("sysfs", "/sys", "sysfs", noexec|nosuid|nodev, "")
|
|
// some of the subsystems may not exist, so ignore errors
|
|
mountSilent("securityfs", "/sys/kernel/security", "securityfs", noexec|nosuid|nodev, "")
|
|
mountSilent("debugfs", "/sys/kernel/debug", "debugfs", noexec|nosuid|nodev, "")
|
|
mountSilent("configfs", "/sys/kernel/config", "configfs", noexec|nosuid|nodev, "")
|
|
mountSilent("fusectl", "/sys/fs/fuse/connections", "fusectl", noexec|nosuid|nodev, "")
|
|
mountSilent("selinuxfs", "/sys/fs/selinux", "selinuxfs", noexec|nosuid, "")
|
|
mountSilent("pstore", "/sys/fs/pstore", "pstore", noexec|nosuid|nodev, "")
|
|
mountSilent("bpffs", "/sys/fs/bpf", "bpf", nodev, "")
|
|
|
|
mountSilent("efivarfs", "/sys/firmware/efi/efivars", "efivarfs", noexec|nosuid|nodev, "")
|
|
|
|
// misc /proc mounted fs
|
|
mountSilent("binfmt_misc", "/proc/sys/fs/binfmt_misc", "binfmt_misc", noexec|nosuid|nodev, "")
|
|
|
|
if isCgroupV1() {
|
|
// mount cgroup root tmpfs
|
|
mount("cgroup_root", "/sys/fs/cgroup", "tmpfs", nodev|noexec|nosuid, "mode=755,size=10m")
|
|
// mount cgroups filesystems for all enabled cgroups
|
|
for _, cg := range cgroupList() {
|
|
path := filepath.Join("/sys/fs/cgroup", cg)
|
|
mkdir(path, 0555)
|
|
mount(cg, path, "cgroup", noexec|nosuid|nodev, cg)
|
|
}
|
|
|
|
// use hierarchy for memory
|
|
write("/sys/fs/cgroup/memory/memory.use_hierarchy", "1")
|
|
|
|
// many things assume systemd
|
|
mkdir("/sys/fs/cgroup/systemd", 0555)
|
|
mount("cgroup", "/sys/fs/cgroup/systemd", "cgroup", 0, "none,name=systemd")
|
|
} else {
|
|
mount("cgroup2", "/sys/fs/cgroup", "cgroup2", noexec|nosuid|nodev, "")
|
|
}
|
|
|
|
// make / rshared
|
|
mount("", "/", "", rec|shared, "")
|
|
}
|
|
|
|
func doHotplug() {
|
|
mdev := "/sbin/mdev"
|
|
// start mdev for hotplug
|
|
write("/proc/sys/kernel/hotplug", mdev)
|
|
|
|
devices := "/sys/devices"
|
|
files := readdir(devices)
|
|
for _, f := range files {
|
|
uevent := filepath.Join(devices, f, "uevent")
|
|
if strings.HasPrefix(f, "usb") && exists(uevent) {
|
|
write(uevent, "add")
|
|
}
|
|
}
|
|
|
|
cmd := exec.Command(mdev, "-s")
|
|
if err := cmd.Run(); err != nil {
|
|
log.Printf("Failed to run %s -s: %v", mdev, err)
|
|
}
|
|
|
|
// mdev only supports hot plug, so also add all existing cold plug devices
|
|
for _, df := range glob("/sys/bus/*/devices/*/modalias") {
|
|
modalias(df)
|
|
}
|
|
}
|
|
|
|
func doClock() {
|
|
cmd := exec.Command("/sbin/hwclock", "--hctosys", "--utc")
|
|
if err := cmd.Run(); err != nil {
|
|
log.Printf("Failed to run hwclock: %v", err)
|
|
}
|
|
}
|
|
|
|
func rlimit(resource int, cur, max uint64) {
|
|
lim := unix.Rlimit{Cur: cur, Max: max}
|
|
err := unix.Setrlimit(resource, &lim)
|
|
if err != nil {
|
|
log.Printf("Failed to set rlimit %d: %v", resource, err)
|
|
}
|
|
}
|
|
|
|
func doLimits() {
|
|
rlimit(unix.RLIMIT_NOFILE, 1048576, 1048576)
|
|
rlimit(unix.RLIMIT_NPROC, infinity, infinity)
|
|
}
|
|
|
|
func doHostname() {
|
|
hostname := read("/etc/hostname")
|
|
if hostname != "" {
|
|
if err := unix.Sethostname([]byte(hostname)); err != nil {
|
|
log.Printf("Setting hostname failed: %v", err)
|
|
}
|
|
}
|
|
hostname, err := os.Hostname()
|
|
if err != nil {
|
|
log.Printf("Cannot read hostname: %v", err)
|
|
return
|
|
}
|
|
|
|
if hostname != "(none)" && hostname != "" {
|
|
return
|
|
}
|
|
|
|
mac := read("/sys/class/net/eth0/address")
|
|
if mac == "" {
|
|
return
|
|
}
|
|
|
|
mac = strings.Replace(mac, ":", "", -1)
|
|
if err := unix.Sethostname([]byte("linuxkit-" + mac)); err != nil {
|
|
log.Printf("Setting hostname failed: %v", err)
|
|
}
|
|
}
|
|
|
|
func doResolvConf() {
|
|
// for containerizing dhcpcd and other containers that need writable /etc/resolv.conf
|
|
// if it is a symlink (usually to /run) make the directory and empty file
|
|
link, err := os.Readlink("/etc/resolv.conf")
|
|
if err != nil {
|
|
return
|
|
}
|
|
mkdir(filepath.Dir(link), 0755)
|
|
write(link, "")
|
|
}
|
|
|
|
func doLoopback() {
|
|
// TODO use netlink instead
|
|
cmd := exec.Command("/sbin/ip", "addr", "add", "127.0.0.1/8", "dev", "lo", "brd", "+", "scope", "host")
|
|
_ = cmd.Run()
|
|
cmd = exec.Command("/sbin/ip", "route", "add", "127.0.0.0/8", "dev", "lo", "scope", "host")
|
|
_ = cmd.Run()
|
|
cmd = exec.Command("/sbin/ip", "link", "set", "lo", "up")
|
|
_ = cmd.Run()
|
|
}
|
|
|
|
// execute scripts in /etc/init.d/ or /etc/shutdown.d. These should not block.
|
|
func runInit(path string) {
|
|
for _, f := range readdir(path) {
|
|
file := filepath.Join(path, f)
|
|
fi, err := os.Stat(file)
|
|
if err != nil {
|
|
log.Printf("Cannot stat %s: %v", file, err)
|
|
continue
|
|
}
|
|
if !fi.Mode().IsRegular() {
|
|
continue
|
|
}
|
|
cmd := exec.Command(file)
|
|
cmd.Stdout = os.Stdout
|
|
cmd.Stderr = os.Stderr
|
|
_ = cmd.Run()
|
|
}
|
|
}
|
|
|
|
func doReap() {
|
|
// now reap all children
|
|
// if we are running in a real system, init does this, but in a container it would terminate when we exit
|
|
for {
|
|
_, err := unix.Wait4(-1, nil, 0, nil)
|
|
if err != nil {
|
|
// ECHILD means no children left
|
|
if e, ok := err.(*os.SyscallError); ok && e.Err == unix.ECHILD {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func unmountAll() {
|
|
mounts, err := os.Open("/proc/mounts")
|
|
if err != nil {
|
|
log.Printf("Cannot open /proc/mounts: %v", err)
|
|
return
|
|
}
|
|
defer mounts.Close()
|
|
scanner := bufio.NewScanner(mounts)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
parts := strings.Split(line, " ")
|
|
if len(parts) > 3 {
|
|
dest := parts[1]
|
|
tp := parts[2]
|
|
switch tp {
|
|
// do not unmount tmpfs or virtual filesystems, just ones that need to write data
|
|
case "ext2", "ext3", "ext4", "btrfs", "xfs", "vfat", "msdos", "overlay":
|
|
if err := unix.Unmount(dest, 0); err != nil {
|
|
log.Printf("error unmounting %s: %v", dest, err)
|
|
}
|
|
case "nfs", "nfs4", "cifs":
|
|
// lazy unmount as we do not want to block on network mounts
|
|
if err := unix.Unmount(dest, unix.MNT_DETACH); err != nil {
|
|
log.Printf("error unmounting %s: %v", dest, err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func doShutdown(action string) {
|
|
runInit("/etc/shutdown.d")
|
|
_ = unix.Kill(-1, unix.SIGTERM)
|
|
time.Sleep(5 * time.Second)
|
|
_ = unix.Kill(-1, unix.SIGKILL)
|
|
unix.Sync()
|
|
unmountAll()
|
|
switch action {
|
|
case "poweroff":
|
|
_ = unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF)
|
|
case "reboot":
|
|
_ = unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
|
|
}
|
|
// if this failed, init will try again
|
|
os.Exit(0)
|
|
}
|
|
|
|
func isCgroupV1() bool {
|
|
dt, err := os.ReadFile("/proc/cmdline")
|
|
if err != nil {
|
|
log.Printf("error reading /proc/cmdline: %v", err)
|
|
return false
|
|
}
|
|
for _, s := range strings.Fields(string(dt)) {
|
|
if s == "linuxkit.unified_cgroup_hierarchy=0" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func main() {
|
|
if filepath.Base(os.Args[0]) == "rc.shutdown" {
|
|
action := "poweroff"
|
|
if len(os.Args) > 1 {
|
|
action = os.Args[1]
|
|
}
|
|
doShutdown(action)
|
|
}
|
|
// see if we are on a real system or in userspace
|
|
// assume in userspace if /proc already mounted
|
|
userspace := exists("/proc/self")
|
|
|
|
if userspace {
|
|
subreaper()
|
|
} else {
|
|
doMounts()
|
|
doHotplug()
|
|
doClock()
|
|
doLoopback()
|
|
}
|
|
|
|
doLimits()
|
|
doHostname()
|
|
doResolvConf()
|
|
|
|
runInit("/etc/init.d")
|
|
|
|
if userspace {
|
|
doReap()
|
|
}
|
|
}
|