linuxkit/pkg/init/cmd/rc.init/main.go
Jacob Weinstock 803747f01a Make cgroups v2 the default:
cgroups v2 has been out since 2015. Not having
to set a kernel parameter helps improve the user
experience by not requiring it when it is required
by services in a build. Making this the default was
discussed back in 2021.

Signed-off-by: Jacob Weinstock <jakobweinstock@gmail.com>
2024-04-27 15:40:00 -06:00

471 lines
12 KiB
Go

package main
import (
"bufio"
"encoding/csv"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"golang.org/x/sys/unix"
)
const (
nodev = unix.MS_NODEV
noexec = unix.MS_NOEXEC
nosuid = unix.MS_NOSUID
readonly = unix.MS_RDONLY
rec = unix.MS_REC
relatime = unix.MS_RELATIME
remount = unix.MS_REMOUNT
shared = unix.MS_SHARED
)
var (
infinity = uint64(unix.RLIM_INFINITY)
)
// set as a subreaper
func subreaper() {
err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(1), 0, 0, 0)
if err != nil {
log.Printf("error setting as a subreaper: %v", err)
}
}
// nothing really to error to, so just warn
func mount(source string, target string, fstype string, flags uintptr, data string) {
err := unix.Mount(source, target, fstype, flags, data)
if err != nil {
log.Printf("error mounting %s to %s: %v", source, target, err)
}
}
// in some cases, do not even log an error
func mountSilent(source string, target string, fstype string, flags uintptr, data string) {
_ = unix.Mount(source, target, fstype, flags, data)
}
// make a character device
func mkchar(path string, mode, major, minor uint32) {
// unix.Mknod only supports int dev numbers; this is ok for us
dev := int(unix.Mkdev(major, minor))
err := unix.Mknod(path, mode, dev)
if err != nil {
if err.Error() == "file exists" {
return
}
log.Printf("error making device %s: %v", path, err)
}
}
// symlink with error warning
func symlink(oldpath string, newpath string) {
err := unix.Symlink(oldpath, newpath)
if err != nil {
log.Printf("error making symlink %s: %v", newpath, err)
}
}
// mkdirall with warning
func mkdir(path string, perm os.FileMode) {
err := os.MkdirAll(path, perm)
if err != nil {
log.Printf("error making directory %s: %v", path, err)
}
}
// list of all enabled cgroups
func cgroupList() []string {
var list []string
f, err := os.Open("/proc/cgroups")
if err != nil {
log.Printf("cannot open /proc/cgroups: %v", err)
return list
}
defer f.Close()
reader := csv.NewReader(f)
// tab delimited
reader.Comma = '\t'
// four fields
reader.FieldsPerRecord = 4
cgroups, err := reader.ReadAll()
if err != nil {
log.Printf("cannot parse /proc/cgroups: %v", err)
return list
}
for _, cg := range cgroups {
// see if enabled
if cg[3] == "1" {
list = append(list, cg[0])
}
}
return list
}
// write a file, eg sysfs
func write(path string, value string) {
err := os.WriteFile(path, []byte(value), 0600)
if err != nil {
log.Printf("cannot write to %s: %v", path, err)
}
}
// read a file, eg sysfs, strip whitespace, empty string if does not exist
func read(path string) string {
data, err := os.ReadFile(path)
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}
// read a directory
func readdir(path string) []string {
var names []string
files, err := os.ReadDir(path)
if err != nil {
log.Printf("cannot read directory %s: %v", path, err)
return names
}
for _, f := range files {
names = append(names, f.Name())
}
return names
}
// glob logging errors
func glob(pattern string) []string {
files, err := filepath.Glob(pattern)
if err != nil {
log.Printf("error in glob %s: %v", pattern, err)
return []string{}
}
return files
}
// test if a file exists
func exists(path string) bool {
_, err := os.Stat(path)
return err == nil
}
// modalias runs modprobe on the modalias file contents
func modalias(path string) {
alias := read(path)
cmd := exec.Command("/sbin/modprobe", "-abq", alias)
// many of these error so do not report
_ = cmd.Run()
}
func doMounts() {
// mount proc filesystem
mount("proc", "/proc", "proc", nodev|nosuid|noexec|relatime, "")
// remount rootfs read only if it is not already
mountSilent("", "/", "", remount|readonly, "")
// mount tmpfs for /tmp and /run
mount("tmpfs", "/run", "tmpfs", nodev|nosuid|noexec|relatime, "size=10%,mode=755")
mount("tmpfs", "/tmp", "tmpfs", nodev|nosuid|noexec|relatime, "size=10%,mode=1777")
// mount tmpfs for /var. This may be overmounted with a persistent filesystem later
mount("tmpfs", "/var", "tmpfs", nodev|nosuid|noexec|relatime, "size=50%,mode=755")
// add standard directories in /var
mkdir("/var/cache", 0755)
mkdir("/var/empty", 0555)
mkdir("/var/lib", 0755)
mkdir("/var/local", 0755)
mkdir("/var/lock", 0755)
mkdir("/var/log", 0755)
mkdir("/var/opt", 0755)
mkdir("/var/spool", 0755)
mkdir("/var/tmp", 01777)
symlink("/run", "/var/run")
// mount devfs
mount("dev", "/dev", "devtmpfs", nosuid|noexec|relatime, "size=10m,nr_inodes=248418,mode=755")
// make minimum necessary devices
mkchar("/dev/console", 0600, 5, 1)
mkchar("/dev/tty1", 0620, 4, 1)
mkchar("/dev/tty", 0666, 5, 0)
mkchar("/dev/null", 0666, 1, 3)
mkchar("/dev/kmsg", 0660, 1, 11)
// make standard symlinks
symlink("/proc/self/fd", "/dev/fd")
symlink("/proc/self/fd/0", "/dev/stdin")
symlink("/proc/self/fd/1", "/dev/stdout")
symlink("/proc/self/fd/2", "/dev/stderr")
symlink("/proc/kcore", "/dev/kcore")
// dev mountpoints
mkdir("/dev/mqueue", 01777)
mkdir("/dev/shm", 01777)
mkdir("/dev/pts", 0755)
// mounts on /dev
mount("mqueue", "/dev/mqueue", "mqueue", noexec|nosuid|nodev, "")
mount("shm", "/dev/shm", "tmpfs", noexec|nosuid|nodev, "mode=1777")
mount("devpts", "/dev/pts", "devpts", noexec|nosuid, "gid=5,mode=0620")
// sysfs
mount("sysfs", "/sys", "sysfs", noexec|nosuid|nodev, "")
// some of the subsystems may not exist, so ignore errors
mountSilent("securityfs", "/sys/kernel/security", "securityfs", noexec|nosuid|nodev, "")
mountSilent("debugfs", "/sys/kernel/debug", "debugfs", noexec|nosuid|nodev, "")
mountSilent("configfs", "/sys/kernel/config", "configfs", noexec|nosuid|nodev, "")
mountSilent("fusectl", "/sys/fs/fuse/connections", "fusectl", noexec|nosuid|nodev, "")
mountSilent("selinuxfs", "/sys/fs/selinux", "selinuxfs", noexec|nosuid, "")
mountSilent("pstore", "/sys/fs/pstore", "pstore", noexec|nosuid|nodev, "")
mountSilent("bpffs", "/sys/fs/bpf", "bpf", nodev, "")
mountSilent("efivarfs", "/sys/firmware/efi/efivars", "efivarfs", noexec|nosuid|nodev, "")
// misc /proc mounted fs
mountSilent("binfmt_misc", "/proc/sys/fs/binfmt_misc", "binfmt_misc", noexec|nosuid|nodev, "")
if isCgroupV1() {
// mount cgroup root tmpfs
mount("cgroup_root", "/sys/fs/cgroup", "tmpfs", nodev|noexec|nosuid, "mode=755,size=10m")
// mount cgroups filesystems for all enabled cgroups
for _, cg := range cgroupList() {
path := filepath.Join("/sys/fs/cgroup", cg)
mkdir(path, 0555)
mount(cg, path, "cgroup", noexec|nosuid|nodev, cg)
}
// use hierarchy for memory
write("/sys/fs/cgroup/memory/memory.use_hierarchy", "1")
// many things assume systemd
mkdir("/sys/fs/cgroup/systemd", 0555)
mount("cgroup", "/sys/fs/cgroup/systemd", "cgroup", 0, "none,name=systemd")
} else {
mount("cgroup2", "/sys/fs/cgroup", "cgroup2", noexec|nosuid|nodev, "")
}
// make / rshared
mount("", "/", "", rec|shared, "")
}
func doHotplug() {
mdev := "/sbin/mdev"
// start mdev for hotplug
write("/proc/sys/kernel/hotplug", mdev)
devices := "/sys/devices"
files := readdir(devices)
for _, f := range files {
uevent := filepath.Join(devices, f, "uevent")
if strings.HasPrefix(f, "usb") && exists(uevent) {
write(uevent, "add")
}
}
cmd := exec.Command(mdev, "-s")
if err := cmd.Run(); err != nil {
log.Printf("Failed to run %s -s: %v", mdev, err)
}
// mdev only supports hot plug, so also add all existing cold plug devices
for _, df := range glob("/sys/bus/*/devices/*/modalias") {
modalias(df)
}
}
func doClock() {
cmd := exec.Command("/sbin/hwclock", "--hctosys", "--utc")
if err := cmd.Run(); err != nil {
log.Printf("Failed to run hwclock: %v", err)
}
}
func rlimit(resource int, cur, max uint64) {
lim := unix.Rlimit{Cur: cur, Max: max}
err := unix.Setrlimit(resource, &lim)
if err != nil {
log.Printf("Failed to set rlimit %d: %v", resource, err)
}
}
func doLimits() {
rlimit(unix.RLIMIT_NOFILE, 1048576, 1048576)
rlimit(unix.RLIMIT_NPROC, infinity, infinity)
}
func doHostname() {
hostname := read("/etc/hostname")
if hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil {
log.Printf("Setting hostname failed: %v", err)
}
}
hostname, err := os.Hostname()
if err != nil {
log.Printf("Cannot read hostname: %v", err)
return
}
if hostname != "(none)" && hostname != "" {
return
}
mac := read("/sys/class/net/eth0/address")
if mac == "" {
return
}
mac = strings.Replace(mac, ":", "", -1)
if err := unix.Sethostname([]byte("linuxkit-" + mac)); err != nil {
log.Printf("Setting hostname failed: %v", err)
}
}
func doResolvConf() {
// for containerizing dhcpcd and other containers that need writable /etc/resolv.conf
// if it is a symlink (usually to /run) make the directory and empty file
link, err := os.Readlink("/etc/resolv.conf")
if err != nil {
return
}
mkdir(filepath.Dir(link), 0755)
write(link, "")
}
func doLoopback() {
// TODO use netlink instead
cmd := exec.Command("/sbin/ip", "addr", "add", "127.0.0.1/8", "dev", "lo", "brd", "+", "scope", "host")
_ = cmd.Run()
cmd = exec.Command("/sbin/ip", "route", "add", "127.0.0.0/8", "dev", "lo", "scope", "host")
_ = cmd.Run()
cmd = exec.Command("/sbin/ip", "link", "set", "lo", "up")
_ = cmd.Run()
}
// execute scripts in /etc/init.d/ or /etc/shutdown.d. These should not block.
func runInit(path string) {
for _, f := range readdir(path) {
file := filepath.Join(path, f)
fi, err := os.Stat(file)
if err != nil {
log.Printf("Cannot stat %s: %v", file, err)
continue
}
if !fi.Mode().IsRegular() {
continue
}
cmd := exec.Command(file)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
_ = cmd.Run()
}
}
func doReap() {
// now reap all children
// if we are running in a real system, init does this, but in a container it would terminate when we exit
for {
_, err := unix.Wait4(-1, nil, 0, nil)
if err != nil {
// ECHILD means no children left
if e, ok := err.(*os.SyscallError); ok && e.Err == unix.ECHILD {
return
}
}
}
}
func unmountAll() {
mounts, err := os.Open("/proc/mounts")
if err != nil {
log.Printf("Cannot open /proc/mounts: %v", err)
return
}
defer mounts.Close()
scanner := bufio.NewScanner(mounts)
for scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, " ")
if len(parts) > 3 {
dest := parts[1]
tp := parts[2]
switch tp {
// do not unmount tmpfs or virtual filesystems, just ones that need to write data
case "ext2", "ext3", "ext4", "btrfs", "xfs", "vfat", "msdos", "overlay":
if err := unix.Unmount(dest, 0); err != nil {
log.Printf("error unmounting %s: %v", dest, err)
}
case "nfs", "nfs4", "cifs":
// lazy unmount as we do not want to block on network mounts
if err := unix.Unmount(dest, unix.MNT_DETACH); err != nil {
log.Printf("error unmounting %s: %v", dest, err)
}
}
}
}
}
func doShutdown(action string) {
runInit("/etc/shutdown.d")
_ = unix.Kill(-1, unix.SIGTERM)
time.Sleep(5 * time.Second)
_ = unix.Kill(-1, unix.SIGKILL)
unix.Sync()
unmountAll()
switch action {
case "poweroff":
_ = unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF)
case "reboot":
_ = unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
}
// if this failed, init will try again
os.Exit(0)
}
func isCgroupV1() bool {
dt, err := os.ReadFile("/proc/cmdline")
if err != nil {
log.Printf("error reading /proc/cmdline: %v", err)
return false
}
for _, s := range strings.Fields(string(dt)) {
if s == "linuxkit.unified_cgroup_hierarchy=0" {
return true
}
}
return false
}
func main() {
if filepath.Base(os.Args[0]) == "rc.shutdown" {
action := "poweroff"
if len(os.Args) > 1 {
action = os.Args[1]
}
doShutdown(action)
}
// see if we are on a real system or in userspace
// assume in userspace if /proc already mounted
userspace := exists("/proc/self")
if userspace {
subreaper()
} else {
doMounts()
doHotplug()
doClock()
doLoopback()
}
doLimits()
doHostname()
doResolvConf()
runInit("/etc/init.d")
if userspace {
doReap()
}
}