diff --git a/pkg/init/Dockerfile b/pkg/init/Dockerfile index 011fc06d8..7740d863d 100644 --- a/pkg/init/Dockerfile +++ b/pkg/init/Dockerfile @@ -9,6 +9,9 @@ ENV GOPATH=/go PATH=$PATH:/go/bin COPY cmd /go/src/cmd RUN go-compile.sh /go/src/cmd/init +RUN go-compile.sh /go/src/cmd/rc.init +# this makes sure that the multi stage build copies as a symlink +RUN mkdir /tmp/bin && cd /tmp/bin/ && cp /go/bin/rc.init . && ln -s rc.init rc.shutdown RUN cd /go/src/cmd/service && ./skanky-vendor.sh $GOPATH/src/github.com/containerd/containerd RUN go-compile.sh /go/src/cmd/service @@ -28,8 +31,8 @@ ENTRYPOINT [] CMD [] WORKDIR / COPY --from=build /go/bin/init / +COPY --from=build /tmp/bin /bin/ COPY --from=build /go/bin/service /usr/bin/ COPY --from=build usermode-helper /sbin/ COPY --from=mirror /out/ / COPY etc etc/ -COPY bin bin/ diff --git a/pkg/init/bin/rc.init b/pkg/init/bin/rc.init deleted file mode 100755 index 730d56154..000000000 --- a/pkg/init/bin/rc.init +++ /dev/null @@ -1,134 +0,0 @@ -#!/bin/sh - -# mount proc filesystem -mount -n -t proc proc /proc -o nodev,nosuid,noexec,relatime - -# remount rootfs as readonly -mount -o remount,ro / - -# mount tmpfs for /tmp and /run -mount -n -t tmpfs tmpfs /run -o nodev,nosuid,noexec,relatime,size=10%,mode=755 -mount -n -t tmpfs tmpfs /tmp -o nodev,nosuid,noexec,relatime,size=10%,mode=1777 - -# mount tmpfs for /var. This may be overmounted with a persistent filesystem later -mount -n -t tmpfs tmpfs /var -o nodev,nosuid,noexec,relatime,size=50%,mode=755 -# add standard directories in /var -mkdir -m 755 /var/cache -mkdir -m 555 /var/empty -mkdir -m 755 /var/lib -mkdir -m 755 /var/local -mkdir -m 755 /var/lock -mkdir -m 755 /var/log -mkdir -m 755 /var/opt -ln -s /run /var/run -mkdir -m 755 /var/spool -mkdir -m 1777 /var/tmp - -# mount devfs -mount -n -t devtmpfs dev /dev -o nosuid,noexec,relatime,size=10m,nr_inodes=248418,mode=755 -# devices -[ -c /dev/console ] || mknod -m 600 /dev/console c 5 1 -[ -c /dev/tty1 ] || mknod -m 620 /dev/tty1 c 4 1 -[ -c /dev/tty ] || mknod -m 666 /dev/tty c 5 0 - -[ -c /dev/null ] || mknod -m 666 /dev/null c 1 3 -[ -c /dev/kmsg ] || mknod -m 660 /dev/kmsg c 1 11 - -# extra symbolic links not provided by default -[ -e /dev/fd ] || ln -snf /proc/self/fd /dev/fd -[ -e /dev/stdin ] || ln -snf /proc/self/fd/0 /dev/stdin -[ -e /dev/stdout ] || ln -snf /proc/self/fd/1 /dev/stdout -[ -e /dev/stderr ] || ln -snf /proc/self/fd/2 /dev/stderr -[ -e /proc/kcore ] && ln -snf /proc/kcore /dev/core - -# devfs filesystems -mkdir -p -m 1777 /dev/mqueue -mkdir -p -m 1777 /dev/shm -mkdir -p -m 0755 /dev/pts -mount -n -t mqueue -o noexec,nosuid,nodev mqueue /dev/mqueue -mount -n -t tmpfs -o noexec,nosuid,nodev,mode=1777 shm /dev/shm -mount -n -t devpts -o noexec,nosuid,gid=5,mode=0620 devpts /dev/pts - -# mount sysfs -sysfs_opts=nodev,noexec,nosuid -mount -n -t sysfs -o ${sysfs_opts} sysfs /sys -[ -d /sys/kernel/security ] && mount -n -t securityfs -o ${sysfs_opts} securityfs /sys/kernel/security -[ -d /sys/kernel/debug ] && mount -n -t debugfs -o ${sysfs_opts} debugfs /sys/kernel/debug -[ -d /sys/kernel/config ] && mount -n -t configfs -o ${sysfs_opts} configfs /sys/kernel/config -[ -d /sys/fs/fuse/connections ] && mount -n -t fusectl -o ${sysfs_opts} fusectl /sys/fs/fuse/connections -[ -d /sys/fs/selinux ] && mount -n -t selinuxfs -o nosuid,noexec selinuxfs /sys/fs/selinux -[ -d /sys/fs/pstore ] && mount -n -t pstore pstore -o ${sysfs_opts} /sys/fs/pstore -[ -d /sys/firmware/efi/efivars ] && mount -n -t efivarfs -o ro,${sysfs_opts} efivarfs /sys/firmware/efi/efivars - -# misc /proc mounted fs -[ -d /proc/sys/fs/binfmt_misc ] && mount -t binfmt_misc -o nodev,noexec,nosuid binfmt_misc /proc/sys/fs/binfmt_misc - -# mount cgroups -mount -n -t tmpfs -o nodev,noexec,nosuid,mode=755,size=10m cgroup_root /sys/fs/cgroup - -while read name hier groups enabled rest -do - case "${enabled}" in - 1) mkdir -p /sys/fs/cgroup/${name} - mount -n -t cgroup -o ${sysfs_opts},${name} ${name} /sys/fs/cgroup/${name} - ;; - esac -done < /proc/cgroups - -# use hierarchy for memory -echo 1 > /sys/fs/cgroup/memory/memory.use_hierarchy - -# for compatibility -mkdir -p /sys/fs/cgroup/systemd -mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd - -# start mdev for hotplug -echo "/sbin/mdev" > /proc/sys/kernel/hotplug - -# mdev -s will not create /dev/usb[1-9] devices with recent kernels -# so we trigger hotplug events for usb for now -for i in $(find /sys/devices -name 'usb[0-9]*'); do - [ -e $i/uevent ] && echo add > $i/uevent -done - -mdev -s - -# Load modules for cold-plugged devices (ie devices present on boot) -grep -h MODALIAS /sys/bus/*/devices/*/uevent | cut -d= -f2 | xargs modprobe -abq 2> /dev/null - -# set hostname -if [ -s /etc/hostname ] -then - hostname -F /etc/hostname -fi - -if [ $(hostname) = "(none)" -a -f /sys/class/net/eth0/address ] -then - mac=$(cat /sys/class/net/eth0/address) - hostname linuxkit-$(echo $mac | sed 's/://g') -fi - -# set system clock from hwclock -hwclock --hctosys --utc - -# bring up loopback interface -ip addr add 127.0.0.1/8 dev lo brd + scope host -ip route add 127.0.0.0/8 dev lo scope host -ip link set lo up - -# for containerizing dhcpcd and other containers that need writable /etc/resolv.conf -[ -L /etc/resolv.conf ] && mkdir -p $(dirname $(readlink -n /etc/resolv.conf)) && touch /etc/resolv.conf - -# make / rshared -mount --make-rshared / - -# set global ulimits TODO move to /etc/limits.conf? -ulimit -n 1048576 -ulimit -p unlimited - -# execute other init processes -INITS="$(find /etc/init.d ! -type d 2>/dev/null | sort)" -for f in $INITS -do - $f -done diff --git a/pkg/init/bin/rc.shutdown b/pkg/init/bin/rc.shutdown deleted file mode 100755 index 6d3880145..000000000 --- a/pkg/init/bin/rc.shutdown +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh - -# execute other shutdown processes -SHUTS="$(find /etc/shutdown.d ! -type d 2>/dev/null | sort)" -for f in $SHUTS -do - $f -done - -# kill all processes and unmount filesystems -/usr/sbin/killall5 -15 -/bin/sleep 5 -/usr/sbin/killall5 -9 -/sbin/swapoff -a -/bin/echo "Unmounting filesystems" -/bin/umount -a -r - -# shutdown or reboot -[ "$1" = "reboot" ] && exec /sbin/reboot -f -/sbin/poweroff -f diff --git a/pkg/init/cmd/rc.init/main.go b/pkg/init/cmd/rc.init/main.go new file mode 100644 index 000000000..82e8986d2 --- /dev/null +++ b/pkg/init/cmd/rc.init/main.go @@ -0,0 +1,456 @@ +package main + +import ( + "bufio" + "encoding/csv" + "io/ioutil" + "log" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "golang.org/x/sys/unix" +) + +const ( + nodev = unix.MS_NODEV + noexec = unix.MS_NOEXEC + nosuid = unix.MS_NOSUID + readonly = unix.MS_RDONLY + rec = unix.MS_REC + relatime = unix.MS_RELATIME + remount = unix.MS_REMOUNT + shared = unix.MS_SHARED +) + +var ( + rliminf = unix.RLIM_INFINITY + infinity = uint64(rliminf) +) + +// set as a subreaper +func subreaper() { + err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(1), 0, 0, 0) + if err != nil { + log.Printf("error setting as a subreaper: %v", err) + } +} + +// nothing really to error to, so just warn +func mount(source string, target string, fstype string, flags uintptr, data string) { + err := unix.Mount(source, target, fstype, flags, data) + if err != nil { + log.Printf("error mounting %s to %s: %v", source, target, err) + } +} + +// in some cases, do not even log an error +func mountSilent(source string, target string, fstype string, flags uintptr, data string) { + _ = unix.Mount(source, target, fstype, flags, data) +} + +// make a character device +func mkchar(path string, mode, major, minor uint32) { + // unix.Mknod only supports int dev numbers; this is ok for us + dev := int(unix.Mkdev(major, minor)) + err := unix.Mknod(path, mode, dev) + if err != nil { + if err.Error() == "file exists" { + return + } + log.Printf("error making device %s: %v", path, err) + } +} + +// symlink with error warning +func symlink(oldpath string, newpath string) { + err := unix.Symlink(oldpath, newpath) + if err != nil { + log.Printf("error making symlink %s: %v", newpath, err) + } +} + +// mkdirall with warning +func mkdir(path string, perm os.FileMode) { + err := os.MkdirAll(path, perm) + if err != nil { + log.Printf("error making directory %s: %v", path, err) + } +} + +// list of all enabled cgroups +func cgroupList() []string { + list := []string{} + f, err := os.Open("/proc/cgroups") + if err != nil { + log.Printf("cannot open /proc/cgroups: %v", err) + return list + } + defer f.Close() + reader := csv.NewReader(f) + // tab delimited + reader.Comma = '\t' + // four fields + reader.FieldsPerRecord = 4 + cgroups, err := reader.ReadAll() + if err != nil { + log.Printf("cannot parse /proc/cgroups: %v", err) + return list + } + for _, cg := range cgroups { + // see if enabled + if cg[3] == "1" { + list = append(list, cg[0]) + } + } + return list +} + +// write a file, eg sysfs +func write(path string, value string) { + err := ioutil.WriteFile(path, []byte(value), 0600) + if err != nil { + log.Printf("cannot write to %s: %v", path, err) + } +} + +// read a file, eg sysfs, strip whitespace, empty string if does not exist +func read(path string) string { + data, err := ioutil.ReadFile(path) + if err != nil { + return "" + } + return strings.TrimSpace(string(data)) +} + +// read a directory +func readdir(path string) []string { + names := []string{} + files, err := ioutil.ReadDir(path) + if err != nil { + log.Printf("cannot read directory %s: %v", path, err) + return names + } + for _, f := range files { + names = append(names, f.Name()) + } + return names +} + +// glob logging errors +func glob(pattern string) []string { + files, err := filepath.Glob(pattern) + if err != nil { + log.Printf("error in glob %s: %v", pattern, err) + return []string{} + } + return files +} + +// test if a file exists +func exists(path string) bool { + _, err := os.Stat(path) + return err == nil +} + +// modalias runs modprobe on the modalias file contents +func modalias(path string) { + alias := read(path) + cmd := exec.Command("/sbin/modprobe", "-abq", alias) + // many of these error so do not report + _ = cmd.Run() +} + +func doMounts() { + // mount proc filesystem + mount("proc", "/proc", "proc", nodev|nosuid|noexec|relatime, "") + + // remount rootfs read only if it is not already + mountSilent("", "/", "", remount|readonly, "") + + // mount tmpfs for /tmp and /run + mount("tmpfs", "/run", "tmpfs", nodev|nosuid|noexec|relatime, "size=10%,mode=755") + mount("tmpfs", "/tmp", "tmpfs", nodev|nosuid|noexec|relatime, "size=10%,mode=1777") + + // mount tmpfs for /var. This may be overmounted with a persistent filesystem later + mount("tmpfs", "/var", "tmpfs", nodev|nosuid|noexec|relatime, "size=50%,mode=755") + // add standard directories in /var + mkdir("/var/cache", 0755) + mkdir("/var/empty", 0555) + mkdir("/var/lib", 0755) + mkdir("/var/local", 0755) + mkdir("/var/lock", 0755) + mkdir("/var/log", 0755) + mkdir("/var/opt", 0755) + mkdir("/var/spool", 0755) + mkdir("/var/tmp", 01777) + symlink("/run", "/var/run") + + // mount devfs + mount("dev", "/dev", "devtmpfs", nosuid|noexec|relatime, "size=10m,nr_inodes=248418,mode=755") + // make minimum necessary devices + mkchar("/dev/console", 0600, 5, 1) + mkchar("/dev/tty1", 0620, 4, 1) + mkchar("/dev/tty", 0666, 5, 0) + mkchar("/dev/null", 0666, 1, 3) + mkchar("/dev/kmsg", 0660, 1, 11) + // make standard symlinks + symlink("/proc/self/fd", "/dev/fd") + symlink("/proc/self/fd/0", "/dev/stdin") + symlink("/proc/self/fd/1", "/dev/stdout") + symlink("/proc/self/fd/2", "/dev/stderr") + symlink("/proc/kcore", "/dev/kcore") + // dev mountpoints + mkdir("/dev/mqueue", 01777) + mkdir("/dev/shm", 01777) + mkdir("/dev/pts", 0755) + // mounts on /dev + mount("mqueue", "/dev/mqueue", "mqueue", noexec|nosuid|nodev, "") + mount("shm", "/dev/shm", "tmpfs", noexec|nosuid|nodev, "mode=1777") + mount("devpts", "/dev/pts", "devpts", noexec|nosuid, "gid=5,mode=0620") + + // sysfs + mount("sysfs", "/sys", "sysfs", noexec|nosuid|nodev, "") + // some of the subsystems may not exist, so ignore errors + mountSilent("securityfs", "/sys/kernel/security", "securityfs", noexec|nosuid|nodev, "") + mountSilent("debugfs", "/sys/kernel/debug", "debugfs", noexec|nosuid|nodev, "") + mountSilent("configfs", "/sys/kernel/config", "configfs", noexec|nosuid|nodev, "") + mountSilent("fusectl", "/sys/fs/fuse/connections", "fusectl", noexec|nosuid|nodev, "") + mountSilent("selinuxfs", "/sys/fs/selinux", "selinuxfs", noexec|nosuid, "") + mountSilent("pstore", "/sys/fs/pstore", "pstore", noexec|nosuid|nodev, "") + mountSilent("efivarfs", "/sys/firmware/efi/efivars", "efivarfs", noexec|nosuid|nodev, "") + + // misc /proc mounted fs + mountSilent("binfmt_misc", "/proc/sys/fs/binfmt_misc", "binfmt_misc", noexec|nosuid|nodev, "") + + // mount cgroup root tmpfs + mount("cgroup_root", "/sys/fs/cgroup", "tmpfs", nodev|noexec|nosuid, "mode=755,size=10m") + // mount cgroups filesystems for all enabled cgroups + for _, cg := range cgroupList() { + path := filepath.Join("/sys/fs/cgroup", cg) + mkdir(path, 0555) + mount(cg, path, "cgroup", noexec|nosuid|nodev, cg) + } + + // use hierarchy for memory + write("/sys/fs/cgroup/memory/memory.use_hierarchy", "1") + + // many things assume systemd + mkdir("/sys/fs/cgroup/systemd", 0555) + mount("cgroup", "/sys/fs/cgroup/systemd", "cgroup", 0, "none,name=systemd") + + // make / rshared + mount("", "/", "", rec|shared, "") +} + +func doHotplug() { + mdev := "/sbin/mdev" + // start mdev for hotplug + write("/proc/sys/kernel/hotplug", mdev) + + devices := "/sys/devices" + files := readdir(devices) + for _, f := range files { + uevent := filepath.Join(devices, f, "uevent") + if strings.HasPrefix(f, "usb") && exists(uevent) { + write(uevent, "add") + } + } + + cmd := exec.Command(mdev, "-s") + if err := cmd.Run(); err != nil { + log.Printf("Failed to run %s -s: %v", mdev, err) + } + + // mdev only supports hot plug, so also add all existing cold plug devices + for _, df := range glob("/sys/bus/*/devices/*/modalias") { + modalias(df) + } +} + +func doClock() { + cmd := exec.Command("/sbin/hwclock", "--hctosys", "--utc") + if err := cmd.Run(); err != nil { + log.Printf("Failed to run hwclock: %v", err) + } +} + +func rlimit(resource int, cur, max uint64) { + lim := unix.Rlimit{Cur: cur, Max: max} + err := unix.Setrlimit(resource, &lim) + if err != nil { + log.Printf("Failed to set rlimit %s: %v", resource, err) + } +} + +func doLimits() { + rlimit(unix.RLIMIT_NOFILE, 1048576, 1048576) + rlimit(unix.RLIMIT_NPROC, infinity, infinity) +} + +func doHostname() { + hostname := read("/etc/hostname") + if hostname != "" { + if err := unix.Sethostname([]byte(hostname)); err != nil { + log.Printf("Setting hostname failed: %v", err) + } + } + hostname, err := os.Hostname() + if err != nil { + log.Printf("Cannot read hostname: %v", err) + return + } + + if hostname != "(none)" && hostname != "" { + return + } + + mac := read("/sys/class/net/eth0/address") + if mac == "" { + return + } + + mac = strings.Replace(mac, ":", "", -1) + if err := unix.Sethostname([]byte("linuxkit-" + mac)); err != nil { + log.Printf("Setting hostname failed: %v", err) + } +} + +func doResolvConf() { + // for containerizing dhcpcd and other containers that need writable /etc/resolv.conf + // if it is a symlink (usually to /run) make the directory and empty file + link, err := os.Readlink("/etc/resolv.conf") + if err != nil { + return + } + mkdir(filepath.Dir(link), 0755) + write(link, "") +} + +func doLoopback() { + // TODO use netlink instead + cmd := exec.Command("/sbin/ip", "addr", "add", "127.0.0.1/8", "dev", "lo", "brd", "+", "scope", "host") + _ = cmd.Run() + cmd = exec.Command("/sbin/ip", "route", "add", "127.0.0.0/8", "dev", "lo", "scope", "host") + _ = cmd.Run() + cmd = exec.Command("/sbin/ip", "link", "set", "lo", "up") + _ = cmd.Run() +} + +// execute scripts in /etc/init.d/ or /etc/shutdown.d. These should not block. +func runInit(path string) { + for _, f := range readdir(path) { + file := filepath.Join(path, f) + fi, err := os.Stat(file) + if err != nil { + log.Printf("Cannot stat %s: %v", file, err) + continue + } + if !fi.Mode().IsRegular() { + continue + } + cmd := exec.Command(file) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + _ = cmd.Run() + } +} + +func doReap() { + // now reap all children + // if we are running in a real system, init does this, but in a container it would terminate when we exit + for { + _, err := unix.Wait4(-1, nil, 0, nil) + if err != nil { + // ECHILD means no children left + if e, ok := err.(*os.SyscallError); ok && e.Err == unix.ECHILD { + return + } + } + } +} + +func unmountAll() { + mounts, err := os.Open("/proc/mounts") + if err != nil { + log.Printf("Cannot open /proc/mounts: %v", err) + return + } + defer mounts.Close() + scanner := bufio.NewScanner(mounts) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Split(line, " ") + if len(parts) > 3 { + dest := parts[1] + tp := parts[2] + switch tp { + // do not unmount tmpfs or virtual filesystems, just ones that need to write data + case "ext2", "ext3", "ext4", "btrfs", "xfs", "vfat", "msdos", "overlay": + if err := unix.Unmount(dest, 0); err != nil { + log.Printf("error unmounting %s: %v", dest, err) + } + case "nfs", "nfs4", "cifs": + // lazy unmount as we do not want to block on network mounts + if err := unix.Unmount(dest, unix.MNT_DETACH); err != nil { + log.Printf("error unmounting %s: %v", dest, err) + } + } + } + } +} + +func doShutdown(action string) { + runInit("/etc/shutdown.d") + _ = unix.Kill(-1, unix.SIGTERM) + time.Sleep(5 * time.Second) + _ = unix.Kill(-1, unix.SIGKILL) + unix.Sync() + unmountAll() + switch action { + case "poweroff": + // TODO use syscall + cmd := exec.Command("/sbin/poweroff", "-f") + _ = cmd.Run() + case "reboot": + // TODO use syscall + cmd := exec.Command("/sbin/reboot", "-f") + _ = cmd.Run() + } + // if this failed, init will try again + os.Exit(0) +} + +func main() { + if filepath.Base(os.Args[0]) == "rc.shutdown" { + action := "poweroff" + if len(os.Args) > 1 { + action = os.Args[1] + } + doShutdown(action) + } + // see if we are on a real system or in userspace + // assume in userspace if /proc already mounted + userspace := exists("/proc/self") + + if userspace { + subreaper() + } else { + doMounts() + doHotplug() + doClock() + doLoopback() + } + + doLimits() + doHostname() + doResolvConf() + + runInit("/etc/init.d") + + if userspace { + doReap() + } +}