mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-10-30 21:30:16 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			537 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			537 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // +build linux
 | |
| 
 | |
| package libcontainer
 | |
| 
 | |
| import (
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"io/ioutil"
 | |
| 	"net"
 | |
| 	"os"
 | |
| 	"strings"
 | |
| 	"syscall" // only for Errno
 | |
| 	"unsafe"
 | |
| 
 | |
| 	"golang.org/x/sys/unix"
 | |
| 
 | |
| 	"github.com/containerd/console"
 | |
| 	"github.com/opencontainers/runc/libcontainer/cgroups"
 | |
| 	"github.com/opencontainers/runc/libcontainer/configs"
 | |
| 	"github.com/opencontainers/runc/libcontainer/system"
 | |
| 	"github.com/opencontainers/runc/libcontainer/user"
 | |
| 	"github.com/opencontainers/runc/libcontainer/utils"
 | |
| 	"github.com/pkg/errors"
 | |
| 	"github.com/sirupsen/logrus"
 | |
| 	"github.com/vishvananda/netlink"
 | |
| )
 | |
| 
 | |
| type initType string
 | |
| 
 | |
| const (
 | |
| 	initSetns    initType = "setns"
 | |
| 	initStandard initType = "standard"
 | |
| )
 | |
| 
 | |
| type pid struct {
 | |
| 	Pid           int `json:"pid"`
 | |
| 	PidFirstChild int `json:"pid_first"`
 | |
| }
 | |
| 
 | |
| // network is an internal struct used to setup container networks.
 | |
| type network struct {
 | |
| 	configs.Network
 | |
| 
 | |
| 	// TempVethPeerName is a unique temporary veth peer name that was placed into
 | |
| 	// the container's namespace.
 | |
| 	TempVethPeerName string `json:"temp_veth_peer_name"`
 | |
| }
 | |
| 
 | |
| // initConfig is used for transferring parameters from Exec() to Init()
 | |
| type initConfig struct {
 | |
| 	Args             []string              `json:"args"`
 | |
| 	Env              []string              `json:"env"`
 | |
| 	Cwd              string                `json:"cwd"`
 | |
| 	Capabilities     *configs.Capabilities `json:"capabilities"`
 | |
| 	ProcessLabel     string                `json:"process_label"`
 | |
| 	AppArmorProfile  string                `json:"apparmor_profile"`
 | |
| 	NoNewPrivileges  bool                  `json:"no_new_privileges"`
 | |
| 	User             string                `json:"user"`
 | |
| 	AdditionalGroups []string              `json:"additional_groups"`
 | |
| 	Config           *configs.Config       `json:"config"`
 | |
| 	Networks         []*network            `json:"network"`
 | |
| 	PassedFilesCount int                   `json:"passed_files_count"`
 | |
| 	ContainerId      string                `json:"containerid"`
 | |
| 	Rlimits          []configs.Rlimit      `json:"rlimits"`
 | |
| 	CreateConsole    bool                  `json:"create_console"`
 | |
| 	ConsoleWidth     uint16                `json:"console_width"`
 | |
| 	ConsoleHeight    uint16                `json:"console_height"`
 | |
| 	RootlessEUID     bool                  `json:"rootless_euid,omitempty"`
 | |
| 	RootlessCgroups  bool                  `json:"rootless_cgroups,omitempty"`
 | |
| }
 | |
| 
 | |
| type initer interface {
 | |
| 	Init() error
 | |
| }
 | |
| 
 | |
| func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
 | |
| 	var config *initConfig
 | |
| 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	if err := populateProcessEnvironment(config.Env); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	switch t {
 | |
| 	case initSetns:
 | |
| 		return &linuxSetnsInit{
 | |
| 			pipe:          pipe,
 | |
| 			consoleSocket: consoleSocket,
 | |
| 			config:        config,
 | |
| 		}, nil
 | |
| 	case initStandard:
 | |
| 		return &linuxStandardInit{
 | |
| 			pipe:          pipe,
 | |
| 			consoleSocket: consoleSocket,
 | |
| 			parentPid:     unix.Getppid(),
 | |
| 			config:        config,
 | |
| 			fifoFd:        fifoFd,
 | |
| 		}, nil
 | |
| 	}
 | |
| 	return nil, fmt.Errorf("unknown init type %q", t)
 | |
| }
 | |
| 
 | |
| // populateProcessEnvironment loads the provided environment variables into the
 | |
| // current processes's environment.
 | |
| func populateProcessEnvironment(env []string) error {
 | |
| 	for _, pair := range env {
 | |
| 		p := strings.SplitN(pair, "=", 2)
 | |
| 		if len(p) < 2 {
 | |
| 			return fmt.Errorf("invalid environment '%v'", pair)
 | |
| 		}
 | |
| 		if err := os.Setenv(p[0], p[1]); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // finalizeNamespace drops the caps, sets the correct user
 | |
| // and working dir, and closes any leaked file descriptors
 | |
| // before executing the command inside the namespace
 | |
| func finalizeNamespace(config *initConfig) error {
 | |
| 	// Ensure that all unwanted fds we may have accidentally
 | |
| 	// inherited are marked close-on-exec so they stay out of the
 | |
| 	// container
 | |
| 	if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
 | |
| 		return errors.Wrap(err, "close exec fds")
 | |
| 	}
 | |
| 
 | |
| 	capabilities := &configs.Capabilities{}
 | |
| 	if config.Capabilities != nil {
 | |
| 		capabilities = config.Capabilities
 | |
| 	} else if config.Config.Capabilities != nil {
 | |
| 		capabilities = config.Config.Capabilities
 | |
| 	}
 | |
| 	w, err := newContainerCapList(capabilities)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	// drop capabilities in bounding set before changing user
 | |
| 	if err := w.ApplyBoundingSet(); err != nil {
 | |
| 		return errors.Wrap(err, "apply bounding set")
 | |
| 	}
 | |
| 	// preserve existing capabilities while we change users
 | |
| 	if err := system.SetKeepCaps(); err != nil {
 | |
| 		return errors.Wrap(err, "set keep caps")
 | |
| 	}
 | |
| 	if err := setupUser(config); err != nil {
 | |
| 		return errors.Wrap(err, "setup user")
 | |
| 	}
 | |
| 	if err := system.ClearKeepCaps(); err != nil {
 | |
| 		return errors.Wrap(err, "clear keep caps")
 | |
| 	}
 | |
| 	if err := w.ApplyCaps(); err != nil {
 | |
| 		return errors.Wrap(err, "apply caps")
 | |
| 	}
 | |
| 	if config.Cwd != "" {
 | |
| 		if err := unix.Chdir(config.Cwd); err != nil {
 | |
| 			return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // setupConsole sets up the console from inside the container, and sends the
 | |
| // master pty fd to the config.Pipe (using cmsg). This is done to ensure that
 | |
| // consoles are scoped to a container properly (see runc#814 and the many
 | |
| // issues related to that). This has to be run *after* we've pivoted to the new
 | |
| // rootfs (and the users' configuration is entirely set up).
 | |
| func setupConsole(socket *os.File, config *initConfig, mount bool) error {
 | |
| 	defer socket.Close()
 | |
| 	// At this point, /dev/ptmx points to something that we would expect. We
 | |
| 	// used to change the owner of the slave path, but since the /dev/pts mount
 | |
| 	// can have gid=X set (at the users' option). So touching the owner of the
 | |
| 	// slave PTY is not necessary, as the kernel will handle that for us. Note
 | |
| 	// however, that setupUser (specifically fixStdioPermissions) *will* change
 | |
| 	// the UID owner of the console to be the user the process will run as (so
 | |
| 	// they can actually control their console).
 | |
| 
 | |
| 	pty, slavePath, err := console.NewPty()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
 | |
| 		err = pty.Resize(console.WinSize{
 | |
| 			Height: config.ConsoleHeight,
 | |
| 			Width:  config.ConsoleWidth,
 | |
| 		})
 | |
| 
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// After we return from here, we don't need the console anymore.
 | |
| 	defer pty.Close()
 | |
| 
 | |
| 	// Mount the console inside our rootfs.
 | |
| 	if mount {
 | |
| 		if err := mountConsole(slavePath); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	// While we can access console.master, using the API is a good idea.
 | |
| 	if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	// Now, dup over all the things.
 | |
| 	return dupStdio(slavePath)
 | |
| }
 | |
| 
 | |
| // syncParentReady sends to the given pipe a JSON payload which indicates that
 | |
| // the init is ready to Exec the child process. It then waits for the parent to
 | |
| // indicate that it is cleared to Exec.
 | |
| func syncParentReady(pipe io.ReadWriter) error {
 | |
| 	// Tell parent.
 | |
| 	if err := writeSync(pipe, procReady); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// Wait for parent to give the all-clear.
 | |
| 	return readSync(pipe, procRun)
 | |
| }
 | |
| 
 | |
| // syncParentHooks sends to the given pipe a JSON payload which indicates that
 | |
| // the parent should execute pre-start hooks. It then waits for the parent to
 | |
| // indicate that it is cleared to resume.
 | |
| func syncParentHooks(pipe io.ReadWriter) error {
 | |
| 	// Tell parent.
 | |
| 	if err := writeSync(pipe, procHooks); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// Wait for parent to give the all-clear.
 | |
| 	return readSync(pipe, procResume)
 | |
| }
 | |
| 
 | |
| // setupUser changes the groups, gid, and uid for the user inside the container
 | |
| func setupUser(config *initConfig) error {
 | |
| 	// Set up defaults.
 | |
| 	defaultExecUser := user.ExecUser{
 | |
| 		Uid:  0,
 | |
| 		Gid:  0,
 | |
| 		Home: "/",
 | |
| 	}
 | |
| 
 | |
| 	passwdPath, err := user.GetPasswdPath()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	groupPath, err := user.GetGroupPath()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	var addGroups []int
 | |
| 	if len(config.AdditionalGroups) > 0 {
 | |
| 		addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Rather than just erroring out later in setuid(2) and setgid(2), check
 | |
| 	// that the user is mapped here.
 | |
| 	if _, err := config.Config.HostUID(execUser.Uid); err != nil {
 | |
| 		return fmt.Errorf("cannot set uid to unmapped user in user namespace")
 | |
| 	}
 | |
| 	if _, err := config.Config.HostGID(execUser.Gid); err != nil {
 | |
| 		return fmt.Errorf("cannot set gid to unmapped user in user namespace")
 | |
| 	}
 | |
| 
 | |
| 	if config.RootlessEUID {
 | |
| 		// We cannot set any additional groups in a rootless container and thus
 | |
| 		// we bail if the user asked us to do so. TODO: We currently can't do
 | |
| 		// this check earlier, but if libcontainer.Process.User was typesafe
 | |
| 		// this might work.
 | |
| 		if len(addGroups) > 0 {
 | |
| 			return fmt.Errorf("cannot set any additional groups in a rootless container")
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Before we change to the container's user make sure that the processes
 | |
| 	// STDIO is correctly owned by the user that we are switching to.
 | |
| 	if err := fixStdioPermissions(config, execUser); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	setgroups, err := ioutil.ReadFile("/proc/self/setgroups")
 | |
| 	if err != nil && !os.IsNotExist(err) {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// This isn't allowed in an unprivileged user namespace since Linux 3.19.
 | |
| 	// There's nothing we can do about /etc/group entries, so we silently
 | |
| 	// ignore setting groups here (since the user didn't explicitly ask us to
 | |
| 	// set the group).
 | |
| 	allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny"
 | |
| 
 | |
| 	if allowSupGroups {
 | |
| 		suppGroups := append(execUser.Sgids, addGroups...)
 | |
| 		if err := unix.Setgroups(suppGroups); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if err := system.Setgid(execUser.Gid); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if err := system.Setuid(execUser.Uid); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// if we didn't get HOME already, set it based on the user's HOME
 | |
| 	if envHome := os.Getenv("HOME"); envHome == "" {
 | |
| 		if err := os.Setenv("HOME", execUser.Home); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
 | |
| // The ownership needs to match because it is created outside of the container and needs to be
 | |
| // localized.
 | |
| func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
 | |
| 	var null unix.Stat_t
 | |
| 	if err := unix.Stat("/dev/null", &null); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	for _, fd := range []uintptr{
 | |
| 		os.Stdin.Fd(),
 | |
| 		os.Stderr.Fd(),
 | |
| 		os.Stdout.Fd(),
 | |
| 	} {
 | |
| 		var s unix.Stat_t
 | |
| 		if err := unix.Fstat(int(fd), &s); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		// Skip chown of /dev/null if it was used as one of the STDIO fds.
 | |
| 		if s.Rdev == null.Rdev {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		// We only change the uid owner (as it is possible for the mount to
 | |
| 		// prefer a different gid, and there's no reason for us to change it).
 | |
| 		// The reason why we don't just leave the default uid=X mount setup is
 | |
| 		// that users expect to be able to actually use their console. Without
 | |
| 		// this code, you couldn't effectively run as a non-root user inside a
 | |
| 		// container and also have a console set up.
 | |
| 		if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
 | |
| 			// If we've hit an EINVAL then s.Gid isn't mapped in the user
 | |
| 			// namespace. If we've hit an EPERM then the inode's current owner
 | |
| 			// is not mapped in our user namespace (in particular,
 | |
| 			// privileged_wrt_inode_uidgid() has failed). In either case, we
 | |
| 			// are in a configuration where it's better for us to just not
 | |
| 			// touch the stdio rather than bail at this point.
 | |
| 			if err == unix.EINVAL || err == unix.EPERM {
 | |
| 				continue
 | |
| 			}
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // setupNetwork sets up and initializes any network interface inside the container.
 | |
| func setupNetwork(config *initConfig) error {
 | |
| 	for _, config := range config.Networks {
 | |
| 		strategy, err := getStrategy(config.Type)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		if err := strategy.initialize(config); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func setupRoute(config *configs.Config) error {
 | |
| 	for _, config := range config.Routes {
 | |
| 		_, dst, err := net.ParseCIDR(config.Destination)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		src := net.ParseIP(config.Source)
 | |
| 		if src == nil {
 | |
| 			return fmt.Errorf("Invalid source for route: %s", config.Source)
 | |
| 		}
 | |
| 		gw := net.ParseIP(config.Gateway)
 | |
| 		if gw == nil {
 | |
| 			return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
 | |
| 		}
 | |
| 		l, err := netlink.LinkByName(config.InterfaceName)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		route := &netlink.Route{
 | |
| 			Scope:     netlink.SCOPE_UNIVERSE,
 | |
| 			Dst:       dst,
 | |
| 			Src:       src,
 | |
| 			Gw:        gw,
 | |
| 			LinkIndex: l.Attrs().Index,
 | |
| 		}
 | |
| 		if err := netlink.RouteAdd(route); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func setupRlimits(limits []configs.Rlimit, pid int) error {
 | |
| 	for _, rlimit := range limits {
 | |
| 		if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
 | |
| 			return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| const _P_PID = 1
 | |
| 
 | |
| type siginfo struct {
 | |
| 	si_signo int32
 | |
| 	si_errno int32
 | |
| 	si_code  int32
 | |
| 	// below here is a union; si_pid is the only field we use
 | |
| 	si_pid int32
 | |
| 	// Pad to 128 bytes as detailed in blockUntilWaitable
 | |
| 	pad [96]byte
 | |
| }
 | |
| 
 | |
| // isWaitable returns true if the process has exited false otherwise.
 | |
| // Its based off blockUntilWaitable in src/os/wait_waitid.go
 | |
| func isWaitable(pid int) (bool, error) {
 | |
| 	si := &siginfo{}
 | |
| 	_, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
 | |
| 	if e != 0 {
 | |
| 		return false, os.NewSyscallError("waitid", e)
 | |
| 	}
 | |
| 
 | |
| 	return si.si_pid != 0, nil
 | |
| }
 | |
| 
 | |
| // isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise
 | |
| func isNoChildren(err error) bool {
 | |
| 	switch err := err.(type) {
 | |
| 	case syscall.Errno:
 | |
| 		if err == unix.ECHILD {
 | |
| 			return true
 | |
| 		}
 | |
| 	case *os.SyscallError:
 | |
| 		if err.Err == unix.ECHILD {
 | |
| 			return true
 | |
| 		}
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| // signalAllProcesses freezes then iterates over all the processes inside the
 | |
| // manager's cgroups sending the signal s to them.
 | |
| // If s is SIGKILL then it will wait for each process to exit.
 | |
| // For all other signals it will check if the process is ready to report its
 | |
| // exit status and only if it is will a wait be performed.
 | |
| func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
 | |
| 	var procs []*os.Process
 | |
| 	if err := m.Freeze(configs.Frozen); err != nil {
 | |
| 		logrus.Warn(err)
 | |
| 	}
 | |
| 	pids, err := m.GetAllPids()
 | |
| 	if err != nil {
 | |
| 		m.Freeze(configs.Thawed)
 | |
| 		return err
 | |
| 	}
 | |
| 	for _, pid := range pids {
 | |
| 		p, err := os.FindProcess(pid)
 | |
| 		if err != nil {
 | |
| 			logrus.Warn(err)
 | |
| 			continue
 | |
| 		}
 | |
| 		procs = append(procs, p)
 | |
| 		if err := p.Signal(s); err != nil {
 | |
| 			logrus.Warn(err)
 | |
| 		}
 | |
| 	}
 | |
| 	if err := m.Freeze(configs.Thawed); err != nil {
 | |
| 		logrus.Warn(err)
 | |
| 	}
 | |
| 
 | |
| 	subreaper, err := system.GetSubreaper()
 | |
| 	if err != nil {
 | |
| 		// The error here means that PR_GET_CHILD_SUBREAPER is not
 | |
| 		// supported because this code might run on a kernel older
 | |
| 		// than 3.4. We don't want to throw an error in that case,
 | |
| 		// and we simplify things, considering there is no subreaper
 | |
| 		// set.
 | |
| 		subreaper = 0
 | |
| 	}
 | |
| 
 | |
| 	for _, p := range procs {
 | |
| 		if s != unix.SIGKILL {
 | |
| 			if ok, err := isWaitable(p.Pid); err != nil {
 | |
| 				if !isNoChildren(err) {
 | |
| 					logrus.Warn("signalAllProcesses: ", p.Pid, err)
 | |
| 				}
 | |
| 				continue
 | |
| 			} else if !ok {
 | |
| 				// Not ready to report so don't wait
 | |
| 				continue
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// In case a subreaper has been setup, this code must not
 | |
| 		// wait for the process. Otherwise, we cannot be sure the
 | |
| 		// current process will be reaped by the subreaper, while
 | |
| 		// the subreaper might be waiting for this process in order
 | |
| 		// to retrieve its exit code.
 | |
| 		if subreaper == 0 {
 | |
| 			if _, err := p.Wait(); err != nil {
 | |
| 				if !isNoChildren(err) {
 | |
| 					logrus.Warn("wait: ", err)
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 |