mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-11-03 23:40:03 +00:00 
			
		
		
		
	Release notes: https://github.com/opencontainers/runc/releases/tag/v1.1.1 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
		
			
				
	
	
		
			2272 lines
		
	
	
		
			65 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			2272 lines
		
	
	
		
			65 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
package libcontainer
 | 
						|
 | 
						|
import (
 | 
						|
	"bytes"
 | 
						|
	"encoding/json"
 | 
						|
	"errors"
 | 
						|
	"fmt"
 | 
						|
	"io"
 | 
						|
	"net"
 | 
						|
	"os"
 | 
						|
	"os/exec"
 | 
						|
	"path"
 | 
						|
	"path/filepath"
 | 
						|
	"reflect"
 | 
						|
	"strconv"
 | 
						|
	"strings"
 | 
						|
	"sync"
 | 
						|
	"time"
 | 
						|
 | 
						|
	"github.com/checkpoint-restore/go-criu/v5"
 | 
						|
	criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
 | 
						|
	securejoin "github.com/cyphar/filepath-securejoin"
 | 
						|
	"github.com/opencontainers/runtime-spec/specs-go"
 | 
						|
	"github.com/sirupsen/logrus"
 | 
						|
	"github.com/vishvananda/netlink/nl"
 | 
						|
	"golang.org/x/sys/unix"
 | 
						|
	"google.golang.org/protobuf/proto"
 | 
						|
 | 
						|
	"github.com/opencontainers/runc/libcontainer/cgroups"
 | 
						|
	"github.com/opencontainers/runc/libcontainer/configs"
 | 
						|
	"github.com/opencontainers/runc/libcontainer/intelrdt"
 | 
						|
	"github.com/opencontainers/runc/libcontainer/system"
 | 
						|
	"github.com/opencontainers/runc/libcontainer/utils"
 | 
						|
)
 | 
						|
 | 
						|
const stdioFdCount = 3
 | 
						|
 | 
						|
type linuxContainer struct {
 | 
						|
	id                   string
 | 
						|
	root                 string
 | 
						|
	config               *configs.Config
 | 
						|
	cgroupManager        cgroups.Manager
 | 
						|
	intelRdtManager      intelrdt.Manager
 | 
						|
	initPath             string
 | 
						|
	initArgs             []string
 | 
						|
	initProcess          parentProcess
 | 
						|
	initProcessStartTime uint64
 | 
						|
	criuPath             string
 | 
						|
	newuidmapPath        string
 | 
						|
	newgidmapPath        string
 | 
						|
	m                    sync.Mutex
 | 
						|
	criuVersion          int
 | 
						|
	state                containerState
 | 
						|
	created              time.Time
 | 
						|
	fifo                 *os.File
 | 
						|
}
 | 
						|
 | 
						|
// State represents a running container's state
 | 
						|
type State struct {
 | 
						|
	BaseState
 | 
						|
 | 
						|
	// Platform specific fields below here
 | 
						|
 | 
						|
	// Specified if the container was started under the rootless mode.
 | 
						|
	// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
 | 
						|
	Rootless bool `json:"rootless"`
 | 
						|
 | 
						|
	// Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
 | 
						|
	//
 | 
						|
	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
 | 
						|
	// to the cgroup for this subsystem.
 | 
						|
	//
 | 
						|
	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
 | 
						|
	CgroupPaths map[string]string `json:"cgroup_paths"`
 | 
						|
 | 
						|
	// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
 | 
						|
	// with the value as the path.
 | 
						|
	NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
 | 
						|
 | 
						|
	// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
 | 
						|
	ExternalDescriptors []string `json:"external_descriptors,omitempty"`
 | 
						|
 | 
						|
	// Intel RDT "resource control" filesystem path
 | 
						|
	IntelRdtPath string `json:"intel_rdt_path"`
 | 
						|
}
 | 
						|
 | 
						|
// Container is a libcontainer container object.
 | 
						|
//
 | 
						|
// Each container is thread-safe within the same process. Since a container can
 | 
						|
// be destroyed by a separate process, any function may return that the container
 | 
						|
// was not found.
 | 
						|
type Container interface {
 | 
						|
	BaseContainer
 | 
						|
 | 
						|
	// Methods below here are platform specific
 | 
						|
 | 
						|
	// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
 | 
						|
	Checkpoint(criuOpts *CriuOpts) error
 | 
						|
 | 
						|
	// Restore restores the checkpointed container to a running state using the criu(8) utility.
 | 
						|
	Restore(process *Process, criuOpts *CriuOpts) error
 | 
						|
 | 
						|
	// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
 | 
						|
	// the execution of any user processes. Asynchronously, when the container finished being paused the
 | 
						|
	// state is changed to PAUSED.
 | 
						|
	// If the Container state is PAUSED, do nothing.
 | 
						|
	Pause() error
 | 
						|
 | 
						|
	// If the Container state is PAUSED, resumes the execution of any user processes in the
 | 
						|
	// Container before setting the Container state to RUNNING.
 | 
						|
	// If the Container state is RUNNING, do nothing.
 | 
						|
	Resume() error
 | 
						|
 | 
						|
	// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
 | 
						|
	NotifyOOM() (<-chan struct{}, error)
 | 
						|
 | 
						|
	// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
 | 
						|
	NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
 | 
						|
}
 | 
						|
 | 
						|
// ID returns the container's unique ID
 | 
						|
func (c *linuxContainer) ID() string {
 | 
						|
	return c.id
 | 
						|
}
 | 
						|
 | 
						|
// Config returns the container's configuration
 | 
						|
func (c *linuxContainer) Config() configs.Config {
 | 
						|
	return *c.config
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Status() (Status, error) {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	return c.currentStatus()
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) State() (*State, error) {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	return c.currentState()
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) OCIState() (*specs.State, error) {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	return c.currentOCIState()
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Processes() ([]int, error) {
 | 
						|
	var pids []int
 | 
						|
	status, err := c.currentStatus()
 | 
						|
	if err != nil {
 | 
						|
		return pids, err
 | 
						|
	}
 | 
						|
	// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
 | 
						|
	if status == Stopped && !c.cgroupManager.Exists() {
 | 
						|
		return pids, nil
 | 
						|
	}
 | 
						|
 | 
						|
	pids, err = c.cgroupManager.GetAllPids()
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("unable to get all container pids: %w", err)
 | 
						|
	}
 | 
						|
	return pids, nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Stats() (*Stats, error) {
 | 
						|
	var (
 | 
						|
		err   error
 | 
						|
		stats = &Stats{}
 | 
						|
	)
 | 
						|
	if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
 | 
						|
		return stats, fmt.Errorf("unable to get container cgroup stats: %w", err)
 | 
						|
	}
 | 
						|
	if c.intelRdtManager != nil {
 | 
						|
		if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
 | 
						|
			return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	for _, iface := range c.config.Networks {
 | 
						|
		switch iface.Type {
 | 
						|
		case "veth":
 | 
						|
			istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
 | 
						|
			if err != nil {
 | 
						|
				return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err)
 | 
						|
			}
 | 
						|
			stats.Interfaces = append(stats.Interfaces, istats)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return stats, nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Set(config configs.Config) error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	status, err := c.currentStatus()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if status == Stopped {
 | 
						|
		return ErrNotRunning
 | 
						|
	}
 | 
						|
	if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
 | 
						|
		// Set configs back
 | 
						|
		if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
 | 
						|
			logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
 | 
						|
		}
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if c.intelRdtManager != nil {
 | 
						|
		if err := c.intelRdtManager.Set(&config); err != nil {
 | 
						|
			// Set configs back
 | 
						|
			if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
 | 
						|
				logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
 | 
						|
			}
 | 
						|
			if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
 | 
						|
				logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
 | 
						|
			}
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
	// After config setting succeed, update config and states
 | 
						|
	c.config = &config
 | 
						|
	_, err = c.updateState(nil)
 | 
						|
	return err
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Start(process *Process) error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	if c.config.Cgroups.Resources.SkipDevices {
 | 
						|
		return errors.New("can't start container with SkipDevices set")
 | 
						|
	}
 | 
						|
	if process.Init {
 | 
						|
		if err := c.createExecFifo(); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if err := c.start(process); err != nil {
 | 
						|
		if process.Init {
 | 
						|
			c.deleteExecFifo()
 | 
						|
		}
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Run(process *Process) error {
 | 
						|
	if err := c.Start(process); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if process.Init {
 | 
						|
		return c.exec()
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Exec() error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	return c.exec()
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) exec() error {
 | 
						|
	path := filepath.Join(c.root, execFifoFilename)
 | 
						|
	pid := c.initProcess.pid()
 | 
						|
	blockingFifoOpenCh := awaitFifoOpen(path)
 | 
						|
	for {
 | 
						|
		select {
 | 
						|
		case result := <-blockingFifoOpenCh:
 | 
						|
			return handleFifoResult(result)
 | 
						|
 | 
						|
		case <-time.After(time.Millisecond * 100):
 | 
						|
			stat, err := system.Stat(pid)
 | 
						|
			if err != nil || stat.State == system.Zombie {
 | 
						|
				// could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
 | 
						|
				// see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
 | 
						|
				if err := handleFifoResult(fifoOpen(path, false)); err != nil {
 | 
						|
					return errors.New("container process is already dead")
 | 
						|
				}
 | 
						|
				return nil
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func readFromExecFifo(execFifo io.Reader) error {
 | 
						|
	data, err := io.ReadAll(execFifo)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if len(data) <= 0 {
 | 
						|
		return errors.New("cannot start an already running container")
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func awaitFifoOpen(path string) <-chan openResult {
 | 
						|
	fifoOpened := make(chan openResult)
 | 
						|
	go func() {
 | 
						|
		result := fifoOpen(path, true)
 | 
						|
		fifoOpened <- result
 | 
						|
	}()
 | 
						|
	return fifoOpened
 | 
						|
}
 | 
						|
 | 
						|
func fifoOpen(path string, block bool) openResult {
 | 
						|
	flags := os.O_RDONLY
 | 
						|
	if !block {
 | 
						|
		flags |= unix.O_NONBLOCK
 | 
						|
	}
 | 
						|
	f, err := os.OpenFile(path, flags, 0)
 | 
						|
	if err != nil {
 | 
						|
		return openResult{err: fmt.Errorf("exec fifo: %w", err)}
 | 
						|
	}
 | 
						|
	return openResult{file: f}
 | 
						|
}
 | 
						|
 | 
						|
func handleFifoResult(result openResult) error {
 | 
						|
	if result.err != nil {
 | 
						|
		return result.err
 | 
						|
	}
 | 
						|
	f := result.file
 | 
						|
	defer f.Close()
 | 
						|
	if err := readFromExecFifo(f); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	return os.Remove(f.Name())
 | 
						|
}
 | 
						|
 | 
						|
type openResult struct {
 | 
						|
	file *os.File
 | 
						|
	err  error
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) start(process *Process) (retErr error) {
 | 
						|
	parent, err := c.newParentProcess(process)
 | 
						|
	if err != nil {
 | 
						|
		return fmt.Errorf("unable to create new parent process: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	logsDone := parent.forwardChildLogs()
 | 
						|
	if logsDone != nil {
 | 
						|
		defer func() {
 | 
						|
			// Wait for log forwarder to finish. This depends on
 | 
						|
			// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
 | 
						|
			err := <-logsDone
 | 
						|
			if err != nil && retErr == nil {
 | 
						|
				retErr = fmt.Errorf("unable to forward init logs: %w", err)
 | 
						|
			}
 | 
						|
		}()
 | 
						|
	}
 | 
						|
 | 
						|
	if err := parent.start(); err != nil {
 | 
						|
		return fmt.Errorf("unable to start container process: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	if process.Init {
 | 
						|
		c.fifo.Close()
 | 
						|
		if c.config.Hooks != nil {
 | 
						|
			s, err := c.currentOCIState()
 | 
						|
			if err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
 | 
						|
			if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
 | 
						|
				if err := ignoreTerminateErrors(parent.terminate()); err != nil {
 | 
						|
					logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
 | 
						|
				}
 | 
						|
				return err
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Signal(s os.Signal, all bool) error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	status, err := c.currentStatus()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if all {
 | 
						|
		// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
 | 
						|
		if status == Stopped && !c.cgroupManager.Exists() {
 | 
						|
			return nil
 | 
						|
		}
 | 
						|
		return signalAllProcesses(c.cgroupManager, s)
 | 
						|
	}
 | 
						|
	// to avoid a PID reuse attack
 | 
						|
	if status == Running || status == Created || status == Paused {
 | 
						|
		if err := c.initProcess.signal(s); err != nil {
 | 
						|
			return fmt.Errorf("unable to signal init: %w", err)
 | 
						|
		}
 | 
						|
		if status == Paused {
 | 
						|
			// For cgroup v1, killing a process in a frozen cgroup
 | 
						|
			// does nothing until it's thawed. Only thaw the cgroup
 | 
						|
			// for SIGKILL.
 | 
						|
			if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
 | 
						|
				_ = c.cgroupManager.Freeze(configs.Thawed)
 | 
						|
			}
 | 
						|
		}
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	return ErrNotRunning
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) createExecFifo() error {
 | 
						|
	rootuid, err := c.Config().HostRootUID()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	rootgid, err := c.Config().HostRootGID()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	fifoName := filepath.Join(c.root, execFifoFilename)
 | 
						|
	if _, err := os.Stat(fifoName); err == nil {
 | 
						|
		return fmt.Errorf("exec fifo %s already exists", fifoName)
 | 
						|
	}
 | 
						|
	oldMask := unix.Umask(0o000)
 | 
						|
	if err := unix.Mkfifo(fifoName, 0o622); err != nil {
 | 
						|
		unix.Umask(oldMask)
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	unix.Umask(oldMask)
 | 
						|
	return os.Chown(fifoName, rootuid, rootgid)
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) deleteExecFifo() {
 | 
						|
	fifoName := filepath.Join(c.root, execFifoFilename)
 | 
						|
	os.Remove(fifoName)
 | 
						|
}
 | 
						|
 | 
						|
// includeExecFifo opens the container's execfifo as a pathfd, so that the
 | 
						|
// container cannot access the statedir (and the FIFO itself remains
 | 
						|
// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
 | 
						|
// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
 | 
						|
func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
 | 
						|
	fifoName := filepath.Join(c.root, execFifoFilename)
 | 
						|
	fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	c.fifo = fifo
 | 
						|
 | 
						|
	cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
 | 
						|
	cmd.Env = append(cmd.Env,
 | 
						|
		"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
 | 
						|
	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("unable to create init pipe: %w", err)
 | 
						|
	}
 | 
						|
	messageSockPair := filePair{parentInitPipe, childInitPipe}
 | 
						|
 | 
						|
	parentLogPipe, childLogPipe, err := os.Pipe()
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("unable to create log pipe: %w", err)
 | 
						|
	}
 | 
						|
	logFilePair := filePair{parentLogPipe, childLogPipe}
 | 
						|
 | 
						|
	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
 | 
						|
	if !p.Init {
 | 
						|
		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
 | 
						|
	}
 | 
						|
 | 
						|
	// We only set up fifoFd if we're not doing a `runc exec`. The historic
 | 
						|
	// reason for this is that previously we would pass a dirfd that allowed
 | 
						|
	// for container rootfs escape (and not doing it in `runc exec` avoided
 | 
						|
	// that problem), but we no longer do that. However, there's no need to do
 | 
						|
	// this for `runc exec` so we just keep it this way to be safe.
 | 
						|
	if err := c.includeExecFifo(cmd); err != nil {
 | 
						|
		return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
 | 
						|
	}
 | 
						|
	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
 | 
						|
	cmd := exec.Command(c.initPath, c.initArgs[1:]...)
 | 
						|
	cmd.Args[0] = c.initArgs[0]
 | 
						|
	cmd.Stdin = p.Stdin
 | 
						|
	cmd.Stdout = p.Stdout
 | 
						|
	cmd.Stderr = p.Stderr
 | 
						|
	cmd.Dir = c.config.Rootfs
 | 
						|
	if cmd.SysProcAttr == nil {
 | 
						|
		cmd.SysProcAttr = &unix.SysProcAttr{}
 | 
						|
	}
 | 
						|
	cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
 | 
						|
	cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
 | 
						|
	if p.ConsoleSocket != nil {
 | 
						|
		cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
 | 
						|
		cmd.Env = append(cmd.Env,
 | 
						|
			"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
 | 
						|
		)
 | 
						|
	}
 | 
						|
	cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
 | 
						|
	cmd.Env = append(cmd.Env,
 | 
						|
		"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
 | 
						|
		"_LIBCONTAINER_STATEDIR="+c.root,
 | 
						|
	)
 | 
						|
 | 
						|
	cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
 | 
						|
	cmd.Env = append(cmd.Env,
 | 
						|
		"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
 | 
						|
		"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
 | 
						|
	)
 | 
						|
 | 
						|
	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
 | 
						|
	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
 | 
						|
	// even with the parent still running.
 | 
						|
	if c.config.ParentDeathSignal > 0 {
 | 
						|
		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
 | 
						|
	}
 | 
						|
	return cmd
 | 
						|
}
 | 
						|
 | 
						|
// shouldSendMountSources says whether the child process must setup bind mounts with
 | 
						|
// the source pre-opened (O_PATH) in the host user namespace.
 | 
						|
// See https://github.com/opencontainers/runc/issues/2484
 | 
						|
func (c *linuxContainer) shouldSendMountSources() bool {
 | 
						|
	// Passing the mount sources via SCM_RIGHTS is only necessary when
 | 
						|
	// both userns and mntns are active.
 | 
						|
	if !c.config.Namespaces.Contains(configs.NEWUSER) ||
 | 
						|
		!c.config.Namespaces.Contains(configs.NEWNS) {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
 | 
						|
	// nsexec.c send_mountsources() requires setns(mntns) capabilities
 | 
						|
	// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
 | 
						|
	if c.config.RootlessEUID {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
 | 
						|
	// We need to send sources if there are bind-mounts.
 | 
						|
	for _, m := range c.config.Mounts {
 | 
						|
		if m.IsBind() {
 | 
						|
			return true
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return false
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
 | 
						|
	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
 | 
						|
	nsMaps := make(map[configs.NamespaceType]string)
 | 
						|
	for _, ns := range c.config.Namespaces {
 | 
						|
		if ns.Path != "" {
 | 
						|
			nsMaps[ns.Type] = ns.Path
 | 
						|
		}
 | 
						|
	}
 | 
						|
	_, sharePidns := nsMaps[configs.NEWPID]
 | 
						|
	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
 | 
						|
	if c.shouldSendMountSources() {
 | 
						|
		// Elements on this slice will be paired with mounts (see StartInitialization() and
 | 
						|
		// prepareRootfs()). This slice MUST have the same size as c.config.Mounts.
 | 
						|
		mountFds := make([]int, len(c.config.Mounts))
 | 
						|
		for i, m := range c.config.Mounts {
 | 
						|
			if !m.IsBind() {
 | 
						|
				// Non bind-mounts do not use an fd.
 | 
						|
				mountFds[i] = -1
 | 
						|
				continue
 | 
						|
			}
 | 
						|
 | 
						|
			// The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
 | 
						|
			// to allocate a fd so that we know the number to pass in the environment variable. The fd
 | 
						|
			// must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
 | 
						|
			// lifecycle of that fd is already taken care of.
 | 
						|
			cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
 | 
						|
			mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
 | 
						|
		}
 | 
						|
 | 
						|
		mountFdsJson, err := json.Marshal(mountFds)
 | 
						|
		if err != nil {
 | 
						|
			return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err)
 | 
						|
		}
 | 
						|
 | 
						|
		cmd.Env = append(cmd.Env,
 | 
						|
			"_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson),
 | 
						|
		)
 | 
						|
	}
 | 
						|
 | 
						|
	init := &initProcess{
 | 
						|
		cmd:             cmd,
 | 
						|
		messageSockPair: messageSockPair,
 | 
						|
		logFilePair:     logFilePair,
 | 
						|
		manager:         c.cgroupManager,
 | 
						|
		intelRdtManager: c.intelRdtManager,
 | 
						|
		config:          c.newInitConfig(p),
 | 
						|
		container:       c,
 | 
						|
		process:         p,
 | 
						|
		bootstrapData:   data,
 | 
						|
		sharePidns:      sharePidns,
 | 
						|
	}
 | 
						|
	c.initProcess = init
 | 
						|
	return init, nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
 | 
						|
	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
 | 
						|
	state, err := c.currentState()
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("unable to get container state: %w", err)
 | 
						|
	}
 | 
						|
	// for setns process, we don't have to set cloneflags as the process namespaces
 | 
						|
	// will only be set via setns syscall
 | 
						|
	data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	proc := &setnsProcess{
 | 
						|
		cmd:             cmd,
 | 
						|
		cgroupPaths:     state.CgroupPaths,
 | 
						|
		rootlessCgroups: c.config.RootlessCgroups,
 | 
						|
		intelRdtPath:    state.IntelRdtPath,
 | 
						|
		messageSockPair: messageSockPair,
 | 
						|
		logFilePair:     logFilePair,
 | 
						|
		manager:         c.cgroupManager,
 | 
						|
		config:          c.newInitConfig(p),
 | 
						|
		process:         p,
 | 
						|
		bootstrapData:   data,
 | 
						|
		initProcessPid:  state.InitProcessPid,
 | 
						|
	}
 | 
						|
	if len(p.SubCgroupPaths) > 0 {
 | 
						|
		if add, ok := p.SubCgroupPaths[""]; ok {
 | 
						|
			// cgroup v1: using the same path for all controllers.
 | 
						|
			// cgroup v2: the only possible way.
 | 
						|
			for k := range proc.cgroupPaths {
 | 
						|
				subPath := path.Join(proc.cgroupPaths[k], add)
 | 
						|
				if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
 | 
						|
					return nil, fmt.Errorf("%s is not a sub cgroup path", add)
 | 
						|
				}
 | 
						|
				proc.cgroupPaths[k] = subPath
 | 
						|
			}
 | 
						|
			// cgroup v2: do not try to join init process's cgroup
 | 
						|
			// as a fallback (see (*setnsProcess).start).
 | 
						|
			proc.initProcessPid = 0
 | 
						|
		} else {
 | 
						|
			// Per-controller paths.
 | 
						|
			for ctrl, add := range p.SubCgroupPaths {
 | 
						|
				if val, ok := proc.cgroupPaths[ctrl]; ok {
 | 
						|
					subPath := path.Join(val, add)
 | 
						|
					if !strings.HasPrefix(subPath, val) {
 | 
						|
						return nil, fmt.Errorf("%s is not a sub cgroup path", add)
 | 
						|
					}
 | 
						|
					proc.cgroupPaths[ctrl] = subPath
 | 
						|
				} else {
 | 
						|
					return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return proc, nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
 | 
						|
	cfg := &initConfig{
 | 
						|
		Config:           c.config,
 | 
						|
		Args:             process.Args,
 | 
						|
		Env:              process.Env,
 | 
						|
		User:             process.User,
 | 
						|
		AdditionalGroups: process.AdditionalGroups,
 | 
						|
		Cwd:              process.Cwd,
 | 
						|
		Capabilities:     process.Capabilities,
 | 
						|
		PassedFilesCount: len(process.ExtraFiles),
 | 
						|
		ContainerId:      c.ID(),
 | 
						|
		NoNewPrivileges:  c.config.NoNewPrivileges,
 | 
						|
		RootlessEUID:     c.config.RootlessEUID,
 | 
						|
		RootlessCgroups:  c.config.RootlessCgroups,
 | 
						|
		AppArmorProfile:  c.config.AppArmorProfile,
 | 
						|
		ProcessLabel:     c.config.ProcessLabel,
 | 
						|
		Rlimits:          c.config.Rlimits,
 | 
						|
		CreateConsole:    process.ConsoleSocket != nil,
 | 
						|
		ConsoleWidth:     process.ConsoleWidth,
 | 
						|
		ConsoleHeight:    process.ConsoleHeight,
 | 
						|
	}
 | 
						|
	if process.NoNewPrivileges != nil {
 | 
						|
		cfg.NoNewPrivileges = *process.NoNewPrivileges
 | 
						|
	}
 | 
						|
	if process.AppArmorProfile != "" {
 | 
						|
		cfg.AppArmorProfile = process.AppArmorProfile
 | 
						|
	}
 | 
						|
	if process.Label != "" {
 | 
						|
		cfg.ProcessLabel = process.Label
 | 
						|
	}
 | 
						|
	if len(process.Rlimits) > 0 {
 | 
						|
		cfg.Rlimits = process.Rlimits
 | 
						|
	}
 | 
						|
	if cgroups.IsCgroup2UnifiedMode() {
 | 
						|
		cfg.Cgroup2Path = c.cgroupManager.Path("")
 | 
						|
	}
 | 
						|
 | 
						|
	return cfg
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Destroy() error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	return c.state.destroy()
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Pause() error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	status, err := c.currentStatus()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	switch status {
 | 
						|
	case Running, Created:
 | 
						|
		if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		return c.state.transition(&pausedState{
 | 
						|
			c: c,
 | 
						|
		})
 | 
						|
	}
 | 
						|
	return ErrNotRunning
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Resume() error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
	status, err := c.currentStatus()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if status != Paused {
 | 
						|
		return ErrNotPaused
 | 
						|
	}
 | 
						|
	if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	return c.state.transition(&runningState{
 | 
						|
		c: c,
 | 
						|
	})
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
 | 
						|
	// XXX(cyphar): This requires cgroups.
 | 
						|
	if c.config.RootlessCgroups {
 | 
						|
		logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
 | 
						|
	}
 | 
						|
	path := c.cgroupManager.Path("memory")
 | 
						|
	if cgroups.IsCgroup2UnifiedMode() {
 | 
						|
		return notifyOnOOMV2(path)
 | 
						|
	}
 | 
						|
	return notifyOnOOM(path)
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
 | 
						|
	// XXX(cyphar): This requires cgroups.
 | 
						|
	if c.config.RootlessCgroups {
 | 
						|
		logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
 | 
						|
	}
 | 
						|
	return notifyMemoryPressure(c.cgroupManager.Path("memory"), level)
 | 
						|
}
 | 
						|
 | 
						|
var criuFeatures *criurpc.CriuFeatures
 | 
						|
 | 
						|
func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
 | 
						|
	t := criurpc.CriuReqType_FEATURE_CHECK
 | 
						|
 | 
						|
	// make sure the features we are looking for are really not from
 | 
						|
	// some previous check
 | 
						|
	criuFeatures = nil
 | 
						|
 | 
						|
	req := &criurpc.CriuReq{
 | 
						|
		Type: &t,
 | 
						|
		// Theoretically this should not be necessary but CRIU
 | 
						|
		// segfaults if Opts is empty.
 | 
						|
		// Fixed in CRIU  2.12
 | 
						|
		Opts:     rpcOpts,
 | 
						|
		Features: criuFeat,
 | 
						|
	}
 | 
						|
 | 
						|
	err := c.criuSwrk(nil, req, criuOpts, nil)
 | 
						|
	if err != nil {
 | 
						|
		logrus.Debugf("%s", err)
 | 
						|
		return errors.New("CRIU feature check failed")
 | 
						|
	}
 | 
						|
 | 
						|
	missingFeatures := false
 | 
						|
 | 
						|
	// The outer if checks if the fields actually exist
 | 
						|
	if (criuFeat.MemTrack != nil) &&
 | 
						|
		(criuFeatures.MemTrack != nil) {
 | 
						|
		// The inner if checks if they are set to true
 | 
						|
		if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
 | 
						|
			missingFeatures = true
 | 
						|
			logrus.Debugf("CRIU does not support MemTrack")
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	// This needs to be repeated for every new feature check.
 | 
						|
	// Is there a way to put this in a function. Reflection?
 | 
						|
	if (criuFeat.LazyPages != nil) &&
 | 
						|
		(criuFeatures.LazyPages != nil) {
 | 
						|
		if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
 | 
						|
			missingFeatures = true
 | 
						|
			logrus.Debugf("CRIU does not support LazyPages")
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if missingFeatures {
 | 
						|
		return errors.New("CRIU is missing features")
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func compareCriuVersion(criuVersion int, minVersion int) error {
 | 
						|
	// simple function to perform the actual version compare
 | 
						|
	if criuVersion < minVersion {
 | 
						|
		return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// checkCriuVersion checks Criu version greater than or equal to minVersion
 | 
						|
func (c *linuxContainer) checkCriuVersion(minVersion int) error {
 | 
						|
	// If the version of criu has already been determined there is no need
 | 
						|
	// to ask criu for the version again. Use the value from c.criuVersion.
 | 
						|
	if c.criuVersion != 0 {
 | 
						|
		return compareCriuVersion(c.criuVersion, minVersion)
 | 
						|
	}
 | 
						|
 | 
						|
	criu := criu.MakeCriu()
 | 
						|
	criu.SetCriuPath(c.criuPath)
 | 
						|
	var err error
 | 
						|
	c.criuVersion, err = criu.GetCriuVersion()
 | 
						|
	if err != nil {
 | 
						|
		return fmt.Errorf("CRIU version check failed: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	return compareCriuVersion(c.criuVersion, minVersion)
 | 
						|
}
 | 
						|
 | 
						|
const descriptorsFilename = "descriptors.json"
 | 
						|
 | 
						|
func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
 | 
						|
	mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
 | 
						|
	if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
 | 
						|
		mountDest = dest[len(c.config.Rootfs):]
 | 
						|
	}
 | 
						|
	extMnt := &criurpc.ExtMountMap{
 | 
						|
		Key: proto.String(mountDest),
 | 
						|
		Val: proto.String(mountDest),
 | 
						|
	}
 | 
						|
	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
 | 
						|
	for _, path := range c.config.MaskPaths {
 | 
						|
		fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
 | 
						|
		if err != nil {
 | 
						|
			if os.IsNotExist(err) {
 | 
						|
				continue
 | 
						|
			}
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		if fi.IsDir() {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		extMnt := &criurpc.ExtMountMap{
 | 
						|
			Key: proto.String(path),
 | 
						|
			Val: proto.String("/dev/null"),
 | 
						|
		}
 | 
						|
		req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
 | 
						|
	// CRIU will evaluate a configuration starting with release 3.11.
 | 
						|
	// Settings in the configuration file will overwrite RPC settings.
 | 
						|
	// Look for annotations. The annotation 'org.criu.config'
 | 
						|
	// specifies if CRIU should use a different, container specific
 | 
						|
	// configuration file.
 | 
						|
	_, annotations := utils.Annotations(c.config.Labels)
 | 
						|
	configFile, exists := annotations["org.criu.config"]
 | 
						|
	if exists {
 | 
						|
		// If the annotation 'org.criu.config' exists and is set
 | 
						|
		// to a non-empty string, tell CRIU to use that as a
 | 
						|
		// configuration file. If the file does not exist, CRIU
 | 
						|
		// will just ignore it.
 | 
						|
		if configFile != "" {
 | 
						|
			rpcOpts.ConfigFile = proto.String(configFile)
 | 
						|
		}
 | 
						|
		// If 'org.criu.config' exists and is set to an empty
 | 
						|
		// string, a runc specific CRIU configuration file will
 | 
						|
		// be not set at all.
 | 
						|
	} else {
 | 
						|
		// If the mentioned annotation has not been found, specify
 | 
						|
		// a default CRIU configuration file.
 | 
						|
		rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
 | 
						|
	var minVersion int
 | 
						|
	switch t {
 | 
						|
	case configs.NEWNET:
 | 
						|
		// CRIU supports different external namespace with different released CRIU versions.
 | 
						|
		// For network namespaces to work we need at least criu 3.11.0 => 31100.
 | 
						|
		minVersion = 31100
 | 
						|
	case configs.NEWPID:
 | 
						|
		// For PID namespaces criu 31500 is needed.
 | 
						|
		minVersion = 31500
 | 
						|
	default:
 | 
						|
		return false
 | 
						|
	}
 | 
						|
	return c.checkCriuVersion(minVersion) == nil
 | 
						|
}
 | 
						|
 | 
						|
func criuNsToKey(t configs.NamespaceType) string {
 | 
						|
	return "extRoot" + strings.Title(configs.NsName(t)) + "NS"
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
 | 
						|
	if !c.criuSupportsExtNS(t) {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
 | 
						|
	nsPath := c.config.Namespaces.PathOf(t)
 | 
						|
	if nsPath == "" {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	// CRIU expects the information about an external namespace
 | 
						|
	// like this: --external <TYPE>[<inode>]:<key>
 | 
						|
	// This <key> is always 'extRoot<TYPE>NS'.
 | 
						|
	var ns unix.Stat_t
 | 
						|
	if err := unix.Stat(nsPath, &ns); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
 | 
						|
	rpcOpts.External = append(rpcOpts.External, criuExternal)
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
 | 
						|
	for _, ns := range c.config.Namespaces {
 | 
						|
		switch ns.Type {
 | 
						|
		case configs.NEWNET, configs.NEWPID:
 | 
						|
			// If the container is running in a network or PID namespace and has
 | 
						|
			// a path to the network or PID namespace configured, we will dump
 | 
						|
			// that network or PID namespace as an external namespace and we
 | 
						|
			// will expect that the namespace exists during restore.
 | 
						|
			// This basically means that CRIU will ignore the namespace
 | 
						|
			// and expect it to be setup correctly.
 | 
						|
			if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
		default:
 | 
						|
			// For all other namespaces except NET and PID CRIU has
 | 
						|
			// a simpler way of joining the existing namespace if set
 | 
						|
			nsPath := c.config.Namespaces.PathOf(ns.Type)
 | 
						|
			if nsPath == "" {
 | 
						|
				continue
 | 
						|
			}
 | 
						|
			if ns.Type == configs.NEWCGROUP {
 | 
						|
				// CRIU has no code to handle NEWCGROUP
 | 
						|
				return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
 | 
						|
			}
 | 
						|
			// CRIU has code to handle NEWTIME, but it does not seem to be defined in runc
 | 
						|
 | 
						|
			// CRIU will issue a warning for NEWUSER:
 | 
						|
			// criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous'
 | 
						|
			rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
 | 
						|
				Ns:     proto.String(configs.NsName(ns.Type)),
 | 
						|
				NsFile: proto.String(nsPath),
 | 
						|
			})
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
 | 
						|
	if !c.criuSupportsExtNS(t) {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
 | 
						|
	nsPath := c.config.Namespaces.PathOf(t)
 | 
						|
	if nsPath == "" {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	// CRIU wants the information about an existing namespace
 | 
						|
	// like this: --inherit-fd fd[<fd>]:<key>
 | 
						|
	// The <key> needs to be the same as during checkpointing.
 | 
						|
	// We are always using 'extRoot<TYPE>NS' as the key in this.
 | 
						|
	nsFd, err := os.Open(nsPath)
 | 
						|
	if err != nil {
 | 
						|
		logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
 | 
						|
		return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
 | 
						|
	}
 | 
						|
	inheritFd := &criurpc.InheritFd{
 | 
						|
		Key: proto.String(criuNsToKey(t)),
 | 
						|
		// The offset of four is necessary because 0, 1, 2 and 3 are
 | 
						|
		// already used by stdin, stdout, stderr, 'criu swrk' socket.
 | 
						|
		Fd: proto.Int32(int32(4 + len(*extraFiles))),
 | 
						|
	}
 | 
						|
	rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
 | 
						|
	// All open FDs need to be transferred to CRIU via extraFiles
 | 
						|
	*extraFiles = append(*extraFiles, nsFd)
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
 | 
						|
	// Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
 | 
						|
	// (CLI prints a warning)
 | 
						|
	// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
 | 
						|
	//               support for doing unprivileged dumps, but the setup of
 | 
						|
	//               rootless containers might make this complicated.
 | 
						|
 | 
						|
	// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
 | 
						|
	if err := c.checkCriuVersion(30000); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	if criuOpts.ImagesDirectory == "" {
 | 
						|
		return errors.New("invalid directory to save checkpoint")
 | 
						|
	}
 | 
						|
 | 
						|
	// Since a container can be C/R'ed multiple times,
 | 
						|
	// the checkpoint directory may already exist.
 | 
						|
	if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	imageDir, err := os.Open(criuOpts.ImagesDirectory)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	defer imageDir.Close()
 | 
						|
 | 
						|
	rpcOpts := criurpc.CriuOpts{
 | 
						|
		ImagesDirFd:     proto.Int32(int32(imageDir.Fd())),
 | 
						|
		LogLevel:        proto.Int32(4),
 | 
						|
		LogFile:         proto.String("dump.log"),
 | 
						|
		Root:            proto.String(c.config.Rootfs),
 | 
						|
		ManageCgroups:   proto.Bool(true),
 | 
						|
		NotifyScripts:   proto.Bool(true),
 | 
						|
		Pid:             proto.Int32(int32(c.initProcess.pid())),
 | 
						|
		ShellJob:        proto.Bool(criuOpts.ShellJob),
 | 
						|
		LeaveRunning:    proto.Bool(criuOpts.LeaveRunning),
 | 
						|
		TcpEstablished:  proto.Bool(criuOpts.TcpEstablished),
 | 
						|
		ExtUnixSk:       proto.Bool(criuOpts.ExternalUnixConnections),
 | 
						|
		FileLocks:       proto.Bool(criuOpts.FileLocks),
 | 
						|
		EmptyNs:         proto.Uint32(criuOpts.EmptyNs),
 | 
						|
		OrphanPtsMaster: proto.Bool(true),
 | 
						|
		AutoDedup:       proto.Bool(criuOpts.AutoDedup),
 | 
						|
		LazyPages:       proto.Bool(criuOpts.LazyPages),
 | 
						|
	}
 | 
						|
 | 
						|
	// if criuOpts.WorkDirectory is not set, criu default is used.
 | 
						|
	if criuOpts.WorkDirectory != "" {
 | 
						|
		if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		workDir, err := os.Open(criuOpts.WorkDirectory)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		defer workDir.Close()
 | 
						|
		rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
 | 
						|
	}
 | 
						|
 | 
						|
	c.handleCriuConfigurationFile(&rpcOpts)
 | 
						|
 | 
						|
	// If the container is running in a network namespace and has
 | 
						|
	// a path to the network namespace configured, we will dump
 | 
						|
	// that network namespace as an external namespace and we
 | 
						|
	// will expect that the namespace exists during restore.
 | 
						|
	// This basically means that CRIU will ignore the namespace
 | 
						|
	// and expect to be setup correctly.
 | 
						|
	if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	// Same for possible external PID namespaces
 | 
						|
	if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	// CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup
 | 
						|
	// is not set, CRIU uses ptrace() to pause the processes.
 | 
						|
	// Note cgroup v2 freezer is only supported since CRIU release 3.14.
 | 
						|
	if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil {
 | 
						|
		if fcg := c.cgroupManager.Path("freezer"); fcg != "" {
 | 
						|
			rpcOpts.FreezeCgroup = proto.String(fcg)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	// append optional criu opts, e.g., page-server and port
 | 
						|
	if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
 | 
						|
		rpcOpts.Ps = &criurpc.CriuPageServerInfo{
 | 
						|
			Address: proto.String(criuOpts.PageServer.Address),
 | 
						|
			Port:    proto.Int32(criuOpts.PageServer.Port),
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	// pre-dump may need parentImage param to complete iterative migration
 | 
						|
	if criuOpts.ParentImage != "" {
 | 
						|
		rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
 | 
						|
		rpcOpts.TrackMem = proto.Bool(true)
 | 
						|
	}
 | 
						|
 | 
						|
	// append optional manage cgroups mode
 | 
						|
	if criuOpts.ManageCgroupsMode != 0 {
 | 
						|
		mode := criuOpts.ManageCgroupsMode
 | 
						|
		rpcOpts.ManageCgroupsMode = &mode
 | 
						|
	}
 | 
						|
 | 
						|
	var t criurpc.CriuReqType
 | 
						|
	if criuOpts.PreDump {
 | 
						|
		feat := criurpc.CriuFeatures{
 | 
						|
			MemTrack: proto.Bool(true),
 | 
						|
		}
 | 
						|
 | 
						|
		if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
 | 
						|
		t = criurpc.CriuReqType_PRE_DUMP
 | 
						|
	} else {
 | 
						|
		t = criurpc.CriuReqType_DUMP
 | 
						|
	}
 | 
						|
 | 
						|
	if criuOpts.LazyPages {
 | 
						|
		// lazy migration requested; check if criu supports it
 | 
						|
		feat := criurpc.CriuFeatures{
 | 
						|
			LazyPages: proto.Bool(true),
 | 
						|
		}
 | 
						|
		if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
 | 
						|
		if fd := criuOpts.StatusFd; fd != -1 {
 | 
						|
			// check that the FD is valid
 | 
						|
			flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
 | 
						|
			if err != nil {
 | 
						|
				return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
 | 
						|
			}
 | 
						|
			// and writable
 | 
						|
			if flags&unix.O_WRONLY == 0 {
 | 
						|
				return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
 | 
						|
			}
 | 
						|
 | 
						|
			if c.checkCriuVersion(31500) != nil {
 | 
						|
				// For criu 3.15+, use notifications (see case "status-ready"
 | 
						|
				// in criuNotifications). Otherwise, rely on criu status fd.
 | 
						|
				rpcOpts.StatusFd = proto.Int32(int32(fd))
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	req := &criurpc.CriuReq{
 | 
						|
		Type: &t,
 | 
						|
		Opts: &rpcOpts,
 | 
						|
	}
 | 
						|
 | 
						|
	// no need to dump all this in pre-dump
 | 
						|
	if !criuOpts.PreDump {
 | 
						|
		hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
 | 
						|
		for _, m := range c.config.Mounts {
 | 
						|
			switch m.Device {
 | 
						|
			case "bind":
 | 
						|
				c.addCriuDumpMount(req, m)
 | 
						|
			case "cgroup":
 | 
						|
				if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
 | 
						|
					// real mount(s)
 | 
						|
					continue
 | 
						|
				}
 | 
						|
				// a set of "external" bind mounts
 | 
						|
				binds, err := getCgroupMounts(m)
 | 
						|
				if err != nil {
 | 
						|
					return err
 | 
						|
				}
 | 
						|
				for _, b := range binds {
 | 
						|
					c.addCriuDumpMount(req, b)
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		if err := c.addMaskPaths(req); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
 | 
						|
		for _, node := range c.config.Devices {
 | 
						|
			m := &configs.Mount{Destination: node.Path, Source: node.Path}
 | 
						|
			c.addCriuDumpMount(req, m)
 | 
						|
		}
 | 
						|
 | 
						|
		// Write the FD info to a file in the image directory
 | 
						|
		fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
 | 
						|
		err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	err = c.criuSwrk(nil, req, criuOpts, nil)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
 | 
						|
	mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
 | 
						|
	if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
 | 
						|
		mountDest = dest[len(c.config.Rootfs):]
 | 
						|
	}
 | 
						|
	extMnt := &criurpc.ExtMountMap{
 | 
						|
		Key: proto.String(mountDest),
 | 
						|
		Val: proto.String(m.Source),
 | 
						|
	}
 | 
						|
	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
 | 
						|
	for _, iface := range c.config.Networks {
 | 
						|
		switch iface.Type {
 | 
						|
		case "veth":
 | 
						|
			veth := new(criurpc.CriuVethPair)
 | 
						|
			veth.IfOut = proto.String(iface.HostInterfaceName)
 | 
						|
			veth.IfIn = proto.String(iface.Name)
 | 
						|
			req.Opts.Veths = append(req.Opts.Veths, veth)
 | 
						|
		case "loopback":
 | 
						|
			// Do nothing
 | 
						|
		}
 | 
						|
	}
 | 
						|
	for _, i := range criuOpts.VethPairs {
 | 
						|
		veth := new(criurpc.CriuVethPair)
 | 
						|
		veth.IfOut = proto.String(i.HostInterfaceName)
 | 
						|
		veth.IfIn = proto.String(i.ContainerInterfaceName)
 | 
						|
		req.Opts.Veths = append(req.Opts.Veths, veth)
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// makeCriuRestoreMountpoints makes the actual mountpoints for the
 | 
						|
// restore using CRIU. This function is inspired from the code in
 | 
						|
// rootfs_linux.go
 | 
						|
func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
 | 
						|
	switch m.Device {
 | 
						|
	case "cgroup":
 | 
						|
		// No mount point(s) need to be created:
 | 
						|
		//
 | 
						|
		// * for v1, mount points are saved by CRIU because
 | 
						|
		//   /sys/fs/cgroup is a tmpfs mount
 | 
						|
		//
 | 
						|
		// * for v2, /sys/fs/cgroup is a real mount, but
 | 
						|
		//   the mountpoint appears as soon as /sys is mounted
 | 
						|
		return nil
 | 
						|
	case "bind":
 | 
						|
		// The prepareBindMount() function checks if source
 | 
						|
		// exists. So it cannot be used for other filesystem types.
 | 
						|
		// TODO: pass something else than nil? Not sure if criu is
 | 
						|
		// impacted by issue #2484
 | 
						|
		if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	default:
 | 
						|
		// for all other filesystems just create the mountpoints
 | 
						|
		dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		if err := os.MkdirAll(dest, 0o755); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// isPathInPrefixList is a small function for CRIU restore to make sure
 | 
						|
// mountpoints, which are on a tmpfs, are not created in the roofs
 | 
						|
func isPathInPrefixList(path string, prefix []string) bool {
 | 
						|
	for _, p := range prefix {
 | 
						|
		if strings.HasPrefix(path, p+"/") {
 | 
						|
			return true
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return false
 | 
						|
}
 | 
						|
 | 
						|
// prepareCriuRestoreMounts tries to set up the rootfs of the
 | 
						|
// container to be restored in the same way runc does it for
 | 
						|
// initial container creation. Even for a read-only rootfs container
 | 
						|
// runc modifies the rootfs to add mountpoints which do not exist.
 | 
						|
// This function also creates missing mountpoints as long as they
 | 
						|
// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
 | 
						|
func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
 | 
						|
	// First get a list of a all tmpfs mounts
 | 
						|
	tmpfs := []string{}
 | 
						|
	for _, m := range mounts {
 | 
						|
		switch m.Device {
 | 
						|
		case "tmpfs":
 | 
						|
			tmpfs = append(tmpfs, m.Destination)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	// Now go through all mounts and create the mountpoints
 | 
						|
	// if the mountpoints are not on a tmpfs, as CRIU will
 | 
						|
	// restore the complete tmpfs content from its checkpoint.
 | 
						|
	umounts := []string{}
 | 
						|
	defer func() {
 | 
						|
		for _, u := range umounts {
 | 
						|
			_ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
 | 
						|
				if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
 | 
						|
					if e != unix.EINVAL { //nolint:errorlint // unix errors are bare
 | 
						|
						// Ignore EINVAL as it means 'target is not a mount point.'
 | 
						|
						// It probably has already been unmounted.
 | 
						|
						logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
 | 
						|
					}
 | 
						|
				}
 | 
						|
				return nil
 | 
						|
			})
 | 
						|
		}
 | 
						|
	}()
 | 
						|
	for _, m := range mounts {
 | 
						|
		if !isPathInPrefixList(m.Destination, tmpfs) {
 | 
						|
			if err := c.makeCriuRestoreMountpoints(m); err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
			// If the mount point is a bind mount, we need to mount
 | 
						|
			// it now so that runc can create the necessary mount
 | 
						|
			// points for mounts in bind mounts.
 | 
						|
			// This also happens during initial container creation.
 | 
						|
			// Without this CRIU restore will fail
 | 
						|
			// See: https://github.com/opencontainers/runc/issues/2748
 | 
						|
			// It is also not necessary to order the mount points
 | 
						|
			// because during initial container creation mounts are
 | 
						|
			// set up in the order they are configured.
 | 
						|
			if m.Device == "bind" {
 | 
						|
				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error {
 | 
						|
					if err := mount(m.Source, m.Destination, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
 | 
						|
						return err
 | 
						|
					}
 | 
						|
					return nil
 | 
						|
				}); err != nil {
 | 
						|
					return err
 | 
						|
				}
 | 
						|
				umounts = append(umounts, m.Destination)
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 | 
						|
	c.m.Lock()
 | 
						|
	defer c.m.Unlock()
 | 
						|
 | 
						|
	var extraFiles []*os.File
 | 
						|
 | 
						|
	// Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
 | 
						|
	// (CLI prints a warning)
 | 
						|
	// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
 | 
						|
	//               support for unprivileged restore at the moment.
 | 
						|
 | 
						|
	// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
 | 
						|
	if err := c.checkCriuVersion(30000); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if criuOpts.ImagesDirectory == "" {
 | 
						|
		return errors.New("invalid directory to restore checkpoint")
 | 
						|
	}
 | 
						|
	imageDir, err := os.Open(criuOpts.ImagesDirectory)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	defer imageDir.Close()
 | 
						|
	// CRIU has a few requirements for a root directory:
 | 
						|
	// * it must be a mount point
 | 
						|
	// * its parent must not be overmounted
 | 
						|
	// c.config.Rootfs is bind-mounted to a temporary directory
 | 
						|
	// to satisfy these requirements.
 | 
						|
	root := filepath.Join(c.root, "criu-root")
 | 
						|
	if err := os.Mkdir(root, 0o755); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	defer os.Remove(root)
 | 
						|
	root, err = filepath.EvalSymlinks(root)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	err = mount(c.config.Rootfs, root, "", "", unix.MS_BIND|unix.MS_REC, "")
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck
 | 
						|
	t := criurpc.CriuReqType_RESTORE
 | 
						|
	req := &criurpc.CriuReq{
 | 
						|
		Type: &t,
 | 
						|
		Opts: &criurpc.CriuOpts{
 | 
						|
			ImagesDirFd:     proto.Int32(int32(imageDir.Fd())),
 | 
						|
			EvasiveDevices:  proto.Bool(true),
 | 
						|
			LogLevel:        proto.Int32(4),
 | 
						|
			LogFile:         proto.String("restore.log"),
 | 
						|
			RstSibling:      proto.Bool(true),
 | 
						|
			Root:            proto.String(root),
 | 
						|
			ManageCgroups:   proto.Bool(true),
 | 
						|
			NotifyScripts:   proto.Bool(true),
 | 
						|
			ShellJob:        proto.Bool(criuOpts.ShellJob),
 | 
						|
			ExtUnixSk:       proto.Bool(criuOpts.ExternalUnixConnections),
 | 
						|
			TcpEstablished:  proto.Bool(criuOpts.TcpEstablished),
 | 
						|
			FileLocks:       proto.Bool(criuOpts.FileLocks),
 | 
						|
			EmptyNs:         proto.Uint32(criuOpts.EmptyNs),
 | 
						|
			OrphanPtsMaster: proto.Bool(true),
 | 
						|
			AutoDedup:       proto.Bool(criuOpts.AutoDedup),
 | 
						|
			LazyPages:       proto.Bool(criuOpts.LazyPages),
 | 
						|
		},
 | 
						|
	}
 | 
						|
 | 
						|
	if criuOpts.LsmProfile != "" {
 | 
						|
		// CRIU older than 3.16 has a bug which breaks the possibility
 | 
						|
		// to set a different LSM profile.
 | 
						|
		if err := c.checkCriuVersion(31600); err != nil {
 | 
						|
			return errors.New("--lsm-profile requires at least CRIU 3.16")
 | 
						|
		}
 | 
						|
		req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
 | 
						|
	}
 | 
						|
	if criuOpts.LsmMountContext != "" {
 | 
						|
		if err := c.checkCriuVersion(31600); err != nil {
 | 
						|
			return errors.New("--lsm-mount-context requires at least CRIU 3.16")
 | 
						|
		}
 | 
						|
		req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext)
 | 
						|
	}
 | 
						|
 | 
						|
	if criuOpts.WorkDirectory != "" {
 | 
						|
		// Since a container can be C/R'ed multiple times,
 | 
						|
		// the work directory may already exist.
 | 
						|
		if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		workDir, err := os.Open(criuOpts.WorkDirectory)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		defer workDir.Close()
 | 
						|
		req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
 | 
						|
	}
 | 
						|
	c.handleCriuConfigurationFile(req.Opts)
 | 
						|
 | 
						|
	if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	// This will modify the rootfs of the container in the same way runc
 | 
						|
	// modifies the container during initial creation.
 | 
						|
	if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
 | 
						|
	for _, m := range c.config.Mounts {
 | 
						|
		switch m.Device {
 | 
						|
		case "bind":
 | 
						|
			c.addCriuRestoreMount(req, m)
 | 
						|
		case "cgroup":
 | 
						|
			if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
 | 
						|
				continue
 | 
						|
			}
 | 
						|
			// cgroup v1 is a set of bind mounts, unless cgroupns is used
 | 
						|
			binds, err := getCgroupMounts(m)
 | 
						|
			if err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
			for _, b := range binds {
 | 
						|
				c.addCriuRestoreMount(req, b)
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if len(c.config.MaskPaths) > 0 {
 | 
						|
		m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
 | 
						|
		c.addCriuRestoreMount(req, m)
 | 
						|
	}
 | 
						|
 | 
						|
	for _, node := range c.config.Devices {
 | 
						|
		m := &configs.Mount{Destination: node.Path, Source: node.Path}
 | 
						|
		c.addCriuRestoreMount(req, m)
 | 
						|
	}
 | 
						|
 | 
						|
	if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
 | 
						|
		c.restoreNetwork(req, criuOpts)
 | 
						|
	}
 | 
						|
 | 
						|
	// append optional manage cgroups mode
 | 
						|
	if criuOpts.ManageCgroupsMode != 0 {
 | 
						|
		mode := criuOpts.ManageCgroupsMode
 | 
						|
		req.Opts.ManageCgroupsMode = &mode
 | 
						|
	}
 | 
						|
 | 
						|
	var (
 | 
						|
		fds    []string
 | 
						|
		fdJSON []byte
 | 
						|
	)
 | 
						|
	if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	if err := json.Unmarshal(fdJSON, &fds); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	for i := range fds {
 | 
						|
		if s := fds[i]; strings.Contains(s, "pipe:") {
 | 
						|
			inheritFd := new(criurpc.InheritFd)
 | 
						|
			inheritFd.Key = proto.String(s)
 | 
						|
			inheritFd.Fd = proto.Int32(int32(i))
 | 
						|
			req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	err = c.criuSwrk(process, req, criuOpts, extraFiles)
 | 
						|
 | 
						|
	// Now that CRIU is done let's close all opened FDs CRIU needed.
 | 
						|
	for _, fd := range extraFiles {
 | 
						|
		fd.Close()
 | 
						|
	}
 | 
						|
 | 
						|
	return err
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
 | 
						|
	// need to apply cgroups only on restore
 | 
						|
	if req.GetType() != criurpc.CriuReqType_RESTORE {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
 | 
						|
	// XXX: Do we need to deal with this case? AFAIK criu still requires root.
 | 
						|
	if err := c.cgroupManager.Apply(pid); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	if cgroups.IsCgroup2UnifiedMode() {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	// the stuff below is cgroupv1-specific
 | 
						|
 | 
						|
	path := fmt.Sprintf("/proc/%d/cgroup", pid)
 | 
						|
	cgroupsPaths, err := cgroups.ParseCgroupFile(path)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	for c, p := range cgroupsPaths {
 | 
						|
		cgroupRoot := &criurpc.CgroupRoot{
 | 
						|
			Ctrl: proto.String(c),
 | 
						|
			Path: proto.String(p),
 | 
						|
		}
 | 
						|
		req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error {
 | 
						|
	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	var logPath string
 | 
						|
	if opts != nil {
 | 
						|
		logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
 | 
						|
	} else {
 | 
						|
		// For the VERSION RPC 'opts' is set to 'nil' and therefore
 | 
						|
		// opts.WorkDirectory does not exist. Set logPath to "".
 | 
						|
		logPath = ""
 | 
						|
	}
 | 
						|
	criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
 | 
						|
	criuClientFileCon, err := net.FileConn(criuClient)
 | 
						|
	criuClient.Close()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	criuClientCon := criuClientFileCon.(*net.UnixConn)
 | 
						|
	defer criuClientCon.Close()
 | 
						|
 | 
						|
	criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
 | 
						|
	defer criuServer.Close()
 | 
						|
 | 
						|
	args := []string{"swrk", "3"}
 | 
						|
	if c.criuVersion != 0 {
 | 
						|
		// If the CRIU Version is still '0' then this is probably
 | 
						|
		// the initial CRIU run to detect the version. Skip it.
 | 
						|
		logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
 | 
						|
	}
 | 
						|
	cmd := exec.Command(c.criuPath, args...)
 | 
						|
	if process != nil {
 | 
						|
		cmd.Stdin = process.Stdin
 | 
						|
		cmd.Stdout = process.Stdout
 | 
						|
		cmd.Stderr = process.Stderr
 | 
						|
	}
 | 
						|
	cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
 | 
						|
	if extraFiles != nil {
 | 
						|
		cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
 | 
						|
	}
 | 
						|
 | 
						|
	if err := cmd.Start(); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	// we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang.
 | 
						|
	criuServer.Close()
 | 
						|
	// cmd.Process will be replaced by a restored init.
 | 
						|
	criuProcess := cmd.Process
 | 
						|
 | 
						|
	var criuProcessState *os.ProcessState
 | 
						|
	defer func() {
 | 
						|
		if criuProcessState == nil {
 | 
						|
			criuClientCon.Close()
 | 
						|
			_, err := criuProcess.Wait()
 | 
						|
			if err != nil {
 | 
						|
				logrus.Warnf("wait on criuProcess returned %v", err)
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}()
 | 
						|
 | 
						|
	if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	var extFds []string
 | 
						|
	if process != nil {
 | 
						|
		extFds, err = getPipeFds(criuProcess.Pid)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
 | 
						|
	// In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
 | 
						|
	// should be empty. For older CRIU versions it still will be
 | 
						|
	// available but empty. criurpc.CriuReqType_VERSION actually
 | 
						|
	// has no req.GetOpts().
 | 
						|
	if logrus.GetLevel() >= logrus.DebugLevel &&
 | 
						|
		!(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
 | 
						|
			req.GetType() == criurpc.CriuReqType_VERSION) {
 | 
						|
 | 
						|
		val := reflect.ValueOf(req.GetOpts())
 | 
						|
		v := reflect.Indirect(val)
 | 
						|
		for i := 0; i < v.NumField(); i++ {
 | 
						|
			st := v.Type()
 | 
						|
			name := st.Field(i).Name
 | 
						|
			if 'A' <= name[0] && name[0] <= 'Z' {
 | 
						|
				value := val.MethodByName("Get" + name).Call([]reflect.Value{})
 | 
						|
				logrus.Debugf("CRIU option %s with value %v", name, value[0])
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	data, err := proto.Marshal(req)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	_, err = criuClientCon.Write(data)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	buf := make([]byte, 10*4096)
 | 
						|
	oob := make([]byte, 4096)
 | 
						|
	for {
 | 
						|
		n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
 | 
						|
		if req.Opts != nil && req.Opts.StatusFd != nil {
 | 
						|
			// Close status_fd as soon as we got something back from criu,
 | 
						|
			// assuming it has consumed (reopened) it by this time.
 | 
						|
			// Otherwise it will might be left open forever and whoever
 | 
						|
			// is waiting on it will wait forever.
 | 
						|
			fd := int(*req.Opts.StatusFd)
 | 
						|
			_ = unix.Close(fd)
 | 
						|
			req.Opts.StatusFd = nil
 | 
						|
		}
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		if n == 0 {
 | 
						|
			return errors.New("unexpected EOF")
 | 
						|
		}
 | 
						|
		if n == len(buf) {
 | 
						|
			return errors.New("buffer is too small")
 | 
						|
		}
 | 
						|
 | 
						|
		resp := new(criurpc.CriuResp)
 | 
						|
		err = proto.Unmarshal(buf[:n], resp)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		if !resp.GetSuccess() {
 | 
						|
			typeString := req.GetType().String()
 | 
						|
			return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
 | 
						|
		}
 | 
						|
 | 
						|
		t := resp.GetType()
 | 
						|
		switch {
 | 
						|
		case t == criurpc.CriuReqType_FEATURE_CHECK:
 | 
						|
			logrus.Debugf("Feature check says: %s", resp)
 | 
						|
			criuFeatures = resp.GetFeatures()
 | 
						|
		case t == criurpc.CriuReqType_NOTIFY:
 | 
						|
			if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
			t = criurpc.CriuReqType_NOTIFY
 | 
						|
			req = &criurpc.CriuReq{
 | 
						|
				Type:          &t,
 | 
						|
				NotifySuccess: proto.Bool(true),
 | 
						|
			}
 | 
						|
			data, err = proto.Marshal(req)
 | 
						|
			if err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
			_, err = criuClientCon.Write(data)
 | 
						|
			if err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
			continue
 | 
						|
		case t == criurpc.CriuReqType_RESTORE:
 | 
						|
		case t == criurpc.CriuReqType_DUMP:
 | 
						|
		case t == criurpc.CriuReqType_PRE_DUMP:
 | 
						|
		default:
 | 
						|
			return fmt.Errorf("unable to parse the response %s", resp.String())
 | 
						|
		}
 | 
						|
 | 
						|
		break
 | 
						|
	}
 | 
						|
 | 
						|
	_ = criuClientCon.CloseWrite()
 | 
						|
	// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
 | 
						|
	// Here we want to wait only the CRIU process.
 | 
						|
	criuProcessState, err = criuProcess.Wait()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	// In pre-dump mode CRIU is in a loop and waits for
 | 
						|
	// the final DUMP command.
 | 
						|
	// The current runc pre-dump approach, however, is
 | 
						|
	// start criu in PRE_DUMP once for a single pre-dump
 | 
						|
	// and not the whole series of pre-dump, pre-dump, ...m, dump
 | 
						|
	// If we got the message CriuReqType_PRE_DUMP it means
 | 
						|
	// CRIU was successful and we need to forcefully stop CRIU
 | 
						|
	if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
 | 
						|
		return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath)
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// block any external network activity
 | 
						|
func lockNetwork(config *configs.Config) error {
 | 
						|
	for _, config := range config.Networks {
 | 
						|
		strategy, err := getStrategy(config.Type)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
 | 
						|
		if err := strategy.detach(config); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func unlockNetwork(config *configs.Config) error {
 | 
						|
	for _, config := range config.Networks {
 | 
						|
		strategy, err := getStrategy(config.Type)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		if err = strategy.attach(config); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error {
 | 
						|
	notify := resp.GetNotify()
 | 
						|
	if notify == nil {
 | 
						|
		return fmt.Errorf("invalid response: %s", resp.String())
 | 
						|
	}
 | 
						|
	script := notify.GetScript()
 | 
						|
	logrus.Debugf("notify: %s\n", script)
 | 
						|
	switch script {
 | 
						|
	case "post-dump":
 | 
						|
		f, err := os.Create(filepath.Join(c.root, "checkpoint"))
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		f.Close()
 | 
						|
	case "network-unlock":
 | 
						|
		if err := unlockNetwork(c.config); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	case "network-lock":
 | 
						|
		if err := lockNetwork(c.config); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	case "setup-namespaces":
 | 
						|
		if c.config.Hooks != nil {
 | 
						|
			s, err := c.currentOCIState()
 | 
						|
			if err != nil {
 | 
						|
				return nil
 | 
						|
			}
 | 
						|
			s.Pid = int(notify.GetPid())
 | 
						|
 | 
						|
			if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
			if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil {
 | 
						|
				return err
 | 
						|
			}
 | 
						|
		}
 | 
						|
	case "post-restore":
 | 
						|
		pid := notify.GetPid()
 | 
						|
 | 
						|
		p, err := os.FindProcess(int(pid))
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		cmd.Process = p
 | 
						|
 | 
						|
		r, err := newRestoredProcess(cmd, fds)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		process.ops = r
 | 
						|
		if err := c.state.transition(&restoredState{
 | 
						|
			imageDir: opts.ImagesDirectory,
 | 
						|
			c:        c,
 | 
						|
		}); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		// create a timestamp indicating when the restored checkpoint was started
 | 
						|
		c.created = time.Now().UTC()
 | 
						|
		if _, err := c.updateState(r); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
 | 
						|
			if !os.IsNotExist(err) {
 | 
						|
				logrus.Error(err)
 | 
						|
			}
 | 
						|
		}
 | 
						|
	case "orphan-pts-master":
 | 
						|
		scm, err := unix.ParseSocketControlMessage(oob)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
		fds, err := unix.ParseUnixRights(&scm[0])
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
 | 
						|
		master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
 | 
						|
		defer master.Close()
 | 
						|
 | 
						|
		// While we can access console.master, using the API is a good idea.
 | 
						|
		if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	case "status-ready":
 | 
						|
		if opts.StatusFd != -1 {
 | 
						|
			// write \0 to status fd to notify that lazy page server is ready
 | 
						|
			_, err := unix.Write(opts.StatusFd, []byte{0})
 | 
						|
			if err != nil {
 | 
						|
				logrus.Warnf("can't write \\0 to status fd: %v", err)
 | 
						|
			}
 | 
						|
			_ = unix.Close(opts.StatusFd)
 | 
						|
			opts.StatusFd = -1
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
 | 
						|
	if process != nil {
 | 
						|
		c.initProcess = process
 | 
						|
	}
 | 
						|
	state, err := c.currentState()
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	err = c.saveState(state)
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	return state, nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) saveState(s *State) (retErr error) {
 | 
						|
	tmpFile, err := os.CreateTemp(c.root, "state-")
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	defer func() {
 | 
						|
		if retErr != nil {
 | 
						|
			tmpFile.Close()
 | 
						|
			os.Remove(tmpFile.Name())
 | 
						|
		}
 | 
						|
	}()
 | 
						|
 | 
						|
	err = utils.WriteJSON(tmpFile, s)
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	err = tmpFile.Close()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
 | 
						|
	stateFilePath := filepath.Join(c.root, stateFilename)
 | 
						|
	return os.Rename(tmpFile.Name(), stateFilePath)
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) currentStatus() (Status, error) {
 | 
						|
	if err := c.refreshState(); err != nil {
 | 
						|
		return -1, err
 | 
						|
	}
 | 
						|
	return c.state.status(), nil
 | 
						|
}
 | 
						|
 | 
						|
// refreshState needs to be called to verify that the current state on the
 | 
						|
// container is what is true.  Because consumers of libcontainer can use it
 | 
						|
// out of process we need to verify the container's status based on runtime
 | 
						|
// information and not rely on our in process info.
 | 
						|
func (c *linuxContainer) refreshState() error {
 | 
						|
	paused, err := c.isPaused()
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	if paused {
 | 
						|
		return c.state.transition(&pausedState{c: c})
 | 
						|
	}
 | 
						|
	t := c.runType()
 | 
						|
	switch t {
 | 
						|
	case Created:
 | 
						|
		return c.state.transition(&createdState{c: c})
 | 
						|
	case Running:
 | 
						|
		return c.state.transition(&runningState{c: c})
 | 
						|
	}
 | 
						|
	return c.state.transition(&stoppedState{c: c})
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) runType() Status {
 | 
						|
	if c.initProcess == nil {
 | 
						|
		return Stopped
 | 
						|
	}
 | 
						|
	pid := c.initProcess.pid()
 | 
						|
	stat, err := system.Stat(pid)
 | 
						|
	if err != nil {
 | 
						|
		return Stopped
 | 
						|
	}
 | 
						|
	if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
 | 
						|
		return Stopped
 | 
						|
	}
 | 
						|
	// We'll create exec fifo and blocking on it after container is created,
 | 
						|
	// and delete it after start container.
 | 
						|
	if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
 | 
						|
		return Created
 | 
						|
	}
 | 
						|
	return Running
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) isPaused() (bool, error) {
 | 
						|
	state, err := c.cgroupManager.GetFreezerState()
 | 
						|
	if err != nil {
 | 
						|
		return false, err
 | 
						|
	}
 | 
						|
	return state == configs.Frozen, nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) currentState() (*State, error) {
 | 
						|
	var (
 | 
						|
		startTime           uint64
 | 
						|
		externalDescriptors []string
 | 
						|
		pid                 = -1
 | 
						|
	)
 | 
						|
	if c.initProcess != nil {
 | 
						|
		pid = c.initProcess.pid()
 | 
						|
		startTime, _ = c.initProcess.startTime()
 | 
						|
		externalDescriptors = c.initProcess.externalDescriptors()
 | 
						|
	}
 | 
						|
 | 
						|
	intelRdtPath := ""
 | 
						|
	if c.intelRdtManager != nil {
 | 
						|
		intelRdtPath = c.intelRdtManager.GetPath()
 | 
						|
	}
 | 
						|
	state := &State{
 | 
						|
		BaseState: BaseState{
 | 
						|
			ID:                   c.ID(),
 | 
						|
			Config:               *c.config,
 | 
						|
			InitProcessPid:       pid,
 | 
						|
			InitProcessStartTime: startTime,
 | 
						|
			Created:              c.created,
 | 
						|
		},
 | 
						|
		Rootless:            c.config.RootlessEUID && c.config.RootlessCgroups,
 | 
						|
		CgroupPaths:         c.cgroupManager.GetPaths(),
 | 
						|
		IntelRdtPath:        intelRdtPath,
 | 
						|
		NamespacePaths:      make(map[configs.NamespaceType]string),
 | 
						|
		ExternalDescriptors: externalDescriptors,
 | 
						|
	}
 | 
						|
	if pid > 0 {
 | 
						|
		for _, ns := range c.config.Namespaces {
 | 
						|
			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
 | 
						|
		}
 | 
						|
		for _, nsType := range configs.NamespaceTypes() {
 | 
						|
			if !configs.IsNamespaceSupported(nsType) {
 | 
						|
				continue
 | 
						|
			}
 | 
						|
			if _, ok := state.NamespacePaths[nsType]; !ok {
 | 
						|
				ns := configs.Namespace{Type: nsType}
 | 
						|
				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return state, nil
 | 
						|
}
 | 
						|
 | 
						|
func (c *linuxContainer) currentOCIState() (*specs.State, error) {
 | 
						|
	bundle, annotations := utils.Annotations(c.config.Labels)
 | 
						|
	state := &specs.State{
 | 
						|
		Version:     specs.Version,
 | 
						|
		ID:          c.ID(),
 | 
						|
		Bundle:      bundle,
 | 
						|
		Annotations: annotations,
 | 
						|
	}
 | 
						|
	status, err := c.currentStatus()
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	state.Status = specs.ContainerState(status.String())
 | 
						|
	if status != Stopped {
 | 
						|
		if c.initProcess != nil {
 | 
						|
			state.Pid = c.initProcess.pid()
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return state, nil
 | 
						|
}
 | 
						|
 | 
						|
// orderNamespacePaths sorts namespace paths into a list of paths that we
 | 
						|
// can setns in order.
 | 
						|
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
 | 
						|
	paths := []string{}
 | 
						|
	for _, ns := range configs.NamespaceTypes() {
 | 
						|
 | 
						|
		// Remove namespaces that we don't need to join.
 | 
						|
		if !c.config.Namespaces.Contains(ns) {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		if p, ok := namespaces[ns]; ok && p != "" {
 | 
						|
			// check if the requested namespace is supported
 | 
						|
			if !configs.IsNamespaceSupported(ns) {
 | 
						|
				return nil, fmt.Errorf("namespace %s is not supported", ns)
 | 
						|
			}
 | 
						|
			// only set to join this namespace if it exists
 | 
						|
			if _, err := os.Lstat(p); err != nil {
 | 
						|
				return nil, fmt.Errorf("namespace path: %w", err)
 | 
						|
			}
 | 
						|
			// do not allow namespace path with comma as we use it to separate
 | 
						|
			// the namespace paths
 | 
						|
			if strings.ContainsRune(p, ',') {
 | 
						|
				return nil, fmt.Errorf("invalid namespace path %s", p)
 | 
						|
			}
 | 
						|
			paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
 | 
						|
		}
 | 
						|
 | 
						|
	}
 | 
						|
 | 
						|
	return paths, nil
 | 
						|
}
 | 
						|
 | 
						|
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
 | 
						|
	data := bytes.NewBuffer(nil)
 | 
						|
	for _, im := range idMap {
 | 
						|
		line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
 | 
						|
		if _, err := data.WriteString(line); err != nil {
 | 
						|
			return nil, err
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return data.Bytes(), nil
 | 
						|
}
 | 
						|
 | 
						|
// netlinkError is an error wrapper type for use by custom netlink message
 | 
						|
// types. Panics with errors are wrapped in netlinkError so that the recover
 | 
						|
// in bootstrapData can distinguish intentional panics.
 | 
						|
type netlinkError struct{ error }
 | 
						|
 | 
						|
// bootstrapData encodes the necessary data in netlink binary format
 | 
						|
// as a io.Reader.
 | 
						|
// Consumer can write the data to a bootstrap program
 | 
						|
// such as one that uses nsenter package to bootstrap the container's
 | 
						|
// init process correctly, i.e. with correct namespaces, uid/gid
 | 
						|
// mapping etc.
 | 
						|
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
 | 
						|
	// create the netlink message
 | 
						|
	r := nl.NewNetlinkRequest(int(InitMsg), 0)
 | 
						|
 | 
						|
	// Our custom messages cannot bubble up an error using returns, instead
 | 
						|
	// they will panic with the specific error type, netlinkError. In that
 | 
						|
	// case, recover from the panic and return that as an error.
 | 
						|
	defer func() {
 | 
						|
		if r := recover(); r != nil {
 | 
						|
			if e, ok := r.(netlinkError); ok {
 | 
						|
				Err = e.error
 | 
						|
			} else {
 | 
						|
				panic(r)
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}()
 | 
						|
 | 
						|
	// write cloneFlags
 | 
						|
	r.AddData(&Int32msg{
 | 
						|
		Type:  CloneFlagsAttr,
 | 
						|
		Value: uint32(cloneFlags),
 | 
						|
	})
 | 
						|
 | 
						|
	// write custom namespace paths
 | 
						|
	if len(nsMaps) > 0 {
 | 
						|
		nsPaths, err := c.orderNamespacePaths(nsMaps)
 | 
						|
		if err != nil {
 | 
						|
			return nil, err
 | 
						|
		}
 | 
						|
		r.AddData(&Bytemsg{
 | 
						|
			Type:  NsPathsAttr,
 | 
						|
			Value: []byte(strings.Join(nsPaths, ",")),
 | 
						|
		})
 | 
						|
	}
 | 
						|
 | 
						|
	// write namespace paths only when we are not joining an existing user ns
 | 
						|
	_, joinExistingUser := nsMaps[configs.NEWUSER]
 | 
						|
	if !joinExistingUser {
 | 
						|
		// write uid mappings
 | 
						|
		if len(c.config.UidMappings) > 0 {
 | 
						|
			if c.config.RootlessEUID && c.newuidmapPath != "" {
 | 
						|
				r.AddData(&Bytemsg{
 | 
						|
					Type:  UidmapPathAttr,
 | 
						|
					Value: []byte(c.newuidmapPath),
 | 
						|
				})
 | 
						|
			}
 | 
						|
			b, err := encodeIDMapping(c.config.UidMappings)
 | 
						|
			if err != nil {
 | 
						|
				return nil, err
 | 
						|
			}
 | 
						|
			r.AddData(&Bytemsg{
 | 
						|
				Type:  UidmapAttr,
 | 
						|
				Value: b,
 | 
						|
			})
 | 
						|
		}
 | 
						|
 | 
						|
		// write gid mappings
 | 
						|
		if len(c.config.GidMappings) > 0 {
 | 
						|
			b, err := encodeIDMapping(c.config.GidMappings)
 | 
						|
			if err != nil {
 | 
						|
				return nil, err
 | 
						|
			}
 | 
						|
			r.AddData(&Bytemsg{
 | 
						|
				Type:  GidmapAttr,
 | 
						|
				Value: b,
 | 
						|
			})
 | 
						|
			if c.config.RootlessEUID && c.newgidmapPath != "" {
 | 
						|
				r.AddData(&Bytemsg{
 | 
						|
					Type:  GidmapPathAttr,
 | 
						|
					Value: []byte(c.newgidmapPath),
 | 
						|
				})
 | 
						|
			}
 | 
						|
			if requiresRootOrMappingTool(c.config) {
 | 
						|
				r.AddData(&Boolmsg{
 | 
						|
					Type:  SetgroupAttr,
 | 
						|
					Value: true,
 | 
						|
				})
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if c.config.OomScoreAdj != nil {
 | 
						|
		// write oom_score_adj
 | 
						|
		r.AddData(&Bytemsg{
 | 
						|
			Type:  OomScoreAdjAttr,
 | 
						|
			Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
 | 
						|
		})
 | 
						|
	}
 | 
						|
 | 
						|
	// write rootless
 | 
						|
	r.AddData(&Boolmsg{
 | 
						|
		Type:  RootlessEUIDAttr,
 | 
						|
		Value: c.config.RootlessEUID,
 | 
						|
	})
 | 
						|
 | 
						|
	// Bind mount source to open.
 | 
						|
	if it == initStandard && c.shouldSendMountSources() {
 | 
						|
		var mounts []byte
 | 
						|
		for _, m := range c.config.Mounts {
 | 
						|
			if m.IsBind() {
 | 
						|
				if strings.IndexByte(m.Source, 0) >= 0 {
 | 
						|
					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
 | 
						|
				}
 | 
						|
				mounts = append(mounts, []byte(m.Source)...)
 | 
						|
			}
 | 
						|
			mounts = append(mounts, byte(0))
 | 
						|
		}
 | 
						|
 | 
						|
		r.AddData(&Bytemsg{
 | 
						|
			Type:  MountSourcesAttr,
 | 
						|
			Value: mounts,
 | 
						|
		})
 | 
						|
	}
 | 
						|
 | 
						|
	return bytes.NewReader(r.Serialize()), nil
 | 
						|
}
 | 
						|
 | 
						|
// ignoreTerminateErrors returns nil if the given err matches an error known
 | 
						|
// to indicate that the terminate occurred successfully or err was nil, otherwise
 | 
						|
// err is returned unaltered.
 | 
						|
func ignoreTerminateErrors(err error) error {
 | 
						|
	if err == nil {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	// terminate() might return an error from either Kill or Wait.
 | 
						|
	// The (*Cmd).Wait documentation says: "If the command fails to run
 | 
						|
	// or doesn't complete successfully, the error is of type *ExitError".
 | 
						|
	// Filter out such errors (like "exit status 1" or "signal: killed").
 | 
						|
	var exitErr *exec.ExitError
 | 
						|
	if errors.As(err, &exitErr) {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	if errors.Is(err, os.ErrProcessDone) {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	s := err.Error()
 | 
						|
	if strings.Contains(s, "Wait was already called") {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	return err
 | 
						|
}
 | 
						|
 | 
						|
func requiresRootOrMappingTool(c *configs.Config) bool {
 | 
						|
	gidMap := []configs.IDMap{
 | 
						|
		{ContainerID: 0, HostID: os.Getegid(), Size: 1},
 | 
						|
	}
 | 
						|
	return !reflect.DeepEqual(c.GidMappings, gidMap)
 | 
						|
}
 |