mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-10-10 09:39:24 +00:00
rootless execution does not yet support cgroups, so if running rootlessly skip the cgroup creation and deletion. Fixes: 1877 Signed-off-by: Gabi Beyer <gabrielle.n.beyer@intel.com>
1575 lines
43 KiB
Go
1575 lines
43 KiB
Go
// +build linux
|
|
// Copyright (c) 2016 Intel Corporation
|
|
// Copyright (c) 2014,2015,2016,2017 Docker, Inc.
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
|
|
package virtcontainers
|
|
|
|
import (
|
|
"context"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/containerd/cgroups"
|
|
vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types"
|
|
"github.com/kata-containers/runtime/virtcontainers/types"
|
|
"github.com/kata-containers/runtime/virtcontainers/utils"
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
opentracing "github.com/opentracing/opentracing-go"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
|
|
"github.com/kata-containers/runtime/pkg/rootless"
|
|
"github.com/kata-containers/runtime/virtcontainers/device/config"
|
|
"github.com/kata-containers/runtime/virtcontainers/device/manager"
|
|
"github.com/kata-containers/runtime/virtcontainers/store"
|
|
)
|
|
|
|
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
|
|
// This file has definitions for major device numbers.
|
|
var cdromMajors = map[int64]string{
|
|
11: "SCSI_CDROM_MAJOR",
|
|
15: "CDU31A_CDROM_MAJOR",
|
|
16: "GOLDSTAR_CDROM_MAJOR",
|
|
17: "OPTICS_CDROM_MAJOR",
|
|
18: "SANYO_CDROM_MAJOR",
|
|
20: "MITSUMI_X_CDROM_MAJOR",
|
|
23: "MITSUMI_CDROM_MAJOR",
|
|
24: "CDU535_CDROM_MAJOR",
|
|
25: "MATSUSHITA_CDROM_MAJOR",
|
|
26: "MATSUSHITA_CDROM2_MAJOR",
|
|
27: "MATSUSHITA_CDROM3_MAJOR",
|
|
28: "MATSUSHITA_CDROM4_MAJOR",
|
|
29: "AZTECH_CDROM_MAJOR",
|
|
32: "CM206_CDROM_MAJOR",
|
|
}
|
|
|
|
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
|
|
// #define FLOPPY_MAJOR 2
|
|
const floppyMajor = int64(2)
|
|
|
|
// Process gathers data related to a container process.
|
|
type Process struct {
|
|
// Token is the process execution context ID. It must be
|
|
// unique per sandbox.
|
|
// Token is used to manipulate processes for containers
|
|
// that have not started yet, and later identify them
|
|
// uniquely within a sandbox.
|
|
Token string
|
|
|
|
// Pid is the process ID as seen by the host software
|
|
// stack, e.g. CRI-O, containerd. This is typically the
|
|
// shim PID.
|
|
Pid int
|
|
|
|
StartTime time.Time
|
|
}
|
|
|
|
// ContainerStatus describes a container status.
|
|
type ContainerStatus struct {
|
|
ID string
|
|
State types.ContainerState
|
|
PID int
|
|
StartTime time.Time
|
|
RootFs string
|
|
Spec *specs.Spec
|
|
|
|
// Annotations allow clients to store arbitrary values,
|
|
// for example to add additional status values required
|
|
// to support particular specifications.
|
|
Annotations map[string]string
|
|
}
|
|
|
|
// ThrottlingData gather the date related to container cpu throttling.
|
|
type ThrottlingData struct {
|
|
// Number of periods with throttling active
|
|
Periods uint64 `json:"periods,omitempty"`
|
|
// Number of periods when the container hit its throttling limit.
|
|
ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
|
|
// Aggregate time the container was throttled for in nanoseconds.
|
|
ThrottledTime uint64 `json:"throttled_time,omitempty"`
|
|
}
|
|
|
|
// CPUUsage denotes the usage of a CPU.
|
|
// All CPU stats are aggregate since container inception.
|
|
type CPUUsage struct {
|
|
// Total CPU time consumed.
|
|
// Units: nanoseconds.
|
|
TotalUsage uint64 `json:"total_usage,omitempty"`
|
|
// Total CPU time consumed per core.
|
|
// Units: nanoseconds.
|
|
PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
|
|
// Time spent by tasks of the cgroup in kernel mode.
|
|
// Units: nanoseconds.
|
|
UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
|
|
// Time spent by tasks of the cgroup in user mode.
|
|
// Units: nanoseconds.
|
|
UsageInUsermode uint64 `json:"usage_in_usermode"`
|
|
}
|
|
|
|
// CPUStats describes the cpu stats
|
|
type CPUStats struct {
|
|
CPUUsage CPUUsage `json:"cpu_usage,omitempty"`
|
|
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
|
|
}
|
|
|
|
// MemoryData gather the data related to memory
|
|
type MemoryData struct {
|
|
Usage uint64 `json:"usage,omitempty"`
|
|
MaxUsage uint64 `json:"max_usage,omitempty"`
|
|
Failcnt uint64 `json:"failcnt"`
|
|
Limit uint64 `json:"limit"`
|
|
}
|
|
|
|
// MemoryStats describes the memory stats
|
|
type MemoryStats struct {
|
|
// memory used for cache
|
|
Cache uint64 `json:"cache,omitempty"`
|
|
// usage of memory
|
|
Usage MemoryData `json:"usage,omitempty"`
|
|
// usage of memory swap
|
|
SwapUsage MemoryData `json:"swap_usage,omitempty"`
|
|
// usage of kernel memory
|
|
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
|
|
// usage of kernel TCP memory
|
|
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
|
|
// if true, memory usage is accounted for throughout a hierarchy of cgroups.
|
|
UseHierarchy bool `json:"use_hierarchy"`
|
|
|
|
Stats map[string]uint64 `json:"stats,omitempty"`
|
|
}
|
|
|
|
// PidsStats describes the pids stats
|
|
type PidsStats struct {
|
|
// number of pids in the cgroup
|
|
Current uint64 `json:"current,omitempty"`
|
|
// active pids hard limit
|
|
Limit uint64 `json:"limit,omitempty"`
|
|
}
|
|
|
|
// BlkioStatEntry gather date related to a block device
|
|
type BlkioStatEntry struct {
|
|
Major uint64 `json:"major,omitempty"`
|
|
Minor uint64 `json:"minor,omitempty"`
|
|
Op string `json:"op,omitempty"`
|
|
Value uint64 `json:"value,omitempty"`
|
|
}
|
|
|
|
// BlkioStats describes block io stats
|
|
type BlkioStats struct {
|
|
// number of bytes tranferred to and from the block device
|
|
IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
|
|
IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
|
|
IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
|
|
IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
|
|
IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
|
|
IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
|
|
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
|
|
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
|
|
}
|
|
|
|
// HugetlbStats describes hugetable memory stats
|
|
type HugetlbStats struct {
|
|
// current res_counter usage for hugetlb
|
|
Usage uint64 `json:"usage,omitempty"`
|
|
// maximum usage ever recorded.
|
|
MaxUsage uint64 `json:"max_usage,omitempty"`
|
|
// number of times hugetlb usage allocation failure.
|
|
Failcnt uint64 `json:"failcnt"`
|
|
}
|
|
|
|
// CgroupStats describes all cgroup subsystem stats
|
|
type CgroupStats struct {
|
|
CPUStats CPUStats `json:"cpu_stats,omitempty"`
|
|
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
|
|
PidsStats PidsStats `json:"pids_stats,omitempty"`
|
|
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
|
|
// the map is in the format "size of hugepage: stats of the hugepage"
|
|
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
|
|
}
|
|
|
|
// NetworkStats describe all network stats.
|
|
type NetworkStats struct {
|
|
// Name is the name of the network interface.
|
|
Name string `json:"name,omitempty"`
|
|
|
|
RxBytes uint64 `json:"rx_bytes,omitempty"`
|
|
RxPackets uint64 `json:"rx_packets,omitempty"`
|
|
RxErrors uint64 `json:"rx_errors,omitempty"`
|
|
RxDropped uint64 `json:"rx_dropped,omitempty"`
|
|
TxBytes uint64 `json:"tx_bytes,omitempty"`
|
|
TxPackets uint64 `json:"tx_packets,omitempty"`
|
|
TxErrors uint64 `json:"tx_errors,omitempty"`
|
|
TxDropped uint64 `json:"tx_dropped,omitempty"`
|
|
}
|
|
|
|
// ContainerStats describes a container stats.
|
|
type ContainerStats struct {
|
|
CgroupStats *CgroupStats
|
|
NetworkStats []*NetworkStats
|
|
}
|
|
|
|
// ContainerResources describes container resources
|
|
type ContainerResources struct {
|
|
// VCPUs are the number of vCPUs that are being used by the container
|
|
VCPUs uint32
|
|
|
|
// Mem is the memory that is being used by the container
|
|
MemByte int64
|
|
}
|
|
|
|
// ContainerConfig describes one container runtime configuration.
|
|
type ContainerConfig struct {
|
|
ID string
|
|
|
|
// RootFs is the container workload image on the host.
|
|
RootFs RootFs
|
|
|
|
// ReadOnlyRootfs indicates if the rootfs should be mounted readonly
|
|
ReadonlyRootfs bool
|
|
|
|
// Cmd specifies the command to run on a container
|
|
Cmd types.Cmd
|
|
|
|
// Annotations allow clients to store arbitrary values,
|
|
// for example to add additional status values required
|
|
// to support particular specifications.
|
|
Annotations map[string]string
|
|
|
|
Mounts []Mount
|
|
|
|
// Device configuration for devices that must be available within the container.
|
|
DeviceInfos []config.DeviceInfo
|
|
|
|
// Resources container resources
|
|
Resources specs.LinuxResources
|
|
|
|
// Raw OCI specification, it won't be saved to disk.
|
|
Spec *specs.Spec `json:"_"`
|
|
}
|
|
|
|
// valid checks that the container configuration is valid.
|
|
func (c *ContainerConfig) valid() bool {
|
|
if c == nil {
|
|
return false
|
|
}
|
|
|
|
if c.ID == "" {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// SystemMountsInfo describes additional information for system mounts that the agent
|
|
// needs to handle
|
|
type SystemMountsInfo struct {
|
|
// Indicates if /dev has been passed as a bind mount for the host /dev
|
|
BindMountDev bool
|
|
|
|
// Size of /dev/shm assigned on the host.
|
|
DevShmSize uint
|
|
}
|
|
|
|
// ContainerDevice describes a device associated with container
|
|
type ContainerDevice struct {
|
|
// ID is device id referencing the device from sandbox's device manager
|
|
ID string
|
|
|
|
// ContainerPath is device path displayed in container
|
|
ContainerPath string
|
|
|
|
// FileMode permission bits for the device.
|
|
FileMode os.FileMode
|
|
|
|
// UID is user ID in the container namespace
|
|
UID uint32
|
|
|
|
// GID is group ID in the container namespace
|
|
GID uint32
|
|
}
|
|
|
|
// RootFs describes the container's rootfs.
|
|
type RootFs struct {
|
|
// Source specifies the BlockDevice path
|
|
Source string
|
|
// Target specify where the rootfs is mounted if it has been mounted
|
|
Target string
|
|
// Type specifies the type of filesystem to mount.
|
|
Type string
|
|
// Options specifies zero or more fstab style mount options.
|
|
Options []string
|
|
// Mounted specifies whether the rootfs has be mounted or not
|
|
Mounted bool
|
|
}
|
|
|
|
// Container is composed of a set of containers and a runtime environment.
|
|
// A Container can be created, deleted, started, stopped, listed, entered, paused and restored.
|
|
type Container struct {
|
|
id string
|
|
sandboxID string
|
|
|
|
rootFs RootFs
|
|
|
|
config *ContainerConfig
|
|
|
|
sandbox *Sandbox
|
|
|
|
runPath string
|
|
configPath string
|
|
containerPath string
|
|
rootfsSuffix string
|
|
|
|
state types.ContainerState
|
|
|
|
process Process
|
|
|
|
mounts []Mount
|
|
|
|
devices []ContainerDevice
|
|
|
|
systemMountsInfo SystemMountsInfo
|
|
|
|
ctx context.Context
|
|
|
|
store *store.VCStore
|
|
}
|
|
|
|
// ID returns the container identifier string.
|
|
func (c *Container) ID() string {
|
|
return c.id
|
|
}
|
|
|
|
// Logger returns a logrus logger appropriate for logging Container messages
|
|
func (c *Container) Logger() *logrus.Entry {
|
|
return virtLog.WithFields(logrus.Fields{
|
|
"subsystem": "container",
|
|
"sandbox": c.sandboxID,
|
|
})
|
|
}
|
|
|
|
func (c *Container) trace(name string) (opentracing.Span, context.Context) {
|
|
if c.ctx == nil {
|
|
c.Logger().WithField("type", "bug").Error("trace called before context set")
|
|
c.ctx = context.Background()
|
|
}
|
|
|
|
span, ctx := opentracing.StartSpanFromContext(c.ctx, name)
|
|
|
|
span.SetTag("subsystem", "container")
|
|
|
|
return span, ctx
|
|
}
|
|
|
|
// Sandbox returns the sandbox handler related to this container.
|
|
func (c *Container) Sandbox() VCSandbox {
|
|
return c.sandbox
|
|
}
|
|
|
|
// Process returns the container process.
|
|
func (c *Container) Process() Process {
|
|
return c.process
|
|
}
|
|
|
|
// GetToken returns the token related to this container's process.
|
|
func (c *Container) GetToken() string {
|
|
return c.process.Token
|
|
}
|
|
|
|
// GetPid returns the pid related to this container's process.
|
|
func (c *Container) GetPid() int {
|
|
return c.process.Pid
|
|
}
|
|
|
|
func (c *Container) setStateFstype(fstype string) error {
|
|
c.state.Fstype = fstype
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
// experimental runtime use "persist.json" which doesn't need "state.json" anymore
|
|
if err := c.storeState(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetAnnotations returns container's annotations
|
|
func (c *Container) GetAnnotations() map[string]string {
|
|
return c.config.Annotations
|
|
}
|
|
|
|
// GetOCISpec returns container's OCI specification
|
|
func (c *Container) GetOCISpec() *specs.Spec {
|
|
return c.config.Spec
|
|
}
|
|
|
|
// storeContainer stores a container config.
|
|
func (c *Container) storeContainer() error {
|
|
if c.sandbox.supportNewStore() {
|
|
if err := c.sandbox.Save(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return c.store.Store(store.Configuration, *(c.config))
|
|
}
|
|
|
|
func (c *Container) storeProcess() error {
|
|
return c.store.Store(store.Process, c.process)
|
|
}
|
|
|
|
func (c *Container) storeMounts() error {
|
|
return c.store.Store(store.Mounts, c.mounts)
|
|
}
|
|
|
|
func (c *Container) storeDevices() error {
|
|
return c.store.Store(store.DeviceIDs, c.devices)
|
|
}
|
|
|
|
func (c *Container) storeState() error {
|
|
return c.store.Store(store.State, c.state)
|
|
}
|
|
|
|
func (c *Container) loadMounts() ([]Mount, error) {
|
|
var mounts []Mount
|
|
if err := c.store.Load(store.Mounts, &mounts); err != nil {
|
|
return []Mount{}, err
|
|
}
|
|
|
|
return mounts, nil
|
|
}
|
|
|
|
func (c *Container) loadDevices() ([]ContainerDevice, error) {
|
|
var devices []ContainerDevice
|
|
|
|
if err := c.store.Load(store.DeviceIDs, &devices); err != nil {
|
|
return []ContainerDevice{}, err
|
|
}
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
// setContainerState sets both the in-memory and on-disk state of the
|
|
// container.
|
|
func (c *Container) setContainerState(state types.StateString) error {
|
|
if state == "" {
|
|
return vcTypes.ErrNeedState
|
|
}
|
|
|
|
c.Logger().Debugf("Setting container state from %v to %v", c.state.State, state)
|
|
// update in-memory state
|
|
c.state.State = state
|
|
|
|
if c.sandbox.supportNewStore() {
|
|
// flush data to storage
|
|
if err := c.sandbox.Save(); err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
// experimental runtime use "persist.json" which doesn't need "state.json" anymore
|
|
// update on-disk state
|
|
if err := c.store.Store(store.State, c.state); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) shareFiles(m Mount, idx int, hostSharedDir, guestSharedDir string) (string, bool, error) {
|
|
randBytes, err := utils.GenerateRandomBytes(8)
|
|
if err != nil {
|
|
return "", false, err
|
|
}
|
|
|
|
filename := fmt.Sprintf("%s-%s-%s", c.id, hex.EncodeToString(randBytes), filepath.Base(m.Destination))
|
|
guestDest := filepath.Join(guestSharedDir, filename)
|
|
|
|
// copy file to contaier's rootfs if filesystem sharing is not supported, otherwise
|
|
// bind mount it in the shared directory.
|
|
caps := c.sandbox.hypervisor.capabilities()
|
|
if !caps.IsFsSharingSupported() {
|
|
c.Logger().Debug("filesystem sharing is not supported, files will be copied")
|
|
|
|
fileInfo, err := os.Stat(m.Source)
|
|
if err != nil {
|
|
return "", false, err
|
|
}
|
|
|
|
// Ignore the mount if this is not a regular file (excludes
|
|
// directory, socket, device, ...) as it cannot be handled by
|
|
// a simple copy. But this should not be treated as an error,
|
|
// only as a limitation.
|
|
if !fileInfo.Mode().IsRegular() {
|
|
c.Logger().WithField("ignored-file", m.Source).Debug("Ignoring non-regular file as FS sharing not supported")
|
|
return "", true, nil
|
|
}
|
|
|
|
if err := c.sandbox.agent.copyFile(m.Source, guestDest); err != nil {
|
|
return "", false, err
|
|
}
|
|
} else {
|
|
// These mounts are created in the shared dir
|
|
mountDest := filepath.Join(hostSharedDir, c.sandbox.id, filename)
|
|
if err := bindMount(c.ctx, m.Source, mountDest, false); err != nil {
|
|
return "", false, err
|
|
}
|
|
// Save HostPath mount value into the mount list of the container.
|
|
c.mounts[idx].HostPath = mountDest
|
|
}
|
|
|
|
return guestDest, false, nil
|
|
}
|
|
|
|
// mountSharedDirMounts handles bind-mounts by bindmounting to the host shared
|
|
// directory which is mounted through 9pfs in the VM.
|
|
// It also updates the container mount list with the HostPath info, and store
|
|
// container mounts to the storage. This way, we will have the HostPath info
|
|
// available when we will need to unmount those mounts.
|
|
func (c *Container) mountSharedDirMounts(hostSharedDir, guestSharedDir string) ([]Mount, []Mount, error) {
|
|
var sharedDirMounts []Mount
|
|
var ignoredMounts []Mount
|
|
for idx, m := range c.mounts {
|
|
// Skip mounting certain system paths from the source on the host side
|
|
// into the container as it does not make sense to do so.
|
|
// Example sources could be /sys/fs/cgroup etc.
|
|
if isSystemMount(m.Source) {
|
|
continue
|
|
}
|
|
|
|
if m.Type != "bind" {
|
|
continue
|
|
}
|
|
|
|
// We need to treat /dev/shm as a special case. This is passed as a bind mount in the spec,
|
|
// but it does not make sense to pass this as a 9p mount from the host side.
|
|
// This needs to be handled purely in the guest, by allocating memory for this inside the VM.
|
|
if m.Destination == "/dev/shm" {
|
|
continue
|
|
}
|
|
|
|
// Check if mount is a block device file. If it is, the block device will be attached to the host
|
|
// instead of passing this as a shared mount.
|
|
if len(m.BlockDeviceID) > 0 {
|
|
// Attach this block device, all other devices passed in the config have been attached at this point
|
|
if err := c.sandbox.devManager.AttachDevice(m.BlockDeviceID, c.sandbox); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
if err := c.sandbox.storeSandboxDevices(); err != nil {
|
|
//TODO: roll back?
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Ignore /dev, directories and all other device files. We handle
|
|
// only regular files in /dev. It does not make sense to pass the host
|
|
// device nodes to the guest.
|
|
if isHostDevice(m.Destination) {
|
|
continue
|
|
}
|
|
|
|
guestDest, ignore, err := c.shareFiles(m, idx, hostSharedDir, guestSharedDir)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Expand the list of mounts to ignore.
|
|
if ignore {
|
|
ignoredMounts = append(ignoredMounts, Mount{Source: m.Source})
|
|
continue
|
|
}
|
|
|
|
// Check if mount is readonly, let the agent handle the readonly mount
|
|
// within the VM.
|
|
readonly := false
|
|
for _, flag := range m.Options {
|
|
if flag == "ro" {
|
|
readonly = true
|
|
}
|
|
}
|
|
|
|
sharedDirMount := Mount{
|
|
Source: guestDest,
|
|
Destination: m.Destination,
|
|
Type: m.Type,
|
|
Options: m.Options,
|
|
ReadOnly: readonly,
|
|
}
|
|
|
|
sharedDirMounts = append(sharedDirMounts, sharedDirMount)
|
|
}
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
if err := c.storeMounts(); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
|
|
return sharedDirMounts, ignoredMounts, nil
|
|
}
|
|
|
|
func (c *Container) unmountHostMounts() error {
|
|
var span opentracing.Span
|
|
span, c.ctx = c.trace("unmountHostMounts")
|
|
defer span.Finish()
|
|
|
|
for _, m := range c.mounts {
|
|
if m.HostPath != "" {
|
|
span, _ := c.trace("unmount")
|
|
span.SetTag("host-path", m.HostPath)
|
|
|
|
if err := syscall.Unmount(m.HostPath, syscall.MNT_DETACH); err != nil {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"host-path": m.HostPath,
|
|
"error": err,
|
|
}).Warn("Could not umount")
|
|
return err
|
|
}
|
|
|
|
if m.Type == "bind" {
|
|
s, err := os.Stat(m.HostPath)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "Could not stat host-path %v", m.HostPath)
|
|
}
|
|
// Remove the empty file or directory
|
|
if s.Mode().IsRegular() && s.Size() == 0 {
|
|
os.Remove(m.HostPath)
|
|
}
|
|
if s.Mode().IsDir() {
|
|
syscall.Rmdir(m.HostPath)
|
|
}
|
|
}
|
|
|
|
span.Finish()
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func filterDevices(c *Container, devices []ContainerDevice) (ret []ContainerDevice) {
|
|
for _, dev := range devices {
|
|
major, _ := c.sandbox.devManager.GetDeviceByID(dev.ID).GetMajorMinor()
|
|
if _, ok := cdromMajors[major]; ok {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"device": dev.ContainerPath,
|
|
}).Info("Not attach device because it is a CDROM")
|
|
continue
|
|
}
|
|
|
|
if major == floppyMajor {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"device": dev.ContainerPath,
|
|
}).Info("Not attaching device because it is a floppy drive")
|
|
continue
|
|
}
|
|
|
|
ret = append(ret, dev)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (c *Container) createBlockDevices() error {
|
|
// iterate all mounts and create block device if it's block based.
|
|
for i, m := range c.mounts {
|
|
if len(m.BlockDeviceID) > 0 || m.Type != "bind" {
|
|
// Non-empty m.BlockDeviceID indicates there's already one device
|
|
// associated with the mount,so no need to create a new device for it
|
|
// and we only create block device for bind mount
|
|
continue
|
|
}
|
|
|
|
var stat unix.Stat_t
|
|
if err := unix.Stat(m.Source, &stat); err != nil {
|
|
return fmt.Errorf("stat %q failed: %v", m.Source, err)
|
|
}
|
|
|
|
// Check if mount is a block device file. If it is, the block device will be attached to the host
|
|
// instead of passing this as a shared mount.
|
|
if c.checkBlockDeviceSupport() && stat.Mode&unix.S_IFBLK == unix.S_IFBLK {
|
|
b, err := c.sandbox.devManager.NewDevice(config.DeviceInfo{
|
|
HostPath: m.Source,
|
|
ContainerPath: m.Destination,
|
|
DevType: "b",
|
|
Major: int64(unix.Major(stat.Rdev)),
|
|
Minor: int64(unix.Minor(stat.Rdev)),
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("device manager failed to create new device for %q: %v", m.Source, err)
|
|
}
|
|
|
|
c.mounts[i].BlockDeviceID = b.DeviceID()
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// newContainer creates a Container structure from a sandbox and a container configuration.
|
|
func newContainer(sandbox *Sandbox, contConfig ContainerConfig) (*Container, error) {
|
|
span, _ := sandbox.trace("newContainer")
|
|
defer span.Finish()
|
|
|
|
if !contConfig.valid() {
|
|
return &Container{}, fmt.Errorf("Invalid container configuration")
|
|
}
|
|
|
|
c := &Container{
|
|
id: contConfig.ID,
|
|
sandboxID: sandbox.id,
|
|
rootFs: contConfig.RootFs,
|
|
config: &contConfig,
|
|
sandbox: sandbox,
|
|
runPath: store.ContainerRuntimeRootPath(sandbox.id, contConfig.ID),
|
|
configPath: store.ContainerConfigurationRootPath(sandbox.id, contConfig.ID),
|
|
containerPath: filepath.Join(sandbox.id, contConfig.ID),
|
|
rootfsSuffix: "rootfs",
|
|
state: types.ContainerState{},
|
|
process: Process{},
|
|
mounts: contConfig.Mounts,
|
|
ctx: sandbox.ctx,
|
|
}
|
|
|
|
storeAlreadyExists := store.VCContainerStoreExists(sandbox.ctx, c.sandboxID, c.id)
|
|
ctrStore, err := store.NewVCContainerStore(sandbox.ctx, c.sandboxID, c.id)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if err != nil && !storeAlreadyExists {
|
|
if delerr := c.store.Delete(); delerr != nil {
|
|
c.Logger().WithError(delerr).WithField("cid", c.id).Error("delete store failed")
|
|
}
|
|
}
|
|
}()
|
|
|
|
c.store = ctrStore
|
|
|
|
// experimental runtime use "persist.json" instead of legacy "state.json" as storage
|
|
if c.sandbox.supportNewStore() {
|
|
err := c.Restore()
|
|
if err == nil {
|
|
//container restored
|
|
return c, nil
|
|
}
|
|
|
|
// Unexpected error
|
|
if !os.IsNotExist(err) && err != errContainerPersistNotExist {
|
|
return nil, err
|
|
}
|
|
// Go to next step for first created container
|
|
} else {
|
|
state, err := c.store.LoadContainerState()
|
|
if err == nil {
|
|
c.state = state
|
|
}
|
|
|
|
var process Process
|
|
if err := c.store.Load(store.Process, &process); err == nil {
|
|
c.process = process
|
|
}
|
|
}
|
|
|
|
if err = c.createMounts(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = c.createDevices(contConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return c, nil
|
|
}
|
|
|
|
func (c *Container) createMounts() error {
|
|
// If sandbox supports "newstore", only newly created container can reach this function,
|
|
// so we don't call restore when `supportNewStore` is true
|
|
if !c.sandbox.supportNewStore() {
|
|
mounts, err := c.loadMounts()
|
|
if err == nil {
|
|
// restore mounts from disk
|
|
c.mounts = mounts
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Create block devices for newly created container
|
|
if err := c.createBlockDevices(); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) createDevices(contConfig ContainerConfig) error {
|
|
// If sandbox supports "newstore", only newly created container can reach this function,
|
|
// so we don't call restore when `supportNewStore` is true
|
|
if !c.sandbox.supportNewStore() {
|
|
// Devices will be found in storage after create stage has completed.
|
|
// We load devices from storage at all other stages.
|
|
storedDevices, err := c.loadDevices()
|
|
if err == nil {
|
|
c.devices = storedDevices
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// If devices were not found in storage, create Device implementations
|
|
// from the configuration. This should happen at create.
|
|
var storedDevices []ContainerDevice
|
|
for _, info := range contConfig.DeviceInfos {
|
|
dev, err := c.sandbox.devManager.NewDevice(info)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
storedDevices = append(storedDevices, ContainerDevice{
|
|
ID: dev.DeviceID(),
|
|
ContainerPath: info.ContainerPath,
|
|
FileMode: info.FileMode,
|
|
UID: info.UID,
|
|
GID: info.GID,
|
|
})
|
|
}
|
|
c.devices = filterDevices(c, storedDevices)
|
|
return nil
|
|
}
|
|
|
|
// rollbackFailingContainerCreation rolls back important steps that might have
|
|
// been performed before the container creation failed.
|
|
// - Unplug CPU and memory resources from the VM.
|
|
// - Unplug devices from the VM.
|
|
func (c *Container) rollbackFailingContainerCreation() {
|
|
if err := c.detachDevices(); err != nil {
|
|
c.Logger().WithError(err).Error("rollback failed detachDevices()")
|
|
}
|
|
if err := c.removeDrive(); err != nil {
|
|
c.Logger().WithError(err).Error("rollback failed removeDrive()")
|
|
}
|
|
}
|
|
|
|
func (c *Container) checkBlockDeviceSupport() bool {
|
|
if !c.sandbox.config.HypervisorConfig.DisableBlockDeviceUse {
|
|
agentCaps := c.sandbox.agent.capabilities()
|
|
hypervisorCaps := c.sandbox.hypervisor.capabilities()
|
|
|
|
if agentCaps.IsBlockDeviceSupported() && hypervisorCaps.IsBlockDeviceHotplugSupported() {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// createContainer creates and start a container inside a Sandbox. It has to be
|
|
// called only when a new container, not known by the sandbox, has to be created.
|
|
func (c *Container) create() (err error) {
|
|
// In case the container creation fails, the following takes care
|
|
// of rolling back all the actions previously performed.
|
|
defer func() {
|
|
if err != nil {
|
|
c.rollbackFailingContainerCreation()
|
|
}
|
|
}()
|
|
|
|
if c.checkBlockDeviceSupport() {
|
|
if err = c.hotplugDrive(); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
// Attach devices
|
|
if err = c.attachDevices(); err != nil {
|
|
return
|
|
}
|
|
|
|
// Deduce additional system mount info that should be handled by the agent
|
|
// inside the VM
|
|
c.getSystemMountInfo()
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
if err = c.storeDevices(); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
process, err := c.sandbox.agent.createContainer(c.sandbox, c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.process = *process
|
|
|
|
if !c.sandbox.config.SandboxCgroupOnly || !rootless.IsRootless() {
|
|
if err = c.cgroupsCreate(); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
// Store the container process returned by the agent.
|
|
if err = c.storeProcess(); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
if err = c.setContainerState(types.StateReady); err != nil {
|
|
return
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) delete() error {
|
|
if c.state.State != types.StateReady &&
|
|
c.state.State != types.StateStopped {
|
|
return fmt.Errorf("Container not ready or stopped, impossible to delete")
|
|
}
|
|
|
|
// Remove the container from sandbox structure
|
|
if err := c.sandbox.removeContainer(c.id); err != nil {
|
|
return err
|
|
}
|
|
|
|
// If running rootless, there are no cgroups to remove
|
|
if !c.sandbox.config.SandboxCgroupOnly || !rootless.IsRootless() {
|
|
if err := c.cgroupsDelete(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return c.store.Delete()
|
|
}
|
|
|
|
// checkSandboxRunning validates the container state.
|
|
//
|
|
// cmd specifies the operation (or verb) that the retrieval is destined
|
|
// for and is only used to make the returned error as descriptive as
|
|
// possible.
|
|
func (c *Container) checkSandboxRunning(cmd string) error {
|
|
if cmd == "" {
|
|
return fmt.Errorf("Cmd cannot be empty")
|
|
}
|
|
|
|
if c.sandbox.state.State != types.StateRunning {
|
|
return fmt.Errorf("Sandbox not running, impossible to %s the container", cmd)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) getSystemMountInfo() {
|
|
// check if /dev needs to be bind mounted from host /dev
|
|
c.systemMountsInfo.BindMountDev = false
|
|
|
|
for _, m := range c.mounts {
|
|
if m.Source == "/dev" && m.Destination == "/dev" && m.Type == "bind" {
|
|
c.systemMountsInfo.BindMountDev = true
|
|
}
|
|
}
|
|
|
|
// TODO Deduce /dev/shm size. See https://github.com/clearcontainers/runtime/issues/138
|
|
}
|
|
|
|
func (c *Container) start() error {
|
|
if err := c.checkSandboxRunning("start"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if c.state.State != types.StateReady &&
|
|
c.state.State != types.StateStopped {
|
|
return fmt.Errorf("Container not ready or stopped, impossible to start")
|
|
}
|
|
|
|
if err := c.state.ValidTransition(c.state.State, types.StateRunning); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := c.sandbox.agent.startContainer(c.sandbox, c); err != nil {
|
|
c.Logger().WithError(err).Error("Failed to start container")
|
|
|
|
if err := c.stop(true); err != nil {
|
|
c.Logger().WithError(err).Warn("Failed to stop container")
|
|
}
|
|
return err
|
|
}
|
|
|
|
return c.setContainerState(types.StateRunning)
|
|
}
|
|
|
|
func (c *Container) stop(force bool) error {
|
|
span, _ := c.trace("stop")
|
|
defer span.Finish()
|
|
|
|
// In case the container status has been updated implicitly because
|
|
// the container process has terminated, it might be possible that
|
|
// someone try to stop the container, and we don't want to issue an
|
|
// error in that case. This should be a no-op.
|
|
//
|
|
// This has to be handled before the transition validation since this
|
|
// is an exception.
|
|
if c.state.State == types.StateStopped {
|
|
c.Logger().Info("Container already stopped")
|
|
return nil
|
|
}
|
|
|
|
if err := c.state.ValidTransition(c.state.State, types.StateStopped); err != nil {
|
|
return err
|
|
}
|
|
|
|
defer func() {
|
|
span, _ := c.trace("stopShim")
|
|
defer span.Finish()
|
|
|
|
// If shim is still running something went wrong
|
|
// Make sure we stop the shim process
|
|
if running, _ := isShimRunning(c.process.Pid); running {
|
|
l := c.Logger()
|
|
l.Error("Failed to stop container so stopping dangling shim")
|
|
if err := stopShim(c.process.Pid); err != nil {
|
|
l.WithError(err).Warn("failed to stop shim")
|
|
}
|
|
}
|
|
|
|
}()
|
|
|
|
// Here we expect that stop() has been called because the container
|
|
// process returned or because it received a signal. In case of a
|
|
// signal, we want to give it some time to end the container process.
|
|
// However, if the signal didn't reach its goal, the caller still
|
|
// expects this container to be stopped, that's why we should not
|
|
// return an error, but instead try to kill it forcefully.
|
|
if err := waitForShim(c.process.Pid); err != nil {
|
|
// Force the container to be killed.
|
|
if err := c.kill(syscall.SIGKILL, true); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
// Wait for the end of container process. We expect this call
|
|
// to succeed. Indeed, we have already given a second chance
|
|
// to the container by trying to kill it with SIGKILL, there
|
|
// is no reason to try to go further if we got an error.
|
|
if err := waitForShim(c.process.Pid); err != nil && !force {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Force the container to be killed. For most of the cases, this
|
|
// should not matter and it should return an error that will be
|
|
// ignored.
|
|
// But for the specific case where the shim has been SIGKILL'ed,
|
|
// the container is still running inside the VM. And this is why
|
|
// this signal will ensure the container will get killed to match
|
|
// the state of the shim. This will allow the following call to
|
|
// stopContainer() to succeed in such particular case.
|
|
c.kill(syscall.SIGKILL, true)
|
|
|
|
// Since the agent has supported the MultiWaitProcess, it's better to
|
|
// wait the process here to make sure the process has exited before to
|
|
// issue stopContainer, otherwise the RemoveContainerRequest in it will
|
|
// get failed if the process hasn't exited.
|
|
c.sandbox.agent.waitProcess(c, c.id)
|
|
|
|
// container was killed by force, container MUST change its state
|
|
// as soon as possible just in case one of below operations fail leaving
|
|
// the containers in a bad state.
|
|
if err := c.setContainerState(types.StateStopped); err != nil {
|
|
return err
|
|
}
|
|
|
|
defer func() {
|
|
// Save device and drive data.
|
|
// TODO: can we merge this saving with setContainerState()?
|
|
if c.sandbox.supportNewStore() {
|
|
if err := c.sandbox.Save(); err != nil {
|
|
c.Logger().WithError(err).Info("save container state failed")
|
|
}
|
|
}
|
|
}()
|
|
|
|
if err := c.sandbox.agent.stopContainer(c.sandbox, *c); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if err := c.unmountHostMounts(); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if err := bindUnmountContainerRootfs(c.ctx, kataHostSharedDir(), c.sandbox.id, c.id); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if err := c.detachDevices(); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if err := c.removeDrive(); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) enter(cmd types.Cmd) (*Process, error) {
|
|
if err := c.checkSandboxRunning("enter"); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if c.state.State != types.StateReady &&
|
|
c.state.State != types.StateRunning {
|
|
return nil, fmt.Errorf("Container not ready or running, " +
|
|
"impossible to enter")
|
|
}
|
|
|
|
process, err := c.sandbox.agent.exec(c.sandbox, *c, cmd)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return process, nil
|
|
}
|
|
|
|
func (c *Container) wait(processID string) (int32, error) {
|
|
if c.state.State != types.StateReady &&
|
|
c.state.State != types.StateRunning {
|
|
return 0, fmt.Errorf("Container not ready or running, " +
|
|
"impossible to wait")
|
|
}
|
|
|
|
return c.sandbox.agent.waitProcess(c, processID)
|
|
}
|
|
|
|
func (c *Container) kill(signal syscall.Signal, all bool) error {
|
|
return c.signalProcess(c.process.Token, signal, all)
|
|
}
|
|
|
|
func (c *Container) signalProcess(processID string, signal syscall.Signal, all bool) error {
|
|
if c.sandbox.state.State != types.StateReady && c.sandbox.state.State != types.StateRunning {
|
|
return fmt.Errorf("Sandbox not ready or running, impossible to signal the container")
|
|
}
|
|
|
|
if c.state.State != types.StateReady && c.state.State != types.StateRunning && c.state.State != types.StatePaused {
|
|
return fmt.Errorf("Container not ready, running or paused, impossible to signal the container")
|
|
}
|
|
|
|
return c.sandbox.agent.signalProcess(c, processID, signal, all)
|
|
}
|
|
|
|
func (c *Container) winsizeProcess(processID string, height, width uint32) error {
|
|
if c.state.State != types.StateReady && c.state.State != types.StateRunning {
|
|
return fmt.Errorf("Container not ready or running, impossible to signal the container")
|
|
}
|
|
|
|
return c.sandbox.agent.winsizeProcess(c, processID, height, width)
|
|
}
|
|
|
|
func (c *Container) ioStream(processID string) (io.WriteCloser, io.Reader, io.Reader, error) {
|
|
if c.state.State != types.StateReady && c.state.State != types.StateRunning {
|
|
return nil, nil, nil, fmt.Errorf("Container not ready or running, impossible to signal the container")
|
|
}
|
|
|
|
stream := newIOStream(c.sandbox, c, processID)
|
|
|
|
return stream.stdin(), stream.stdout(), stream.stderr(), nil
|
|
}
|
|
|
|
func (c *Container) processList(options ProcessListOptions) (ProcessList, error) {
|
|
if err := c.checkSandboxRunning("ps"); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if c.state.State != types.StateRunning {
|
|
return nil, fmt.Errorf("Container not running, impossible to list processes")
|
|
}
|
|
|
|
return c.sandbox.agent.processListContainer(c.sandbox, *c, options)
|
|
}
|
|
|
|
func (c *Container) stats() (*ContainerStats, error) {
|
|
if err := c.checkSandboxRunning("stats"); err != nil {
|
|
return nil, err
|
|
}
|
|
return c.sandbox.agent.statsContainer(c.sandbox, *c)
|
|
}
|
|
|
|
func (c *Container) update(resources specs.LinuxResources) error {
|
|
if err := c.checkSandboxRunning("update"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if state := c.state.State; !(state == types.StateRunning || state == types.StateReady) {
|
|
return fmt.Errorf("Container(%s) not running or ready, impossible to update", state)
|
|
}
|
|
|
|
if c.config.Resources.CPU == nil {
|
|
c.config.Resources.CPU = &specs.LinuxCPU{}
|
|
}
|
|
|
|
if cpu := resources.CPU; cpu != nil {
|
|
if p := cpu.Period; p != nil && *p != 0 {
|
|
c.config.Resources.CPU.Period = p
|
|
}
|
|
if q := cpu.Quota; q != nil && *q != 0 {
|
|
c.config.Resources.CPU.Quota = q
|
|
}
|
|
}
|
|
|
|
if c.config.Resources.Memory == nil {
|
|
c.config.Resources.Memory = &specs.LinuxMemory{}
|
|
}
|
|
|
|
if mem := resources.Memory; mem != nil && mem.Limit != nil {
|
|
c.config.Resources.Memory.Limit = mem.Limit
|
|
}
|
|
|
|
if err := c.sandbox.updateResources(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if !c.sandbox.config.SandboxCgroupOnly {
|
|
if err := c.cgroupsUpdate(resources); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return c.sandbox.agent.updateContainer(c.sandbox, *c, resources)
|
|
}
|
|
|
|
func (c *Container) pause() error {
|
|
if err := c.checkSandboxRunning("pause"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if c.state.State != types.StateRunning {
|
|
return fmt.Errorf("Container not running, impossible to pause")
|
|
}
|
|
|
|
if err := c.sandbox.agent.pauseContainer(c.sandbox, *c); err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.setContainerState(types.StatePaused)
|
|
}
|
|
|
|
func (c *Container) resume() error {
|
|
if err := c.checkSandboxRunning("resume"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if c.state.State != types.StatePaused {
|
|
return fmt.Errorf("Container not paused, impossible to resume")
|
|
}
|
|
|
|
if err := c.sandbox.agent.resumeContainer(c.sandbox, *c); err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.setContainerState(types.StateRunning)
|
|
}
|
|
|
|
func (c *Container) hotplugDrive() error {
|
|
var dev device
|
|
var err error
|
|
|
|
// container rootfs is blockdevice backed and isn't mounted
|
|
if !c.rootFs.Mounted {
|
|
dev, err = getDeviceForPath(c.rootFs.Source)
|
|
// there is no "rootfs" dir on block device backed rootfs
|
|
c.rootfsSuffix = ""
|
|
} else {
|
|
dev, err = getDeviceForPath(c.rootFs.Target)
|
|
}
|
|
|
|
if err == errMountPointNotFound {
|
|
return nil
|
|
}
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"device-major": dev.major,
|
|
"device-minor": dev.minor,
|
|
"mount-point": dev.mountPoint,
|
|
}).Info("device details")
|
|
|
|
isDM, err := checkStorageDriver(dev.major, dev.minor)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !isDM {
|
|
return nil
|
|
}
|
|
|
|
devicePath := c.rootFs.Source
|
|
fsType := c.rootFs.Type
|
|
if c.rootFs.Mounted {
|
|
if dev.mountPoint == c.rootFs.Target {
|
|
c.rootfsSuffix = ""
|
|
}
|
|
// If device mapper device, then fetch the full path of the device
|
|
devicePath, fsType, err = GetDevicePathAndFsType(dev.mountPoint)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
devicePath, err = filepath.EvalSymlinks(devicePath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"device-path": devicePath,
|
|
"fs-type": fsType,
|
|
}).Info("Block device detected")
|
|
|
|
if err = c.plugDevice(devicePath); err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.setStateFstype(fsType)
|
|
}
|
|
|
|
func (c *Container) plugDevice(devicePath string) error {
|
|
var stat unix.Stat_t
|
|
if err := unix.Stat(devicePath, &stat); err != nil {
|
|
return fmt.Errorf("stat %q failed: %v", devicePath, err)
|
|
}
|
|
|
|
if c.checkBlockDeviceSupport() && stat.Mode&unix.S_IFBLK == unix.S_IFBLK {
|
|
b, err := c.sandbox.devManager.NewDevice(config.DeviceInfo{
|
|
HostPath: devicePath,
|
|
ContainerPath: filepath.Join(kataGuestSharedDir(), c.id),
|
|
DevType: "b",
|
|
Major: int64(unix.Major(stat.Rdev)),
|
|
Minor: int64(unix.Minor(stat.Rdev)),
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("device manager failed to create rootfs device for %q: %v", devicePath, err)
|
|
}
|
|
|
|
c.state.BlockDeviceID = b.DeviceID()
|
|
|
|
// attach rootfs device
|
|
if err := c.sandbox.devManager.AttachDevice(b.DeviceID(), c.sandbox); err != nil {
|
|
return err
|
|
}
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
if err := c.sandbox.storeSandboxDevices(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// isDriveUsed checks if a drive has been used for container rootfs
|
|
func (c *Container) isDriveUsed() bool {
|
|
return !(c.state.Fstype == "")
|
|
}
|
|
|
|
func (c *Container) removeDrive() (err error) {
|
|
if c.isDriveUsed() {
|
|
c.Logger().Info("unplugging block device")
|
|
|
|
devID := c.state.BlockDeviceID
|
|
err := c.sandbox.devManager.DetachDevice(devID, c.sandbox)
|
|
if err != nil && err != manager.ErrDeviceNotAttached {
|
|
return err
|
|
}
|
|
|
|
if err = c.sandbox.devManager.RemoveDevice(devID); err != nil {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"container": c.id,
|
|
"device-id": devID,
|
|
}).WithError(err).Error("remove device failed")
|
|
|
|
// ignore the device not exist error
|
|
if err != manager.ErrDeviceNotExist {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
if err := c.sandbox.storeSandboxDevices(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) attachDevices() error {
|
|
// there's no need to do rollback when error happens,
|
|
// because if attachDevices fails, container creation will fail too,
|
|
// and rollbackFailingContainerCreation could do all the rollbacks
|
|
for _, dev := range c.devices {
|
|
if err := c.sandbox.devManager.AttachDevice(dev.ID, c.sandbox); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
if err := c.sandbox.storeSandboxDevices(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) detachDevices() error {
|
|
for _, dev := range c.devices {
|
|
err := c.sandbox.devManager.DetachDevice(dev.ID, c.sandbox)
|
|
if err != nil && err != manager.ErrDeviceNotAttached {
|
|
return err
|
|
}
|
|
|
|
if err = c.sandbox.devManager.RemoveDevice(dev.ID); err != nil {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"container": c.id,
|
|
"device-id": dev.ID,
|
|
}).WithError(err).Error("remove device failed")
|
|
|
|
// ignore the device not exist error
|
|
if err != manager.ErrDeviceNotExist {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
if !c.sandbox.supportNewStore() {
|
|
if err := c.sandbox.storeSandboxDevices(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// cgroupsCreate creates cgroups on the host for the associated container
|
|
func (c *Container) cgroupsCreate() (err error) {
|
|
spec := c.GetOCISpec()
|
|
if spec == nil {
|
|
return errorMissingOCISpec
|
|
}
|
|
|
|
// https://github.com/kata-containers/runtime/issues/168
|
|
resources := specs.LinuxResources{
|
|
CPU: nil,
|
|
}
|
|
|
|
if spec.Linux != nil && spec.Linux.Resources != nil {
|
|
resources.CPU = validCPUResources(spec.Linux.Resources.CPU)
|
|
}
|
|
|
|
cgroupPath := utils.ValidCgroupPath(spec.Linux.CgroupsPath)
|
|
c.state.CgroupPath, err = renameCgroupPath(cgroupPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
cgroup, err := cgroupsNewFunc(cgroups.V1,
|
|
cgroups.StaticPath(c.state.CgroupPath), &resources)
|
|
if err != nil {
|
|
return fmt.Errorf("Could not create cgroup for %v: %v", c.state.CgroupPath, err)
|
|
}
|
|
|
|
c.config.Resources = resources
|
|
|
|
// Add shim into cgroup
|
|
if c.process.Pid > 0 {
|
|
if err := cgroup.Add(cgroups.Process{Pid: c.process.Pid}); err != nil {
|
|
return fmt.Errorf("Could not add PID %d to cgroup %v: %v", c.process.Pid, spec.Linux.CgroupsPath, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// cgroupsDelete deletes the cgroups on the host for the associated container
|
|
func (c *Container) cgroupsDelete() error {
|
|
|
|
if c.state.CgroupPath == "" {
|
|
c.Logger().Debug("container does not have host cgroups: nothing to update")
|
|
return nil
|
|
}
|
|
|
|
cgroup, err := cgroupsLoadFunc(cgroups.V1,
|
|
cgroups.StaticPath(c.state.CgroupPath))
|
|
|
|
if err == cgroups.ErrCgroupDeleted {
|
|
// cgroup already deleted
|
|
return nil
|
|
}
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("Could not load container cgroup %v: %v", c.state.CgroupPath, err)
|
|
}
|
|
|
|
// move running process here, that way cgroup can be removed
|
|
parent, err := parentCgroup(cgroups.V1, c.state.CgroupPath)
|
|
if err != nil {
|
|
// parent cgroup doesn't exist, that means there are no process running
|
|
// and the container cgroup was removed.
|
|
c.Logger().WithError(err).Warn("Container cgroup doesn't exist")
|
|
return nil
|
|
}
|
|
|
|
if err := cgroup.MoveTo(parent); err != nil {
|
|
// Don't fail, cgroup can be deleted
|
|
c.Logger().WithError(err).Warn("Could not move container process into parent cgroup")
|
|
}
|
|
|
|
if err := cgroup.Delete(); err != nil {
|
|
return fmt.Errorf("Could not delete container cgroup path='%v': error='%v'", c.state.CgroupPath, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// cgroupsUpdate updates cgroups on the host for the associated container
|
|
func (c *Container) cgroupsUpdate(resources specs.LinuxResources) error {
|
|
|
|
if c.state.CgroupPath == "" {
|
|
c.Logger().Debug("container does not have host cgroups: nothing to update")
|
|
return nil
|
|
}
|
|
cgroup, err := cgroupsLoadFunc(cgroups.V1,
|
|
cgroups.StaticPath(c.state.CgroupPath))
|
|
if err != nil {
|
|
return fmt.Errorf("Could not load cgroup %v: %v", c.state.CgroupPath, err)
|
|
}
|
|
|
|
// Issue: https://github.com/kata-containers/runtime/issues/168
|
|
r := specs.LinuxResources{
|
|
CPU: validCPUResources(resources.CPU),
|
|
}
|
|
|
|
// update cgroup
|
|
if err := cgroup.Update(&r); err != nil {
|
|
return fmt.Errorf("Could not update container cgroup path='%v': error='%v'", c.state.CgroupPath, err)
|
|
}
|
|
|
|
// store new resources
|
|
c.config.Resources = r
|
|
if err := c.storeContainer(); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|