mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-02 07:02:16 +00:00
The NVIDIA BF3 SR-IOV device plugin injects the VF BDF only as a PCIDEVICE_* environment variable; it does not add the VFIO char device to linux.devices in the OCI spec. As a result the agent's container_has_vfio_device() gate stays closed and expose_guest_infiniband_devices() is never triggered — leaving /dev/infiniband absent from the container even though the guest kernel created the IB devices (mlx5_core.rdma.0 probes successfully). Add appendPhysicalEndpointDevices() which runs after appendDevices() in createContainer(). It walks the sandbox network endpoints; for each PhysicalEndpoint with a resolved guest PCI path it derives the VFIO group char path from sysfs (iommu_group symlink) and synthesises a vfio-pci-gk Device entry. Both legacy group paths (/dev/vfio/N) and iommufd cdev paths (/dev/vfio/devices/vfioN) are supported by reading the iommu_group sysfs symlink. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com> Assisted-by: Cursor <cursoragent@cursor.com>
2049 lines
65 KiB
Go
2049 lines
65 KiB
Go
// Copyright (c) 2016 Intel Corporation
|
|
// Copyright (c) 2014,2015,2016,2017 Docker, Inc.
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
|
|
package virtcontainers
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"slices"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
|
deviceUtils "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
|
|
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
|
|
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
|
|
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
|
|
|
|
"github.com/moby/sys/mountinfo"
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// tracingTags defines tags for the trace span
|
|
var containerTracingTags = map[string]string{
|
|
"source": "runtime",
|
|
"package": "virtcontainers",
|
|
"subsystem": "container",
|
|
}
|
|
|
|
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
|
|
// This file has definitions for major device numbers.
|
|
var cdromMajors = map[int64]string{
|
|
11: "SCSI_CDROM_MAJOR",
|
|
15: "CDU31A_CDROM_MAJOR",
|
|
16: "GOLDSTAR_CDROM_MAJOR",
|
|
17: "OPTICS_CDROM_MAJOR",
|
|
18: "SANYO_CDROM_MAJOR",
|
|
20: "MITSUMI_X_CDROM_MAJOR",
|
|
23: "MITSUMI_CDROM_MAJOR",
|
|
24: "CDU535_CDROM_MAJOR",
|
|
25: "MATSUSHITA_CDROM_MAJOR",
|
|
26: "MATSUSHITA_CDROM2_MAJOR",
|
|
27: "MATSUSHITA_CDROM3_MAJOR",
|
|
28: "MATSUSHITA_CDROM4_MAJOR",
|
|
29: "AZTECH_CDROM_MAJOR",
|
|
32: "CM206_CDROM_MAJOR",
|
|
}
|
|
|
|
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
|
|
// #define FLOPPY_MAJOR 2
|
|
const floppyMajor = int64(2)
|
|
|
|
// Process gathers data related to a container process.
|
|
type Process struct {
|
|
StartTime time.Time
|
|
|
|
// Token is the process execution context ID. It must be
|
|
// unique per sandbox.
|
|
// Token is used to manipulate processes for containers
|
|
// that have not started yet, and later identify them
|
|
// uniquely within a sandbox.
|
|
Token string
|
|
|
|
// Pid is the process ID as seen by the host software
|
|
// stack, e.g. CRI-O, containerd. This is typically the
|
|
// shim PID.
|
|
Pid int
|
|
}
|
|
|
|
// ContainerStatus describes a container status.
|
|
type ContainerStatus struct {
|
|
Spec *specs.Spec
|
|
|
|
// Annotations allow clients to store arbitrary values,
|
|
// for example to add additional status values required
|
|
// to support particular specifications.
|
|
Annotations map[string]string
|
|
|
|
ID string
|
|
RootFs string
|
|
StartTime time.Time
|
|
State types.ContainerState
|
|
|
|
PID int
|
|
}
|
|
|
|
// ThrottlingData gather the date related to container cpu throttling.
|
|
type ThrottlingData struct {
|
|
// Number of periods with throttling active
|
|
Periods uint64 `json:"periods,omitempty"`
|
|
// Number of periods when the container hit its throttling limit.
|
|
ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
|
|
// Aggregate time the container was throttled for in nanoseconds.
|
|
ThrottledTime uint64 `json:"throttled_time,omitempty"`
|
|
}
|
|
|
|
// CPUUsage denotes the usage of a CPU.
|
|
// All CPU stats are aggregate since container inception.
|
|
type CPUUsage struct {
|
|
// Total CPU time consumed per core.
|
|
// Units: nanoseconds.
|
|
PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
|
|
// Total CPU time consumed.
|
|
// Units: nanoseconds.
|
|
TotalUsage uint64 `json:"total_usage,omitempty"`
|
|
// Time spent by tasks of the cgroup in kernel mode.
|
|
// Units: nanoseconds.
|
|
UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
|
|
// Time spent by tasks of the cgroup in user mode.
|
|
// Units: nanoseconds.
|
|
UsageInUsermode uint64 `json:"usage_in_usermode"`
|
|
}
|
|
|
|
// CPUStats describes the cpu stats
|
|
type CPUStats struct {
|
|
CPUUsage CPUUsage `json:"cpu_usage,omitempty"`
|
|
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
|
|
}
|
|
|
|
// MemoryData gather the data related to memory
|
|
type MemoryData struct {
|
|
Usage uint64 `json:"usage,omitempty"`
|
|
MaxUsage uint64 `json:"max_usage,omitempty"`
|
|
Failcnt uint64 `json:"failcnt"`
|
|
Limit uint64 `json:"limit"`
|
|
}
|
|
|
|
// MemoryStats describes the memory stats
|
|
type MemoryStats struct {
|
|
Stats map[string]uint64 `json:"stats,omitempty"`
|
|
// usage of memory
|
|
Usage MemoryData `json:"usage,omitempty"`
|
|
// usage of memory swap
|
|
SwapUsage MemoryData `json:"swap_usage,omitempty"`
|
|
// usage of kernel memory
|
|
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
|
|
// usage of kernel TCP memory
|
|
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
|
|
// memory used for cache
|
|
Cache uint64 `json:"cache,omitempty"`
|
|
// if true, memory usage is accounted for throughout a hierarchy of cgroups.
|
|
UseHierarchy bool `json:"use_hierarchy"`
|
|
}
|
|
|
|
// PidsStats describes the pids stats
|
|
type PidsStats struct {
|
|
// number of pids in the cgroup
|
|
Current uint64 `json:"current,omitempty"`
|
|
// active pids hard limit
|
|
Limit uint64 `json:"limit,omitempty"`
|
|
}
|
|
|
|
// BlkioStatEntry gather date related to a block device
|
|
type BlkioStatEntry struct {
|
|
Op string `json:"op,omitempty"`
|
|
Major uint64 `json:"major,omitempty"`
|
|
Minor uint64 `json:"minor,omitempty"`
|
|
Value uint64 `json:"value,omitempty"`
|
|
}
|
|
|
|
// BlkioStats describes block io stats
|
|
type BlkioStats struct {
|
|
// number of bytes tranferred to and from the block device
|
|
IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
|
|
IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
|
|
IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
|
|
IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
|
|
IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
|
|
IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
|
|
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
|
|
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
|
|
}
|
|
|
|
// HugetlbStats describes hugetable memory stats
|
|
type HugetlbStats struct {
|
|
// current res_counter usage for hugetlb
|
|
Usage uint64 `json:"usage,omitempty"`
|
|
// maximum usage ever recorded.
|
|
MaxUsage uint64 `json:"max_usage,omitempty"`
|
|
// number of times hugetlb usage allocation failure.
|
|
Failcnt uint64 `json:"failcnt"`
|
|
}
|
|
|
|
// CgroupStats describes all cgroup subsystem stats
|
|
type CgroupStats struct {
|
|
// the map is in the format "size of hugepage: stats of the hugepage"
|
|
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
|
|
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
|
|
CPUStats CPUStats `json:"cpu_stats,omitempty"`
|
|
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
|
|
PidsStats PidsStats `json:"pids_stats,omitempty"`
|
|
}
|
|
|
|
// NetworkStats describe all network stats.
|
|
type NetworkStats struct {
|
|
// Name is the name of the network interface.
|
|
Name string `json:"name,omitempty"`
|
|
|
|
RxBytes uint64 `json:"rx_bytes,omitempty"`
|
|
RxPackets uint64 `json:"rx_packets,omitempty"`
|
|
RxErrors uint64 `json:"rx_errors,omitempty"`
|
|
RxDropped uint64 `json:"rx_dropped,omitempty"`
|
|
TxBytes uint64 `json:"tx_bytes,omitempty"`
|
|
TxPackets uint64 `json:"tx_packets,omitempty"`
|
|
TxErrors uint64 `json:"tx_errors,omitempty"`
|
|
TxDropped uint64 `json:"tx_dropped,omitempty"`
|
|
}
|
|
|
|
// ContainerStats describes a container stats.
|
|
type ContainerStats struct {
|
|
CgroupStats *CgroupStats
|
|
NetworkStats []*NetworkStats
|
|
}
|
|
|
|
// ContainerResources describes container resources
|
|
type ContainerResources struct {
|
|
// VCPUs are the number of vCPUs that are being used by the container
|
|
VCPUs uint32
|
|
|
|
// Mem is the memory that is being used by the container
|
|
MemByte int64
|
|
}
|
|
|
|
// ContainerConfig describes one container runtime configuration.
|
|
type ContainerConfig struct {
|
|
// Device configuration for devices that must be available within the container.
|
|
DeviceInfos []config.DeviceInfo
|
|
|
|
Mounts []Mount
|
|
|
|
// Raw OCI specification, it won't be saved to disk.
|
|
CustomSpec *specs.Spec `json:"-"`
|
|
|
|
// Annotations allow clients to store arbitrary values,
|
|
// for example to add additional status values required
|
|
// to support particular specifications.
|
|
Annotations map[string]string
|
|
|
|
ID string
|
|
|
|
// Resources container resources
|
|
Resources specs.LinuxResources
|
|
|
|
// Cmd specifies the command to run on a container
|
|
Cmd types.Cmd
|
|
|
|
// RootFs is the container workload image on the host.
|
|
RootFs RootFs
|
|
|
|
// ReadOnlyRootfs indicates if the rootfs should be mounted readonly
|
|
ReadonlyRootfs bool
|
|
}
|
|
|
|
// valid checks that the container configuration is valid.
|
|
func (c *ContainerConfig) valid() bool {
|
|
if c == nil {
|
|
return false
|
|
}
|
|
|
|
if c.ID == "" {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// SystemMountsInfo describes additional information for system mounts that the agent
|
|
// needs to handle
|
|
type SystemMountsInfo struct {
|
|
// Indicates if /dev has been passed as a bind mount for the host /dev
|
|
BindMountDev bool
|
|
|
|
// Size of /dev/shm assigned on the host.
|
|
DevShmSize uint
|
|
}
|
|
|
|
// ContainerDevice describes a device associated with container
|
|
type ContainerDevice struct {
|
|
// ID is device id referencing the device from sandbox's device manager
|
|
ID string
|
|
|
|
// ContainerPath is device path displayed in container
|
|
ContainerPath string
|
|
|
|
// FileMode permission bits for the device.
|
|
FileMode os.FileMode
|
|
|
|
// UID is user ID in the container namespace
|
|
UID uint32
|
|
|
|
// GID is group ID in the container namespace
|
|
GID uint32
|
|
|
|
// Shared indicates whether the device is shared across containers.
|
|
Shared bool
|
|
}
|
|
|
|
// EphemeralDisk holds information about an ephemeral disk created for
|
|
// block-based emptyDir volumes.
|
|
type EphemeralDisk struct {
|
|
// DiskPath is the path to the disk image file.
|
|
DiskPath string
|
|
|
|
// SourcePath is the emptyDir source path, ie. the folder created by
|
|
// Kubelet on the host.
|
|
SourcePath string
|
|
}
|
|
|
|
// RootFs describes the container's rootfs.
|
|
type RootFs struct {
|
|
// Source specifies the BlockDevice path
|
|
Source string
|
|
// Target specify where the rootfs is mounted if it has been mounted
|
|
Target string
|
|
// Type specifies the type of filesystem to mount.
|
|
Type string
|
|
// Options specifies zero or more fstab style mount options.
|
|
Options []string
|
|
// Mounted specifies whether the rootfs has be mounted or not
|
|
Mounted bool
|
|
}
|
|
|
|
// Container is composed of a set of containers and a runtime environment.
|
|
// A Container can be created, deleted, started, stopped, listed, entered, paused and restored.
|
|
type Container struct {
|
|
ctx context.Context
|
|
|
|
config *ContainerConfig
|
|
sandbox *Sandbox
|
|
|
|
id string
|
|
sandboxID string
|
|
containerPath string
|
|
rootfsSuffix string
|
|
|
|
mounts []Mount
|
|
|
|
devices []ContainerDevice
|
|
|
|
state types.ContainerState
|
|
|
|
process Process
|
|
|
|
rootFs RootFs
|
|
|
|
systemMountsInfo SystemMountsInfo
|
|
}
|
|
|
|
// ID returns the container identifier string.
|
|
func (c *Container) ID() string {
|
|
return c.id
|
|
}
|
|
|
|
// Logger returns a logrus logger appropriate for logging Container messages
|
|
func (c *Container) Logger() *logrus.Entry {
|
|
return virtLog.WithFields(logrus.Fields{
|
|
"subsystem": "container",
|
|
"sandbox": c.sandboxID,
|
|
"container": c.id,
|
|
})
|
|
}
|
|
|
|
// Sandbox returns the sandbox handler related to this container.
|
|
func (c *Container) Sandbox() VCSandbox {
|
|
return c.sandbox
|
|
}
|
|
|
|
// Process returns the container process.
|
|
func (c *Container) Process() Process {
|
|
return c.process
|
|
}
|
|
|
|
// GetToken returns the token related to this container's process.
|
|
func (c *Container) GetToken() string {
|
|
return c.process.Token
|
|
}
|
|
|
|
// GetPid returns the pid related to this container's process.
|
|
func (c *Container) GetPid() int {
|
|
return c.process.Pid
|
|
}
|
|
|
|
func (c *Container) setStateFstype(fstype string) error {
|
|
c.state.Fstype = fstype
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetAnnotations returns container's annotations
|
|
func (c *Container) GetAnnotations() map[string]string {
|
|
return c.config.Annotations
|
|
}
|
|
|
|
// GetPatchedOCISpec returns container's OCI specification
|
|
// This OCI specification was patched when the sandbox was created
|
|
// by containerCapabilities(), SetEphemeralStorageType() and others
|
|
// in order to support:
|
|
// * Capabilities
|
|
// * Ephemeral storage
|
|
// * k8s empty dir
|
|
// If you need the original (vanilla) OCI spec,
|
|
// use compatoci.GetContainerSpec() instead.
|
|
func (c *Container) GetPatchedOCISpec() *specs.Spec {
|
|
return c.config.CustomSpec
|
|
}
|
|
|
|
// setContainerState sets both the in-memory and on-disk state of the
|
|
// container.
|
|
func (c *Container) setContainerState(state types.StateString) error {
|
|
if state == "" {
|
|
return types.ErrNeedState
|
|
}
|
|
|
|
c.Logger().Debugf("Setting container state from %v to %v", c.state.State, state)
|
|
// update in-memory state
|
|
c.state.State = state
|
|
|
|
// flush data to storage
|
|
if err := c.sandbox.Save(); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// mountSharedDirMounts handles bind-mounts by bindmounting to the host shared
|
|
// directory which is mounted through virtiofs/9pfs in the VM.
|
|
// It also updates the container mount list with the HostPath info, and store
|
|
// container mounts to the storage. This way, we will have the HostPath info
|
|
// available when we will need to unmount those mounts.
|
|
func (c *Container) mountSharedDirMounts(ctx context.Context, sharedDirMounts, ignoredMounts map[string]Mount) (storages []*grpc.Storage, err error) {
|
|
var devicesToDetach []string
|
|
defer func() {
|
|
if err != nil {
|
|
for _, id := range devicesToDetach {
|
|
c.sandbox.devManager.DetachDevice(ctx, id, c.sandbox)
|
|
}
|
|
}
|
|
}()
|
|
|
|
for idx, m := range c.mounts {
|
|
// Skip mounting certain system paths from the source on the host side
|
|
// into the container as it does not make sense to do so.
|
|
// Example sources could be /sys/fs/cgroup etc.
|
|
if isSystemMount(m.Source) {
|
|
continue
|
|
}
|
|
|
|
// Check if mount is a block device file. If it is, the block device will be attached to the host
|
|
// instead of passing this as a shared mount:
|
|
if len(m.BlockDeviceID) > 0 {
|
|
// Attach this block device, all other devices passed in the config have been attached at this point
|
|
if err = c.sandbox.devManager.AttachDevice(ctx, m.BlockDeviceID, c.sandbox); err != nil {
|
|
return storages, err
|
|
}
|
|
devicesToDetach = append(devicesToDetach, m.BlockDeviceID)
|
|
continue
|
|
}
|
|
|
|
// For non-block based mounts, we are only interested in bind mounts
|
|
if m.Type != "bind" {
|
|
continue
|
|
}
|
|
|
|
// We need to treat /dev/shm as a special case. This is passed as a bind mount in the spec,
|
|
// but it does not make sense to pass this as a 9p mount from the host side.
|
|
// This needs to be handled purely in the guest, by allocating memory for this inside the VM.
|
|
if m.Destination == "/dev/shm" {
|
|
continue
|
|
}
|
|
|
|
// Ignore /dev, directories and all other device files. We handle
|
|
// only regular files in /dev. It does not make sense to pass the host
|
|
// device nodes to the guest. We also ignore inaccessible host
|
|
// devices in case we're mounting a device that is only
|
|
// accessible in the guest.
|
|
//
|
|
// Note: K8s/containerd seems to create the source path as a
|
|
// directory on the host if it does not already exist.
|
|
// isHostDevice() will still return true in that case, so the
|
|
// above contract holds.
|
|
if isDevice, err := isHostDevice(m.Source); isDevice || err != nil {
|
|
continue
|
|
}
|
|
|
|
sharedFile, err := c.sandbox.fsShare.ShareFile(ctx, c, &c.mounts[idx])
|
|
if err != nil {
|
|
return storages, err
|
|
}
|
|
|
|
// Expand the list of mounts to ignore.
|
|
if sharedFile == nil {
|
|
ignoredMounts[m.Source] = Mount{Source: m.Source}
|
|
continue
|
|
}
|
|
sharedDirMount := Mount{
|
|
Source: sharedFile.guestPath,
|
|
Destination: m.Destination,
|
|
Type: m.Type,
|
|
Options: m.Options,
|
|
ReadOnly: m.ReadOnly,
|
|
}
|
|
|
|
// virtiofs does not support inotify. To workaround this limitation, we want to special case
|
|
// mounts that are commonly 'watched'. "watchable" mounts include:
|
|
// - Kubernetes configmap
|
|
// - Kubernetes secret
|
|
// If we identify one of these, we'll need to carry out polling in the guest in order to present the
|
|
// container with a mount that supports inotify. To do this, we create a Storage object for
|
|
// the "watchable-bind" driver. This will have the agent create a new mount that is watchable,
|
|
// who's effective source is the original mount (the agent will poll the original mount for changes and
|
|
// manually update the path that is mounted into the container).
|
|
// Based on this, let's make sure we update the sharedDirMount structure with the new watchable-mount as
|
|
// the source (this is what is utilized to update the OCI spec).
|
|
caps := c.sandbox.hypervisor.Capabilities(ctx)
|
|
if isWatchableMount(m.Source) && caps.IsFsSharingSupported() {
|
|
|
|
// Create path in shared directory for creating watchable mount:
|
|
watchableHostPath := filepath.Join(getMountPath(c.sandboxID), "watchable")
|
|
if err := os.MkdirAll(watchableHostPath, DirMode); err != nil {
|
|
return storages, fmt.Errorf("unable to create watchable path: %s: %v", watchableHostPath, err)
|
|
}
|
|
|
|
watchableGuestMount := filepath.Join(kataGuestSharedDir(), "watchable", filepath.Base(sharedFile.guestPath))
|
|
|
|
storage := &grpc.Storage{
|
|
Driver: kataWatchableBindDevType,
|
|
Source: sharedFile.guestPath,
|
|
Fstype: "bind",
|
|
MountPoint: watchableGuestMount,
|
|
Options: m.Options,
|
|
}
|
|
storages = append(storages, storage)
|
|
|
|
// Update the sharedDirMount, in order to identify what will
|
|
// change in the OCI spec.
|
|
sharedDirMount.Source = watchableGuestMount
|
|
}
|
|
|
|
sharedDirMounts[sharedDirMount.Destination] = sharedDirMount
|
|
}
|
|
|
|
return storages, nil
|
|
}
|
|
|
|
func (c *Container) unmountHostMounts(ctx context.Context) error {
|
|
span, ctx := katatrace.Trace(ctx, c.Logger(), "unmountHostMounts", containerTracingTags, map[string]string{"container_id": c.id})
|
|
defer span.End()
|
|
|
|
unmountFunc := func(m Mount) (err error) {
|
|
span, _ := katatrace.Trace(ctx, c.Logger(), "unmount", containerTracingTags, map[string]string{"container_id": c.id, "host-path": m.HostPath})
|
|
defer func() {
|
|
if err != nil {
|
|
katatrace.AddTags(span, "error", err)
|
|
}
|
|
span.End()
|
|
}()
|
|
|
|
if err = c.sandbox.fsShare.UnshareFile(ctx, c, &m); err != nil {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"host-path": m.HostPath,
|
|
"error": err,
|
|
}).Warn("Could not umount")
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
for _, m := range c.mounts {
|
|
if m.HostPath != "" {
|
|
if err := unmountFunc(m); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func filterDevices(c *Container, devices []ContainerDevice) (ret []ContainerDevice) {
|
|
for _, dev := range devices {
|
|
major, _ := c.sandbox.devManager.GetDeviceByID(dev.ID).GetMajorMinor()
|
|
if _, ok := cdromMajors[major]; ok {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"device": dev.ContainerPath,
|
|
}).Info("Not attach device because it is a CDROM")
|
|
continue
|
|
}
|
|
|
|
if major == floppyMajor {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"device": dev.ContainerPath,
|
|
}).Info("Not attaching device because it is a floppy drive")
|
|
continue
|
|
}
|
|
|
|
ret = append(ret, dev)
|
|
}
|
|
return
|
|
}
|
|
|
|
// Add any mount based block devices to the device manager and Save the
|
|
// device ID for the particular mount. This'll occur when the mountpoint source
|
|
// is a block device.
|
|
func (c *Container) createBlockDevices(ctx context.Context) error {
|
|
// iterate all mounts and create block device if it's block based.
|
|
for i := range c.mounts {
|
|
// If block devices are disabled, we selectively only hotplug if
|
|
// the mount is an encrypted block-based emptyDir, to avoid
|
|
// cases that could regress 20ca4d2.
|
|
if !c.checkBlockDeviceSupport(ctx) && (c.sandbox.config.EmptyDirMode != EmptyDirModeVirtioBlkEncrypted || !Isk8sHostEmptyDir(c.mounts[i].Source)) {
|
|
c.Logger().Warn("Block device not supported")
|
|
continue
|
|
}
|
|
|
|
if len(c.mounts[i].BlockDeviceID) > 0 {
|
|
// Non-empty m.BlockDeviceID indicates there's already one device
|
|
// associated with the mount,so no need to create a new device for it
|
|
// and we only create block device for bind mount
|
|
continue
|
|
}
|
|
|
|
isBlockFile := HasOption(c.mounts[i].Options, vcAnnotations.IsFileBlockDevice)
|
|
if c.mounts[i].Type != "bind" && !isBlockFile {
|
|
// We only handle for bind and block device mounts.
|
|
continue
|
|
}
|
|
|
|
// Handle directly assigned volume. Update the mount info based on the mount info json.
|
|
mntInfo, e := volume.VolumeMountInfo(c.mounts[i].Source)
|
|
if e != nil && !os.IsNotExist(e) {
|
|
c.Logger().WithError(e).WithField("mount-source", c.mounts[i].Source).
|
|
Error("failed to parse the mount info file for a direct assigned volume")
|
|
continue
|
|
}
|
|
|
|
if mntInfo != nil {
|
|
// Write out sandbox info file on the mount source to allow CSI to communicate with the runtime
|
|
if err := volume.RecordSandboxID(c.sandboxID, c.mounts[i].Source); err != nil {
|
|
c.Logger().WithError(err).Error("error writing sandbox info")
|
|
}
|
|
|
|
// When using direct volume assignment, we assume the source file is a disk if it's a regular file.
|
|
fileInfo, err := os.Stat(mntInfo.Device)
|
|
if err == nil && fileInfo.Mode().IsRegular() {
|
|
isBlockFile = true
|
|
}
|
|
|
|
readonly := false
|
|
for _, flag := range mntInfo.Options {
|
|
if flag == "ro" {
|
|
readonly = true
|
|
break
|
|
}
|
|
}
|
|
|
|
c.mounts[i].Source = mntInfo.Device
|
|
c.mounts[i].Type = mntInfo.FsType
|
|
c.mounts[i].Options = mntInfo.Options
|
|
c.mounts[i].ReadOnly = readonly
|
|
|
|
for key, value := range mntInfo.Metadata {
|
|
switch key {
|
|
case volume.EncryptionKeyMetadataKey:
|
|
c.mounts[i].EncryptionKey = value
|
|
case volume.FSGroupMetadataKey:
|
|
gid, err := strconv.Atoi(value)
|
|
if err != nil {
|
|
c.Logger().WithError(err).Errorf("invalid group id value %s provided for key %s", value, volume.FSGroupMetadataKey)
|
|
continue
|
|
}
|
|
c.mounts[i].FSGroup = &gid
|
|
case volume.FSGroupChangePolicyMetadataKey:
|
|
if _, exists := mntInfo.Metadata[volume.FSGroupMetadataKey]; !exists {
|
|
c.Logger().Errorf("%s specified without provding the group id with key %s", volume.FSGroupChangePolicyMetadataKey, volume.FSGroupMetadataKey)
|
|
continue
|
|
}
|
|
c.mounts[i].FSGroupChangePolicy = volume.FSGroupChangePolicy(value)
|
|
default:
|
|
c.Logger().Warnf("Ignoring unsupported direct-assignd volume metadata key: %s, value: %s", key, value)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if mount is a block device file. If it is, the block device will be attached to the host
|
|
// instead of passing this as a shared mount.
|
|
di, err := c.createDeviceInfo(c.mounts[i].Source, c.mounts[i].Destination, c.mounts[i].ReadOnly, isBlockFile)
|
|
if err == nil && di != nil {
|
|
b, err := c.sandbox.devManager.NewDevice(*di)
|
|
if err != nil {
|
|
// Do not return an error, try to create
|
|
// devices for other mounts
|
|
c.Logger().WithError(err).WithField("mount-source", c.mounts[i].Source).
|
|
Error("device manager failed to create new device")
|
|
continue
|
|
|
|
}
|
|
|
|
c.mounts[i].BlockDeviceID = b.DeviceID()
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) initConfigResourcesMemory() {
|
|
ociSpec := c.GetPatchedOCISpec()
|
|
c.config.Resources.Memory = &specs.LinuxMemory{}
|
|
ociSpec.Linux.Resources.Memory = c.config.Resources.Memory
|
|
}
|
|
|
|
// newContainer creates a Container structure from a sandbox and a container configuration.
|
|
func newContainer(ctx context.Context, sandbox *Sandbox, contConfig *ContainerConfig) (*Container, error) {
|
|
span, ctx := katatrace.Trace(ctx, nil, "newContainer", containerTracingTags, map[string]string{"container_id": contConfig.ID, "sandbox_id": sandbox.id})
|
|
defer span.End()
|
|
|
|
if !contConfig.valid() {
|
|
return &Container{}, fmt.Errorf("Invalid container configuration")
|
|
}
|
|
|
|
c := &Container{
|
|
id: contConfig.ID,
|
|
sandboxID: sandbox.id,
|
|
rootFs: contConfig.RootFs,
|
|
config: contConfig,
|
|
sandbox: sandbox,
|
|
containerPath: filepath.Join(sandbox.id, contConfig.ID),
|
|
rootfsSuffix: "rootfs",
|
|
state: types.ContainerState{},
|
|
process: Process{},
|
|
mounts: contConfig.Mounts,
|
|
ctx: sandbox.ctx,
|
|
}
|
|
|
|
// Set the Annotations of SWAP to Resources
|
|
if resourceSwappinessStr, ok := c.config.Annotations[vcAnnotations.ContainerResourcesSwappiness]; ok {
|
|
resourceSwappiness, err := strconv.ParseUint(resourceSwappinessStr, 0, 64)
|
|
if err == nil && resourceSwappiness > 200 {
|
|
err = fmt.Errorf("swapiness should not bigger than 200")
|
|
}
|
|
if err != nil {
|
|
return &Container{}, fmt.Errorf("Invalid container configuration Annotations %s %v", vcAnnotations.ContainerResourcesSwappiness, err)
|
|
}
|
|
if c.config.Resources.Memory == nil {
|
|
c.initConfigResourcesMemory()
|
|
}
|
|
c.config.Resources.Memory.Swappiness = &resourceSwappiness
|
|
}
|
|
if resourceSwapInBytesStr, ok := c.config.Annotations[vcAnnotations.ContainerResourcesSwapInBytes]; ok {
|
|
resourceSwapInBytesInUint, err := strconv.ParseUint(resourceSwapInBytesStr, 0, 64)
|
|
if err != nil {
|
|
return &Container{}, fmt.Errorf("Invalid container configuration Annotations %s %v", vcAnnotations.ContainerResourcesSwapInBytes, err)
|
|
}
|
|
if c.config.Resources.Memory == nil {
|
|
c.initConfigResourcesMemory()
|
|
}
|
|
resourceSwapInBytes := int64(resourceSwapInBytesInUint)
|
|
c.config.Resources.Memory.Swap = &resourceSwapInBytes
|
|
}
|
|
|
|
// experimental runtime use "persist.json" instead of legacy "state.json" as storage
|
|
err := c.Restore()
|
|
if err == nil {
|
|
//container restored
|
|
return c, nil
|
|
}
|
|
|
|
// Unexpected error
|
|
if !os.IsNotExist(err) && err != errContainerPersistNotExist {
|
|
return nil, err
|
|
}
|
|
|
|
if err := c.createEphemeralDisks(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// If mounts are block devices, add to devmanager
|
|
if err := c.createMounts(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Add container's devices to sandbox's device-manager
|
|
if err := c.createDevices(ctx, contConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return c, nil
|
|
}
|
|
|
|
// Create Device Information about the block device
|
|
func (c *Container) createDeviceInfo(source, destination string, readonly, isBlockFile bool) (*config.DeviceInfo, error) {
|
|
var stat unix.Stat_t
|
|
if err := unix.Stat(source, &stat); err != nil {
|
|
return nil, fmt.Errorf("stat %q failed: %v", source, err)
|
|
}
|
|
|
|
var di *config.DeviceInfo
|
|
var err error
|
|
|
|
if stat.Mode&unix.S_IFMT == unix.S_IFBLK {
|
|
di = &config.DeviceInfo{
|
|
HostPath: source,
|
|
ContainerPath: destination,
|
|
DevType: "b",
|
|
Major: int64(unix.Major(uint64(stat.Rdev))),
|
|
Minor: int64(unix.Minor(uint64(stat.Rdev))),
|
|
ReadOnly: readonly,
|
|
}
|
|
} else if isBlockFile && stat.Mode&unix.S_IFMT == unix.S_IFREG {
|
|
di = &config.DeviceInfo{
|
|
HostPath: source,
|
|
ContainerPath: destination,
|
|
DevType: "b",
|
|
Major: -1,
|
|
Minor: 0,
|
|
ReadOnly: readonly,
|
|
}
|
|
// Check whether source can be used as a pmem device
|
|
} else if di, err = config.PmemDeviceInfo(source, destination); err != nil {
|
|
c.Logger().WithError(err).
|
|
WithField("mount-source", source).
|
|
Debug("no loop device")
|
|
}
|
|
return di, err
|
|
}
|
|
|
|
// call hypervisor to create device about KataVirtualVolume.
|
|
func (c *Container) createVirtualVolumeDevices() ([]config.DeviceInfo, error) {
|
|
var deviceInfos []config.DeviceInfo
|
|
for _, o := range c.rootFs.Options {
|
|
if strings.HasPrefix(o, VirtualVolumePrefix) {
|
|
virtVolume, err := types.ParseKataVirtualVolume(strings.TrimPrefix(o, VirtualVolumePrefix))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
c.Logger().Infof("KataVirtualVolume volumetype = %s", virtVolume.VolumeType)
|
|
}
|
|
}
|
|
return deviceInfos, nil
|
|
}
|
|
|
|
// getFilesystemCapacity return the total size in bytes of the filesystem
|
|
// under path.
|
|
func getFilesystemCapacity(path string) (uint64, error) {
|
|
var stat unix.Statfs_t
|
|
if err := unix.Statfs(path, &stat); err != nil {
|
|
return 0, err
|
|
}
|
|
return stat.Blocks * uint64(stat.Bsize), nil
|
|
}
|
|
|
|
func (c *Container) createEphemeralDisks() error {
|
|
if c.sandbox.config.EmptyDirMode != EmptyDirModeVirtioBlkEncrypted {
|
|
return nil
|
|
}
|
|
|
|
for i := range c.mounts {
|
|
if !Isk8sHostEmptyDir(c.mounts[i].Source) {
|
|
continue
|
|
}
|
|
|
|
// Mark the mount as shared so the block device isn't removed when a container stops.
|
|
c.mounts[i].Shared = true
|
|
|
|
if mounted, err := volume.IsVolumeMounted(c.mounts[i].Source); err != nil {
|
|
return err
|
|
} else if mounted {
|
|
continue
|
|
}
|
|
|
|
diskPath, err := c.setupEphemeralDisk(c.mounts[i].Source)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c.sandbox.ephemeralDisks = append(c.sandbox.ephemeralDisks, EphemeralDisk{
|
|
DiskPath: diskPath,
|
|
SourcePath: c.mounts[i].Source,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// setupEphemeralDisk creates and configures an ephemeral disk image
|
|
// inside the given emptyDir. It returns the path to the created disk
|
|
// image. The fd is always closed and the disk image is removed if any
|
|
// step after creation fails.
|
|
func (c *Container) setupEphemeralDisk(emptyDirPath string) (diskPath string, err error) {
|
|
// Create the disk file in the same folder as the original
|
|
// emptyDir mount so that Kubelet can enforce the sizeLimit.
|
|
diskPath = filepath.Join(emptyDirPath, "disk.img")
|
|
f, err := os.Create(diskPath)
|
|
if err != nil {
|
|
c.Logger().WithError(err).Errorf("failed to create disk file at %s", diskPath)
|
|
return
|
|
}
|
|
defer f.Close()
|
|
|
|
defer func() {
|
|
if err != nil {
|
|
if removeErr := os.Remove(diskPath); removeErr != nil {
|
|
c.Logger().WithError(removeErr).Errorf("failed to clean up disk file %s after error", diskPath)
|
|
}
|
|
}
|
|
}()
|
|
|
|
emptyDirFsCapacity, err := getFilesystemCapacity(emptyDirPath)
|
|
if err != nil {
|
|
c.Logger().WithError(err).Errorf("failed to get filesystem capacity for mount %s", emptyDirPath)
|
|
return
|
|
}
|
|
|
|
if err = f.Truncate(int64(emptyDirFsCapacity)); err != nil {
|
|
c.Logger().WithError(err).Errorf("failed to truncate disk file")
|
|
return
|
|
}
|
|
|
|
var sourceStat unix.Stat_t
|
|
if err = unix.Stat(emptyDirPath, &sourceStat); err != nil {
|
|
c.Logger().WithError(err).Errorf("failed to stat mount source: %s", emptyDirPath)
|
|
return
|
|
}
|
|
|
|
metadata := map[string]string{
|
|
volume.EncryptionKeyMetadataKey: "ephemeral",
|
|
}
|
|
if sourceStat.Gid != 0 {
|
|
metadata[volume.FSGroupMetadataKey] = strconv.FormatUint(uint64(sourceStat.Gid), 10)
|
|
}
|
|
|
|
if err = volume.AddMountInfo(emptyDirPath, volume.MountInfo{
|
|
VolumeType: "blk",
|
|
Device: diskPath,
|
|
FsType: "ext4",
|
|
Metadata: metadata,
|
|
}); err != nil {
|
|
c.Logger().WithError(err).Errorf("failed to assign direct volume for mount %s", emptyDirPath)
|
|
return
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (c *Container) createMounts(ctx context.Context) error {
|
|
// Create block devices for newly created container
|
|
return c.createBlockDevices(ctx)
|
|
}
|
|
|
|
func findErofsMountSource(mounts []*mountinfo.Info, mnt string) (string, error) {
|
|
for _, m := range mounts {
|
|
if m.FSType == "erofs" && m.Mountpoint == mnt {
|
|
return m.Source, nil
|
|
}
|
|
}
|
|
return "", fmt.Errorf("erofs mount not found for %s", mnt)
|
|
}
|
|
|
|
func parseOverlayUpperFs(mounts []*mountinfo.Info, path string) (*mountinfo.Info, error) {
|
|
for _, m := range mounts {
|
|
if strings.HasPrefix(m.Mountpoint, filepath.Dir(path)) {
|
|
return m, nil
|
|
}
|
|
}
|
|
return nil, fmt.Errorf("upper mount not found for %s", path)
|
|
}
|
|
|
|
func (c *Container) createErofsDevices(ctx context.Context) ([]config.DeviceInfo, error) {
|
|
var deviceInfos []config.DeviceInfo
|
|
if IsErofsRootFS(c.rootFs) {
|
|
mounts, err := mountinfo.GetMounts(nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
lowerdirs, upperdir := parseErofsRootFsOptions(c.rootFs.Options)
|
|
for _, path := range lowerdirs {
|
|
s, err := findErofsMountSource(mounts, path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if strings.HasPrefix(s, "/dev/loop") {
|
|
b, err := os.ReadFile(fmt.Sprintf("/sys/block/loop%s/loop/backing_file", strings.TrimPrefix(s, "/dev/loop")))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
s = strings.TrimSuffix(string(b), "\n")
|
|
}
|
|
if filepath.Base(s) != "layer.erofs" {
|
|
return nil, fmt.Errorf("unsupported mount source %s for %s", s, path)
|
|
}
|
|
di, err := c.createDeviceInfo(s, s, true, true)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
deviceInfos = append(deviceInfos, *di)
|
|
}
|
|
|
|
if upperdir != "" {
|
|
// check if upperdir is really backed by a filesystem
|
|
if m, err := parseOverlayUpperFs(mounts, upperdir); err == nil {
|
|
s := m.Source
|
|
if strings.HasPrefix(s, "/dev/loop") {
|
|
b, err := os.ReadFile(fmt.Sprintf("/sys/block/loop%s/loop/backing_file", strings.TrimPrefix(s, "/dev/loop")))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
s = strings.TrimSuffix(string(b), "\n")
|
|
}
|
|
if filepath.Base(s) != "rwlayer.img" {
|
|
return nil, fmt.Errorf("unsupported upper blockfile %s for %s", s, upperdir)
|
|
}
|
|
// XXX: we cannot umount here because it's in another mntns,
|
|
// therefore remountRo for safety; should adapt containerd
|
|
// custom mount type instead.
|
|
if err := remount(ctx, syscall.MS_RDONLY, m.Mountpoint); err != nil {
|
|
return nil, fmt.Errorf("failed to unmount rwlayer %s", m.Mountpoint)
|
|
}
|
|
|
|
di, err := c.createDeviceInfo(s, s, false, true)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
deviceInfos = append(deviceInfos, *di)
|
|
}
|
|
}
|
|
}
|
|
return deviceInfos, nil
|
|
}
|
|
|
|
// physicalEndpointBDFs returns the set of host PCI BDFs backing physical
|
|
// network endpoints already cold-plugged into the sandbox.
|
|
func (c *Container) physicalEndpointBDFs() map[string]struct{} {
|
|
bdfs := map[string]struct{}{}
|
|
if c.sandbox == nil || c.sandbox.network == nil {
|
|
return bdfs
|
|
}
|
|
for _, ep := range c.sandbox.network.Endpoints() {
|
|
if ep.Type() != PhysicalEndpointType {
|
|
continue
|
|
}
|
|
if pe, ok := ep.(*PhysicalEndpoint); ok && pe.BDF != "" {
|
|
bdfs[pe.BDF] = struct{}{}
|
|
}
|
|
}
|
|
return bdfs
|
|
}
|
|
|
|
// vfioDeviceIsPhysicalEndpoint reports whether the VFIO device at devPath
|
|
// resolves to a BDF owned by a physical network endpoint.
|
|
func (c *Container) vfioDeviceIsPhysicalEndpoint(devPath string, endpointBDFs map[string]struct{}) bool {
|
|
for _, bdf := range vfioDeviceBDFs(devPath) {
|
|
if _, ok := endpointBDFs[bdf]; ok {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// vfioDeviceBDFs resolves a VFIO device path to the host PCI BDF(s) it exposes.
|
|
// Handles both the iommufd cdev (/dev/vfio/devices/vfio<N>) and the legacy
|
|
// group node (/dev/vfio/<group>, which may contain multiple devices).
|
|
func vfioDeviceBDFs(devPath string) []string {
|
|
if strings.HasPrefix(filepath.Base(devPath), "vfio") {
|
|
// IOMMUFD device (/dev/vfio/devices/vfio<NUM>): single device per char dev
|
|
major, minor, err := deviceUtils.GetMajorMinorFromDevPath(devPath)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
bdf, err := deviceUtils.GetBDFFromVFIODev(major, minor)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return []string{bdf}
|
|
}
|
|
// Legacy VFIO group (/dev/vfio/<GROUP>): may contain multiple devices
|
|
vfioGroup := filepath.Base(devPath)
|
|
iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")
|
|
deviceFiles, err := os.ReadDir(iommuDevicesPath)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var bdfs []string
|
|
for _, deviceFile := range deviceFiles {
|
|
bdf, _, _, err := deviceUtils.GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
bdfs = append(bdfs, bdf)
|
|
}
|
|
return bdfs
|
|
}
|
|
|
|
func (c *Container) createDevices(ctx context.Context, contConfig *ContainerConfig) error {
|
|
// If devices were not found in storage, create Device implementations
|
|
// from the configuration. This should happen at create.
|
|
var storedDevices []ContainerDevice
|
|
virtualVolumesDeviceInfos, err := c.createVirtualVolumeDevices()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
deviceInfos := append(virtualVolumesDeviceInfos, contConfig.DeviceInfos...)
|
|
|
|
erofsDeviceInfos, err := c.createErofsDevices(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
deviceInfos = append(erofsDeviceInfos, deviceInfos...)
|
|
|
|
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
|
|
// until we have TDISP/IDE PCIe support.
|
|
coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort)
|
|
// Aggregate all the containner devices for hot-plug and use them to dedcue
|
|
// the correct amount of ports to reserve for the hypervisor.
|
|
hotPlugVFIO := (c.sandbox.config.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
|
|
|
hotPlugDevices := []config.DeviceInfo{}
|
|
vfioColdPlugDevices := []config.DeviceInfo{}
|
|
|
|
// VFs that back a physical network endpoint are already cold-plugged into
|
|
// QEMU during sandbox network setup (PhysicalEndpoint.Attach -> AddDevice).
|
|
// The SR-IOV device plugin sometimes ALSO lists the same VF in the
|
|
// container's linux.devices. Attaching it again here fails ("port is not
|
|
// set" / already present), so build the set of BDFs owned by physical
|
|
// endpoints and skip any container VFIO device that resolves to one of
|
|
// them. Exposing these VFs to the agent is handled separately by
|
|
// kata_agent.appendPhysicalEndpointDevices().
|
|
physicalEndpointBDFs := c.physicalEndpointBDFs()
|
|
|
|
for i, vfio := range deviceInfos {
|
|
// Only considering VFIO updates for Port and ColdPlug or
|
|
// HotPlug updates
|
|
isVFIODevice := deviceManager.IsVFIODevice(vfio.ContainerPath)
|
|
|
|
if isVFIODevice && len(physicalEndpointBDFs) > 0 {
|
|
if c.vfioDeviceIsPhysicalEndpoint(vfio.ContainerPath, physicalEndpointBDFs) {
|
|
c.Logger().WithField("device", vfio.ContainerPath).Info(
|
|
"skipping VFIO device already cold-plugged as a physical network endpoint")
|
|
continue
|
|
}
|
|
}
|
|
|
|
if hotPlugVFIO && isVFIODevice {
|
|
deviceInfos[i].ColdPlug = false
|
|
deviceInfos[i].Port = c.sandbox.config.HypervisorConfig.HotPlugVFIO
|
|
hotPlugDevices = append(hotPlugDevices, deviceInfos[i])
|
|
continue
|
|
}
|
|
// Device is already cold-plugged at sandbox creation time
|
|
// ignore it for the container creation
|
|
if coldPlugVFIO && isVFIODevice {
|
|
vfioColdPlugDevices = append(vfioColdPlugDevices, deviceInfos[i])
|
|
continue
|
|
}
|
|
hotPlugDevices = append(hotPlugDevices, deviceInfos[i])
|
|
}
|
|
|
|
// If modeVFIO is enabled we need 1st to attach the VFIO control group
|
|
// device /dev/vfio/vfio an 2nd the actuall device(s) afterwards.
|
|
// Sort the devices starting with device #1 being the VFIO control group
|
|
// device and the next the actuall device(s) /dev/vfio/<group>
|
|
//
|
|
// Cold-plug VFIO devices must also reach the agent in
|
|
// `VfioMode == GuestKernel`. The agent's `vfio-pci-gk` handler
|
|
// returns `dev: None` (so /dev/vfio/<group> is *not* materialised in
|
|
// the container spec — `constrainGRPCSpec(stripVfio=true)` will have
|
|
// already removed it from `grpcSpec.Linux.Devices`), but it still
|
|
// records the host->guest PCI mapping into `sandbox.pcimap[cid]`.
|
|
// Without that mapping, `update_env_pci` cannot translate the
|
|
// `PCIDEVICE_<RES>=<host-BDF>` env vars set by the SR-IOV device
|
|
// plugin and aborts the container creation with
|
|
// "No PCI mapping found for container <id>".
|
|
//
|
|
// `devManager.NewDevice` calls `FindDevice` first, which matches the
|
|
// already-cold-plugged sandbox-level device by HostPath/major/minor,
|
|
// so this does not double-attach.
|
|
if coldPlugVFIO {
|
|
// DeviceInfo should still be added to the sandbox's device manager
|
|
// if vfio_mode is VFIO and coldPlugVFIO is true (e.g. vfio-ap-cold).
|
|
// This ensures that ociSpec.Linux.Devices is updated with
|
|
// this information before the container is created on the guest.
|
|
sortedVFIODevices := sortContainerVFIODevices(vfioColdPlugDevices)
|
|
// Combine sorted VFIO devices with hot-plug devices
|
|
deviceInfos = append(sortedVFIODevices, hotPlugDevices...)
|
|
} else {
|
|
deviceInfos = sortContainerVFIODevices(hotPlugDevices)
|
|
}
|
|
|
|
for _, info := range deviceInfos {
|
|
dev, err := c.sandbox.devManager.NewDevice(info)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
storedDevices = append(storedDevices, ContainerDevice{
|
|
ID: dev.DeviceID(),
|
|
ContainerPath: info.ContainerPath,
|
|
FileMode: info.FileMode,
|
|
UID: info.UID,
|
|
GID: info.GID,
|
|
})
|
|
}
|
|
c.devices = filterDevices(c, storedDevices)
|
|
|
|
// If we're hot-plugging this will be a no-op because at this stage
|
|
// no devices are attached to the root-port or switch-port
|
|
if err := c.annotateContainerWithVFIOMetadata(vfioColdPlugDevices); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// rollbackFailingContainerCreation rolls back important steps that might have
|
|
// been performed before the container creation failed.
|
|
// - Unplug CPU and memory resources from the VM.
|
|
// - Unplug devices from the VM.
|
|
func (c *Container) rollbackFailingContainerCreation(ctx context.Context) {
|
|
if err := c.detachDevices(ctx); err != nil {
|
|
c.Logger().WithError(err).Error("rollback failed detachDevices()")
|
|
}
|
|
if err := c.removeDrive(ctx); err != nil {
|
|
c.Logger().WithError(err).Error("rollback failed removeDrive()")
|
|
}
|
|
if err := c.unmountHostMounts(ctx); err != nil {
|
|
c.Logger().WithError(err).Error("rollback failed unmountHostMounts()")
|
|
}
|
|
|
|
if IsNydusRootFSType(c.rootFs.Type) {
|
|
if err := nydusContainerCleanup(ctx, getMountPath(c.sandbox.id), c); err != nil {
|
|
c.Logger().WithError(err).Error("rollback failed nydusContainerCleanup()")
|
|
}
|
|
} else {
|
|
if err := c.sandbox.fsShare.UnshareRootFilesystem(ctx, c); err != nil {
|
|
c.Logger().WithError(err).Error("rollback failed UnshareRootFilesystem()")
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *Container) checkBlockDeviceSupport(ctx context.Context) bool {
|
|
if !c.sandbox.config.HypervisorConfig.DisableBlockDeviceUse {
|
|
agentCaps := c.sandbox.agent.capabilities()
|
|
hypervisorCaps := c.sandbox.hypervisor.Capabilities(ctx)
|
|
|
|
if agentCaps.IsBlockDeviceSupported() && hypervisorCaps.IsBlockDeviceHotplugSupported() {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// Sort the devices starting with device #1 being the VFIO control group
|
|
// device and the next the actuall device(s) e.g. /dev/vfio/<group>
|
|
func sortContainerVFIODevices(devices []config.DeviceInfo) []config.DeviceInfo {
|
|
var vfioDevices []config.DeviceInfo
|
|
|
|
for _, device := range devices {
|
|
if deviceManager.IsVFIOControlDevice(device.ContainerPath) {
|
|
vfioDevices = append([]config.DeviceInfo{device}, vfioDevices...)
|
|
continue
|
|
}
|
|
vfioDevices = append(vfioDevices, device)
|
|
}
|
|
return vfioDevices
|
|
}
|
|
|
|
// errNoSiblingFound is returned by siblingAnnotation when the VFIO device is
|
|
// not of a supported CDI device type, i.e. it has no entry in the cdiDeviceKind
|
|
// table (e.g. NVSwitches). Callers should treat this as a non-fatal "device not
|
|
// applicable" condition rather than a sibling-matching failure.
|
|
var errNoSiblingFound = fmt.Errorf("no suitable sibling found")
|
|
|
|
// cdiDeviceKey identifies a device type by vendor ID and PCI class prefix.
|
|
type cdiDeviceKey struct {
|
|
VendorID string
|
|
ClassPrefix string
|
|
}
|
|
|
|
// cdiDeviceKind maps known device types to their CDI annotation kind.
|
|
var cdiDeviceKind = map[cdiDeviceKey]string{
|
|
{VendorID: "0x10de", ClassPrefix: "0x030"}: "nvidia.com/gpu",
|
|
}
|
|
|
|
// cdiKindForDevice returns the CDI kind for a given vendor ID and PCI class,
|
|
// or empty string and false if the device is not recognized.
|
|
func cdiKindForDevice(vendorID, class string) (string, bool) {
|
|
for key, kind := range cdiDeviceKind {
|
|
if vendorID == key.VendorID && strings.Contains(class, key.ClassPrefix) {
|
|
return kind, true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
type DeviceRelation struct {
|
|
Bus string
|
|
Path string
|
|
Index int
|
|
BDF string
|
|
CDIKind string
|
|
}
|
|
|
|
// Depending on the HW we might need to inject metadata into the container
|
|
// In this case for the NV GPU we need to provide the correct mapping from
|
|
// VFIO-<NUM> to GPU index inside of the VM when vfio_mode="guest-kernel",
|
|
// otherwise we do not know which GPU is which.
|
|
func (c *Container) annotateContainerWithVFIOMetadata(devices interface{}) error {
|
|
|
|
if ContainerType(c.config.Annotations[vcAnnotations.ContainerTypeKey]).IsCriSandbox() {
|
|
c.Logger().Info("Skipping VFIO metadata annotation for sandbox container")
|
|
return nil
|
|
}
|
|
|
|
modeIsGK := (c.sandbox.config.VfioMode == config.VFIOModeGuestKernel)
|
|
|
|
if modeIsGK {
|
|
// Hot plug is done let's update meta information about the
|
|
// hot plugged devices especially VFIO devices in modeIsGK
|
|
siblings := make([]DeviceRelation, 0)
|
|
// In the sandbox we first create the root-ports and secondly
|
|
// the switch-ports. The range over map is not deterministic
|
|
// so lets first iterate over all root-port devices and then
|
|
// switch-port devices no special handling for bridge-port (PCI)
|
|
for _, dev := range config.PCIeDevicesPerPort["root-port"] {
|
|
if kind, ok := cdiKindForDevice(dev.VendorID, dev.Class); ok {
|
|
siblings = append(siblings, DeviceRelation{Bus: dev.Bus, Path: dev.HostPath, BDF: dev.BDF, CDIKind: kind})
|
|
}
|
|
}
|
|
for _, dev := range config.PCIeDevicesPerPort["switch-port"] {
|
|
if kind, ok := cdiKindForDevice(dev.VendorID, dev.Class); ok {
|
|
siblings = append(siblings, DeviceRelation{Bus: dev.Bus, Path: dev.HostPath, BDF: dev.BDF, CDIKind: kind})
|
|
}
|
|
}
|
|
// We need to sort the VFIO devices by bus to get the correct
|
|
// ordering root-port < switch-port
|
|
sort.Slice(siblings, func(i, j int) bool {
|
|
return siblings[i].Bus < siblings[j].Bus
|
|
})
|
|
|
|
for i := range siblings {
|
|
siblings[i].Index = i
|
|
}
|
|
|
|
// Collect container paths from either hot-plug or cold-plug devices
|
|
var containerPaths []string
|
|
if devs, ok := devices.([]ContainerDevice); ok {
|
|
for _, dev := range devs {
|
|
containerPaths = append(containerPaths, dev.ContainerPath)
|
|
}
|
|
}
|
|
if devs, ok := devices.([]config.DeviceInfo); ok {
|
|
for _, dev := range devs {
|
|
containerPaths = append(containerPaths, dev.ContainerPath)
|
|
}
|
|
}
|
|
|
|
// Now that we have the index lets connect the /dev/vfio/<num>
|
|
// to the correct index
|
|
for _, devPath := range containerPaths {
|
|
if !strings.HasPrefix(devPath, "/dev/vfio") {
|
|
c.Logger().Infof("skipping guest annotations for non-VFIO device %q", devPath)
|
|
continue
|
|
}
|
|
if devPath == "/dev/vfio/vfio" {
|
|
c.Logger().Infof("skipping /dev/vfio/vfio for vfio_mode=guest-kernel")
|
|
continue
|
|
}
|
|
if err := c.siblingAnnotation(devPath, siblings); err != nil {
|
|
if errors.Is(err, errNoSiblingFound) {
|
|
c.Logger().Infof("no CDI annotation for device %s (not a known CDI device type)", devPath)
|
|
continue
|
|
}
|
|
return err
|
|
}
|
|
}
|
|
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// createCDIAnnotation adds a container annotation mapping a VFIO device to a device index.
|
|
//
|
|
// devPath is the path to the VFIO device, which can be in the format
|
|
// "/dev/vfio/<num>" or "/dev/vfio/devices/vfio<num>". The function extracts
|
|
// the device number from the path and creates an annotation with the key
|
|
// "cdi.k8s.io/vfio<num>" and the value "<cdiKind>=<index>", where
|
|
// <cdiKind> is the CDI device kind (e.g. "nvidia.com/gpu"),
|
|
// <num> is the device number and <index> is the provided device index.
|
|
// The annotation is stored in c.config.CustomSpec.Annotations.
|
|
func (c *Container) createCDIAnnotation(devPath string, index int, cdiKind string) {
|
|
// We have here either /dev/vfio/<num> or /dev/vfio/devices/vfio<num>
|
|
baseName := filepath.Base(devPath)
|
|
vfioNum := baseName
|
|
// For IOMMUFD format /dev/vfio/devices/vfio<num>, strip "vfio" prefix
|
|
if strings.HasPrefix(baseName, "vfio") {
|
|
vfioNum = strings.TrimPrefix(baseName, "vfio")
|
|
}
|
|
annoKey := fmt.Sprintf("cdi.k8s.io/vfio%s", vfioNum)
|
|
annoValue := fmt.Sprintf("%s=%d", cdiKind, index)
|
|
if c.config.CustomSpec.Annotations == nil {
|
|
c.config.CustomSpec.Annotations = make(map[string]string)
|
|
}
|
|
c.config.CustomSpec.Annotations[annoKey] = annoValue
|
|
}
|
|
|
|
func (c *Container) siblingAnnotation(devPath string, siblings []DeviceRelation) error {
|
|
// Resolve the device's BDFs once upfront. This serves two purposes:
|
|
// 1. Determine if the device is a known CDI type (if not, skip it)
|
|
// 2. Reuse the BDFs for sibling matching without redundant sysfs reads
|
|
isKnownCDIDevice := false
|
|
var devBDFs []string
|
|
|
|
if strings.HasPrefix(filepath.Base(devPath), "vfio") {
|
|
// IOMMUFD device (/dev/vfio/devices/vfio<NUM>): single device per char dev
|
|
major, minor, err := deviceUtils.GetMajorMinorFromDevPath(devPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
bdf, err := deviceUtils.GetBDFFromVFIODev(major, minor)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
devBDFs = []string{bdf}
|
|
vendorID := deviceUtils.GetPCIDeviceProperty(bdf, deviceUtils.PCISysFsDevicesVendor)
|
|
class := deviceUtils.GetPCIDeviceProperty(bdf, deviceUtils.PCISysFsDevicesClass)
|
|
_, isKnownCDIDevice = cdiKindForDevice(vendorID, class)
|
|
} else {
|
|
// Legacy VFIO group (/dev/vfio/<GROUP>): may contain multiple devices
|
|
vfioGroup := filepath.Base(devPath)
|
|
iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")
|
|
deviceFiles, err := os.ReadDir(iommuDevicesPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, deviceFile := range deviceFiles {
|
|
deviceBDF, _, _, err := deviceUtils.GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
devBDFs = append(devBDFs, deviceBDF)
|
|
if !isKnownCDIDevice {
|
|
vendorID := deviceUtils.GetPCIDeviceProperty(deviceBDF, deviceUtils.PCISysFsDevicesVendor)
|
|
class := deviceUtils.GetPCIDeviceProperty(deviceBDF, deviceUtils.PCISysFsDevicesClass)
|
|
if _, ok := cdiKindForDevice(vendorID, class); ok {
|
|
isKnownCDIDevice = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if !isKnownCDIDevice {
|
|
return fmt.Errorf("device %s: %w", devPath, errNoSiblingFound)
|
|
}
|
|
|
|
for _, sibling := range siblings {
|
|
if sibling.Path == devPath || slices.Contains(devBDFs, sibling.BDF) {
|
|
c.createCDIAnnotation(devPath, sibling.Index, sibling.CDIKind)
|
|
return nil
|
|
}
|
|
}
|
|
return fmt.Errorf("device %s is a known CDI device type but failed to match any sibling by path or BDF", devPath)
|
|
}
|
|
|
|
// create creates and starts a container inside a Sandbox. It has to be
|
|
// called only when a new container, not known by the sandbox, has to be created.
|
|
func (c *Container) create(ctx context.Context) (err error) {
|
|
// In case the container creation fails, the following takes care
|
|
// of rolling back all the actions previously performed.
|
|
defer func() {
|
|
if err != nil {
|
|
c.Logger().WithError(err).Error("container create failed")
|
|
c.rollbackFailingContainerCreation(ctx)
|
|
}
|
|
}()
|
|
|
|
if c.checkBlockDeviceSupport(ctx) && !IsNydusRootFSType(c.rootFs.Type) && !IsErofsRootFS(c.rootFs) {
|
|
// If the rootfs is backed by a block device, go ahead and hotplug it to the guest
|
|
if err = c.hotplugDrive(ctx); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"devices": c.devices,
|
|
}).Info("Attach devices")
|
|
if err = c.attachDevices(ctx); err != nil {
|
|
return
|
|
}
|
|
|
|
if err := c.annotateContainerWithVFIOMetadata(c.devices); err != nil {
|
|
return fmt.Errorf("annotating VFIO devices: %w", err)
|
|
}
|
|
|
|
// Deduce additional system mount info that should be handled by the agent
|
|
// inside the VM
|
|
c.getSystemMountInfo()
|
|
|
|
process, err := c.sandbox.agent.createContainer(ctx, c.sandbox, c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.process = *process
|
|
|
|
if err = c.setContainerState(types.StateReady); err != nil {
|
|
return
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) delete(ctx context.Context) error {
|
|
if c.state.State != types.StateReady &&
|
|
c.state.State != types.StateStopped {
|
|
return fmt.Errorf("Container not ready or stopped, impossible to delete")
|
|
}
|
|
|
|
// Remove the container from sandbox structure
|
|
if err := c.sandbox.removeContainer(c.id); err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.sandbox.storeSandbox(ctx)
|
|
}
|
|
|
|
// checkSandboxRunning validates the container state.
|
|
//
|
|
// cmd specifies the operation (or verb) that the retrieval is destined
|
|
// for and is only used to make the returned error as descriptive as
|
|
// possible.
|
|
func (c *Container) checkSandboxRunning(cmd string) error {
|
|
if cmd == "" {
|
|
return fmt.Errorf("Cmd cannot be empty")
|
|
}
|
|
|
|
if c.sandbox.state.State != types.StateRunning {
|
|
return fmt.Errorf("Sandbox not running, impossible to %s the container", cmd)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) getSystemMountInfo() {
|
|
// Check if /dev needs to be bind mounted from host /dev
|
|
c.systemMountsInfo.BindMountDev = false
|
|
|
|
for _, m := range c.mounts {
|
|
if m.Source == "/dev" && m.Destination == "/dev" && m.Type == "bind" {
|
|
c.systemMountsInfo.BindMountDev = true
|
|
}
|
|
}
|
|
|
|
// TODO Deduce /dev/shm size. See https://github.com/clearcontainers/runtime/issues/138
|
|
}
|
|
|
|
func (c *Container) start(ctx context.Context) error {
|
|
if err := c.checkSandboxRunning("start"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if c.state.State != types.StateReady &&
|
|
c.state.State != types.StateStopped {
|
|
return fmt.Errorf("Container not ready or stopped, impossible to start")
|
|
}
|
|
|
|
if err := c.state.ValidTransition(c.state.State, types.StateRunning); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := c.sandbox.agent.startContainer(ctx, c.sandbox, c); err != nil {
|
|
c.Logger().WithError(err).Error("Failed to start container")
|
|
|
|
if err := c.stop(ctx, true); err != nil {
|
|
c.Logger().WithError(err).Warn("Failed to stop container")
|
|
}
|
|
return err
|
|
}
|
|
|
|
return c.setContainerState(types.StateRunning)
|
|
}
|
|
|
|
func (c *Container) stop(ctx context.Context, force bool) error {
|
|
span, ctx := katatrace.Trace(ctx, c.Logger(), "stop", containerTracingTags, map[string]string{"container_id": c.id})
|
|
defer span.End()
|
|
|
|
// In case the container status has been updated implicitly because
|
|
// the container process has terminated, it might be possible that
|
|
// someone try to stop the container, and we don't want to issue an
|
|
// error in that case. This should be a no-op.
|
|
//
|
|
// This has to be handled before the transition validation since this
|
|
// is an exception.
|
|
if c.state.State == types.StateStopped {
|
|
c.Logger().Info("Container already stopped")
|
|
return nil
|
|
}
|
|
|
|
if err := c.state.ValidTransition(c.state.State, types.StateStopped); err != nil {
|
|
if !force {
|
|
return err
|
|
}
|
|
c.Logger().WithError(err).Warn("invalid state transition to Stopped; continuing because force is set")
|
|
}
|
|
|
|
// Force the container to be killed. For most of the cases, this
|
|
// should not matter and it should return an error that will be
|
|
// ignored.
|
|
c.kill(ctx, syscall.SIGKILL, true)
|
|
|
|
// Since the agent has supported the MultiWaitProcess, it's better to
|
|
// wait the process here to make sure the process has exited before to
|
|
// issue stopContainer, otherwise the RemoveContainerRequest in it will
|
|
// get failed if the process hasn't exited.
|
|
c.sandbox.agent.waitProcess(ctx, c, c.id)
|
|
|
|
if c.sandbox.config.HypervisorConfig.SharedFS == config.NoSharedFS &&
|
|
c.config.Annotations["io.kubernetes.container.terminationMessagePolicy"] == "File" {
|
|
terminationMessagePath := c.config.Annotations["io.kubernetes.container.terminationMessagePath"]
|
|
if terminationMessagePath != "" {
|
|
data, err := c.sandbox.agent.getDiagnosticData(ctx, "termination_log", c.id)
|
|
if err != nil {
|
|
c.Logger().WithError(err).Warn("Failed to get termination message from guest")
|
|
} else if data != "" {
|
|
// The kubelet bind-mounts a host file into the container at
|
|
// terminationMessagePath, then reads back from that host file.
|
|
// With shared_fs=none the guest cannot write through that mount,
|
|
// so we locate the host-side path from the OCI mounts and write
|
|
// the data there directly.
|
|
var hostPath string
|
|
for _, m := range c.mounts {
|
|
if m.Destination == terminationMessagePath {
|
|
hostPath = m.Source
|
|
break
|
|
}
|
|
}
|
|
if hostPath == "" {
|
|
c.Logger().Warn("No host mount found for termination message path")
|
|
} else if err := os.WriteFile(hostPath, []byte(data), 0644); err != nil {
|
|
c.Logger().WithError(err).Warn("Failed to write termination message")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
defer func() {
|
|
// Save device and drive data.
|
|
// TODO: can we merge this saving with setContainerState()?
|
|
if err := c.sandbox.Save(); err != nil {
|
|
c.Logger().WithError(err).Info("Save container state failed")
|
|
}
|
|
}()
|
|
|
|
if err := c.sandbox.agent.stopContainer(ctx, c.sandbox, *c); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if err := c.unmountHostMounts(ctx); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if IsNydusRootFSType(c.rootFs.Type) {
|
|
if err := nydusContainerCleanup(ctx, getMountPath(c.sandbox.id), c); err != nil && !force {
|
|
return err
|
|
}
|
|
} else {
|
|
if err := c.sandbox.fsShare.UnshareRootFilesystem(ctx, c); err != nil && !force {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err := c.sandbox.agent.removeStaleVirtiofsShareMounts(ctx); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if err := c.detachDevices(ctx); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if err := c.removeDrive(ctx); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
// container was killed by force, container MUST change its state
|
|
// as soon as possible just in case one of below operations fail leaving
|
|
// the containers in a bad state.
|
|
if err := c.setContainerState(types.StateStopped); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) enter(ctx context.Context, cmd types.Cmd) (*Process, error) {
|
|
if err := c.checkSandboxRunning("enter"); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if c.state.State != types.StateReady &&
|
|
c.state.State != types.StateRunning {
|
|
return nil, fmt.Errorf("Container not ready or running, " +
|
|
"impossible to enter")
|
|
}
|
|
|
|
process, err := c.sandbox.agent.exec(ctx, c.sandbox, *c, cmd)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return process, nil
|
|
}
|
|
|
|
func (c *Container) wait(ctx context.Context, processID string) (int32, error) {
|
|
if c.state.State != types.StateReady &&
|
|
c.state.State != types.StateRunning {
|
|
return 0, fmt.Errorf("Container not ready or running, " +
|
|
"impossible to wait")
|
|
}
|
|
|
|
return c.sandbox.agent.waitProcess(ctx, c, processID)
|
|
}
|
|
|
|
func (c *Container) kill(ctx context.Context, signal syscall.Signal, all bool) error {
|
|
return c.signalProcess(ctx, c.process.Token, signal, all)
|
|
}
|
|
|
|
func (c *Container) signalProcess(ctx context.Context, processID string, signal syscall.Signal, all bool) error {
|
|
if c.sandbox.state.State != types.StateReady && c.sandbox.state.State != types.StateRunning {
|
|
return fmt.Errorf("Sandbox not ready or running, impossible to signal the container")
|
|
}
|
|
|
|
if c.state.State != types.StateReady && c.state.State != types.StateRunning && c.state.State != types.StatePaused {
|
|
return fmt.Errorf("Container not ready, running or paused, impossible to signal the container")
|
|
}
|
|
|
|
// kill(2) method can return ESRCH in certain cases, which is not handled by containerd cri server in container_stop.go.
|
|
// CRIO server also doesn't handle ESRCH. So kata runtime will swallow it here.
|
|
var err error
|
|
if err = c.sandbox.agent.signalProcess(ctx, c, processID, signal, all); err != nil &&
|
|
strings.Contains(err.Error(), "ESRCH: No such process") {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"container": c.id,
|
|
"process-id": processID,
|
|
}).Warn("signal encounters ESRCH, process already finished")
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (c *Container) winsizeProcess(ctx context.Context, processID string, height, width uint32) error {
|
|
if c.state.State != types.StateReady && c.state.State != types.StateRunning {
|
|
return fmt.Errorf("Container not ready or running, impossible to signal the container")
|
|
}
|
|
|
|
return c.sandbox.agent.winsizeProcess(ctx, c, processID, height, width)
|
|
}
|
|
|
|
func (c *Container) ioStream(processID string) (io.WriteCloser, io.Reader, io.Reader, error) {
|
|
if c.state.State != types.StateReady && c.state.State != types.StateRunning {
|
|
return nil, nil, nil, fmt.Errorf("Container not ready or running, impossible to signal the container")
|
|
}
|
|
|
|
stream := newIOStream(c.sandbox, c, processID)
|
|
|
|
return stream.stdin(), stream.stdout(), stream.stderr(), nil
|
|
}
|
|
|
|
func (c *Container) stats(ctx context.Context) (*ContainerStats, error) {
|
|
if err := c.checkSandboxRunning("stats"); err != nil {
|
|
return nil, err
|
|
}
|
|
return c.sandbox.agent.statsContainer(ctx, c.sandbox, *c)
|
|
}
|
|
|
|
func (c *Container) update(ctx context.Context, resources specs.LinuxResources) error {
|
|
if err := c.checkSandboxRunning("update"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if state := c.state.State; state != types.StateRunning && state != types.StateReady {
|
|
return fmt.Errorf("container(%s) not running or ready, impossible to update", state)
|
|
}
|
|
|
|
if c.config.Resources.CPU == nil {
|
|
c.config.Resources.CPU = &specs.LinuxCPU{}
|
|
}
|
|
|
|
if cpu := resources.CPU; cpu != nil {
|
|
if p := cpu.Period; p != nil && *p != 0 {
|
|
c.config.Resources.CPU.Period = p
|
|
}
|
|
if q := cpu.Quota; q != nil && *q != 0 {
|
|
c.config.Resources.CPU.Quota = q
|
|
}
|
|
if cpu.Cpus != "" {
|
|
c.config.Resources.CPU.Cpus = cpu.Cpus
|
|
}
|
|
if cpu.Mems != "" {
|
|
c.config.Resources.CPU.Mems = cpu.Mems
|
|
}
|
|
}
|
|
|
|
if c.config.Resources.Memory == nil {
|
|
c.config.Resources.Memory = &specs.LinuxMemory{}
|
|
}
|
|
|
|
if mem := resources.Memory; mem != nil && mem.Limit != nil {
|
|
c.config.Resources.Memory.Limit = mem.Limit
|
|
}
|
|
|
|
if err := c.sandbox.updateResources(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Cpus/Mems in cgroup cpuset are host-relative; clear Cpus since vCPU
|
|
// numbering differs inside the guest. For Mems, translate host NUMA node
|
|
// IDs to guest node IDs when multi-NUMA is configured, otherwise clear.
|
|
if resources.CPU != nil {
|
|
resources.CPU.Cpus = ""
|
|
numaNodes := c.sandbox.config.HypervisorConfig.GuestNUMANodes
|
|
if len(numaNodes) > 1 && resources.CPU.Mems != "" {
|
|
resources.CPU.Mems = translateHostMemsToGuest(resources.CPU.Mems, numaNodes)
|
|
} else {
|
|
resources.CPU.Mems = ""
|
|
}
|
|
}
|
|
|
|
return c.sandbox.agent.updateContainer(ctx, c.sandbox, *c, resources)
|
|
}
|
|
|
|
func (c *Container) pause(ctx context.Context) error {
|
|
if err := c.checkSandboxRunning("pause"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if c.state.State != types.StateRunning {
|
|
return fmt.Errorf("Container not running, impossible to pause")
|
|
}
|
|
|
|
if err := c.sandbox.agent.pauseContainer(ctx, c.sandbox, *c); err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.setContainerState(types.StatePaused)
|
|
}
|
|
|
|
func (c *Container) resume(ctx context.Context) error {
|
|
if err := c.checkSandboxRunning("resume"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if c.state.State != types.StatePaused {
|
|
return fmt.Errorf("Container not paused, impossible to resume")
|
|
}
|
|
|
|
if err := c.sandbox.agent.resumeContainer(ctx, c.sandbox, *c); err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.setContainerState(types.StateRunning)
|
|
}
|
|
|
|
// hotplugDrive will attempt to hotplug the container rootfs if it is backed by a
|
|
// block device
|
|
func (c *Container) hotplugDrive(ctx context.Context) error {
|
|
var dev device
|
|
var err error
|
|
|
|
// Check to see if the rootfs is an umounted block device (source) or if the
|
|
// mount (target) is backed by a block device:
|
|
if !c.rootFs.Mounted {
|
|
dev, err = getDeviceForPath(c.rootFs.Source)
|
|
// there is no "rootfs" dir on block device backed rootfs
|
|
c.rootfsSuffix = ""
|
|
} else {
|
|
dev, err = getDeviceForPath(c.rootFs.Target)
|
|
}
|
|
|
|
if err == errMountPointNotFound {
|
|
return nil
|
|
}
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"device-major": dev.major,
|
|
"device-minor": dev.minor,
|
|
"mount-point": dev.mountPoint,
|
|
}).Info("device details")
|
|
|
|
isBD, err := checkStorageDriver(dev.major, dev.minor)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !isBD {
|
|
return nil
|
|
}
|
|
|
|
devicePath := c.rootFs.Source
|
|
fsType := c.rootFs.Type
|
|
if c.rootFs.Mounted {
|
|
if dev.mountPoint == c.rootFs.Target {
|
|
c.rootfsSuffix = ""
|
|
}
|
|
// If device mapper device, then fetch the full path of the device
|
|
devicePath, fsType, _, err = utils.GetDevicePathAndFsTypeOptions(dev.mountPoint)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
devicePath, err = filepath.EvalSymlinks(devicePath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"device-path": devicePath,
|
|
"fs-type": fsType,
|
|
}).Info("Block device detected")
|
|
|
|
if err = c.plugDevice(ctx, devicePath); err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.setStateFstype(fsType)
|
|
}
|
|
|
|
// plugDevice will attach the rootfs if blockdevice is supported (this is rootfs specific)
|
|
func (c *Container) plugDevice(ctx context.Context, devicePath string) error {
|
|
var stat unix.Stat_t
|
|
if err := unix.Stat(devicePath, &stat); err != nil {
|
|
return fmt.Errorf("stat %q failed: %v", devicePath, err)
|
|
}
|
|
|
|
if c.checkBlockDeviceSupport(ctx) && stat.Mode&unix.S_IFBLK == unix.S_IFBLK {
|
|
b, err := c.sandbox.devManager.NewDevice(config.DeviceInfo{
|
|
HostPath: devicePath,
|
|
ContainerPath: filepath.Join(kataGuestSharedDir(), c.id),
|
|
DevType: "b",
|
|
Major: int64(unix.Major(uint64(stat.Rdev))),
|
|
Minor: int64(unix.Minor(uint64(stat.Rdev))),
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("device manager failed to create rootfs device for %q: %v", devicePath, err)
|
|
}
|
|
|
|
c.state.BlockDeviceID = b.DeviceID()
|
|
|
|
// attach rootfs device
|
|
if err := c.sandbox.devManager.AttachDevice(ctx, b.DeviceID(), c.sandbox); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// isDriveUsed checks if a drive has been used for container rootfs
|
|
func (c *Container) isDriveUsed() bool {
|
|
return c.state.Fstype != ""
|
|
}
|
|
|
|
func (c *Container) removeDrive(ctx context.Context) (err error) {
|
|
if c.isDriveUsed() {
|
|
c.Logger().Info("unplugging block device")
|
|
|
|
devID := c.state.BlockDeviceID
|
|
err := c.sandbox.devManager.DetachDevice(ctx, devID, c.sandbox)
|
|
if err != nil && err != deviceManager.ErrDeviceNotAttached {
|
|
return err
|
|
}
|
|
|
|
if err = c.sandbox.devManager.RemoveDevice(devID); err != nil {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"container": c.id,
|
|
"device-id": devID,
|
|
}).WithError(err).Error("remove device failed")
|
|
|
|
// ignore the device not exist error
|
|
if err != deviceManager.ErrDeviceNotExist {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) attachDevices(ctx context.Context) error {
|
|
// there's no need to do rollback when error happens,
|
|
// because if attachDevices fails, container creation will fail too,
|
|
// and rollbackFailingContainerCreation could do all the rollbacks
|
|
|
|
// since devices with large bar space require delayed attachment,
|
|
// the devices need to be split into two lists, normalAttachedDevs and delayAttachedDevs.
|
|
// so c.device is not used here. See issue https://github.com/kata-containers/runtime/issues/2460.
|
|
for _, dev := range c.devices {
|
|
if err := c.sandbox.devManager.AttachDevice(ctx, dev.ID, c.sandbox); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *Container) detachDevices(ctx context.Context) error {
|
|
for _, dev := range c.devices {
|
|
// Skip detaching shared devices - they are shared across
|
|
// containers (e.g., block-based emptyDirs) and will be cleaned
|
|
// up when the sandbox is deleted.
|
|
if dev.Shared {
|
|
continue
|
|
}
|
|
|
|
err := c.sandbox.devManager.DetachDevice(ctx, dev.ID, c.sandbox)
|
|
if err != nil && err != deviceManager.ErrDeviceNotAttached {
|
|
return err
|
|
}
|
|
|
|
if err = c.sandbox.devManager.RemoveDevice(dev.ID); err != nil {
|
|
c.Logger().WithFields(logrus.Fields{
|
|
"container": c.id,
|
|
"device-id": dev.ID,
|
|
}).WithError(err).Error("remove device failed")
|
|
|
|
// ignore the device not exist error
|
|
if err != deviceManager.ErrDeviceNotExist {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|