runtime: add support for kata overlays

When at least one `io.katacontainers.fs-opt.layer` option is added to
the rootfs, it gets inserted into the VM as a layer, and the file system
is mounted as an overlay of all layers using the overlayfs driver.

Additionally, if the `io.katacontainers.fs-opt.block_device=file` option
is present in a layer, it is mounted as a block device backed by a file
on the host.

Fixes: #7536

Signed-off-by: Wedson Almeida Filho <walmeida@microsoft.com>
This commit is contained in:
Wedson Almeida Filho 2023-04-20 04:52:27 -03:00
parent 6c867d9e86
commit 7e1b1949d4
9 changed files with 136 additions and 11 deletions

View File

@ -16,6 +16,7 @@ import (
"path" "path"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings"
"syscall" "syscall"
containerd_types "github.com/containerd/containerd/api/types" containerd_types "github.com/containerd/containerd/api/types"
@ -23,6 +24,8 @@ import (
taskAPI "github.com/containerd/containerd/runtime/v2/task" taskAPI "github.com/containerd/containerd/runtime/v2/task"
"github.com/containerd/typeurl" "github.com/containerd/typeurl"
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils" "github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
"github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors" "github.com/pkg/errors"
@ -48,6 +51,28 @@ var defaultStartManagementServerFunc startManagementServerFunc = func(s *service
shimLog.Info("management server started") shimLog.Info("management server started")
} }
func copyLayersToMounts(rootFs *vc.RootFs, spec *specs.Spec) error {
for _, o := range rootFs.Options {
if !strings.HasPrefix(o, annotations.FileSystemLayer) {
continue
}
fields := strings.Split(o[len(annotations.FileSystemLayer):], ",")
if len(fields) < 2 {
return fmt.Errorf("Missing fields in rootfs layer: %q", o)
}
spec.Mounts = append(spec.Mounts, specs.Mount{
Destination: "/run/kata-containers/sandbox/layers/" + filepath.Base(fields[0]),
Type: fields[1],
Source: fields[0],
Options: fields[2:],
})
}
return nil
}
func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*container, error) { func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*container, error) {
rootFs := vc.RootFs{} rootFs := vc.RootFs{}
if len(r.Rootfs) == 1 { if len(r.Rootfs) == 1 {
@ -63,6 +88,11 @@ func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*con
if err != nil { if err != nil {
return nil, err return nil, err
} }
if err := copyLayersToMounts(&rootFs, ociSpec); err != nil {
return nil, err
}
containerType, err := oci.ContainerType(*ociSpec) containerType, err := oci.ContainerType(*ociSpec)
if err != nil { if err != nil {
return nil, err return nil, err
@ -268,6 +298,11 @@ func checkAndMount(s *service, r *taskAPI.CreateTaskRequest) (bool, error) {
if katautils.IsBlockDevice(m.Source) && !s.config.HypervisorConfig.DisableBlockDeviceUse { if katautils.IsBlockDevice(m.Source) && !s.config.HypervisorConfig.DisableBlockDeviceUse {
return false, nil return false, nil
} }
if virtcontainers.HasOptionPrefix(m.Options, annotations.FileSystemLayer) {
return false, nil
}
if m.Type == vc.NydusRootFSType { if m.Type == vc.NydusRootFSType {
// if kata + nydus, do not mount // if kata + nydus, do not mount
return false, nil return false, nil

View File

@ -480,6 +480,10 @@ func GetHostPath(devInfo DeviceInfo, vhostUserStoreEnabled bool, vhostUserStoreP
return "", fmt.Errorf("Empty path provided for device") return "", fmt.Errorf("Empty path provided for device")
} }
if devInfo.Major == -1 {
return devInfo.HostPath, nil
}
// Filter out vhost-user storage devices by device Major numbers. // Filter out vhost-user storage devices by device Major numbers.
if vhostUserStoreEnabled && devInfo.DevType == "b" && if vhostUserStoreEnabled && devInfo.DevType == "b" &&
(devInfo.Major == VhostUserSCSIMajor || devInfo.Major == VhostUserBlkMajor) { (devInfo.Major == VhostUserSCSIMajor || devInfo.Major == VhostUserBlkMajor) {

View File

@ -83,10 +83,21 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
return dm return dm
} }
func (dm *deviceManager) findDeviceByMajorMinor(major, minor int64) api.Device { func (dm *deviceManager) findDevice(devInfo *config.DeviceInfo) api.Device {
// For devices with a major of -1, we use the host path to find existing instances.
if devInfo.Major == -1 {
for _, dev := range dm.devices {
dma, _ := dev.GetMajorMinor()
if dma == -1 && dev.GetHostPath() == devInfo.HostPath {
return dev
}
}
return nil
}
for _, dev := range dm.devices { for _, dev := range dm.devices {
dma, dmi := dev.GetMajorMinor() dma, dmi := dev.GetMajorMinor()
if dma == major && dmi == minor { if dma == devInfo.Major && dmi == devInfo.Minor {
return dev return dev
} }
} }
@ -111,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
} }
}() }()
if existingDev := dm.findDeviceByMajorMinor(devInfo.Major, devInfo.Minor); existingDev != nil { if existingDev := dm.findDevice(&devInfo); existingDev != nil {
return existingDev, nil return existingDev, nil
} }

View File

@ -802,7 +802,12 @@ func (q *QMP) blockdevAddBaseArgs(driver string, blockDevice *BlockDevice) map[s
// used to name the device. As this identifier will be passed directly to QMP, // used to name the device. As this identifier will be passed directly to QMP,
// it must obey QMP's naming rules, e,g., it must start with a letter. // it must obey QMP's naming rules, e,g., it must start with a letter.
func (q *QMP) ExecuteBlockdevAdd(ctx context.Context, blockDevice *BlockDevice) error { func (q *QMP) ExecuteBlockdevAdd(ctx context.Context, blockDevice *BlockDevice) error {
args := q.blockdevAddBaseArgs("host_device", blockDevice) var args map[string]interface{}
if fi, err := os.Stat(blockDevice.File); err == nil && fi.Mode().IsRegular() {
args = q.blockdevAddBaseArgs("file", blockDevice)
} else {
args = q.blockdevAddBaseArgs("host_device", blockDevice)
}
return q.executeCommand(ctx, "blockdev-add", args, nil) return q.executeCommand(ctx, "blockdev-add", args, nil)
} }

View File

@ -608,8 +608,9 @@ func (c *Container) createBlockDevices(ctx context.Context) error {
continue continue
} }
if c.mounts[i].Type != "bind" { isBlockFile := HasOption(c.mounts[i].Options, vcAnnotations.IsFileBlockDevice)
// We only handle for bind-mounts if c.mounts[i].Type != "bind" && !isBlockFile {
// We only handle for bind and block device mounts.
continue continue
} }
@ -671,7 +672,7 @@ func (c *Container) createBlockDevices(ctx context.Context) error {
// Check if mount is a block device file. If it is, the block device will be attached to the host // Check if mount is a block device file. If it is, the block device will be attached to the host
// instead of passing this as a shared mount. // instead of passing this as a shared mount.
if stat.Mode&unix.S_IFBLK == unix.S_IFBLK { if stat.Mode&unix.S_IFMT == unix.S_IFBLK {
di = &config.DeviceInfo{ di = &config.DeviceInfo{
HostPath: c.mounts[i].Source, HostPath: c.mounts[i].Source,
ContainerPath: c.mounts[i].Destination, ContainerPath: c.mounts[i].Destination,
@ -680,6 +681,15 @@ func (c *Container) createBlockDevices(ctx context.Context) error {
Minor: int64(unix.Minor(uint64(stat.Rdev))), Minor: int64(unix.Minor(uint64(stat.Rdev))),
ReadOnly: c.mounts[i].ReadOnly, ReadOnly: c.mounts[i].ReadOnly,
} }
} else if isBlockFile && stat.Mode&unix.S_IFMT == unix.S_IFREG {
di = &config.DeviceInfo{
HostPath: c.mounts[i].Source,
ContainerPath: c.mounts[i].Destination,
DevType: "b",
Major: -1,
Minor: 0,
ReadOnly: c.mounts[i].ReadOnly,
}
// Check whether source can be used as a pmem device // Check whether source can be used as a pmem device
} else if di, err = config.PmemDeviceInfo(c.mounts[i].Source, c.mounts[i].Destination); err != nil { } else if di, err = config.PmemDeviceInfo(c.mounts[i].Source, c.mounts[i].Destination); err != nil {
c.Logger().WithError(err). c.Logger().WithError(err).

View File

@ -22,6 +22,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
) )
@ -376,6 +377,20 @@ func (f *FilesystemShare) ShareRootFilesystem(ctx context.Context, c *Container)
} }
rootfsGuestPath := filepath.Join(kataGuestSharedDir(), c.id, c.rootfsSuffix) rootfsGuestPath := filepath.Join(kataGuestSharedDir(), c.id, c.rootfsSuffix)
if HasOptionPrefix(c.rootFs.Options, annotations.FileSystemLayer) {
path := filepath.Join("/run/kata-containers", c.id, "rootfs")
return &SharedFile{
storage: &grpc.Storage{
MountPoint: path,
Source: "none",
Fstype: c.rootFs.Type,
Driver: kataOverlayDevType,
Options: c.rootFs.Options,
},
guestPath: path,
}, nil
}
if c.state.Fstype != "" && c.state.BlockDeviceID != "" { if c.state.Fstype != "" && c.state.BlockDeviceID != "" {
// The rootfs storage volume represents the container rootfs // The rootfs storage volume represents the container rootfs
// mount point inside the guest. // mount point inside the guest.

View File

@ -907,6 +907,8 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st
for _, m := range spec.Mounts { for _, m := range spec.Mounts {
if _, found := ignoredMounts[m.Source]; found { if _, found := ignoredMounts[m.Source]; found {
k.Logger().WithField("removed-mount", m.Source).Debug("Removing OCI mount") k.Logger().WithField("removed-mount", m.Source).Debug("Removing OCI mount")
} else if HasOption(m.Options, vcAnnotations.IsFileSystemLayer) {
k.Logger().WithField("removed-mount", m.Source).Debug("Removing layer")
} else { } else {
mounts = append(mounts, m) mounts = append(mounts, m)
} }
@ -1293,13 +1295,17 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
// Block based volumes will require some adjustments in the OCI spec, and creation of // Block based volumes will require some adjustments in the OCI spec, and creation of
// storage objects to pass to the agent. // storage objects to pass to the agent.
volumeStorages, err := k.handleBlkOCIMounts(c, ociSpec) layerStorages, volumeStorages, err := k.handleBlkOCIMounts(c, ociSpec)
if err != nil { if err != nil {
return nil, err return nil, err
} }
ctrStorages = append(ctrStorages, volumeStorages...) ctrStorages = append(ctrStorages, volumeStorages...)
// Layer storage objects are prepended to the list so that they come _before_ the
// rootfs because the rootfs depends on them (it's an overlay of the layers).
ctrStorages = append(layerStorages, ctrStorages...)
grpcSpec, err := grpc.OCItoGRPC(ociSpec) grpcSpec, err := grpc.OCItoGRPC(ociSpec)
if err != nil { if err != nil {
return nil, err return nil, err
@ -1611,9 +1617,10 @@ func (k *kataAgent) createBlkStorageObject(c *Container, m Mount) (*grpc.Storage
// handleBlkOCIMounts will create a unique destination mountpoint in the guest for each volume in the // handleBlkOCIMounts will create a unique destination mountpoint in the guest for each volume in the
// given container and will update the OCI spec to utilize this mount point as the new source for the // given container and will update the OCI spec to utilize this mount point as the new source for the
// container volume. The container mount structure is updated to store the guest destination mountpoint. // container volume. The container mount structure is updated to store the guest destination mountpoint.
func (k *kataAgent) handleBlkOCIMounts(c *Container, spec *specs.Spec) ([]*grpc.Storage, error) { func (k *kataAgent) handleBlkOCIMounts(c *Container, spec *specs.Spec) ([]*grpc.Storage, []*grpc.Storage, error) {
var volumeStorages []*grpc.Storage var volumeStorages []*grpc.Storage
var layerStorages []*grpc.Storage
for i, m := range c.mounts { for i, m := range c.mounts {
id := m.BlockDeviceID id := m.BlockDeviceID
@ -1629,7 +1636,12 @@ func (k *kataAgent) handleBlkOCIMounts(c *Container, spec *specs.Spec) ([]*grpc.
// Create Storage structure // Create Storage structure
vol, err := k.createBlkStorageObject(c, m) vol, err := k.createBlkStorageObject(c, m)
if vol == nil || err != nil { if vol == nil || err != nil {
return nil, err return nil, nil, err
}
if HasOption(m.Options, vcAnnotations.IsFileSystemLayer) {
layerStorages = append(layerStorages, vol)
continue
} }
// Each device will be mounted at a unique location within the VM only once. Mounting // Each device will be mounted at a unique location within the VM only once. Mounting
@ -1660,7 +1672,7 @@ func (k *kataAgent) handleBlkOCIMounts(c *Container, spec *specs.Spec) ([]*grpc.
volumeStorages = append(volumeStorages, vol) volumeStorages = append(volumeStorages, vol)
} }
return volumeStorages, nil return layerStorages, volumeStorages, nil
} }
// handlePidNamespace checks if Pid namespace for a container needs to be shared with its sandbox // handlePidNamespace checks if Pid namespace for a container needs to be shared with its sandbox

View File

@ -415,3 +415,21 @@ func isWatchableMount(path string) bool {
return false return false
} }
func HasOption(options []string, option string) bool {
for _, o := range options {
if o == option {
return true
}
}
return false
}
func HasOptionPrefix(options []string, prefix string) bool {
for _, o := range options {
if strings.HasPrefix(o, prefix) {
return true
}
}
return false
}

View File

@ -310,6 +310,21 @@ const (
ContainerResourcesSwapInBytes = kataAnnotContainerResourcePrefix + "swap_in_bytes" ContainerResourcesSwapInBytes = kataAnnotContainerResourcePrefix + "swap_in_bytes"
) )
// Annotations related to file system options.
const (
kataAnnotFsOptPrefix = kataAnnotationsPrefix + "fs-opt."
// FileSystemLayer describes a layer of an overlay filesystem.
FileSystemLayer = kataAnnotFsOptPrefix + "layer="
// IsFileSystemLayer indicates that the annotated filesystem is a layer of an overlay fs.
IsFileSystemLayer = kataAnnotFsOptPrefix + "is-layer"
// IsFileBlockDevice indicates that the annotated filesystem is mounted on a block device
// backed by a host file.
IsFileBlockDevice = kataAnnotFsOptPrefix + "block_device=file"
)
const ( const (
// SHA512 is the SHA-512 (64) hash algorithm // SHA512 is the SHA-512 (64) hash algorithm
SHA512 string = "sha512" SHA512 string = "sha512"