diff --git a/src/runtime/pkg/containerd-shim-v2/create.go b/src/runtime/pkg/containerd-shim-v2/create.go index 474925b122..ee9bbb36b6 100644 --- a/src/runtime/pkg/containerd-shim-v2/create.go +++ b/src/runtime/pkg/containerd-shim-v2/create.go @@ -16,6 +16,7 @@ import ( "path" "path/filepath" "strconv" + "strings" "syscall" containerd_types "github.com/containerd/containerd/api/types" @@ -23,6 +24,8 @@ import ( taskAPI "github.com/containerd/containerd/runtime/v2/task" "github.com/containerd/typeurl" "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" @@ -48,6 +51,28 @@ var defaultStartManagementServerFunc startManagementServerFunc = func(s *service shimLog.Info("management server started") } +func copyLayersToMounts(rootFs *vc.RootFs, spec *specs.Spec) error { + for _, o := range rootFs.Options { + if !strings.HasPrefix(o, annotations.FileSystemLayer) { + continue + } + + fields := strings.Split(o[len(annotations.FileSystemLayer):], ",") + if len(fields) < 2 { + return fmt.Errorf("Missing fields in rootfs layer: %q", o) + } + + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: "/run/kata-containers/sandbox/layers/" + filepath.Base(fields[0]), + Type: fields[1], + Source: fields[0], + Options: fields[2:], + }) + } + + return nil +} + func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*container, error) { rootFs := vc.RootFs{} if len(r.Rootfs) == 1 { @@ -63,6 +88,11 @@ func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*con if err != nil { return nil, err } + + if err := copyLayersToMounts(&rootFs, ociSpec); err != nil { + return nil, err + } + containerType, err := oci.ContainerType(*ociSpec) if err != nil { return nil, err @@ -268,6 +298,11 @@ func checkAndMount(s *service, r *taskAPI.CreateTaskRequest) (bool, error) { if katautils.IsBlockDevice(m.Source) && !s.config.HypervisorConfig.DisableBlockDeviceUse { return false, nil } + + if virtcontainers.HasOptionPrefix(m.Options, annotations.FileSystemLayer) { + return false, nil + } + if m.Type == vc.NydusRootFSType { // if kata + nydus, do not mount return false, nil diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index ef2a5c4b03..773eaaa2d5 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -480,6 +480,10 @@ func GetHostPath(devInfo DeviceInfo, vhostUserStoreEnabled bool, vhostUserStoreP return "", fmt.Errorf("Empty path provided for device") } + if devInfo.Major == -1 { + return devInfo.HostPath, nil + } + // Filter out vhost-user storage devices by device Major numbers. if vhostUserStoreEnabled && devInfo.DevType == "b" && (devInfo.Major == VhostUserSCSIMajor || devInfo.Major == VhostUserBlkMajor) { diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index cb5e86a045..ed3708dc9a 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -83,10 +83,21 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS return dm } -func (dm *deviceManager) findDeviceByMajorMinor(major, minor int64) api.Device { +func (dm *deviceManager) findDevice(devInfo *config.DeviceInfo) api.Device { + // For devices with a major of -1, we use the host path to find existing instances. + if devInfo.Major == -1 { + for _, dev := range dm.devices { + dma, _ := dev.GetMajorMinor() + if dma == -1 && dev.GetHostPath() == devInfo.HostPath { + return dev + } + } + return nil + } + for _, dev := range dm.devices { dma, dmi := dev.GetMajorMinor() - if dma == major && dmi == minor { + if dma == devInfo.Major && dmi == devInfo.Minor { return dev } } @@ -111,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device } }() - if existingDev := dm.findDeviceByMajorMinor(devInfo.Major, devInfo.Minor); existingDev != nil { + if existingDev := dm.findDevice(&devInfo); existingDev != nil { return existingDev, nil } diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 46eb1c0827..8241d91729 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -802,7 +802,12 @@ func (q *QMP) blockdevAddBaseArgs(driver string, blockDevice *BlockDevice) map[s // used to name the device. As this identifier will be passed directly to QMP, // it must obey QMP's naming rules, e,g., it must start with a letter. func (q *QMP) ExecuteBlockdevAdd(ctx context.Context, blockDevice *BlockDevice) error { - args := q.blockdevAddBaseArgs("host_device", blockDevice) + var args map[string]interface{} + if fi, err := os.Stat(blockDevice.File); err == nil && fi.Mode().IsRegular() { + args = q.blockdevAddBaseArgs("file", blockDevice) + } else { + args = q.blockdevAddBaseArgs("host_device", blockDevice) + } return q.executeCommand(ctx, "blockdev-add", args, nil) } diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index 0f68810c40..1f8646a63c 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -608,8 +608,9 @@ func (c *Container) createBlockDevices(ctx context.Context) error { continue } - if c.mounts[i].Type != "bind" { - // We only handle for bind-mounts + isBlockFile := HasOption(c.mounts[i].Options, vcAnnotations.IsFileBlockDevice) + if c.mounts[i].Type != "bind" && !isBlockFile { + // We only handle for bind and block device mounts. continue } @@ -671,7 +672,7 @@ func (c *Container) createBlockDevices(ctx context.Context) error { // Check if mount is a block device file. If it is, the block device will be attached to the host // instead of passing this as a shared mount. - if stat.Mode&unix.S_IFBLK == unix.S_IFBLK { + if stat.Mode&unix.S_IFMT == unix.S_IFBLK { di = &config.DeviceInfo{ HostPath: c.mounts[i].Source, ContainerPath: c.mounts[i].Destination, @@ -680,6 +681,15 @@ func (c *Container) createBlockDevices(ctx context.Context) error { Minor: int64(unix.Minor(uint64(stat.Rdev))), ReadOnly: c.mounts[i].ReadOnly, } + } else if isBlockFile && stat.Mode&unix.S_IFMT == unix.S_IFREG { + di = &config.DeviceInfo{ + HostPath: c.mounts[i].Source, + ContainerPath: c.mounts[i].Destination, + DevType: "b", + Major: -1, + Minor: 0, + ReadOnly: c.mounts[i].ReadOnly, + } // Check whether source can be used as a pmem device } else if di, err = config.PmemDeviceInfo(c.mounts[i].Source, c.mounts[i].Destination); err != nil { c.Logger().WithError(err). diff --git a/src/runtime/virtcontainers/fs_share_linux.go b/src/runtime/virtcontainers/fs_share_linux.go index 085f3b4f8b..d2f9039726 100644 --- a/src/runtime/virtcontainers/fs_share_linux.go +++ b/src/runtime/virtcontainers/fs_share_linux.go @@ -22,6 +22,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) @@ -376,6 +377,20 @@ func (f *FilesystemShare) ShareRootFilesystem(ctx context.Context, c *Container) } rootfsGuestPath := filepath.Join(kataGuestSharedDir(), c.id, c.rootfsSuffix) + if HasOptionPrefix(c.rootFs.Options, annotations.FileSystemLayer) { + path := filepath.Join("/run/kata-containers", c.id, "rootfs") + return &SharedFile{ + storage: &grpc.Storage{ + MountPoint: path, + Source: "none", + Fstype: c.rootFs.Type, + Driver: kataOverlayDevType, + Options: c.rootFs.Options, + }, + guestPath: path, + }, nil + } + if c.state.Fstype != "" && c.state.BlockDeviceID != "" { // The rootfs storage volume represents the container rootfs // mount point inside the guest. diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index a3e5f5d421..e102484c4c 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -907,6 +907,8 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st for _, m := range spec.Mounts { if _, found := ignoredMounts[m.Source]; found { k.Logger().WithField("removed-mount", m.Source).Debug("Removing OCI mount") + } else if HasOption(m.Options, vcAnnotations.IsFileSystemLayer) { + k.Logger().WithField("removed-mount", m.Source).Debug("Removing layer") } else { mounts = append(mounts, m) } @@ -1293,13 +1295,17 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co // Block based volumes will require some adjustments in the OCI spec, and creation of // storage objects to pass to the agent. - volumeStorages, err := k.handleBlkOCIMounts(c, ociSpec) + layerStorages, volumeStorages, err := k.handleBlkOCIMounts(c, ociSpec) if err != nil { return nil, err } ctrStorages = append(ctrStorages, volumeStorages...) + // Layer storage objects are prepended to the list so that they come _before_ the + // rootfs because the rootfs depends on them (it's an overlay of the layers). + ctrStorages = append(layerStorages, ctrStorages...) + grpcSpec, err := grpc.OCItoGRPC(ociSpec) if err != nil { return nil, err @@ -1611,9 +1617,10 @@ func (k *kataAgent) createBlkStorageObject(c *Container, m Mount) (*grpc.Storage // handleBlkOCIMounts will create a unique destination mountpoint in the guest for each volume in the // given container and will update the OCI spec to utilize this mount point as the new source for the // container volume. The container mount structure is updated to store the guest destination mountpoint. -func (k *kataAgent) handleBlkOCIMounts(c *Container, spec *specs.Spec) ([]*grpc.Storage, error) { +func (k *kataAgent) handleBlkOCIMounts(c *Container, spec *specs.Spec) ([]*grpc.Storage, []*grpc.Storage, error) { var volumeStorages []*grpc.Storage + var layerStorages []*grpc.Storage for i, m := range c.mounts { id := m.BlockDeviceID @@ -1629,7 +1636,12 @@ func (k *kataAgent) handleBlkOCIMounts(c *Container, spec *specs.Spec) ([]*grpc. // Create Storage structure vol, err := k.createBlkStorageObject(c, m) if vol == nil || err != nil { - return nil, err + return nil, nil, err + } + + if HasOption(m.Options, vcAnnotations.IsFileSystemLayer) { + layerStorages = append(layerStorages, vol) + continue } // Each device will be mounted at a unique location within the VM only once. Mounting @@ -1660,7 +1672,7 @@ func (k *kataAgent) handleBlkOCIMounts(c *Container, spec *specs.Spec) ([]*grpc. volumeStorages = append(volumeStorages, vol) } - return volumeStorages, nil + return layerStorages, volumeStorages, nil } // handlePidNamespace checks if Pid namespace for a container needs to be shared with its sandbox diff --git a/src/runtime/virtcontainers/mount.go b/src/runtime/virtcontainers/mount.go index 6c2e204208..f261618905 100644 --- a/src/runtime/virtcontainers/mount.go +++ b/src/runtime/virtcontainers/mount.go @@ -415,3 +415,21 @@ func isWatchableMount(path string) bool { return false } + +func HasOption(options []string, option string) bool { + for _, o := range options { + if o == option { + return true + } + } + return false +} + +func HasOptionPrefix(options []string, prefix string) bool { + for _, o := range options { + if strings.HasPrefix(o, prefix) { + return true + } + } + return false +} diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index eb8656cc28..d51447d73e 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -310,6 +310,21 @@ const ( ContainerResourcesSwapInBytes = kataAnnotContainerResourcePrefix + "swap_in_bytes" ) +// Annotations related to file system options. +const ( + kataAnnotFsOptPrefix = kataAnnotationsPrefix + "fs-opt." + + // FileSystemLayer describes a layer of an overlay filesystem. + FileSystemLayer = kataAnnotFsOptPrefix + "layer=" + + // IsFileSystemLayer indicates that the annotated filesystem is a layer of an overlay fs. + IsFileSystemLayer = kataAnnotFsOptPrefix + "is-layer" + + // IsFileBlockDevice indicates that the annotated filesystem is mounted on a block device + // backed by a host file. + IsFileBlockDevice = kataAnnotFsOptPrefix + "block_device=file" +) + const ( // SHA512 is the SHA-512 (64) hash algorithm SHA512 string = "sha512"