From 20ca4d2d79aa5bf63aa1254f08915da84f19e92a Mon Sep 17 00:00:00 2001 From: Dan Mihai Date: Wed, 7 Jan 2026 00:09:08 +0000 Subject: [PATCH] runtime: DEFDISABLEBLOCK := true MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Add disable_block_device_use to CLH settings file, for parity with the already existing QEMU settings. 2. Set DEFDISABLEBLOCK := true by default for both QEMU and CLH. After this change, Kata Guests will use by default virtio-fs to access container rootfs directories from their Hosts. Hosts that were designed to use Host block devices attached to the Guests can re-enable these rootfs block devices by changing the value of disable_block_device_use back to false in their settings files. 3. Add test using container image without any rootfs layers. Depending on the container runtime and image snapshotter being used, the empty container rootfs image might get stored on a host block device that cannot be safely hotplugged to a guest VM, because the host is using the same block device. 4. Add block device hotplug safety warning into the Kata Shim configuration files. Signed-off-by: Dan Mihai Signed-off-by: Fabiano FidĂȘncio Signed-off-by: Cameron McDermott --- docs/design/architecture/storage.md | 7 ++- docs/how-to/how-to-set-sandbox-config-kata.md | 2 +- src/runtime/Makefile | 2 +- src/runtime/config/configuration-clh.toml.in | 14 +++++ .../config/configuration-qemu-cca.toml.in | 8 ++- .../configuration-qemu-coco-dev.toml.in | 8 ++- .../configuration-qemu-nvidia-gpu-snp.toml.in | 8 ++- .../configuration-qemu-nvidia-gpu-tdx.toml.in | 8 ++- .../configuration-qemu-nvidia-gpu.toml.in | 8 ++- .../config/configuration-qemu-se.toml.in | 8 ++- .../config/configuration-qemu-snp.toml.in | 8 ++- .../config/configuration-qemu-tdx.toml.in | 8 ++- src/runtime/config/configuration-qemu.toml.in | 8 ++- .../config/configuration-stratovirt.toml.in | 8 ++- .../kubernetes/k8s-empty-image.bats | 59 +++++++++++++++++++ .../kubernetes/run_kubernetes_tests.sh | 1 + .../no-layer-image.yaml | 13 ++++ 17 files changed, 163 insertions(+), 15 deletions(-) create mode 100644 tests/integration/kubernetes/k8s-empty-image.bats create mode 100644 tests/integration/kubernetes/runtimeclass_workloads/no-layer-image.yaml diff --git a/docs/design/architecture/storage.md b/docs/design/architecture/storage.md index 3aefc7ecf9..d3cb71ad85 100644 --- a/docs/design/architecture/storage.md +++ b/docs/design/architecture/storage.md @@ -51,6 +51,7 @@ containers started after the VM has been launched. Users can check to see if the container uses the `devicemapper` block device as its rootfs by calling `mount(8)` within the container. If the `devicemapper` block device is used, the root filesystem (`/`) -will be mounted from `/dev/vda`. Users can disable direct mounting of -the underlying block device through the runtime -[configuration](README.md#configuration). +will be mounted from `/dev/vda`. Users can enable direct mounting of +the underlying block device by setting the runtime +[configuration](README.md#configuration) flag `disable_block_device_use` to +`false`. diff --git a/docs/how-to/how-to-set-sandbox-config-kata.md b/docs/how-to/how-to-set-sandbox-config-kata.md index aa044367b3..37c454cba4 100644 --- a/docs/how-to/how-to-set-sandbox-config-kata.md +++ b/docs/how-to/how-to-set-sandbox-config-kata.md @@ -50,7 +50,7 @@ There are several kinds of Kata configurations and they are listed below. | `io.katacontainers.config.hypervisor.default_max_vcpus` | uint32| the maximum number of vCPUs allocated for the VM by the hypervisor | | `io.katacontainers.config.hypervisor.default_memory` | uint32| the memory assigned for a VM by the hypervisor in `MiB` | | `io.katacontainers.config.hypervisor.default_vcpus` | float32| the default vCPUs assigned for a VM by the hypervisor | -| `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disallow a block device from being used | +| `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disable hotplugging host block devices to guest VMs for container rootfs | | `io.katacontainers.config.hypervisor.disable_image_nvdimm` | `boolean` | specify if a `nvdimm` device should be used as rootfs for the guest (QEMU) | | `io.katacontainers.config.hypervisor.disable_vhost_net` | `boolean` | specify if `vhost-net` is not available on the host | | `io.katacontainers.config.hypervisor.enable_hugepages` | `boolean` | if the memory should be `pre-allocated` from huge pages | diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 270943e152..662f8dcb84 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -250,7 +250,7 @@ DEFSECCOMPSANDBOXPARAM := DEFENTROPYSOURCE := /dev/urandom DEFVALIDENTROPYSOURCES := [\"/dev/urandom\",\"/dev/random\",\"\"] -DEFDISABLEBLOCK := false +DEFDISABLEBLOCK := true DEFSHAREDFS_CLH_VIRTIOFS := virtio-fs DEFSHAREDFS_QEMU_VIRTIOFS := virtio-fs # Please keep DEFSHAREDFS_QEMU_COCO_DEV_VIRTIOFS in sync with TDX/SNP diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index fe5fca4883..937d25d6ba 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -109,6 +109,20 @@ memory_slots = @DEFMEMSLOTS@ # > amount of physical RAM --> will be set to the actual amount of physical RAM default_maxmemory = @DEFMAXMEMSZ@ +# Disable hotplugging host block devices to guest VMs for container rootfs. +# In case of a storage driver like devicemapper where a container's +# root file system is backed by a block device, the block device is passed +# directly to the hypervisor for performance reasons. +# This flag prevents the block device from being passed to the hypervisor, +# virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. +disable_block_device_use = @DEFDISABLEBLOCK@ + # Shared file system type: # - virtio-fs (default) # - virtio-fs-nydus diff --git a/src/runtime/config/configuration-qemu-cca.toml.in b/src/runtime/config/configuration-qemu-cca.toml.in index e1469b59ce..7d71dc1d47 100644 --- a/src/runtime/config/configuration-qemu-cca.toml.in +++ b/src/runtime/config/configuration-qemu-cca.toml.in @@ -159,12 +159,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-qemu-coco-dev.toml.in b/src/runtime/config/configuration-qemu-coco-dev.toml.in index 64b3917124..1a792d9bf8 100644 --- a/src/runtime/config/configuration-qemu-coco-dev.toml.in +++ b/src/runtime/config/configuration-qemu-coco-dev.toml.in @@ -145,12 +145,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in index ef0333d75a..0da0ccd413 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in @@ -185,12 +185,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in index 90a0707800..3b3fc11d27 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in @@ -162,12 +162,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in index 65323638e7..ea03eff328 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in @@ -144,12 +144,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-qemu-se.toml.in b/src/runtime/config/configuration-qemu-se.toml.in index a7732bd1f9..483fb348b3 100644 --- a/src/runtime/config/configuration-qemu-se.toml.in +++ b/src/runtime/config/configuration-qemu-se.toml.in @@ -153,12 +153,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-qemu-snp.toml.in b/src/runtime/config/configuration-qemu-snp.toml.in index e79051fec6..13a364de9a 100644 --- a/src/runtime/config/configuration-qemu-snp.toml.in +++ b/src/runtime/config/configuration-qemu-snp.toml.in @@ -184,12 +184,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-qemu-tdx.toml.in b/src/runtime/config/configuration-qemu-tdx.toml.in index 287d356a55..5029c3ec92 100644 --- a/src/runtime/config/configuration-qemu-tdx.toml.in +++ b/src/runtime/config/configuration-qemu-tdx.toml.in @@ -161,12 +161,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index c31d17f489..af971558ca 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -144,12 +144,18 @@ memory_offset = 0 # Default false enable_virtio_mem = false -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/src/runtime/config/configuration-stratovirt.toml.in b/src/runtime/config/configuration-stratovirt.toml.in index a86a584a52..b9f28a74a6 100644 --- a/src/runtime/config/configuration-stratovirt.toml.in +++ b/src/runtime/config/configuration-stratovirt.toml.in @@ -103,12 +103,18 @@ default_maxmemory = @DEFMAXMEMSZ@ # Default 0 memory_offset = 0 -# Disable block device from being used for a container's rootfs. +# Disable hotplugging host block devices to guest VMs for container rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, # virtio-fs is used instead to pass the rootfs. +# WARNING: +# Don't set this flag to false if you don't understand well the behavior of +# your container runtime and image snapshotter. Some snapshotters might use +# container image storage devices that are not meant to be hotplugged into a +# guest VM - e.g., because they contain files used by the host or by other +# guests. disable_block_device_use = @DEFDISABLEBLOCK@ # Shared file system type: diff --git a/tests/integration/kubernetes/k8s-empty-image.bats b/tests/integration/kubernetes/k8s-empty-image.bats new file mode 100644 index 0000000000..6d003b2aad --- /dev/null +++ b/tests/integration/kubernetes/k8s-empty-image.bats @@ -0,0 +1,59 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2025 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +load "${BATS_TEST_DIRNAME}/../../common.bash" +load "${BATS_TEST_DIRNAME}/lib.sh" +load "${BATS_TEST_DIRNAME}/tests_common.sh" + +setup() { + setup_common || die "setup_common failed" + pod_name="no-layer-image" + get_pod_config_dir + + yaml_file="${pod_config_dir}/${pod_name}.yaml" + + # genpolicy fails for this unusual container image, so use the allow_all policy. + add_allow_all_policy_to_yaml "${yaml_file}" +} + +@test "Test image with no layers cannot run" { + # Error from run-k8s-tests (ubuntu, qemu, small): + # + # failed to create containerd task: failed to create shim task: the file sleep was not found + # + # Error from run-k8s-tests-on-tee (sev-snp, qemu-snp): + # + # failed to create containerd task: failed to create shim task: rpc status: + # Status { code: INTERNAL, message: "[CDH] [ERROR]: Image Pull error: Failed to pull image + # ghcr.io/kata-containers/no-layer-image:latest from all mirror/mapping locations or original location: image: + # ghcr.io/kata-containers/no-layer-image:latest, error: Internal error", details: [], special_fields: + # SpecialFields { unknown_fields: UnknownFields { fields: None }, cached_size: CachedSize { size: 0 } } } + # + # Error from run-k8s-tests-coco-nontee-with-erofs-snapshotter (qemu-coco-dev, erofs, default): + # + # failed to create containerd task: failed to create shim task: failed to mount + # /run/kata-containers/shared/containers/fadd1af7ea2a7bfc6caf26471f70e9a913a2989fd4a1be9d001b59e48c0781aa/rootfs + # to /run/kata-containers/fadd1af7ea2a7bfc6caf26471f70e9a913a2989fd4a1be9d001b59e48c0781aa/rootfs, with error: + # ENOENT: No such file or directory + + kubectl create -f "${yaml_file}" + + local -r command="kubectl describe "pod/${pod_name}" | grep -E \ + 'the file sleep was not found|\[CDH\] \[ERROR\]: Image Pull error|ENOENT: No such file or directory'" + info "Waiting ${wait_time} seconds for: ${command}" + waitForProcess "${wait_time}" "${sleep_time}" "${command}" >/dev/null 2>/dev/null +} + +teardown() { + # Debugging information + kubectl describe "pod/${pod_name}" + kubectl get "pod/${pod_name}" -o yaml + + kubectl delete pod "${pod_name}" + + teardown_common "${node}" "${node_start_time:-}" +} diff --git a/tests/integration/kubernetes/run_kubernetes_tests.sh b/tests/integration/kubernetes/run_kubernetes_tests.sh index a1c24c11d7..bdd6a79def 100755 --- a/tests/integration/kubernetes/run_kubernetes_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_tests.sh @@ -42,6 +42,7 @@ else ) K8S_TEST_SMALL_HOST_UNION=( \ + "k8s-empty-image.bats" \ "k8s-guest-pull-image.bats" \ "k8s-confidential.bats" \ "k8s-sealed-secret.bats" \ diff --git a/tests/integration/kubernetes/runtimeclass_workloads/no-layer-image.yaml b/tests/integration/kubernetes/runtimeclass_workloads/no-layer-image.yaml new file mode 100644 index 0000000000..0e552eb5d3 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/no-layer-image.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Pod +metadata: + name: no-layer-image +spec: + runtimeClassName: kata + containers: + - name: no-layer-image + image: ghcr.io/kata-containers/no-layer-image:latest + resources: {} + command: + - sleep + - infinity