runtime: Simplify mounting guest devices when using hostPath volumes

This change crystallizes and simplifies the current handling of /dev
hostPath mounts with virtually no functional change.

Before this change:

 - If a mount DESTINATION is in /dev and it is a non-regular file on the HOST,
   the shim passes the OCI bind mount as is to the guest (e.g.
   /dev/kmsg:/dev/kmsg). The container rightfully sees the GUEST device.

 - If the mount DESTINATION does not exist on the host, the shim relies on
   k8s/containerd to automatically create a directory (ie. non-regular file) on
   the HOST. The shim then also passes the OCI bind mount as is to the guest.  The
   container rightfully sees the GUEST device.

 - For other /dev mounts, the shim passes the device major/minor to the guest
   over virtio-fs. The container rightfully sees the GUEST device.

After this change:

 - If a mount SOURCE is in /dev and it is a non-regular file on the HOST,
   the shim passes the OCI bind mount as is to the guest. The container
   rightfully sees the GUEST device.

 - The shim does not anymore rely on k8s/containerd to create missing mount
   directories. Instead it explicitely handles missing mount SOURCES, and
   treats them like the previous bullet point.

 - The shim no longer uses virtio-fs to pass /dev device major/minor to the
   guest, instead it passes the OCI bind mount as is.

Signed-off-by: Aurélien Bombo <abombo@microsoft.com>
This commit is contained in:
Aurélien Bombo
2025-09-22 15:57:33 -05:00
parent 285aaad13e
commit 5c21b1faf3
8 changed files with 114 additions and 15 deletions

View File

@@ -166,6 +166,15 @@ moment.
See [this issue](https://github.com/kata-containers/runtime/issues/2812) for more details.
[Another issue](https://github.com/kata-containers/kata-containers/issues/1728) focuses on the case of `emptyDir`.
### Kubernetes [hostPath][k8s-hostpath] volumes
When the source path of a hostPath volume is under `/dev`, and the path
either corresponds to a host device or is not accessible by the Kata
shim, the Kata agent bind mounts the source path directly from the
*guest* filesystem into the container.
[k8s-hostpath]: https://kubernetes.io/docs/concepts/storage/volumes/#hostpath
## Host resource sharing
### Privileged containers

View File

@@ -469,8 +469,15 @@ func (c *Container) mountSharedDirMounts(ctx context.Context, sharedDirMounts, i
// Ignore /dev, directories and all other device files. We handle
// only regular files in /dev. It does not make sense to pass the host
// device nodes to the guest.
if isHostDevice(m.Destination) {
// device nodes to the guest. We also ignore inaccessible host
// devices in case we're mounting a device that is only
// accessible in the guest.
//
// Note: K8s/containerd seems to create the source path as a
// directory on the host if it does not already exist.
// isHostDevice() will still return true in that case, so the
// above contract holds.
if isDevice, err := isHostDevice(m.Source); isDevice || err != nil {
continue
}

View File

@@ -54,33 +54,32 @@ func isSystemMount(m string) bool {
return false
}
func isHostDevice(m string) bool {
// isHostDevice returns whether the given path is a non-regular file
// under /dev (or /dev itself) on the host. If os.Stat fails on the
// file, it returns false plus the error from os.Stat.
func isHostDevice(m string) (bool, error) {
m = filepath.Clean(m)
if m == "/dev" {
return true
return true, nil
}
if strings.HasPrefix(m, "/dev/") {
// Check if regular file
s, err := os.Stat(m)
// This should not happen. In case file does not exist let the
// error be handled by the agent, simply return false here.
if err != nil {
return false
return false, err
}
if s.Mode().IsRegular() {
return false
return false, nil
}
// This is not a regular file in /dev. It is either a
// device file, directory or any other special file which is
// specific to the host system.
return true
return true, nil
}
return false
return false, nil
}
func major(dev uint64) int {
@@ -131,7 +130,7 @@ func getDeviceForPath(path string) (device, error) {
return device{}, err
}
if isHostDevice(path) {
if isDevice, _ := isHostDevice(path); isDevice {
// stat.Rdev describes the device that this file (inode) represents.
devMajor = major(uint64(stat.Rdev))
devMinor = minor(uint64(stat.Rdev))

View File

@@ -255,7 +255,7 @@ func TestIsHostDevice(t *testing.T) {
}
for _, test := range tests {
result := isHostDevice(test.mnt)
result, _ := isHostDevice(test.mnt)
assert.Equal(result, test.expected)
}
}

View File

@@ -65,7 +65,9 @@ func TestIsHostDeviceCreateFile(t *testing.T) {
assert.NoError(err)
f.Close()
assert.False(isHostDevice(path))
isDevice, err := isHostDevice(path)
assert.False(isDevice)
assert.NoError(err)
assert.NoError(os.Remove(path))
}

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env bats
#
# Copyright (c) 2025 Microsoft Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
load "${BATS_TEST_DIRNAME}/../../common.bash"
load "${BATS_TEST_DIRNAME}/lib.sh"
load "${BATS_TEST_DIRNAME}/tests_common.sh"
setup() {
setup_common
get_pod_config_dir
pod_name="hostpath-kmsg"
yaml_file="${pod_config_dir}/pod-hostpath-kmsg.yaml"
cmd_mountinfo=(sh -c "grep /dev/kmsg /proc/self/mountinfo")
cmd_stat=(sh -c "stat -c '%t,%T' /dev/kmsg")
cmd_head=(sh -c "head -10 /dev/kmsg")
policy_settings_dir="$(create_tmp_policy_settings_dir "${pod_config_dir}")"
add_exec_to_policy_settings "${policy_settings_dir}" "${cmd_mountinfo[@]}"
add_exec_to_policy_settings "${policy_settings_dir}" "${cmd_stat[@]}"
add_exec_to_policy_settings "${policy_settings_dir}" "${cmd_head[@]}"
add_requests_to_policy_settings "${policy_settings_dir}" "ReadStreamRequest"
auto_generate_policy "${policy_settings_dir}" "${yaml_file}"
}
@test "/dev hostPath volume bind mounts the guest device and skips virtio-fs" {
kubectl apply -f "${yaml_file}"
kubectl wait --for=condition=Ready --timeout="${timeout}" pod "${pod_name}"
# Check the mount info.
mount_info="$(kubectl exec "${pod_name}" -- "${cmd_mountinfo[@]}")"
read root mountpoint fstype < <(awk '{print $4, $5, $9}' <<< "$mount_info")
[ "$root" == "/kmsg" ] # Would look like "/<CONTAINER_ID>-<RANDOM_ID>-kmsg" with virtio-fs.
[ "$mountpoint" == "/dev/kmsg" ]
[ "$fstype" == "devtmpfs" ] # Would be "virtiofs" with virtio-fs.
# Check the device major/minor.
majminor="$(kubectl exec "${pod_name}" -- "${cmd_stat[@]}")"
[ "$majminor" == "1,b" ]
# Check that the device is actually accessible.
kubectl exec "${pod_name}" -- "${cmd_head[@]}"
}
teardown() {
delete_tmp_policy_settings_dir "${policy_settings_dir}"
teardown_common "${node}" "${node_start_time:-}"
}

View File

@@ -60,6 +60,7 @@ else
"k8s-exec.bats" \
"k8s-file-volume.bats" \
"k8s-hostname.bats" \
"k8s-hostpath-volume.bats" \
"k8s-inotify.bats" \
"k8s-ip6tables.bats" \
"k8s-job.bats" \

View File

@@ -0,0 +1,24 @@
#
# Copyright (c) 2025 Microsoft Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
apiVersion: v1
kind: Pod
metadata:
name: hostpath-kmsg
spec:
terminationGracePeriodSeconds: 0
runtimeClassName: kata
restartPolicy: Never
volumes:
- name: dev-kmsg
hostPath:
path: /dev/kmsg
containers:
- image: quay.io/prometheus/busybox:latest
name: container
volumeMounts:
- name: dev-kmsg
mountPath: /dev/kmsg