From 6203e28bef1e9a1916295fcd65e3b95adb55dce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 15 Jun 2026 23:18:03 +0200 Subject: [PATCH 1/2] runtime: Propagate block-mode device read-only flag to the VMM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Block-mode volumes (e.g. Kubernetes volumeDevices) are passed to the container as device nodes in spec.Linux.Devices and carry no mount "ro" option. Their read-only intent is expressed only via the cgroup device access in spec.Linux.Resources.Devices ("rm" = read+mknod, no write, for read-only; "rwm" for read-write). The device path ignored that signal: newLinuxDeviceInfo() built the DeviceInfo without ever setting ReadOnly (it only consumed FileMode, the node permission bits, not the read/write access), so the device was always attached read-write. This is problematic for filesystems such as XFS, which inspect the block device read-only state to decide whether to attempt journal/log recovery. When the guest device is writable, XFS tries to replay the log even for a read-only mount, which fails badly. Mounting "-o ro" in the guest is not enough; the device itself must advertise read-only, which only happens when the VMM opens the backing device read-only (DeviceInfo.ReadOnly -> BlockDrive.ReadOnly -> qemu read-only=on / clh Readonly). Derive the read-only flag from two independent signals, combined with OR so either one marks the device read-only: - the cgroup device access rule that exactly matches the device, so a block-mode volume marked read-only by the orchestrator (e.g. a pod volume with persistentVolumeClaim.readOnly: true) is honored, and - the host block device's own read-only flag (queried via the BLKROGET ioctl). Block-mode volumes frequently carry no read-only signal in the OCI spec at all, so the device flag is often the only reliable source. The BLKROGET probe is shared (pkg/device/config.BlockDeviceIsReadOnly, Linux-only with a stub on other platforms) between the device-node path (newLinuxDeviceInfo, probing /dev/block/:) and the bind-mounted/filesystem block path (createDeviceInfo). None of this relies on external host tooling such as "blockdev --setro". Signed-off-by: Fabiano FidĂȘncio Assisted-by: Cursor --- src/runtime/pkg/device/config/config_linux.go | 37 ++++++++ .../pkg/device/config/config_nonlinux.go | 18 ++++ src/runtime/pkg/oci/utils.go | 75 +++++++++++++++- src/runtime/pkg/oci/utils_test.go | 89 +++++++++++++++++++ src/runtime/virtcontainers/container.go | 12 +++ 5 files changed, 229 insertions(+), 2 deletions(-) create mode 100644 src/runtime/pkg/device/config/config_linux.go create mode 100644 src/runtime/pkg/device/config/config_nonlinux.go diff --git a/src/runtime/pkg/device/config/config_linux.go b/src/runtime/pkg/device/config/config_linux.go new file mode 100644 index 0000000000..732c0d2972 --- /dev/null +++ b/src/runtime/pkg/device/config/config_linux.go @@ -0,0 +1,37 @@ +// Copyright (c) 2017-2018 Intel Corporation +// Copyright (c) 2018 Huawei Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package config + +import ( + "os" + + "golang.org/x/sys/unix" +) + +// BlockDeviceIsReadOnly queries the host block device at path for its +// read-only flag (BLKROGET). This reflects the device's actual writability, +// which is the ground truth for whether the guest should see it read-only: +// when the host backing is read-only, writes from the guest fail at the host +// anyway, so the device must be exposed read-only (e.g. so the guest does not +// attempt a journal replay that cannot succeed). The read-only intent for such +// devices is frequently not carried in the OCI spec (no "ro" mount option and +// no read-only cgroup device rule), so the device flag is the only reliable +// source. +func BlockDeviceIsReadOnly(path string) (bool, error) { + f, err := os.OpenFile(path, os.O_RDONLY|unix.O_CLOEXEC|unix.O_NONBLOCK, 0) + if err != nil { + return false, err + } + defer f.Close() + + ro, err := unix.IoctlGetInt(int(f.Fd()), unix.BLKROGET) + if err != nil { + return false, err + } + + return ro != 0, nil +} diff --git a/src/runtime/pkg/device/config/config_nonlinux.go b/src/runtime/pkg/device/config/config_nonlinux.go new file mode 100644 index 0000000000..0b2741d2ef --- /dev/null +++ b/src/runtime/pkg/device/config/config_nonlinux.go @@ -0,0 +1,18 @@ +// Copyright (c) 2017-2018 Intel Corporation +// Copyright (c) 2018 Huawei Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +//go:build !linux + +package config + +import "fmt" + +// BlockDeviceIsReadOnly is only meaningful on Linux, where the BLKROGET ioctl +// is available. On other platforms it is a no-op stub so the package still +// builds (e.g. for the macOS CI), and callers treat the error as "unknown". +func BlockDeviceIsReadOnly(path string) (bool, error) { + return false, fmt.Errorf("BlockDeviceIsReadOnly is not supported on this platform") +} diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 300b1a5084..c90cbe3149 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -320,7 +320,62 @@ func checkAnnotationNameIsValid(list []string, name string, prefix string) bool return true } -func newLinuxDeviceInfo(d specs.LinuxDevice) (*config.DeviceInfo, error) { +// deviceCgroupAccessIsReadOnly derives a device's read-only intent from the +// cgroup device access rules. Block-mode volumes (e.g. Kubernetes +// volumeDevices) are passed as device nodes in spec.Linux.Devices and carry no +// mount "ro" option; their read-only intent is expressed solely through the +// cgroup device access in spec.Linux.Resources.Devices ("rm" = read+mknod, no +// write, for read-only; "rwm" for read-write). +// +// The allow rule that exactly matches the device (type and exact major/minor) +// decides: the device is read-only when that rule grants access without the +// write ("w") bit. Wildcard rules (nil major/minor) describe broad device +// classes and are ignored so they cannot override a specific device's access. +// If no exact rule matches, the device is left read-write, preserving the +// previous behavior. +func deviceCgroupAccessIsReadOnly(resources *specs.LinuxResources, devType string, major, minor int64) bool { + if resources == nil { + return false + } + + for _, r := range resources.Devices { + if !r.Allow { + continue + } + if r.Major == nil || r.Minor == nil { + continue + } + if *r.Major != major || *r.Minor != minor { + continue + } + if r.Type != "" && r.Type != "a" && r.Type != devType { + continue + } + + return !strings.Contains(r.Access, "w") + } + + return false +} + +// blockDeviceReadOnlyProbe reports whether the host block device identified by +// major:minor advertises the read-only flag (BLKROGET). It is a package +// variable so tests can stub the host probe. The default implementation does a +// best-effort probe of /dev/block/: (the canonical sysfs-backed +// node that always exists for a registered block device); any failure is logged +// and treated as not-read-only so it can never flip a positive signal back. +var blockDeviceReadOnlyProbe = func(major, minor int64) bool { + path := fmt.Sprintf("/dev/block/%d:%d", major, minor) + ro, err := config.BlockDeviceIsReadOnly(path) + if err != nil { + ociLog.WithError(err).WithField("device", path). + Warn("could not query block device read-only flag") + return false + } + return ro +} + +func newLinuxDeviceInfo(d specs.LinuxDevice, resources *specs.LinuxResources) (*config.DeviceInfo, error) { allowedDeviceTypes := []string{"c", "b", "u", "p"} if !contains(allowedDeviceTypes, d.Type) { @@ -331,11 +386,22 @@ func newLinuxDeviceInfo(d specs.LinuxDevice) (*config.DeviceInfo, error) { return nil, fmt.Errorf("Path cannot be empty for device") } + // Read-only intent comes from the cgroup device access rule. For block + // devices, also honor the host device's own read-only flag (BLKROGET): + // block-mode volumes frequently carry no read-only signal in the OCI spec, + // so the device flag is the only reliable source. Either signal being + // positive marks the device read-only. + readOnly := deviceCgroupAccessIsReadOnly(resources, d.Type, d.Major, d.Minor) + if !readOnly && d.Type == "b" { + readOnly = blockDeviceReadOnlyProbe(d.Major, d.Minor) + } + deviceInfo := config.DeviceInfo{ ContainerPath: d.Path, DevType: d.Type, Major: d.Major, Minor: d.Minor, + ReadOnly: readOnly, } if d.UID != nil { deviceInfo.UID = *d.UID @@ -359,9 +425,14 @@ func containerDeviceInfos(spec specs.Spec) ([]config.DeviceInfo, error) { return []config.DeviceInfo{}, nil } + var resources *specs.LinuxResources + if spec.Linux != nil { + resources = spec.Linux.Resources + } + var devices []config.DeviceInfo for _, d := range ociLinuxDevices { - linuxDeviceInfo, err := newLinuxDeviceInfo(d) + linuxDeviceInfo, err := newLinuxDeviceInfo(d, resources) if err != nil { return []config.DeviceInfo{}, err } diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index 729f5c4b95..1d4125e6cd 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -375,6 +375,95 @@ func TestDevicePathEmpty(t *testing.T) { assert.NotNil(t, err, "This test should fail as path cannot be empty for device") } +func TestDeviceCgroupAccessIsReadOnly(t *testing.T) { + assert := assert.New(t) + + i64 := func(v int64) *int64 { return &v } + major, minor := int64(8), int64(0) + + rule := func(allow bool, typ string, maj, min *int64, access string) specs.LinuxDeviceCgroup { + return specs.LinuxDeviceCgroup{ + Allow: allow, + Type: typ, + Major: maj, + Minor: min, + Access: access, + } + } + + resources := func(rules ...specs.LinuxDeviceCgroup) *specs.LinuxResources { + return &specs.LinuxResources{Devices: rules} + } + + tests := []struct { + name string + resources *specs.LinuxResources + expected bool + }{ + {"nil resources", nil, false}, + {"no rules", resources(), false}, + {"exact match read-only (rm)", resources(rule(true, "b", i64(major), i64(minor), "rm")), true}, + {"exact match read-only (r)", resources(rule(true, "b", i64(major), i64(minor), "r")), true}, + {"exact match read-write (rwm)", resources(rule(true, "b", i64(major), i64(minor), "rwm")), false}, + {"type all (a) read-only", resources(rule(true, "a", i64(major), i64(minor), "rm")), true}, + {"empty type read-only", resources(rule(true, "", i64(major), i64(minor), "rm")), true}, + {"deny rule is ignored", resources(rule(false, "b", i64(major), i64(minor), "rm")), false}, + {"wildcard major is ignored", resources(rule(true, "b", nil, i64(minor), "rm")), false}, + {"wildcard minor is ignored", resources(rule(true, "b", i64(major), nil, "rm")), false}, + {"type mismatch is ignored", resources(rule(true, "c", i64(major), i64(minor), "rm")), false}, + {"different device is ignored", resources(rule(true, "b", i64(9), i64(1), "rm")), false}, + { + "first exact match wins", + resources( + rule(true, "b", i64(major), i64(minor), "rm"), + rule(true, "b", i64(major), i64(minor), "rwm"), + ), + true, + }, + } + + for _, tt := range tests { + assert.Equal(tt.expected, deviceCgroupAccessIsReadOnly(tt.resources, "b", major, minor), tt.name) + } +} + +func TestContainerDeviceInfosReadOnly(t *testing.T) { + assert := assert.New(t) + + // Stub the host BLKROGET probe so the test is deterministic and does no + // real device I/O: only the probe-only device (8:32) reports read-only. + savedProbe := blockDeviceReadOnlyProbe + defer func() { blockDeviceReadOnlyProbe = savedProbe }() + blockDeviceReadOnlyProbe = func(major, minor int64) bool { + return major == 8 && minor == 32 + } + + i64 := func(v int64) *int64 { return &v } + + var ociSpec specs.Spec + ociSpec.Linux = &specs.Linux{ + Devices: []specs.LinuxDevice{ + {Path: "/dev/cgroup-ro-blk", Type: "b", Major: 8, Minor: 0}, + {Path: "/dev/rw-blk", Type: "b", Major: 8, Minor: 16}, + {Path: "/dev/blkroget-ro-blk", Type: "b", Major: 8, Minor: 32}, + }, + Resources: &specs.LinuxResources{ + Devices: []specs.LinuxDeviceCgroup{ + {Allow: true, Type: "b", Major: i64(8), Minor: i64(0), Access: "rm"}, + {Allow: true, Type: "b", Major: i64(8), Minor: i64(16), Access: "rwm"}, + {Allow: true, Type: "b", Major: i64(8), Minor: i64(32), Access: "rwm"}, + }, + }, + } + + devices, err := containerDeviceInfos(ociSpec) + assert.NoError(err) + assert.Len(devices, 3) + assert.True(devices[0].ReadOnly, "device with cgroup access \"rm\" should be read-only") + assert.False(devices[1].ReadOnly, "device with cgroup access \"rwm\" and writable host device should be read-write") + assert.True(devices[2].ReadOnly, "device flagged read-only via BLKROGET should be read-only even with \"rwm\" cgroup access") +} + func TestGetShmSize(t *testing.T) { containerConfig := vc.ContainerConfig{ Mounts: []vc.Mount{}, diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index aa78dc693c..0513293cb6 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -814,6 +814,18 @@ func (c *Container) createDeviceInfo(source, destination string, readonly, isBlo var err error if stat.Mode&unix.S_IFMT == unix.S_IFBLK { + // Honor the host block device's own read-only flag in addition to the + // mount-derived intent, so a device marked read-only on the host is + // exposed read-only to the guest. + if !readonly { + if ro, roErr := config.BlockDeviceIsReadOnly(source); roErr != nil { + c.Logger().WithError(roErr).WithField("mount-source", source). + Warn("could not query block device read-only flag") + } else { + readonly = ro + } + } + di = &config.DeviceInfo{ HostPath: source, ContainerPath: destination, From cfab6f496b346bab6af98c851acc74d957bb2b75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 15 Jun 2026 23:18:20 +0200 Subject: [PATCH 2/2] runtime-rs: Propagate block device read-only flag to the VMM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Block volumes and block-mode device nodes were attached to the guest read-write regardless of the volume's read-only intent, so the guest-visible virtio-blk device was always writable. This matters beyond simple write protection: filesystems such as XFS inspect the block device read-only state to decide whether to attempt journal/log recovery. When the device is writable, XFS tries to replay the log even on a read-only mount, which fails badly. Mounting with "-o ro" inside the guest is not sufficient; the device itself must advertise read-only (VIRTIO_BLK_F_RO), which only happens when the VMM opens the backing image read-only. Set is_readonly on the block device config from two signals, combined with OR so either one marks the device read-only: - the read-only intent from the OCI spec: * bind-mounted block volumes and direct-assigned (raw block) volumes derive it from the "ro" mount option, and * block-mode volumes (e.g. Kubernetes volumeDevices) arrive as device nodes in spec.Linux.Devices with no mount option; their intent is expressed only via the cgroup device access in spec.Linux.Resources.Devices ("rm" = read+mknod, no write, for read-only; "rwm" for read-write). handler_devices() derives the flag from the matching cgroup allow rule, and - the host block device's own read-only flag (queried via the BLKROGET ioctl). Both the volume path (block_volume/rawblock_volume) and the device-node path (handler_devices, resolving the host node via get_host_path) honor it, so a device that is physically read-only on the host is exposed read-only to the guest even when the intent is not encoded in the OCI spec. All in-tree hypervisors (qemu, cloud-hypervisor, dragonball) already honor BlockConfig.is_readonly, so no hypervisor changes are required. Signed-off-by: Fabiano FidĂȘncio Assisted-by: Cursor --- .../crates/resource/src/manager_inner.rs | 177 +++++++++++++++++- .../resource/src/volume/block_volume.rs | 20 +- .../volume/direct_volumes/rawblock_volume.rs | 23 ++- .../crates/resource/src/volume/utils.rs | 29 +++ 4 files changed, 245 insertions(+), 4 deletions(-) diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 9f99c0944c..34d424d5f6 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -12,7 +12,7 @@ use async_trait::async_trait; use hypervisor::{ device::{ device_manager::{do_handle_device, get_block_device_info, DeviceManager}, - util::{get_host_path, DEVICE_TYPE_CHAR}, + util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR}, DeviceConfig, DeviceType, }, utils::uses_native_ccw_bus, @@ -41,7 +41,7 @@ use crate::{ resource_persist::ResourceState, rootfs::{RootFsResource, Rootfs}, share_fs::{self, sandbox_bind_mounts::SandboxBindMounts, NydusShareFs, ShareFs}, - volume::{Volume, VolumeResource}, + volume::{utils::is_block_device_readonly, Volume, VolumeResource}, ResourceConfig, ResourceUpdateOp, }; @@ -535,9 +535,21 @@ impl ResourceManagerInner { match d.typ() { LinuxDeviceType::B => { let blkdev_info = get_block_device_info(&self.device_manager).await; + // Read-only intent comes from the cgroup device access rule. + // Also honor the host device's own read-only flag (BLKROGET): + // block-mode volumes frequently carry no read-only signal in + // the OCI spec, so the device flag is the only reliable + // source. Either signal being positive marks it read-only. + let is_readonly = device_cgroup_access_is_readonly( + linux, + LinuxDeviceType::B, + d.major(), + d.minor(), + ) || block_device_node_is_readonly(d.major(), d.minor()); let dev_info = DeviceConfig::BlockCfg(BlockConfig { major: d.major(), minor: d.minor(), + is_readonly, driver_option: blkdev_info.block_device_driver, blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio), num_queues: blkdev_info.num_queues, @@ -1199,3 +1211,164 @@ async fn resolve_physical_endpoint_pci_paths( } } } + +/// Derive a device's read-only intent from the cgroup device access rules. +/// +/// Block-mode volumes (e.g. Kubernetes volumeDevices) are passed as device +/// nodes in `spec.Linux.Devices` and carry no mount "ro" option; their +/// read-only intent is expressed solely through the cgroup device access in +/// `spec.Linux.Resources.Devices` ("rm" = read+mknod, no write, for read-only; +/// "rwm" for read-write). +/// +/// The allow rule that exactly matches the device (type and exact major/minor) +/// decides: the device is read-only when that rule grants access without the +/// write ("w") bit. Wildcard rules (no major/minor) describe broad device +/// classes and are ignored so they cannot override a specific device's access. +/// If no exact rule matches, the device is left read-write. +fn device_cgroup_access_is_readonly( + linux: &Linux, + dev_type: LinuxDeviceType, + major: i64, + minor: i64, +) -> bool { + let devices = match linux.resources().as_ref().and_then(|r| r.devices().as_ref()) { + Some(devices) => devices, + None => return false, + }; + + for r in devices.iter() { + if !r.allow() { + continue; + } + let (rule_major, rule_minor) = match (r.major(), r.minor()) { + (Some(major), Some(minor)) => (major, minor), + _ => continue, + }; + if rule_major != major || rule_minor != minor { + continue; + } + // A specific type must match; `A` (all) and an unset type are wildcards. + if let Some(typ) = r.typ() { + if typ != LinuxDeviceType::A && typ != dev_type { + continue; + } + } + + return !r.access().as_deref().unwrap_or("").contains('w'); + } + + false +} + +/// block_device_node_is_readonly reports whether the host block device +/// identified by major:minor advertises the read-only flag (BLKROGET). This is +/// the ground truth for a device's writability: block-mode volumes frequently +/// carry no read-only signal in the OCI spec, so the device flag is the only +/// reliable source. Any failure is logged and treated as not-read-only so it +/// can never flip a positive signal back. +fn block_device_node_is_readonly(major: i64, minor: i64) -> bool { + let host_path = match get_host_path(DEVICE_TYPE_BLOCK, major, minor) { + Ok(path) if !path.is_empty() => path, + Ok(_) => return false, + Err(e) => { + warn!( + sl!(), + "could not resolve host path for block device {}:{}: {:?}", major, minor, e + ); + return false; + } + }; + + is_block_device_readonly(&host_path).unwrap_or_else(|e| { + warn!( + sl!(), + "could not query block device read-only flag for {}: {:?}", host_path, e + ); + false + }) +} + +#[cfg(test)] +mod tests { + use super::device_cgroup_access_is_readonly; + use oci_spec::runtime::{ + Linux, LinuxBuilder, LinuxDeviceCgroup, LinuxDeviceCgroupBuilder, LinuxDeviceType, + LinuxResourcesBuilder, + }; + use rstest::rstest; + + const MAJOR: i64 = 8; + const MINOR: i64 = 0; + + fn rule( + allow: bool, + typ: LinuxDeviceType, + major: Option, + minor: Option, + access: &str, + ) -> LinuxDeviceCgroup { + let mut builder = LinuxDeviceCgroupBuilder::default() + .allow(allow) + .typ(typ) + .access(access); + if let Some(major) = major { + builder = builder.major(major); + } + if let Some(minor) = minor { + builder = builder.minor(minor); + } + builder.build().unwrap() + } + + fn linux_with_rules(rules: Vec) -> Linux { + LinuxBuilder::default() + .resources( + LinuxResourcesBuilder::default() + .devices(rules) + .build() + .unwrap(), + ) + .build() + .unwrap() + } + + #[rstest] + #[case::no_rules(vec![], false)] + #[case::exact_match_rm(vec![rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rm")], true)] + #[case::exact_match_r(vec![rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "r")], true)] + #[case::exact_match_rwm(vec![rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rwm")], false)] + #[case::type_all_is_wildcard(vec![rule(true, LinuxDeviceType::A, Some(MAJOR), Some(MINOR), "rm")], true)] + #[case::deny_rule_ignored(vec![rule(false, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rm")], false)] + #[case::wildcard_major_ignored(vec![rule(true, LinuxDeviceType::B, None, Some(MINOR), "rm")], false)] + #[case::wildcard_minor_ignored(vec![rule(true, LinuxDeviceType::B, Some(MAJOR), None, "rm")], false)] + #[case::type_mismatch_ignored(vec![rule(true, LinuxDeviceType::C, Some(MAJOR), Some(MINOR), "rm")], false)] + #[case::different_device_ignored(vec![rule(true, LinuxDeviceType::B, Some(9), Some(1), "rm")], false)] + #[case::first_exact_match_wins( + vec![ + rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rm"), + rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rwm"), + ], + true + )] + fn test_device_cgroup_access_is_readonly( + #[case] rules: Vec, + #[case] expected: bool, + ) { + let linux = linux_with_rules(rules); + assert_eq!( + device_cgroup_access_is_readonly(&linux, LinuxDeviceType::B, MAJOR, MINOR), + expected + ); + } + + #[test] + fn test_no_resources() { + let linux = LinuxBuilder::default().build().unwrap(); + assert!(!device_cgroup_access_is_readonly( + &linux, + LinuxDeviceType::B, + MAJOR, + MINOR + )); + } +} diff --git a/src/runtime-rs/crates/resource/src/volume/block_volume.rs b/src/runtime-rs/crates/resource/src/volume/block_volume.rs index f5cb4ad786..cb0edaab5e 100644 --- a/src/runtime-rs/crates/resource/src/volume/block_volume.rs +++ b/src/runtime-rs/crates/resource/src/volume/block_volume.rs @@ -5,7 +5,9 @@ // use super::Volume; -use crate::volume::utils::{handle_block_volume, DEFAULT_VOLUME_FS_TYPE, KATA_MOUNT_BIND_TYPE}; +use crate::volume::utils::{ + handle_block_volume, is_block_device_readonly, DEFAULT_VOLUME_FS_TYPE, KATA_MOUNT_BIND_TYPE, +}; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use hypervisor::{ @@ -42,9 +44,25 @@ impl BlockVolume { let blkdev_info = get_block_device_info(d).await; let fstat = stat::stat(mnt_src).context(format!("stat {}", mnt_src.display()))?; + + // Honor the host block device's own read-only flag in addition to the + // mount-derived intent, so a device marked read-only on the host is + // exposed read-only to the guest. + let read_only = read_only + || is_block_device_readonly(mnt_src).unwrap_or_else(|e| { + warn!( + sl!(), + "could not query block device read-only flag for {}: {:?}", + mnt_src.display(), + e + ); + false + }); + let block_device_config = BlockConfig { major: stat::major(fstat.st_rdev) as i64, minor: stat::minor(fstat.st_rdev) as i64, + is_readonly: read_only, driver_option: blkdev_info.block_device_driver, blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio), num_queues: blkdev_info.num_queues, diff --git a/src/runtime-rs/crates/resource/src/volume/direct_volumes/rawblock_volume.rs b/src/runtime-rs/crates/resource/src/volume/direct_volumes/rawblock_volume.rs index 2efcac4077..0a8a89c74f 100644 --- a/src/runtime-rs/crates/resource/src/volume/direct_volumes/rawblock_volume.rs +++ b/src/runtime-rs/crates/resource/src/volume/direct_volumes/rawblock_volume.rs @@ -18,7 +18,11 @@ use nix::sys::{stat, stat::SFlag}; use oci_spec::runtime as oci; use tokio::sync::RwLock; -use crate::volume::{direct_volumes::KATA_DIRECT_VOLUME_TYPE, utils::handle_block_volume, Volume}; +use crate::volume::{ + direct_volumes::KATA_DIRECT_VOLUME_TYPE, + utils::{handle_block_volume, is_block_device_readonly}, + Volume, +}; #[derive(Clone)] pub(crate) struct RawblockVolume { @@ -58,8 +62,25 @@ impl RawblockVolume { )); } + // For a real block device, honor its host read-only flag (BLKROGET) in + // addition to the mount-derived intent, so a device marked read-only on + // the host is exposed read-only to the guest. (Not applicable to + // regular-file backed images.) + let read_only = read_only + || (SFlag::from_bits_truncate(fstat.st_mode) == SFlag::S_IFBLK + && is_block_device_readonly(mount_info.device.as_str()).unwrap_or_else(|e| { + warn!( + sl!(), + "could not query block device read-only flag for {}: {:?}", + mount_info.device, + e + ); + false + })); + let block_config = BlockConfigModern { path_on_host: mount_info.device.clone(), + is_readonly: read_only, driver_option: blkdev_info.block_device_driver, blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio), num_queues: blkdev_info.num_queues, diff --git a/src/runtime-rs/crates/resource/src/volume/utils.rs b/src/runtime-rs/crates/resource/src/volume/utils.rs index d74ad43efb..1efe44b13b 100644 --- a/src/runtime-rs/crates/resource/src/volume/utils.rs +++ b/src/runtime-rs/crates/resource/src/volume/utils.rs @@ -6,6 +6,8 @@ use std::{ fs, + fs::OpenOptions, + os::unix::{fs::OpenOptionsExt, io::AsRawFd}, path::{Path, PathBuf}, }; @@ -26,6 +28,33 @@ use hypervisor::device::DeviceType; pub const DEFAULT_VOLUME_FS_TYPE: &str = "ext4"; pub const KATA_MOUNT_BIND_TYPE: &str = "bind"; +// BLKROGET (_IO(0x12, 94)) returns the block device's read-only flag into an +// int. It is encoded as an `_IO` ioctl but actually transfers data, so it is a +// "bad" ioctl; request_code_none! produces the correct, arch-aware value. +nix::ioctl_read_bad!(blkroget, nix::request_code_none!(0x12, 94), libc::c_int); + +/// Query the host block device's read-only flag (BLKROGET). This reflects the +/// device's actual writability, which is the ground truth for whether the guest +/// should see it read-only: when the host backing is read-only, writes from the +/// guest fail at the host anyway, so the device must be exposed read-only. The +/// read-only intent for such devices is frequently not carried in the OCI spec +/// (no "ro" mount option), so the device flag is the only reliable source. +pub(crate) fn is_block_device_readonly>(path: P) -> Result { + let path = path.as_ref(); + let file = OpenOptions::new() + .read(true) + .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) + .open(path) + .with_context(|| format!("open {} for readonly probe", path.display()))?; + + let mut ro: libc::c_int = 0; + // Safe: file owns a valid fd for the duration of the call and `ro` is a + // valid, properly aligned pointer to an initialized int. + unsafe { blkroget(file.as_raw_fd(), &mut ro).context("ioctl BLKROGET")? }; + + Ok(ro != 0) +} + pub fn get_file_name>(src: P) -> Result { let file_name = src .as_ref()