runtime-rs: Propagate block device read-only flag to the VMM

Block volumes and block-mode device nodes were attached to the guest
read-write regardless of the volume's read-only intent, so the
guest-visible virtio-blk device was always writable.

This matters beyond simple write protection: filesystems such as XFS
inspect the block device read-only state to decide whether to attempt
journal/log recovery. When the device is writable, XFS tries to replay
the log even on a read-only mount, which fails badly. Mounting with
"-o ro" inside the guest is not sufficient; the device itself must
advertise read-only (VIRTIO_BLK_F_RO), which only happens when the VMM
opens the backing image read-only.

Set is_readonly on the block device config from two signals, combined
with OR so either one marks the device read-only:

  - the read-only intent from the OCI spec:
      * bind-mounted block volumes and direct-assigned (raw block)
        volumes derive it from the "ro" mount option, and
      * block-mode volumes (e.g. Kubernetes volumeDevices) arrive as
        device nodes in spec.Linux.Devices with no mount option; their
        intent is expressed only via the cgroup device access in
        spec.Linux.Resources.Devices ("rm" = read+mknod, no write, for
        read-only; "rwm" for read-write). handler_devices() derives the
        flag from the matching cgroup allow rule, and
  - the host block device's own read-only flag (queried via the BLKROGET
    ioctl). Both the volume path (block_volume/rawblock_volume) and the
    device-node path (handler_devices, resolving the host node via
    get_host_path) honor it, so a device that is physically read-only on
    the host is exposed read-only to the guest even when the intent is
    not encoded in the OCI spec.

All in-tree hypervisors (qemu, cloud-hypervisor, dragonball) already
honor BlockConfig.is_readonly, so no hypervisor changes are required.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor
This commit is contained in:
Fabiano Fidêncio
2026-06-15 23:18:20 +02:00
parent 6203e28bef
commit cfab6f496b
4 changed files with 245 additions and 4 deletions

View File

@@ -12,7 +12,7 @@ use async_trait::async_trait;
use hypervisor::{
device::{
device_manager::{do_handle_device, get_block_device_info, DeviceManager},
util::{get_host_path, DEVICE_TYPE_CHAR},
util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR},
DeviceConfig, DeviceType,
},
utils::uses_native_ccw_bus,
@@ -41,7 +41,7 @@ use crate::{
resource_persist::ResourceState,
rootfs::{RootFsResource, Rootfs},
share_fs::{self, sandbox_bind_mounts::SandboxBindMounts, NydusShareFs, ShareFs},
volume::{Volume, VolumeResource},
volume::{utils::is_block_device_readonly, Volume, VolumeResource},
ResourceConfig, ResourceUpdateOp,
};
@@ -535,9 +535,21 @@ impl ResourceManagerInner {
match d.typ() {
LinuxDeviceType::B => {
let blkdev_info = get_block_device_info(&self.device_manager).await;
// Read-only intent comes from the cgroup device access rule.
// Also honor the host device's own read-only flag (BLKROGET):
// block-mode volumes frequently carry no read-only signal in
// the OCI spec, so the device flag is the only reliable
// source. Either signal being positive marks it read-only.
let is_readonly = device_cgroup_access_is_readonly(
linux,
LinuxDeviceType::B,
d.major(),
d.minor(),
) || block_device_node_is_readonly(d.major(), d.minor());
let dev_info = DeviceConfig::BlockCfg(BlockConfig {
major: d.major(),
minor: d.minor(),
is_readonly,
driver_option: blkdev_info.block_device_driver,
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
num_queues: blkdev_info.num_queues,
@@ -1199,3 +1211,164 @@ async fn resolve_physical_endpoint_pci_paths(
}
}
}
/// Derive a device's read-only intent from the cgroup device access rules.
///
/// Block-mode volumes (e.g. Kubernetes volumeDevices) are passed as device
/// nodes in `spec.Linux.Devices` and carry no mount "ro" option; their
/// read-only intent is expressed solely through the cgroup device access in
/// `spec.Linux.Resources.Devices` ("rm" = read+mknod, no write, for read-only;
/// "rwm" for read-write).
///
/// The allow rule that exactly matches the device (type and exact major/minor)
/// decides: the device is read-only when that rule grants access without the
/// write ("w") bit. Wildcard rules (no major/minor) describe broad device
/// classes and are ignored so they cannot override a specific device's access.
/// If no exact rule matches, the device is left read-write.
fn device_cgroup_access_is_readonly(
linux: &Linux,
dev_type: LinuxDeviceType,
major: i64,
minor: i64,
) -> bool {
let devices = match linux.resources().as_ref().and_then(|r| r.devices().as_ref()) {
Some(devices) => devices,
None => return false,
};
for r in devices.iter() {
if !r.allow() {
continue;
}
let (rule_major, rule_minor) = match (r.major(), r.minor()) {
(Some(major), Some(minor)) => (major, minor),
_ => continue,
};
if rule_major != major || rule_minor != minor {
continue;
}
// A specific type must match; `A` (all) and an unset type are wildcards.
if let Some(typ) = r.typ() {
if typ != LinuxDeviceType::A && typ != dev_type {
continue;
}
}
return !r.access().as_deref().unwrap_or("").contains('w');
}
false
}
/// block_device_node_is_readonly reports whether the host block device
/// identified by major:minor advertises the read-only flag (BLKROGET). This is
/// the ground truth for a device's writability: block-mode volumes frequently
/// carry no read-only signal in the OCI spec, so the device flag is the only
/// reliable source. Any failure is logged and treated as not-read-only so it
/// can never flip a positive signal back.
fn block_device_node_is_readonly(major: i64, minor: i64) -> bool {
let host_path = match get_host_path(DEVICE_TYPE_BLOCK, major, minor) {
Ok(path) if !path.is_empty() => path,
Ok(_) => return false,
Err(e) => {
warn!(
sl!(),
"could not resolve host path for block device {}:{}: {:?}", major, minor, e
);
return false;
}
};
is_block_device_readonly(&host_path).unwrap_or_else(|e| {
warn!(
sl!(),
"could not query block device read-only flag for {}: {:?}", host_path, e
);
false
})
}
#[cfg(test)]
mod tests {
use super::device_cgroup_access_is_readonly;
use oci_spec::runtime::{
Linux, LinuxBuilder, LinuxDeviceCgroup, LinuxDeviceCgroupBuilder, LinuxDeviceType,
LinuxResourcesBuilder,
};
use rstest::rstest;
const MAJOR: i64 = 8;
const MINOR: i64 = 0;
fn rule(
allow: bool,
typ: LinuxDeviceType,
major: Option<i64>,
minor: Option<i64>,
access: &str,
) -> LinuxDeviceCgroup {
let mut builder = LinuxDeviceCgroupBuilder::default()
.allow(allow)
.typ(typ)
.access(access);
if let Some(major) = major {
builder = builder.major(major);
}
if let Some(minor) = minor {
builder = builder.minor(minor);
}
builder.build().unwrap()
}
fn linux_with_rules(rules: Vec<LinuxDeviceCgroup>) -> Linux {
LinuxBuilder::default()
.resources(
LinuxResourcesBuilder::default()
.devices(rules)
.build()
.unwrap(),
)
.build()
.unwrap()
}
#[rstest]
#[case::no_rules(vec![], false)]
#[case::exact_match_rm(vec![rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rm")], true)]
#[case::exact_match_r(vec![rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "r")], true)]
#[case::exact_match_rwm(vec![rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rwm")], false)]
#[case::type_all_is_wildcard(vec![rule(true, LinuxDeviceType::A, Some(MAJOR), Some(MINOR), "rm")], true)]
#[case::deny_rule_ignored(vec![rule(false, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rm")], false)]
#[case::wildcard_major_ignored(vec![rule(true, LinuxDeviceType::B, None, Some(MINOR), "rm")], false)]
#[case::wildcard_minor_ignored(vec![rule(true, LinuxDeviceType::B, Some(MAJOR), None, "rm")], false)]
#[case::type_mismatch_ignored(vec![rule(true, LinuxDeviceType::C, Some(MAJOR), Some(MINOR), "rm")], false)]
#[case::different_device_ignored(vec![rule(true, LinuxDeviceType::B, Some(9), Some(1), "rm")], false)]
#[case::first_exact_match_wins(
vec![
rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rm"),
rule(true, LinuxDeviceType::B, Some(MAJOR), Some(MINOR), "rwm"),
],
true
)]
fn test_device_cgroup_access_is_readonly(
#[case] rules: Vec<LinuxDeviceCgroup>,
#[case] expected: bool,
) {
let linux = linux_with_rules(rules);
assert_eq!(
device_cgroup_access_is_readonly(&linux, LinuxDeviceType::B, MAJOR, MINOR),
expected
);
}
#[test]
fn test_no_resources() {
let linux = LinuxBuilder::default().build().unwrap();
assert!(!device_cgroup_access_is_readonly(
&linux,
LinuxDeviceType::B,
MAJOR,
MINOR
));
}
}

View File

@@ -5,7 +5,9 @@
//
use super::Volume;
use crate::volume::utils::{handle_block_volume, DEFAULT_VOLUME_FS_TYPE, KATA_MOUNT_BIND_TYPE};
use crate::volume::utils::{
handle_block_volume, is_block_device_readonly, DEFAULT_VOLUME_FS_TYPE, KATA_MOUNT_BIND_TYPE,
};
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use hypervisor::{
@@ -42,9 +44,25 @@ impl BlockVolume {
let blkdev_info = get_block_device_info(d).await;
let fstat = stat::stat(mnt_src).context(format!("stat {}", mnt_src.display()))?;
// Honor the host block device's own read-only flag in addition to the
// mount-derived intent, so a device marked read-only on the host is
// exposed read-only to the guest.
let read_only = read_only
|| is_block_device_readonly(mnt_src).unwrap_or_else(|e| {
warn!(
sl!(),
"could not query block device read-only flag for {}: {:?}",
mnt_src.display(),
e
);
false
});
let block_device_config = BlockConfig {
major: stat::major(fstat.st_rdev) as i64,
minor: stat::minor(fstat.st_rdev) as i64,
is_readonly: read_only,
driver_option: blkdev_info.block_device_driver,
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
num_queues: blkdev_info.num_queues,

View File

@@ -18,7 +18,11 @@ use nix::sys::{stat, stat::SFlag};
use oci_spec::runtime as oci;
use tokio::sync::RwLock;
use crate::volume::{direct_volumes::KATA_DIRECT_VOLUME_TYPE, utils::handle_block_volume, Volume};
use crate::volume::{
direct_volumes::KATA_DIRECT_VOLUME_TYPE,
utils::{handle_block_volume, is_block_device_readonly},
Volume,
};
#[derive(Clone)]
pub(crate) struct RawblockVolume {
@@ -58,8 +62,25 @@ impl RawblockVolume {
));
}
// For a real block device, honor its host read-only flag (BLKROGET) in
// addition to the mount-derived intent, so a device marked read-only on
// the host is exposed read-only to the guest. (Not applicable to
// regular-file backed images.)
let read_only = read_only
|| (SFlag::from_bits_truncate(fstat.st_mode) == SFlag::S_IFBLK
&& is_block_device_readonly(mount_info.device.as_str()).unwrap_or_else(|e| {
warn!(
sl!(),
"could not query block device read-only flag for {}: {:?}",
mount_info.device,
e
);
false
}));
let block_config = BlockConfigModern {
path_on_host: mount_info.device.clone(),
is_readonly: read_only,
driver_option: blkdev_info.block_device_driver,
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
num_queues: blkdev_info.num_queues,

View File

@@ -6,6 +6,8 @@
use std::{
fs,
fs::OpenOptions,
os::unix::{fs::OpenOptionsExt, io::AsRawFd},
path::{Path, PathBuf},
};
@@ -26,6 +28,33 @@ use hypervisor::device::DeviceType;
pub const DEFAULT_VOLUME_FS_TYPE: &str = "ext4";
pub const KATA_MOUNT_BIND_TYPE: &str = "bind";
// BLKROGET (_IO(0x12, 94)) returns the block device's read-only flag into an
// int. It is encoded as an `_IO` ioctl but actually transfers data, so it is a
// "bad" ioctl; request_code_none! produces the correct, arch-aware value.
nix::ioctl_read_bad!(blkroget, nix::request_code_none!(0x12, 94), libc::c_int);
/// Query the host block device's read-only flag (BLKROGET). This reflects the
/// device's actual writability, which is the ground truth for whether the guest
/// should see it read-only: when the host backing is read-only, writes from the
/// guest fail at the host anyway, so the device must be exposed read-only. The
/// read-only intent for such devices is frequently not carried in the OCI spec
/// (no "ro" mount option), so the device flag is the only reliable source.
pub(crate) fn is_block_device_readonly<P: AsRef<Path>>(path: P) -> Result<bool> {
let path = path.as_ref();
let file = OpenOptions::new()
.read(true)
.custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK)
.open(path)
.with_context(|| format!("open {} for readonly probe", path.display()))?;
let mut ro: libc::c_int = 0;
// Safe: file owns a valid fd for the duration of the call and `ro` is a
// valid, properly aligned pointer to an initialized int.
unsafe { blkroget(file.as_raw_fd(), &mut ro).context("ioctl BLKROGET")? };
Ok(ro != 0)
}
pub fn get_file_name<P: AsRef<Path>>(src: P) -> Result<String> {
let file_name = src
.as_ref()