From 94b7936f5122d90250da6a7db77dc6c4f4b666d9 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 2 Nov 2021 16:32:04 +1100 Subject: [PATCH 01/15] agent/device: Use nix::sys::stat::{major,minor} instead of libc::* update_spec_devices() includes an unsafe block, in order to call the libc functions to get the major and minor numbers from a device ID. However, the nix crate already has a safe wrapper for this function, which we use in other places in the file. Signed-off-by: David Gibson --- src/agent/src/device.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index df0221c746..e01b7d3df8 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 // -use libc::{c_uint, major, minor}; use nix::sys::stat; use regex::Regex; use std::collections::HashMap; @@ -450,9 +449,6 @@ fn update_spec_device( vm_path: &str, final_path: &str, ) -> Result<()> { - let major_id: c_uint; - let minor_id: c_uint; - // If no container_path is provided, we won't be able to match and // update the device in the OCI spec device list. This is an error. if host_path.is_empty() { @@ -470,10 +466,8 @@ fn update_spec_device( let meta = fs::metadata(vm_path)?; let dev_id = meta.rdev(); - unsafe { - major_id = major(dev_id); - minor_id = minor(dev_id); - } + let major_id = stat::major(dev_id); + let minor_id = stat::minor(dev_id); info!( sl!(), From 0c51da3dd0a04a25b8d923f72d99e2b1fff3bfb7 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 4 Nov 2021 16:47:45 +1100 Subject: [PATCH 02/15] agent/device: Correct misleading error message in update_spec_device() This error is returned if we have information for a device from the runtime, but a matching device does not appear in the OCI spec. However, the name for the device we print is the name from the VM, rather than the name from the container which is what we actually expect in the spec. Signed-off-by: David Gibson --- src/agent/src/device.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index e01b7d3df8..fe79b44413 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -511,8 +511,8 @@ fn update_spec_device( Ok(()) } else { Err(anyhow!( - "Should have found a matching device {} in the spec", - vm_path + "Should have found a device {} in the spec", + host_path )) } } From 57541315db19be2c43a3d3adb751344204ee8628 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 29 Oct 2021 14:12:47 +1100 Subject: [PATCH 03/15] agent/device: Correct misleading parameter name in update_spec_device() update_spec_device() takes a 'host_path' parameter which it uses to locate the device to correct in the OCI spec. Although this will usually be the path of the device on the host, it doesn't have to be - a traditional runtime like runc would create a device node of that name in the container with the given (host) major and minor numbers. To clarify that, rename it to 'container_path'. We also update the block comment to explain the distinctions more carefully. Finally we update some variable names in tests to match. Signed-off-by: David Gibson --- src/agent/src/device.rs | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index fe79b44413..1aef6e00b0 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -437,22 +437,25 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { // update_spec_device updates the device list in the OCI spec to make // it include details appropriate for the VM, instead of the host. It -// is given the host path to the device (to locate the device in the -// original OCI spec) and the VM path which it uses to determine the -// VM major/minor numbers, and the final path with which to present -// the device in the (inner) container +// is given: +// container_path: the path to the device in the original OCI spec +// vm_path: the path to the equivalent device in the VM (used to +// determine the correct major/minor numbers) +// final_path: a new path to update the device to in the "inner" +// container specification (usually the same as +// container_path) #[instrument] fn update_spec_device( spec: &mut Spec, devidx: &DevIndex, - host_path: &str, + container_path: &str, vm_path: &str, final_path: &str, ) -> Result<()> { // If no container_path is provided, we won't be able to match and // update the device in the OCI spec device list. This is an error. - if host_path.is_empty() { - return Err(anyhow!("Host path cannot empty for device")); + if container_path.is_empty() { + return Err(anyhow!("Container path cannot be empty for device")); } let linux = spec @@ -474,7 +477,7 @@ fn update_spec_device( "update_spec_device(): vm_path={}, major: {}, minor: {}\n", vm_path, major_id, minor_id ); - if let Some(idxdata) = devidx.0.get(host_path) { + if let Some(idxdata) = devidx.0.get(container_path) { let dev = &mut linux.devices[idxdata.idx]; let host_major = dev.major; let host_minor = dev.minor; @@ -486,7 +489,7 @@ fn update_spec_device( info!( sl!(), "change the device from path: {} major: {} minor: {} to vm device path: {} major: {} minor: {}", - host_path, + container_path, host_major, host_minor, dev.path, @@ -512,7 +515,7 @@ fn update_spec_device( } else { Err(anyhow!( "Should have found a device {} in the spec", - host_path + container_path )) } } @@ -1107,14 +1110,14 @@ mod tests { let guest_major = stat::major(null_rdev) as i64; let guest_minor = stat::minor(null_rdev) as i64; - let host_path = "/dev/host"; + let container_path = "/dev/original"; let host_major: i64 = 99; let host_minor: i64 = 99; let mut spec = Spec { linux: Some(Linux { devices: vec![oci::LinuxDevice { - path: host_path.to_string(), + path: container_path.to_string(), r#type: "c".to_string(), major: host_major, minor: host_minor, @@ -1129,7 +1132,7 @@ mod tests { let vm_path = "/dev/null"; let final_path = "/dev/final"; - let res = update_spec_device(&mut spec, &devidx, host_path, vm_path, final_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, final_path); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; From 2029eeebca1b24ba5b2501e6055d912f838de1e5 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 2 Nov 2021 11:07:51 +1100 Subject: [PATCH 04/15] agent/device: Improve update_spec_device() final_path handling update_spec_device() takes a 'final_path' parameter which gives the name the device should be given in the "inner" OCI spec. We need this for VFIO devices where the name the payload sees needs to match the VM's IOMMU groups. However, in all other cases (for now, and maybe forever), this is the same as the original 'container_path' given in the input OCI spec. To make this clearer and simplify callers, make this parameter an Option, and only update the device name if it is non-None. Additionally, update_spec_device() needs to call to_string() on update_path to get an owned version. Rust convention[0] is to let the caller decide whether it should copy, or just give an existing owned version to the function. Change from &str to String to allow that; it doesn't buy us anything right now, but will make some things a little nicer in future. [0] https://rust-lang.github.io/api-guidelines/flexibility.html?highlight=clone#caller-decides-where-to-copy-and-place-data-c-caller-control Signed-off-by: David Gibson --- src/agent/src/device.rs | 88 ++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 55 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 1aef6e00b0..238d556401 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -450,7 +450,7 @@ fn update_spec_device( devidx: &DevIndex, container_path: &str, vm_path: &str, - final_path: &str, + final_path: Option, ) -> Result<()> { // If no container_path is provided, we won't be able to match and // update the device in the OCI spec device list. This is an error. @@ -484,7 +484,9 @@ fn update_spec_device( dev.major = major_id as i64; dev.minor = minor_id as i64; - dev.path = final_path.to_string(); + if let Some(final_path) = final_path { + dev.path = final_path; + } info!( sl!(), @@ -533,13 +535,7 @@ async fn virtiommio_blk_device_handler( return Err(anyhow!("Invalid path for virtio mmio blk device")); } - update_spec_device( - spec, - devidx, - &device.container_path, - &device.vm_path, - &device.container_path, - ) + update_spec_device(spec, devidx, &device.container_path, &device.vm_path, None) } // device.Id should be a PCI path string @@ -555,13 +551,7 @@ async fn virtio_blk_device_handler( dev.vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - update_spec_device( - spec, - devidx, - &dev.container_path, - &dev.vm_path, - &dev.container_path, - ) + update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path, None) } // device.id should be a CCW path string @@ -606,13 +596,7 @@ async fn virtio_scsi_device_handler( ) -> Result<()> { let mut dev = device.clone(); dev.vm_path = get_scsi_device_name(sandbox, &device.id).await?; - update_spec_device( - spec, - devidx, - &dev.container_path, - &dev.vm_path, - &dev.container_path, - ) + update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path, None) } #[instrument] @@ -626,13 +610,7 @@ async fn virtio_nvdimm_device_handler( return Err(anyhow!("Invalid path for nvdimm device")); } - update_spec_device( - spec, - devidx, - &device.container_path, - &device.vm_path, - &device.container_path, - ) + update_spec_device(spec, devidx, &device.container_path, &device.vm_path, None) } fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { @@ -695,7 +673,13 @@ async fn vfio_device_handler( let group = group.unwrap(); let vmpath = get_vfio_device_name(sandbox, group).await?; - update_spec_device(spec, devidx, &device.container_path, &vmpath, &vmpath)?; + update_spec_device( + spec, + devidx, + &device.container_path, + &vmpath, + Some(vmpath.to_string()), + )?; } Ok(()) @@ -845,20 +829,20 @@ mod tests { let container_path = ""; let vm_path = ""; let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); assert!(res.is_err()); // linux is empty let container_path = "/dev/null"; let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); assert!(res.is_err()); spec.linux = Some(Linux::default()); // linux.devices is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); assert!(res.is_err()); spec.linux.as_mut().unwrap().devices = vec![oci::LinuxDevice { @@ -870,14 +854,14 @@ mod tests { // vm_path empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); assert!(res.is_err()); let vm_path = "/dev/null"; // guest and host path are not the same let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); assert!( res.is_err(), "container_path={:?} vm_path={:?} spec={:?}", @@ -890,7 +874,7 @@ mod tests { // spec.linux.resources is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); assert!(res.is_ok()); // update both devices and cgroup lists @@ -911,7 +895,7 @@ mod tests { }); let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); assert!(res.is_ok()); } @@ -991,13 +975,7 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let res = update_spec_device( - &mut spec, - &devidx, - container_path_a, - vm_path_a, - container_path_a, - ); + let res = update_spec_device(&mut spec, &devidx, container_path_a, vm_path_a, None); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -1012,13 +990,7 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let res = update_spec_device( - &mut spec, - &devidx, - container_path_b, - vm_path_b, - container_path_b, - ); + let res = update_spec_device(&mut spec, &devidx, container_path_b, vm_path_b, None); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -1093,7 +1065,7 @@ mod tests { assert_eq!(Some(host_major), specresources.devices[1].major); assert_eq!(Some(host_minor), specresources.devices[1].minor); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path); + let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); assert!(res.is_ok()); // Only the char device, not the block device should be updated @@ -1130,9 +1102,15 @@ mod tests { let devidx = DevIndex::new(&spec); let vm_path = "/dev/null"; - let final_path = "/dev/final"; + let final_path = "/dev/new"; - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, final_path); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + vm_path, + Some(final_path.to_string()), + ); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; From e7beed543066619b3ae2eebe4c4b16f6a6622479 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 4 Nov 2021 15:18:04 +1100 Subject: [PATCH 05/15] agent/device: Remove unneeded clone() from several device handlers virtio_blk_device_handler(), virtio_blk_ccw_device_handler() and virtio_scsi_device_handler() all take a clone of their 'device' parameter. They appear to do this in order to get a mutable copy in which they can update the vm_path field. However, the copy is dropped at the end of the function, so the only thing that's used in it is the vm_path field passed to update_spec_device() afterwards. We can avoid the clone by just using a local variable for the vm_path. Signed-off-by: David Gibson --- src/agent/src/device.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 238d556401..afae70dd91 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -546,12 +546,10 @@ async fn virtio_blk_device_handler( sandbox: &Arc>, devidx: &DevIndex, ) -> Result<()> { - let mut dev = device.clone(); let pcipath = pci::Path::from_str(&device.id)?; + let vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - dev.vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - - update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path, None) + update_spec_device(spec, devidx, &device.container_path, &vm_path, None) } // device.id should be a CCW path string @@ -563,15 +561,14 @@ async fn virtio_blk_ccw_device_handler( sandbox: &Arc>, devidx: &DevIndex, ) -> Result<()> { - let mut dev = device.clone(); let ccw_device = ccw::Device::from_str(&device.id)?; - dev.vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?; + let vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?; update_spec_device( spec, devidx, - &dev.container_path, - &dev.vm_path, - &dev.container_path, + &device.container_path, + &vm_path, + &device.container_path, ) } @@ -594,9 +591,8 @@ async fn virtio_scsi_device_handler( sandbox: &Arc>, devidx: &DevIndex, ) -> Result<()> { - let mut dev = device.clone(); - dev.vm_path = get_scsi_device_name(sandbox, &device.id).await?; - update_spec_device(spec, devidx, &dev.container_path, &dev.vm_path, None) + let vm_path = get_scsi_device_name(sandbox, &device.id).await?; + update_spec_device(spec, devidx, &device.container_path, &vm_path, None) } #[instrument] From 46a4020e9e5831a588e61a4cc8dadb6efa233e7a Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 4 Nov 2021 15:37:41 +1100 Subject: [PATCH 06/15] agent/device: Types to represent update for a device in the OCI spec Currently update_spec_device() takes parameters 'vm_path' and 'final_path' to give it the information it needs to update a single device in the OCI spec for the guest. This bundles these parameters into a single structure type describing the updates to a single device. This doesn't accomplish much immediately, but will allow a number of further cleanups. At the same time we change the representation of vm_path from a Unicode string to a std::path::Path, which is a bit more natural since we are performing file operations on it. Signed-off-by: David Gibson --- src/agent/src/device.rs | 187 ++++++++++++++++++++++++++++++++-------- 1 file changed, 153 insertions(+), 34 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index afae70dd91..d68f533270 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -11,7 +11,7 @@ use std::fmt; use std::fs; use std::os::unix::ffi::OsStrExt; use std::os::unix::fs::MetadataExt; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::str::FromStr; use std::sync::Arc; use tokio::sync::Mutex; @@ -435,22 +435,74 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { Ok(()) } +#[derive(Debug, Clone)] +struct DevNumUpdate { + // the path to the equivalent device in the VM (used to determine + // the correct major/minor numbers) + vm_path: PathBuf, +} + +impl DevNumUpdate { + fn from_vm_path>(vm_path: T) -> Result { + Ok(DevNumUpdate { + vm_path: vm_path.as_ref().to_owned(), + }) + } +} + +// Represents the device-node and resource related updates to the OCI +// spec needed for a particular device +#[derive(Debug, Clone)] +struct DevUpdate { + num: DevNumUpdate, + // an optional new path to update the device to in the "inner" container + // specification + final_path: Option, +} + +impl DevUpdate { + fn from_vm_path>(vm_path: T, final_path: String) -> Result { + Ok(DevUpdate { + final_path: Some(final_path), + ..DevNumUpdate::from_vm_path(vm_path)?.into() + }) + } +} + +impl From for DevUpdate { + fn from(num: DevNumUpdate) -> Self { + DevUpdate { + num, + final_path: None, + } + } +} + +// Represents the updates to the OCI spec needed for a particular device +#[derive(Debug, Clone, Default)] +struct SpecUpdate { + dev: Option, +} + +impl> From for SpecUpdate { + fn from(dev: T) -> Self { + SpecUpdate { + dev: Some(dev.into()), + } + } +} + // update_spec_device updates the device list in the OCI spec to make // it include details appropriate for the VM, instead of the host. It // is given: // container_path: the path to the device in the original OCI spec -// vm_path: the path to the equivalent device in the VM (used to -// determine the correct major/minor numbers) -// final_path: a new path to update the device to in the "inner" -// container specification (usually the same as -// container_path) +// update: information on changes to make to the device #[instrument] fn update_spec_device( spec: &mut Spec, devidx: &DevIndex, container_path: &str, - vm_path: &str, - final_path: Option, + update: DevUpdate, ) -> Result<()> { // If no container_path is provided, we won't be able to match and // update the device in the OCI spec device list. This is an error. @@ -463,18 +515,24 @@ fn update_spec_device( .as_mut() .ok_or_else(|| anyhow!("Spec didn't container linux field"))?; - if !Path::new(vm_path).exists() { - return Err(anyhow!("vm_path:{} doesn't exist", vm_path)); + if !update.num.vm_path.exists() { + return Err(anyhow!( + "vm_path:{} doesn't exist", + update.num.vm_path.display() + )); } - let meta = fs::metadata(vm_path)?; + let meta = fs::metadata(&update.num.vm_path)?; let dev_id = meta.rdev(); let major_id = stat::major(dev_id); let minor_id = stat::minor(dev_id); info!( sl!(), - "update_spec_device(): vm_path={}, major: {}, minor: {}\n", vm_path, major_id, minor_id + "update_spec_device(): vm_path={}, major: {}, minor: {}\n", + update.num.vm_path.display(), + major_id, + minor_id ); if let Some(idxdata) = devidx.0.get(container_path) { @@ -484,7 +542,7 @@ fn update_spec_device( dev.major = major_id as i64; dev.minor = minor_id as i64; - if let Some(final_path) = final_path { + if let Some(final_path) = update.final_path { dev.path = final_path; } @@ -535,7 +593,12 @@ async fn virtiommio_blk_device_handler( return Err(anyhow!("Invalid path for virtio mmio blk device")); } - update_spec_device(spec, devidx, &device.container_path, &device.vm_path, None) + update_spec_device( + spec, + devidx, + &device.container_path, + DevNumUpdate::from_vm_path(&device.vm_path)?.into(), + ) } // device.Id should be a PCI path string @@ -549,7 +612,12 @@ async fn virtio_blk_device_handler( let pcipath = pci::Path::from_str(&device.id)?; let vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - update_spec_device(spec, devidx, &device.container_path, &vm_path, None) + update_spec_device( + spec, + devidx, + &device.container_path, + DevNumUpdate::from_vm_path(vm_path)?.into(), + ) } // device.id should be a CCW path string @@ -563,12 +631,12 @@ async fn virtio_blk_ccw_device_handler( ) -> Result<()> { let ccw_device = ccw::Device::from_str(&device.id)?; let vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?; + update_spec_device( spec, devidx, &device.container_path, - &vm_path, - &device.container_path, + DevNumUpdate::from_vm_path(vm_path)?.into(), ) } @@ -592,7 +660,13 @@ async fn virtio_scsi_device_handler( devidx: &DevIndex, ) -> Result<()> { let vm_path = get_scsi_device_name(sandbox, &device.id).await?; - update_spec_device(spec, devidx, &device.container_path, &vm_path, None) + + update_spec_device( + spec, + devidx, + &device.container_path, + DevNumUpdate::from_vm_path(vm_path)?.into(), + ) } #[instrument] @@ -606,7 +680,12 @@ async fn virtio_nvdimm_device_handler( return Err(anyhow!("Invalid path for nvdimm device")); } - update_spec_device(spec, devidx, &device.container_path, &device.vm_path, None) + update_spec_device( + spec, + devidx, + &device.container_path, + DevNumUpdate::from_vm_path(&device.vm_path)?.into(), + ) } fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { @@ -667,14 +746,13 @@ async fn vfio_device_handler( if vfio_in_guest { // If there are any devices at all, logic above ensures that group is not None let group = group.unwrap(); - let vmpath = get_vfio_device_name(sandbox, group).await?; + let vm_path = get_vfio_device_name(sandbox, group).await?; update_spec_device( spec, devidx, &device.container_path, - &vmpath, - Some(vmpath.to_string()), + DevUpdate::from_vm_path(&vm_path, vm_path.clone())?, )?; } @@ -825,20 +903,35 @@ mod tests { let container_path = ""; let vm_path = ""; let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + ); assert!(res.is_err()); // linux is empty let container_path = "/dev/null"; let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + ); assert!(res.is_err()); spec.linux = Some(Linux::default()); // linux.devices is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + ); assert!(res.is_err()); spec.linux.as_mut().unwrap().devices = vec![oci::LinuxDevice { @@ -850,14 +943,24 @@ mod tests { // vm_path empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + ); assert!(res.is_err()); let vm_path = "/dev/null"; // guest and host path are not the same let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + ); assert!( res.is_err(), "container_path={:?} vm_path={:?} spec={:?}", @@ -870,7 +973,12 @@ mod tests { // spec.linux.resources is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + ); assert!(res.is_ok()); // update both devices and cgroup lists @@ -891,7 +999,12 @@ mod tests { }); let devidx = DevIndex::new(&spec); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + ); assert!(res.is_ok()); } @@ -971,7 +1084,8 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let res = update_spec_device(&mut spec, &devidx, container_path_a, vm_path_a, None); + let update_a = DevNumUpdate::from_vm_path(vm_path_a).unwrap().into(); + let res = update_spec_device(&mut spec, &devidx, container_path_a, update_a); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -986,7 +1100,8 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let res = update_spec_device(&mut spec, &devidx, container_path_b, vm_path_b, None); + let update_b = DevNumUpdate::from_vm_path(vm_path_b).unwrap().into(); + let res = update_spec_device(&mut spec, &devidx, container_path_b, update_b); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -1061,7 +1176,12 @@ mod tests { assert_eq!(Some(host_major), specresources.devices[1].major); assert_eq!(Some(host_minor), specresources.devices[1].minor); - let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, None); + let res = update_spec_device( + &mut spec, + &devidx, + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + ); assert!(res.is_ok()); // Only the char device, not the block device should be updated @@ -1104,8 +1224,7 @@ mod tests { &mut spec, &devidx, container_path, - vm_path, - Some(final_path.to_string()), + DevUpdate::from_vm_path(vm_path, final_path.to_string()).unwrap(), ); assert!(res.is_ok()); From f10e8c816562d76c79af3d573b8148ec3a4a7ae1 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 29 Oct 2021 14:36:07 +1100 Subject: [PATCH 07/15] agent/device: Batch changes to the OCI specification As we process container devices in the agent, we repeatedly call update_spec_device() to adjust the OCI spec as necessary for differences between the host and the VM. This means that for the whole of a pretty complex call graph, the spec is in a partially-updated state - neither fully as it was on the host, not fully as it will be for the container within the VM. Worse, it's not discernable from the contents itself which parts of the spec have already been updated and which have not. We used to have real bugs because of this, until the DevIndex structure was introduced, but that means a whole, fairly complex, parallel data structure needs to be passed around this call graph just to keep track of the state we're in. Start simplifying this by having the device handler functions not directly update the spec, but instead return an update structure describing the change they need. Once all the devices are added, add_devices() will process all the updates as a batch. Note that collecting the updates in a HashMap, rather than a simple Vec doesn't make a lot of sense in the current code, but will reduce churn in future changes which make use of it. Signed-off-by: David Gibson --- src/agent/src/device.rs | 117 ++++++++++++---------------------------- 1 file changed, 34 insertions(+), 83 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index d68f533270..66ee9d9fa7 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -585,39 +585,25 @@ fn update_spec_device( #[instrument] async fn virtiommio_blk_device_handler( device: &Device, - spec: &mut Spec, _sandbox: &Arc>, - devidx: &DevIndex, -) -> Result<()> { +) -> Result { if device.vm_path.is_empty() { return Err(anyhow!("Invalid path for virtio mmio blk device")); } - update_spec_device( - spec, - devidx, - &device.container_path, - DevNumUpdate::from_vm_path(&device.vm_path)?.into(), - ) + Ok(DevNumUpdate::from_vm_path(&device.vm_path)?.into()) } // device.Id should be a PCI path string #[instrument] async fn virtio_blk_device_handler( device: &Device, - spec: &mut Spec, sandbox: &Arc>, - devidx: &DevIndex, -) -> Result<()> { +) -> Result { let pcipath = pci::Path::from_str(&device.id)?; let vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?; - update_spec_device( - spec, - devidx, - &device.container_path, - DevNumUpdate::from_vm_path(vm_path)?.into(), - ) + Ok(DevNumUpdate::from_vm_path(vm_path)?.into()) } // device.id should be a CCW path string @@ -625,29 +611,17 @@ async fn virtio_blk_device_handler( #[instrument] async fn virtio_blk_ccw_device_handler( device: &Device, - spec: &mut Spec, sandbox: &Arc>, - devidx: &DevIndex, -) -> Result<()> { +) -> Result { let ccw_device = ccw::Device::from_str(&device.id)?; let vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?; - update_spec_device( - spec, - devidx, - &device.container_path, - DevNumUpdate::from_vm_path(vm_path)?.into(), - ) + Ok(DevNumUpdate::from_vm_path(vm_path)?.into()) } #[cfg(not(target_arch = "s390x"))] #[instrument] -async fn virtio_blk_ccw_device_handler( - _: &Device, - _: &mut Spec, - _: &Arc>, - _: &DevIndex, -) -> Result<()> { +async fn virtio_blk_ccw_device_handler(_: &Device, _: &Arc>) -> Result { Err(anyhow!("CCW is only supported on s390x")) } @@ -655,37 +629,23 @@ async fn virtio_blk_ccw_device_handler( #[instrument] async fn virtio_scsi_device_handler( device: &Device, - spec: &mut Spec, sandbox: &Arc>, - devidx: &DevIndex, -) -> Result<()> { +) -> Result { let vm_path = get_scsi_device_name(sandbox, &device.id).await?; - update_spec_device( - spec, - devidx, - &device.container_path, - DevNumUpdate::from_vm_path(vm_path)?.into(), - ) + Ok(DevNumUpdate::from_vm_path(vm_path)?.into()) } #[instrument] async fn virtio_nvdimm_device_handler( device: &Device, - spec: &mut Spec, _sandbox: &Arc>, - devidx: &DevIndex, -) -> Result<()> { +) -> Result { if device.vm_path.is_empty() { return Err(anyhow!("Invalid path for nvdimm device")); } - update_spec_device( - spec, - devidx, - &device.container_path, - DevNumUpdate::from_vm_path(&device.vm_path)?.into(), - ) + Ok(DevNumUpdate::from_vm_path(&device.vm_path)?.into()) } fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { @@ -703,12 +663,7 @@ fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { // Each option should have the form "DDDD:BB:DD.F=" // DDDD:BB:DD.F is the device's PCI address in the host // is a PCI path to the device in the guest (see pci.rs) -async fn vfio_device_handler( - device: &Device, - spec: &mut Spec, - sandbox: &Arc>, - devidx: &DevIndex, -) -> Result<()> { +async fn vfio_device_handler(device: &Device, sandbox: &Arc>) -> Result { let vfio_in_guest = device.field_type != DRIVER_VFIO_GK_TYPE; let mut group = None; @@ -743,20 +698,15 @@ async fn vfio_device_handler( } } - if vfio_in_guest { + Ok(if vfio_in_guest { // If there are any devices at all, logic above ensures that group is not None let group = group.unwrap(); let vm_path = get_vfio_device_name(sandbox, group).await?; - update_spec_device( - spec, - devidx, - &device.container_path, - DevUpdate::from_vm_path(&vm_path, vm_path.clone())?, - )?; - } - - Ok(()) + DevUpdate::from_vm_path(&vm_path, vm_path.clone())?.into() + } else { + SpecUpdate::default() + }) } impl DevIndex { @@ -790,22 +740,25 @@ pub async fn add_devices( spec: &mut Spec, sandbox: &Arc>, ) -> Result<()> { - let devidx = DevIndex::new(spec); + let mut dev_updates = HashMap::<&str, DevUpdate>::with_capacity(devices.len()); for device in devices.iter() { - add_device(device, spec, sandbox, &devidx).await?; + let update = add_device(device, sandbox).await?; + if let Some(dev_update) = update.dev { + dev_updates.insert(&device.container_path, dev_update); + } + } + + let devidx = DevIndex::new(spec); + for (container_path, update) in dev_updates.drain() { + update_spec_device(spec, &devidx, container_path, update)?; } Ok(()) } #[instrument] -async fn add_device( - device: &Device, - spec: &mut Spec, - sandbox: &Arc>, - devidx: &DevIndex, -) -> Result<()> { +async fn add_device(device: &Device, sandbox: &Arc>) -> Result { // log before validation to help with debugging gRPC protocol version differences. info!(sl!(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}", device.id, device.field_type, device.vm_path, device.container_path, device.options); @@ -823,14 +776,12 @@ async fn add_device( } match device.field_type.as_str() { - DRIVER_BLK_TYPE => virtio_blk_device_handler(device, spec, sandbox, devidx).await, - DRIVER_BLK_CCW_TYPE => virtio_blk_ccw_device_handler(device, spec, sandbox, devidx).await, - DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, spec, sandbox, devidx).await, - DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, spec, sandbox, devidx).await, - DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, spec, sandbox, devidx).await, - DRIVER_VFIO_GK_TYPE | DRIVER_VFIO_TYPE => { - vfio_device_handler(device, spec, sandbox, devidx).await - } + DRIVER_BLK_TYPE => virtio_blk_device_handler(device, sandbox).await, + DRIVER_BLK_CCW_TYPE => virtio_blk_ccw_device_handler(device, sandbox).await, + DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, sandbox).await, + DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, sandbox).await, + DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, sandbox).await, + DRIVER_VFIO_GK_TYPE | DRIVER_VFIO_TYPE => vfio_device_handler(device, sandbox).await, _ => Err(anyhow!("Unknown device type {}", device.field_type)), } } From f4982130e16ae5add3d5c119d5e522f3d0aed001 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Nov 2021 16:04:20 +1100 Subject: [PATCH 08/15] agent/device: Check for conflicting device updates For each device in the OCI spec we need to update it to reflect the guest rather than the host. We do this with additional device information provided by the runtime. There should only be one update for each device though, if there are multiple, something has gone horribly wrong. Detect and report this situation, for safety. Signed-off-by: David Gibson --- src/agent/src/device.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 66ee9d9fa7..a8c49ecd95 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -745,7 +745,15 @@ pub async fn add_devices( for device in devices.iter() { let update = add_device(device, sandbox).await?; if let Some(dev_update) = update.dev { - dev_updates.insert(&device.container_path, dev_update); + if dev_updates + .insert(&device.container_path, dev_update) + .is_some() + { + return Err(anyhow!( + "Conflicting device updates for {}", + &device.container_path + )); + } } } From d6a3ebc496df78b0a9ef8ab711ad7c4aa923effe Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 4 Nov 2021 16:43:45 +1100 Subject: [PATCH 09/15] agent/device: Obtain guest major/minor numbers when creating DevNumUpdate Currently the DevNumUpdate structure is created with a path to a device node in the VM, which is then used by update_spec_device(). However the only piece of information that update_spec_device() actually needs is the VM side major and minor numbers for the device. We can determine those when we create the DevNumUpdate structure. This means we detect errors earlier and as a bonus we don't need to make a copy of the vm path string. Since that change requires updating 2 of the log statements, we take the opportunity to update all the log statements to structured style. Signed-off-by: David Gibson --- src/agent/src/device.rs | 85 +++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index a8c49ecd95..06cdc4b5ff 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -11,7 +11,7 @@ use std::fmt; use std::fs; use std::os::unix::ffi::OsStrExt; use std::os::unix::fs::MetadataExt; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::str::FromStr; use std::sync::Arc; use tokio::sync::Mutex; @@ -437,15 +437,26 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> { #[derive(Debug, Clone)] struct DevNumUpdate { - // the path to the equivalent device in the VM (used to determine - // the correct major/minor numbers) - vm_path: PathBuf, + // the major and minor numbers for the device within the guest + guest_major: i64, + guest_minor: i64, } impl DevNumUpdate { fn from_vm_path>(vm_path: T) -> Result { + let vm_path = vm_path.as_ref(); + + if !vm_path.exists() { + return Err(anyhow!("VM device path {:?} doesn't exist", vm_path)); + } + + let devid = fs::metadata(vm_path)?.rdev(); + let guest_major = stat::major(devid) as i64; + let guest_minor = stat::minor(devid) as i64; + Ok(DevNumUpdate { - vm_path: vm_path.as_ref().to_owned(), + guest_major, + guest_minor, }) } } @@ -515,24 +526,12 @@ fn update_spec_device( .as_mut() .ok_or_else(|| anyhow!("Spec didn't container linux field"))?; - if !update.num.vm_path.exists() { - return Err(anyhow!( - "vm_path:{} doesn't exist", - update.num.vm_path.display() - )); - } - - let meta = fs::metadata(&update.num.vm_path)?; - let dev_id = meta.rdev(); - let major_id = stat::major(dev_id); - let minor_id = stat::minor(dev_id); - info!( sl!(), - "update_spec_device(): vm_path={}, major: {}, minor: {}\n", - update.num.vm_path.display(), - major_id, - minor_id + "update_spec_device() considering device"; + "container_path" => container_path, + "guest_major" => update.num.guest_major, + "guest_minor" => update.num.guest_minor, ); if let Some(idxdata) = devidx.0.get(container_path) { @@ -540,21 +539,21 @@ fn update_spec_device( let host_major = dev.major; let host_minor = dev.minor; - dev.major = major_id as i64; - dev.minor = minor_id as i64; + dev.major = update.num.guest_major; + dev.minor = update.num.guest_minor; if let Some(final_path) = update.final_path { dev.path = final_path; } info!( sl!(), - "change the device from path: {} major: {} minor: {} to vm device path: {} major: {} minor: {}", - container_path, - host_major, - host_minor, - dev.path, - dev.major, - dev.minor, + "update_spec_device(): updating device"; + "container_path" => container_path, + "host_major" => host_major, + "host_minor" => host_minor, + "updated_path" => &dev.path, + "guest_major" => dev.major, + "guest_minor" => dev.minor, ); // Resources must be updated since they are used to identify @@ -563,12 +562,14 @@ fn update_spec_device( // unwrap is safe, because residx would be empty if there // were no resources let res = &mut linux.resources.as_mut().unwrap().devices[*ridx]; - res.major = Some(major_id as i64); - res.minor = Some(minor_id as i64); + res.major = Some(update.num.guest_major); + res.minor = Some(update.num.guest_minor); info!( sl!(), - "set resources for device major: {} minor: {}\n", major_id, minor_id + "set resources for resource"; + "guest_major" => update.num.guest_major, + "guest_minor" => update.num.guest_minor, ); } Ok(()) @@ -858,9 +859,13 @@ mod tests { let (major, minor) = (7, 2); let mut spec = Spec::default(); + // vm_path empty + let update = DevNumUpdate::from_vm_path(""); + assert!(update.is_err()); + // container_path empty let container_path = ""; - let vm_path = ""; + let vm_path = "/dev/null"; let devidx = DevIndex::new(&spec); let res = update_spec_device( &mut spec, @@ -900,18 +905,6 @@ mod tests { ..oci::LinuxDevice::default() }]; - // vm_path empty - let devidx = DevIndex::new(&spec); - let res = update_spec_device( - &mut spec, - &devidx, - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), - ); - assert!(res.is_err()); - - let vm_path = "/dev/null"; - // guest and host path are not the same let devidx = DevIndex::new(&spec); let res = update_spec_device( From 084538d33407f5c681db56647c45cc3d6b61c907 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 4 Nov 2021 19:58:52 +1100 Subject: [PATCH 10/15] agent/device: Change update_spec_device to handle multiple devices at once update_spec_device() adjusts the OCI spec for device differences between the host and guest. It is called repeatedly for each device we need to alter. These calls are now all in a single loop in add_devices(), so it makes more sense to move the loop into a renamed update_spec_devices() and process all the fixups in one call. Signed-off-by: David Gibson --- src/agent/src/device.rs | 221 +++++++++++++++++++++------------------- 1 file changed, 114 insertions(+), 107 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 06cdc4b5ff..1f4ed0b3d1 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -503,82 +503,83 @@ impl> From for SpecUpdate { } } -// update_spec_device updates the device list in the OCI spec to make +// update_spec_devices updates the device list in the OCI spec to make // it include details appropriate for the VM, instead of the host. It -// is given: +// is given a map of (container_path => update) where: // container_path: the path to the device in the original OCI spec // update: information on changes to make to the device #[instrument] -fn update_spec_device( +fn update_spec_devices( spec: &mut Spec, devidx: &DevIndex, - container_path: &str, - update: DevUpdate, + updates: HashMap<&str, DevUpdate>, ) -> Result<()> { - // If no container_path is provided, we won't be able to match and - // update the device in the OCI spec device list. This is an error. - if container_path.is_empty() { - return Err(anyhow!("Container path cannot be empty for device")); - } - - let linux = spec - .linux - .as_mut() - .ok_or_else(|| anyhow!("Spec didn't container linux field"))?; - - info!( - sl!(), - "update_spec_device() considering device"; - "container_path" => container_path, - "guest_major" => update.num.guest_major, - "guest_minor" => update.num.guest_minor, - ); - - if let Some(idxdata) = devidx.0.get(container_path) { - let dev = &mut linux.devices[idxdata.idx]; - let host_major = dev.major; - let host_minor = dev.minor; - - dev.major = update.num.guest_major; - dev.minor = update.num.guest_minor; - if let Some(final_path) = update.final_path { - dev.path = final_path; + for (container_path, update) in updates { + // If no container_path is provided, we won't be able to match and + // update the device in the OCI spec device list. This is an error. + if container_path.is_empty() { + return Err(anyhow!("Container path cannot be empty for device")); } + let linux = spec + .linux + .as_mut() + .ok_or_else(|| anyhow!("Spec didn't container linux field"))?; + info!( sl!(), - "update_spec_device(): updating device"; + "update_spec_devices() considering device"; "container_path" => container_path, - "host_major" => host_major, - "host_minor" => host_minor, - "updated_path" => &dev.path, - "guest_major" => dev.major, - "guest_minor" => dev.minor, + "guest_major" => update.num.guest_major, + "guest_minor" => update.num.guest_minor, ); - // Resources must be updated since they are used to identify - // the device in the devices cgroup. - for ridx in &idxdata.residx { - // unwrap is safe, because residx would be empty if there - // were no resources - let res = &mut linux.resources.as_mut().unwrap().devices[*ridx]; - res.major = Some(update.num.guest_major); - res.minor = Some(update.num.guest_minor); + if let Some(idxdata) = devidx.0.get(container_path) { + let dev = &mut linux.devices[idxdata.idx]; + let host_major = dev.major; + let host_minor = dev.minor; + + dev.major = update.num.guest_major; + dev.minor = update.num.guest_minor; + if let Some(final_path) = update.final_path { + dev.path = final_path; + } info!( sl!(), - "set resources for resource"; - "guest_major" => update.num.guest_major, - "guest_minor" => update.num.guest_minor, + "update_spec_devices(): updating device"; + "container_path" => container_path, + "host_major" => host_major, + "host_minor" => host_minor, + "updated_path" => &dev.path, + "guest_major" => dev.major, + "guest_minor" => dev.minor, ); + + // Resources must be updated since they are used to identify + // the device in the devices cgroup. + for ridx in &idxdata.residx { + // unwrap is safe, because residx would be empty if there + // were no resources + let res = &mut linux.resources.as_mut().unwrap().devices[*ridx]; + res.major = Some(update.num.guest_major); + res.minor = Some(update.num.guest_minor); + + info!( + sl!(), + "set resources for resource"; + "guest_major" => update.num.guest_major, + "guest_minor" => update.num.guest_minor, + ); + } + } else { + return Err(anyhow!( + "Should have found a device {} in the spec", + container_path + )); } - Ok(()) - } else { - Err(anyhow!( - "Should have found a device {} in the spec", - container_path - )) } + Ok(()) } // device.Id should be the predicted device name (vda, vdb, ...) @@ -759,11 +760,7 @@ pub async fn add_devices( } let devidx = DevIndex::new(spec); - for (container_path, update) in dev_updates.drain() { - update_spec_device(spec, &devidx, container_path, update)?; - } - - Ok(()) + update_spec_devices(spec, &devidx, dev_updates) } #[instrument] @@ -831,6 +828,7 @@ mod tests { use super::*; use crate::uevent::spawn_test_watcher; use oci::Linux; + use std::iter::FromIterator; use tempfile::tempdir; #[test] @@ -855,7 +853,7 @@ mod tests { } #[test] - fn test_update_spec_device() { + fn test_update_spec_devices() { let (major, minor) = (7, 2); let mut spec = Spec::default(); @@ -867,22 +865,26 @@ mod tests { let container_path = ""; let vm_path = "/dev/null"; let devidx = DevIndex::new(&spec); - let res = update_spec_device( + let res = update_spec_devices( &mut spec, &devidx, - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + HashMap::from_iter(vec![( + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + )]), ); assert!(res.is_err()); // linux is empty let container_path = "/dev/null"; let devidx = DevIndex::new(&spec); - let res = update_spec_device( + let res = update_spec_devices( &mut spec, &devidx, - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + HashMap::from_iter(vec![( + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + )]), ); assert!(res.is_err()); @@ -890,11 +892,13 @@ mod tests { // linux.devices is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device( + let res = update_spec_devices( &mut spec, &devidx, - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + HashMap::from_iter(vec![( + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + )]), ); assert!(res.is_err()); @@ -907,11 +911,13 @@ mod tests { // guest and host path are not the same let devidx = DevIndex::new(&spec); - let res = update_spec_device( + let res = update_spec_devices( &mut spec, &devidx, - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + HashMap::from_iter(vec![( + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + )]), ); assert!( res.is_err(), @@ -925,11 +931,13 @@ mod tests { // spec.linux.resources is empty let devidx = DevIndex::new(&spec); - let res = update_spec_device( + let res = update_spec_devices( &mut spec, &devidx, - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + HashMap::from_iter(vec![( + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + )]), ); assert!(res.is_ok()); @@ -951,17 +959,19 @@ mod tests { }); let devidx = DevIndex::new(&spec); - let res = update_spec_device( + let res = update_spec_devices( &mut spec, &devidx, - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + HashMap::from_iter(vec![( + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + )]), ); assert!(res.is_ok()); } #[test] - fn test_update_spec_device_guest_host_conflict() { + fn test_update_spec_devices_guest_host_conflict() { let null_rdev = fs::metadata("/dev/null").unwrap().rdev(); let zero_rdev = fs::metadata("/dev/zero").unwrap().rdev(); let full_rdev = fs::metadata("/dev/full").unwrap().rdev(); @@ -1036,24 +1046,17 @@ mod tests { assert_eq!(Some(host_major_b), specresources.devices[1].major); assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - let update_a = DevNumUpdate::from_vm_path(vm_path_a).unwrap().into(); - let res = update_spec_device(&mut spec, &devidx, container_path_a, update_a); - assert!(res.is_ok()); - - let specdevices = &spec.linux.as_ref().unwrap().devices; - assert_eq!(guest_major_a, specdevices[0].major); - assert_eq!(guest_minor_a, specdevices[0].minor); - assert_eq!(host_major_b, specdevices[1].major); - assert_eq!(host_minor_b, specdevices[1].minor); - - let specresources = spec.linux.as_ref().unwrap().resources.as_ref().unwrap(); - assert_eq!(Some(guest_major_a), specresources.devices[0].major); - assert_eq!(Some(guest_minor_a), specresources.devices[0].minor); - assert_eq!(Some(host_major_b), specresources.devices[1].major); - assert_eq!(Some(host_minor_b), specresources.devices[1].minor); - - let update_b = DevNumUpdate::from_vm_path(vm_path_b).unwrap().into(); - let res = update_spec_device(&mut spec, &devidx, container_path_b, update_b); + let updates = HashMap::from_iter(vec![ + ( + container_path_a, + DevNumUpdate::from_vm_path(vm_path_a).unwrap().into(), + ), + ( + container_path_b, + DevNumUpdate::from_vm_path(vm_path_b).unwrap().into(), + ), + ]); + let res = update_spec_devices(&mut spec, &devidx, updates); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -1070,7 +1073,7 @@ mod tests { } #[test] - fn test_update_spec_device_char_block_conflict() { + fn test_update_spec_devices_char_block_conflict() { let null_rdev = fs::metadata("/dev/null").unwrap().rdev(); let guest_major = stat::major(null_rdev) as i64; @@ -1128,11 +1131,13 @@ mod tests { assert_eq!(Some(host_major), specresources.devices[1].major); assert_eq!(Some(host_minor), specresources.devices[1].minor); - let res = update_spec_device( + let res = update_spec_devices( &mut spec, &devidx, - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + HashMap::from_iter(vec![( + container_path, + DevNumUpdate::from_vm_path(vm_path).unwrap().into(), + )]), ); assert!(res.is_ok()); @@ -1145,7 +1150,7 @@ mod tests { } #[test] - fn test_update_spec_device_final_path() { + fn test_update_spec_devices_final_path() { let null_rdev = fs::metadata("/dev/null").unwrap().rdev(); let guest_major = stat::major(null_rdev) as i64; let guest_minor = stat::minor(null_rdev) as i64; @@ -1172,11 +1177,13 @@ mod tests { let vm_path = "/dev/null"; let final_path = "/dev/new"; - let res = update_spec_device( + let res = update_spec_devices( &mut spec, &devidx, - container_path, - DevUpdate::from_vm_path(vm_path, final_path.to_string()).unwrap(), + HashMap::from_iter(vec![( + container_path, + DevUpdate::from_vm_path(vm_path, final_path.to_string()).unwrap(), + )]), ); assert!(res.is_ok()); From c855a312f011a85c180811dc3a73ab3114cf83be Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 4 Nov 2021 20:46:10 +1100 Subject: [PATCH 11/15] agent/device: Make DevIndex local to update_spec_devices() The DevIndex data structure keeps track of devices in the OCI specification. We used to carry it around to quite a lot of functions, but it's now used only within update_spec_devices(). That means we can simplify things a bit by just open coding the maps we need, rather than declaring a special type. Signed-off-by: David Gibson --- src/agent/src/device.rs | 93 ++++++++++++----------------------------- 1 file changed, 26 insertions(+), 67 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 1f4ed0b3d1..3ab58d66be 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -52,15 +52,6 @@ pub const DRIVER_VFIO_GK_TYPE: &str = "vfio-gk"; // container as a VFIO device node pub const DRIVER_VFIO_TYPE: &str = "vfio"; -#[derive(Debug)] -struct DevIndexEntry { - idx: usize, - residx: Vec, -} - -#[derive(Debug)] -struct DevIndex(HashMap); - #[instrument] pub fn online_device(path: &str) -> Result<()> { fs::write(path, "1")?; @@ -509,11 +500,27 @@ impl> From for SpecUpdate { // container_path: the path to the device in the original OCI spec // update: information on changes to make to the device #[instrument] -fn update_spec_devices( - spec: &mut Spec, - devidx: &DevIndex, - updates: HashMap<&str, DevUpdate>, -) -> Result<()> { +fn update_spec_devices(spec: &mut Spec, updates: HashMap<&str, DevUpdate>) -> Result<()> { + let linux = spec + .linux + .as_mut() + .ok_or_else(|| anyhow!("Spec didn't contain linux field"))?; + + let mut devidx = HashMap::)>::new(); + + for (i, d) in linux.devices.iter().enumerate() { + let mut residx = Vec::new(); + + if let Some(linuxres) = linux.resources.as_ref() { + for (j, r) in linuxres.devices.iter().enumerate() { + if r.r#type == d.r#type && r.major == Some(d.major) && r.minor == Some(d.minor) { + residx.push(j); + } + } + } + devidx.insert(d.path.clone(), (i, residx)); + } + for (container_path, update) in updates { // If no container_path is provided, we won't be able to match and // update the device in the OCI spec device list. This is an error. @@ -521,11 +528,6 @@ fn update_spec_devices( return Err(anyhow!("Container path cannot be empty for device")); } - let linux = spec - .linux - .as_mut() - .ok_or_else(|| anyhow!("Spec didn't container linux field"))?; - info!( sl!(), "update_spec_devices() considering device"; @@ -534,8 +536,8 @@ fn update_spec_devices( "guest_minor" => update.num.guest_minor, ); - if let Some(idxdata) = devidx.0.get(container_path) { - let dev = &mut linux.devices[idxdata.idx]; + if let Some(idxdata) = devidx.get(container_path) { + let dev = &mut linux.devices[idxdata.0]; let host_major = dev.major; let host_minor = dev.minor; @@ -558,7 +560,7 @@ fn update_spec_devices( // Resources must be updated since they are used to identify // the device in the devices cgroup. - for ridx in &idxdata.residx { + for ridx in &idxdata.1 { // unwrap is safe, because residx would be empty if there // were no resources let res = &mut linux.resources.as_mut().unwrap().devices[*ridx]; @@ -711,31 +713,6 @@ async fn vfio_device_handler(device: &Device, sandbox: &Arc>) -> }) } -impl DevIndex { - fn new(spec: &Spec) -> DevIndex { - let mut map = HashMap::new(); - - if let Some(linux) = spec.linux.as_ref() { - for (i, d) in linux.devices.iter().enumerate() { - let mut residx = Vec::new(); - - if let Some(linuxres) = linux.resources.as_ref() { - for (j, r) in linuxres.devices.iter().enumerate() { - if r.r#type == d.r#type - && r.major == Some(d.major) - && r.minor == Some(d.minor) - { - residx.push(j); - } - } - } - map.insert(d.path.clone(), DevIndexEntry { idx: i, residx }); - } - } - DevIndex(map) - } -} - #[instrument] pub async fn add_devices( devices: &[Device], @@ -759,8 +736,7 @@ pub async fn add_devices( } } - let devidx = DevIndex::new(spec); - update_spec_devices(spec, &devidx, dev_updates) + update_spec_devices(spec, dev_updates) } #[instrument] @@ -864,10 +840,8 @@ mod tests { // container_path empty let container_path = ""; let vm_path = "/dev/null"; - let devidx = DevIndex::new(&spec); let res = update_spec_devices( &mut spec, - &devidx, HashMap::from_iter(vec![( container_path, DevNumUpdate::from_vm_path(vm_path).unwrap().into(), @@ -877,10 +851,8 @@ mod tests { // linux is empty let container_path = "/dev/null"; - let devidx = DevIndex::new(&spec); let res = update_spec_devices( &mut spec, - &devidx, HashMap::from_iter(vec![( container_path, DevNumUpdate::from_vm_path(vm_path).unwrap().into(), @@ -891,10 +863,8 @@ mod tests { spec.linux = Some(Linux::default()); // linux.devices is empty - let devidx = DevIndex::new(&spec); let res = update_spec_devices( &mut spec, - &devidx, HashMap::from_iter(vec![( container_path, DevNumUpdate::from_vm_path(vm_path).unwrap().into(), @@ -910,10 +880,8 @@ mod tests { }]; // guest and host path are not the same - let devidx = DevIndex::new(&spec); let res = update_spec_devices( &mut spec, - &devidx, HashMap::from_iter(vec![( container_path, DevNumUpdate::from_vm_path(vm_path).unwrap().into(), @@ -930,10 +898,8 @@ mod tests { spec.linux.as_mut().unwrap().devices[0].path = container_path.to_string(); // spec.linux.resources is empty - let devidx = DevIndex::new(&spec); let res = update_spec_devices( &mut spec, - &devidx, HashMap::from_iter(vec![( container_path, DevNumUpdate::from_vm_path(vm_path).unwrap().into(), @@ -958,10 +924,8 @@ mod tests { ..oci::LinuxResources::default() }); - let devidx = DevIndex::new(&spec); let res = update_spec_devices( &mut spec, - &devidx, HashMap::from_iter(vec![( container_path, DevNumUpdate::from_vm_path(vm_path).unwrap().into(), @@ -1020,7 +984,6 @@ mod tests { }), ..Spec::default() }; - let devidx = DevIndex::new(&spec); let container_path_a = "/dev/a"; let vm_path_a = "/dev/zero"; @@ -1056,7 +1019,7 @@ mod tests { DevNumUpdate::from_vm_path(vm_path_b).unwrap().into(), ), ]); - let res = update_spec_devices(&mut spec, &devidx, updates); + let res = update_spec_devices(&mut spec, updates); assert!(res.is_ok()); let specdevices = &spec.linux.as_ref().unwrap().devices; @@ -1120,7 +1083,6 @@ mod tests { }), ..Spec::default() }; - let devidx = DevIndex::new(&spec); let container_path = "/dev/char"; let vm_path = "/dev/null"; @@ -1133,7 +1095,6 @@ mod tests { let res = update_spec_devices( &mut spec, - &devidx, HashMap::from_iter(vec![( container_path, DevNumUpdate::from_vm_path(vm_path).unwrap().into(), @@ -1172,14 +1133,12 @@ mod tests { }), ..Spec::default() }; - let devidx = DevIndex::new(&spec); let vm_path = "/dev/null"; let final_path = "/dev/new"; let res = update_spec_devices( &mut spec, - &devidx, HashMap::from_iter(vec![( container_path, DevUpdate::from_vm_path(vm_path, final_path.to_string()).unwrap(), From 89ff7000380d0456c9a879125723b457873a0fc5 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Nov 2021 16:37:22 +1100 Subject: [PATCH 12/15] agent/device: Remove unnecessary check for empty container_path update_spec_devices() explicitly checks for being called with an empty container path and fails. We have a unit test to verify this behaviour. But while an empty container_path probably does mean something has gone wrong elsewhere, that's also true of any number of other bad paths. Having an empty string here doesn't prevent what we're doing in this function making sense - we can compare it to the strings in the OCI spec perfectly well (though more likely we simply won't find it there). So, there's no real reason to check this one particular odd case. Signed-off-by: David Gibson --- src/agent/src/device.rs | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 3ab58d66be..16bae152c7 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -522,12 +522,6 @@ fn update_spec_devices(spec: &mut Spec, updates: HashMap<&str, DevUpdate>) -> Re } for (container_path, update) in updates { - // If no container_path is provided, we won't be able to match and - // update the device in the OCI spec device list. This is an error. - if container_path.is_empty() { - return Err(anyhow!("Container path cannot be empty for device")); - } - info!( sl!(), "update_spec_devices() considering device"; @@ -837,20 +831,9 @@ mod tests { let update = DevNumUpdate::from_vm_path(""); assert!(update.is_err()); - // container_path empty - let container_path = ""; - let vm_path = "/dev/null"; - let res = update_spec_devices( - &mut spec, - HashMap::from_iter(vec![( - container_path, - DevNumUpdate::from_vm_path(vm_path).unwrap().into(), - )]), - ); - assert!(res.is_err()); - // linux is empty let container_path = "/dev/null"; + let vm_path = "/dev/null"; let res = update_spec_devices( &mut spec, HashMap::from_iter(vec![( From b60622786d0275028fedf4ae4645cf840cecf3fc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Nov 2021 17:28:27 +1100 Subject: [PATCH 13/15] agent/device: Correct misleading comment on test case We have a test case commented as testing the case where linux.devices is empty in the OCI spec. While it's true that linux.devices is empth in this example, the reason it fails isn't specifically because it's empty but because it doesn't contain a device for the update we're trying to apply. Signed-off-by: David Gibson --- src/agent/src/device.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 16bae152c7..0994a3e2dc 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -845,7 +845,7 @@ mod tests { spec.linux = Some(Linux::default()); - // linux.devices is empty + // linux.devices doesn't contain the updated device let res = update_spec_devices( &mut spec, HashMap::from_iter(vec![( From 4530e7df294f1832a1a134615ac32e975fe2ce12 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Nov 2021 17:12:30 +1100 Subject: [PATCH 14/15] agent/device: Use simpler structure in update_spec_devices() update_spec_devices() takes a bunch of updates for the device entries in the OCI spec and applies them, adjusting things in both the linux.devices and linux.resources.devices sections of the spec. It's important that each entry in the spec only be updated once. Currently we ensure this by first creating an index of where the entries are, then consulting that as we apply each update, so that earlier updates don't cause us to incorrectly detect an entry as being relevant to a later update. This method works, but it's quite awkward. This inverts the loop structure in update_spec_devices() to make this clearer. Instead of stepping through each update and finding the relevant entries in the spec to change, we step through each entry in the spec and find the relevant update. This makes it structurally clear that we're only updating each entry once. Signed-off-by: David Gibson --- src/agent/src/device.rs | 125 +++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 60 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 0994a3e2dc..8297ccb4ef 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -500,81 +500,86 @@ impl> From for SpecUpdate { // container_path: the path to the device in the original OCI spec // update: information on changes to make to the device #[instrument] -fn update_spec_devices(spec: &mut Spec, updates: HashMap<&str, DevUpdate>) -> Result<()> { +fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -> Result<()> { let linux = spec .linux .as_mut() .ok_or_else(|| anyhow!("Spec didn't contain linux field"))?; + let mut res_updates = HashMap::<(&str, i64, i64), DevNumUpdate>::with_capacity(updates.len()); - let mut devidx = HashMap::)>::new(); - - for (i, d) in linux.devices.iter().enumerate() { - let mut residx = Vec::new(); - - if let Some(linuxres) = linux.resources.as_ref() { - for (j, r) in linuxres.devices.iter().enumerate() { - if r.r#type == d.r#type && r.major == Some(d.major) && r.minor == Some(d.minor) { - residx.push(j); - } - } - } - devidx.insert(d.path.clone(), (i, residx)); - } - - for (container_path, update) in updates { - info!( - sl!(), - "update_spec_devices() considering device"; - "container_path" => container_path, - "guest_major" => update.num.guest_major, - "guest_minor" => update.num.guest_minor, - ); - - if let Some(idxdata) = devidx.get(container_path) { - let dev = &mut linux.devices[idxdata.0]; - let host_major = dev.major; - let host_minor = dev.minor; - - dev.major = update.num.guest_major; - dev.minor = update.num.guest_minor; - if let Some(final_path) = update.final_path { - dev.path = final_path; - } + for specdev in &mut linux.devices { + if let Some(update) = updates.remove(specdev.path.as_str()) { + let host_major = specdev.major; + let host_minor = specdev.minor; info!( sl!(), - "update_spec_devices(): updating device"; - "container_path" => container_path, + "update_spec_devices() updating device"; + "container_path" => &specdev.path, + "type" => &specdev.r#type, "host_major" => host_major, "host_minor" => host_minor, - "updated_path" => &dev.path, - "guest_major" => dev.major, - "guest_minor" => dev.minor, + "guest_major" => update.num.guest_major, + "guest_minor" => update.num.guest_minor, + "final_path" => update.final_path.as_ref(), ); - // Resources must be updated since they are used to identify - // the device in the devices cgroup. - for ridx in &idxdata.1 { - // unwrap is safe, because residx would be empty if there - // were no resources - let res = &mut linux.resources.as_mut().unwrap().devices[*ridx]; - res.major = Some(update.num.guest_major); - res.minor = Some(update.num.guest_minor); - - info!( - sl!(), - "set resources for resource"; - "guest_major" => update.num.guest_major, - "guest_minor" => update.num.guest_minor, - ); + specdev.major = update.num.guest_major; + specdev.minor = update.num.guest_minor; + if let Some(final_path) = update.final_path { + specdev.path = final_path; + } + + if res_updates + .insert( + (specdev.r#type.as_str(), host_major, host_minor), + update.num, + ) + .is_some() + { + return Err(anyhow!( + "Conflicting resource updates for host_major={} host_minor={}", + host_major, + host_minor + )); } - } else { - return Err(anyhow!( - "Should have found a device {} in the spec", - container_path - )); } } + + // Make sure we applied all of our updates + if !updates.is_empty() { + return Err(anyhow!( + "Missing devices in OCI spec: {:?}", + updates + .keys() + .map(|d| format!("{:?}", d)) + .collect::>() + .join(" ") + )); + } + + if let Some(resources) = linux.resources.as_mut() { + for r in &mut resources.devices { + if let (Some(host_major), Some(host_minor)) = (r.major, r.minor) { + if let Some(update) = res_updates.get(&(r.r#type.as_str(), host_major, host_minor)) + { + info!( + sl!(), + "update_spec_devices() updating resource"; + "type" => &r.r#type, + "host_major" => host_major, + "host_minor" => host_minor, + "guest_major" => update.guest_major, + "guest_minor" => update.guest_minor, + ); + + r.major = Some(update.guest_major); + r.minor = Some(update.guest_minor); + } + } + } + } + Ok(()) } From 78dff468bff95d33b0042cb6e3aef2fc8d2d2f14 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 10 Nov 2021 13:07:25 +1100 Subject: [PATCH 15/15] agent/device: Adjust PCIDEVICE_* container environment variables for VM The k8s SR-IOV plugin, when it assigns a VFIO device to a container, adds an variable of the form PCIDEVICE_ to the container's environment, so that the payload knows which device is which. The contents of the variable gives the PCI address of the device to use. Kata allows VFIO devices to be passed in to a Kata container, however it runs within a VM which has a different PCI topology. In order for the payload to find the right device, the environment variables therefore need to be converted to list the guest PCI addresses instead of the host PCI addresses. fixes #2897 Signed-off-by: David Gibson --- src/agent/src/device.rs | 116 +++++++++++++++++++++++++++++++++++++--- src/agent/src/pci.rs | 4 +- 2 files changed, 112 insertions(+), 8 deletions(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 8297ccb4ef..c22a657951 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -22,7 +22,7 @@ use crate::linux_abi::*; use crate::pci; use crate::sandbox::Sandbox; use crate::uevent::{wait_for_uevent, Uevent, UeventMatcher}; -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context, Result}; use oci::{LinuxDeviceCgroup, LinuxResources, Spec}; use protocols::agent::Device; use tracing::instrument; @@ -484,12 +484,15 @@ impl From for DevUpdate { #[derive(Debug, Clone, Default)] struct SpecUpdate { dev: Option, + // optional corrections for PCI addresses + pci: Vec<(pci::Address, pci::Address)>, } impl> From for SpecUpdate { fn from(dev: T) -> Self { SpecUpdate { dev: Some(dev.into()), + pci: Vec::new(), } } } @@ -583,6 +586,43 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) - Ok(()) } +// update_spec_pci PCI addresses in the OCI spec to be guest addresses +// instead of host addresses. It is given a map of (host address => +// guest address) +#[instrument] +fn update_spec_pci(spec: &mut Spec, updates: HashMap) -> Result<()> { + // Correct PCI addresses in the environment + if let Some(process) = spec.process.as_mut() { + for envvar in process.env.iter_mut() { + let eqpos = envvar + .find('=') + .ok_or_else(|| anyhow!("Malformed OCI env entry {:?}", envvar))?; + + let (name, eqval) = envvar.split_at(eqpos); + let val = &eqval[1..]; + + if !name.starts_with("PCIDEVICE_") { + continue; + } + + let mut guest_addrs = Vec::::new(); + + for host_addr in val.split(',') { + let host_addr = pci::Address::from_str(host_addr) + .with_context(|| format!("Can't parse {} environment variable", name))?; + let guest_addr = updates + .get(&host_addr) + .ok_or_else(|| anyhow!("Unable to translate host PCI address {}", host_addr))?; + guest_addrs.push(format!("{}", guest_addr)); + } + + envvar.replace_range(eqpos + 1.., guest_addrs.join(",").as_str()); + } + } + + Ok(()) +} + // device.Id should be the predicted device name (vda, vdb, ...) // device.VmPath already provides a way to send it in #[instrument] @@ -668,11 +708,14 @@ fn split_vfio_option(opt: &str) -> Option<(&str, &str)> { // is a PCI path to the device in the guest (see pci.rs) async fn vfio_device_handler(device: &Device, sandbox: &Arc>) -> Result { let vfio_in_guest = device.field_type != DRIVER_VFIO_GK_TYPE; + let mut pci_fixups = Vec::<(pci::Address, pci::Address)>::new(); let mut group = None; for opt in device.options.iter() { - let (_, pcipath) = + let (host, pcipath) = split_vfio_option(opt).ok_or_else(|| anyhow!("Malformed VFIO option {:?}", opt))?; + let host = + pci::Address::from_str(host).context("Bad host PCI address in VFIO option {:?}")?; let pcipath = pci::Path::from_str(pcipath)?; let guestdev = wait_for_pci_device(sandbox, &pcipath).await?; @@ -698,17 +741,24 @@ async fn vfio_device_handler(device: &Device, sandbox: &Arc>) -> } group = devgroup; + + pci_fixups.push((host, guestdev)); } } - Ok(if vfio_in_guest { + let dev_update = if vfio_in_guest { // If there are any devices at all, logic above ensures that group is not None let group = group.unwrap(); let vm_path = get_vfio_device_name(sandbox, group).await?; - DevUpdate::from_vm_path(&vm_path, vm_path.clone())?.into() + Some(DevUpdate::from_vm_path(&vm_path, vm_path.clone())?) } else { - SpecUpdate::default() + None + }; + + Ok(SpecUpdate { + dev: dev_update, + pci: pci_fixups, }) } @@ -719,6 +769,7 @@ pub async fn add_devices( sandbox: &Arc>, ) -> Result<()> { let mut dev_updates = HashMap::<&str, DevUpdate>::with_capacity(devices.len()); + let mut pci_updates = HashMap::::new(); for device in devices.iter() { let update = add_device(device, sandbox).await?; @@ -732,6 +783,17 @@ pub async fn add_devices( &device.container_path )); } + + for (host, guest) in update.pci { + if let Some(other_guest) = pci_updates.insert(host, guest) { + return Err(anyhow!( + "Conflicting guest address for host device {} ({} versus {})", + host, + guest, + other_guest + )); + } + } } } @@ -802,7 +864,7 @@ pub fn update_device_cgroup(spec: &mut Spec) -> Result<()> { mod tests { use super::*; use crate::uevent::spawn_test_watcher; - use oci::Linux; + use oci::{Linux, Process}; use std::iter::FromIterator; use tempfile::tempdir; @@ -1140,6 +1202,48 @@ mod tests { assert_eq!(final_path, specdevices[0].path); } + #[test] + fn test_update_spec_pci() { + let example_map = [ + // Each is a host,guest pair of pci addresses + ("0000:1a:01.0", "0000:01:01.0"), + ("0000:1b:02.0", "0000:01:02.0"), + // This one has the same host address as guest address + // above, to test that we're not double-translating + ("0000:01:01.0", "ffff:02:1f.7"), + ]; + + let mut spec = Spec { + process: Some(Process { + env: vec![ + "PCIDEVICE_x=0000:1a:01.0,0000:1b:02.0".to_string(), + "PCIDEVICE_y=0000:01:01.0".to_string(), + "NOTAPCIDEVICE_blah=abcd:ef:01.0".to_string(), + ], + ..Process::default() + }), + ..Spec::default() + }; + + let pci_fixups = example_map + .iter() + .map(|(h, g)| { + ( + pci::Address::from_str(h).unwrap(), + pci::Address::from_str(g).unwrap(), + ) + }) + .collect(); + + let res = update_spec_pci(&mut spec, pci_fixups); + assert!(res.is_ok()); + + let env = &spec.process.as_ref().unwrap().env; + assert_eq!(env[0], "PCIDEVICE_x=0000:01:01.0,0000:01:02.0"); + assert_eq!(env[1], "PCIDEVICE_y=ffff:02:1f.7"); + assert_eq!(env[2], "NOTAPCIDEVICE_blah=abcd:ef:01.0"); + } + #[test] fn test_pcipath_to_sysfs() { let testdir = tempdir().expect("failed to create tmpdir"); diff --git a/src/agent/src/pci.rs b/src/agent/src/pci.rs index 25ad9a6a09..4cb6b521ae 100644 --- a/src/agent/src/pci.rs +++ b/src/agent/src/pci.rs @@ -20,7 +20,7 @@ const FUNCTION_MAX: u8 = (1 << FUNCTION_BITS) - 1; // Represents a PCI function's slot (a.k.a. device) and function // numbers, giving its location on a single logical bus -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct SlotFn(u8); impl SlotFn { @@ -94,7 +94,7 @@ impl fmt::Display for SlotFn { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct Address { domain: u16, bus: u8,