From 248d04c20cf2992306fccbce7c81c09814cac829 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Tue, 4 Mar 2025 18:39:32 +0000 Subject: [PATCH] agent: Enable VFIO and initContainers We had a static mapping of host guest PCI addresses, which prevented to use VFIO devices in initContainers. We're tracking now the host-guest mapping per container and removing this mapping if a container is removed. Signed-off-by: Zvonko Kaiser --- src/agent/src/device/mod.rs | 26 ++++++++++++++++++++------ src/agent/src/rpc.rs | 9 +++++++-- src/agent/src/sandbox.rs | 4 +++- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/agent/src/device/mod.rs b/src/agent/src/device/mod.rs index 7bfa802f83..eb07e0b777 100644 --- a/src/agent/src/device/mod.rs +++ b/src/agent/src/device/mod.rs @@ -9,6 +9,7 @@ use self::nvdimm_device_handler::VirtioNvdimmDeviceHandler; use self::scsi_device_handler::ScsiDeviceHandler; use self::vfio_device_handler::{VfioApDeviceHandler, VfioPciDeviceHandler}; use crate::pci; +use crate::sandbox::PciHostGuestMapping; use crate::sandbox::Sandbox; use anyhow::{anyhow, Context, Result}; use cdi::annotations::parse_annotations; @@ -180,6 +181,7 @@ lazy_static! { #[instrument] pub async fn add_devices( + cid: &String, logger: &Logger, devices: &[Device], spec: &mut Spec, @@ -211,8 +213,9 @@ pub async fn add_devices( } let mut sb = sandbox.lock().await; + let mut host_guest: PciHostGuestMapping = HashMap::new(); for (host, guest) in update.pci { - if let Some(other_guest) = sb.pcimap.insert(host, guest) { + if let Some(other_guest) = host_guest.insert(host, guest) { return Err(anyhow!( "Conflicting guest address for host device {} ({} versus {})", host, @@ -221,6 +224,9 @@ pub async fn add_devices( )); } } + // Save all the host -> guest mappings per container upon + // removal of the container, the mappings will be removed + sb.pcimap.insert(cid.clone(), host_guest); } Err(e) => { error!(logger, "failed to add devices, error: {e:?}"); @@ -238,7 +244,7 @@ pub async fn add_devices( if let Some(process) = spec.process_mut() { let env_vec: &mut Vec = &mut process.env_mut().get_or_insert_with(Vec::new).to_vec(); - update_env_pci(env_vec, &sandbox.lock().await.pcimap)? + update_env_pci(cid, env_vec, &sandbox.lock().await.pcimap)? } update_spec_devices(logger, spec, dev_updates) } @@ -391,8 +397,9 @@ pub fn insert_devices_cgroup_rule( // given a map of (host address => guest address) #[instrument] pub fn update_env_pci( + cid: &String, env: &mut [String], - pcimap: &HashMap, + pcimap: &HashMap, ) -> Result<()> { // SR-IOV device plugin may add two environment variables for one resource: // - PCIDEVICE__: a list of PCI device ids separated by comma @@ -418,7 +425,10 @@ pub fn update_env_pci( for host_addr_str in val.split(',') { let host_addr = pci::Address::from_str(host_addr_str) .with_context(|| format!("Can't parse {} environment variable", name))?; - let guest_addr = pcimap + let host_guest = pcimap + .get(cid) + .ok_or_else(|| anyhow!("No PCI mapping found for container {}", cid))?; + let guest_addr = host_guest .get(&host_addr) .ok_or_else(|| anyhow!("Unable to translate host PCI address {}", host_addr))?; @@ -1052,7 +1062,7 @@ mod tests { "NOTAPCIDEVICE_blah=abcd:ef:01.0".to_string(), ]; - let pci_fixups = example_map + let _pci_fixups = example_map .iter() .map(|(h, g)| { ( @@ -1062,7 +1072,11 @@ mod tests { }) .collect(); - let res = update_env_pci(&mut env, &pci_fixups); + let cid = "0".to_string(); + let mut pci_fixups: HashMap> = HashMap::new(); + pci_fixups.insert(cid.clone(), _pci_fixups); + + let res = update_env_pci(&cid, &mut env, &pci_fixups); assert!(res.is_ok(), "error: {}", res.err().unwrap()); assert_eq!(env[0], "PCIDEVICE_x=0000:01:01.0,0000:01:02.0"); diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 126061f63d..eea1a4e85e 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -230,7 +230,7 @@ impl AgentService { // updates the devices listed in the OCI spec, so that they actually // match real devices inside the VM. This step is necessary since we // cannot predict everything from the caller. - add_devices(&sl(), &req.devices, &mut oci, &self.sandbox).await?; + add_devices(&cid, &sl(), &req.devices, &mut oci, &self.sandbox).await?; // In guest-kernel mode some devices need extra handling. Taking the // GPU as an example the shim will inject CDI annotations that will @@ -377,6 +377,9 @@ impl AgentService { ) -> Result<()> { let cid = req.container_id; + // Drop the host guest mapping for this container so we can reuse the + // PCI slots for the next containers + if req.timeout == 0 { let mut sandbox = self.sandbox.lock().await; sandbox.bind_watcher.remove_container(&cid).await; @@ -432,7 +435,7 @@ impl AgentService { .ok_or_else(|| anyhow!("Unable to parse process from ExecProcessRequest"))?; // Apply any necessary corrections for PCI addresses - update_env_pci(&mut process.Env, &sandbox.pcimap)?; + update_env_pci(&cid, &mut process.Env, &sandbox.pcimap)?; let pipe_size = AGENT_CONFIG.container_pipe_size; let ocip = process.into(); @@ -1878,6 +1881,8 @@ async fn remove_container_resources(sandbox: &mut Sandbox, cid: &str) -> Result< sandbox.container_mounts.remove(cid); sandbox.containers.remove(cid); + // Remove any host -> guest mappings for this container + sandbox.pcimap.remove(cid); Ok(()) } diff --git a/src/agent/src/sandbox.rs b/src/agent/src/sandbox.rs index 35e3079d65..c68bc8905c 100644 --- a/src/agent/src/sandbox.rs +++ b/src/agent/src/sandbox.rs @@ -95,6 +95,8 @@ impl StorageState { } } +pub type PciHostGuestMapping = HashMap; + #[derive(Debug)] pub struct Sandbox { pub logger: Logger, @@ -118,7 +120,7 @@ pub struct Sandbox { pub event_rx: Arc>>, pub event_tx: Option>, pub bind_watcher: BindWatcher, - pub pcimap: HashMap, + pub pcimap: HashMap, pub devcg_info: Arc>, }