From 21272884371fbb3115bdb4448c55181e9a6e2d6d Mon Sep 17 00:00:00 2001 From: Archana Shinde <archana.m.shinde@intel.com> Date: Mon, 24 Jun 2024 19:18:31 -0700 Subject: [PATCH 1/4] agent: Bring interface down before renaming it. In case we are dealing with multiple interfaces and there exists a network interface with a conflicting name, we temporarily rename it to avoid name conflicts. Before doing this, we need to rename bring the interface down. Failure to do so results in netlink returning Resource busy errors. The resource needs to be down for subsequent operation when the name is swapped back as well. This solves the issue of passing multiple networks in case of nerdctl as: nerdctl run --rm --net foo --net bar docker.io/library/busybox:latest ip a Fixes: #9900 Signed-off-by: Archana Shinde <archana.m.shinde@intel.com> --- src/agent/src/netlink.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/agent/src/netlink.rs b/src/agent/src/netlink.rs index b41cf9b096..bb5be32f58 100644 --- a/src/agent/src/netlink.rs +++ b/src/agent/src/netlink.rs @@ -95,16 +95,30 @@ impl Handle { let mut new_link = None; if link.name() != iface.name { if let Ok(link) = self.find_link(LinkFilter::Name(iface.name.as_str())).await { + // Bring down interface if it is UP + if link.is_up() { + self.enable_link(link.index(), false).await?; + } + // update the existing interface name with a temporary name, otherwise // it would failed to udpate this interface with an existing name. let mut request = self.handle.link().set(link.index()); request.message_mut().header = link.header.clone(); + let link_name = link.name(); + let temp_name = link_name.clone() + "_temp"; request - .name(format!("{}_temp", link.name())) - .up() + .name(temp_name.clone()) .execute() - .await?; + .await + .map_err(|err| { + anyhow!( + "Failed to rename interface {} to {}with error: {}", + link_name, + temp_name, + err + ) + })?; new_link = Some(link); } From 82a1892d34c8e0055dcae0a8630c5f4f878cfe9c Mon Sep 17 00:00:00 2001 From: Archana Shinde <archana.m.shinde@intel.com> Date: Mon, 24 Jun 2024 19:37:24 -0700 Subject: [PATCH 2/4] agent: Add additional info while returning errors for update_interface This should provide additional context for errors while updating network interface. Signed-off-by: Archana Shinde <archana.m.shinde@intel.com> --- src/agent/src/netlink.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/agent/src/netlink.rs b/src/agent/src/netlink.rs index bb5be32f58..6bbbab91f1 100644 --- a/src/agent/src/netlink.rs +++ b/src/agent/src/netlink.rs @@ -134,14 +134,33 @@ impl Handle { .arp(iface.raw_flags & libc::IFF_NOARP as u32 == 0) .up() .execute() - .await?; + .await + .map_err(|err| { + anyhow!( + "Failure in LinkSetRequest for interface {}: {}", + iface.name.as_str(), + err + ) + })?; // swap the updated iface's name. if let Some(nlink) = new_link { let mut request = self.handle.link().set(nlink.index()); request.message_mut().header = nlink.header.clone(); - request.name(link.name()).up().execute().await?; + request + .name(link.name()) + .up() + .execute() + .await + .map_err(|err| { + anyhow!( + "Error swapping back interface name {} to {}: {}", + nlink.name().as_str(), + link.name(), + err + ) + })?; } Ok(()) From 49fbae4fb1b08d132b90c8fc2c2e09f79841ce72 Mon Sep 17 00:00:00 2001 From: Archana Shinde <archana.m.shinde@intel.com> Date: Tue, 25 Jun 2024 00:21:16 -0700 Subject: [PATCH 3/4] agent: Wait for interface in update_interface For nerdctl and docker runtimes, network is hot-plugged instead of cold-plugged. While this change was made in the runtime, we did not have the agent waiting for the device to be ready. On some systems, the device hotplug could take some time causing the update_interface rpc call to fail as the interface is not available. Add a watcher for the network interface based on the pci-path of the network interface. Note, waiting on the device based on name is really not reliable especially in case multiple networks are hotplugged. Signed-off-by: Archana Shinde <archana.m.shinde@intel.com> --- src/agent/src/device.rs | 94 ++++++++++++++++++++++++++++++++++++++ src/agent/src/linux_abi.rs | 1 + src/agent/src/rpc.rs | 16 ++++++- 3 files changed, 110 insertions(+), 1 deletion(-) diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index a06b54fc36..2040bf8c7c 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -379,6 +379,65 @@ pub async fn wait_for_pci_device( Ok(addr) } +#[derive(Debug)] +struct NetPciMatcher { + devpath: String, +} + +impl NetPciMatcher { + fn new(relpath: &str) -> NetPciMatcher { + let root_bus = create_pci_root_bus_path(); + + NetPciMatcher { + devpath: format!("{}{}", root_bus, relpath), + } + } +} + +impl UeventMatcher for NetPciMatcher { + fn is_match(&self, uev: &Uevent) -> bool { + uev.devpath.starts_with(self.devpath.as_str()) + && uev.subsystem == "net" + && !uev.interface.is_empty() + && uev.action == "add" + } +} + +pub async fn wait_for_net_interface( + sandbox: &Arc<Mutex<Sandbox>>, + pcipath: &pci::Path, +) -> Result<()> { + let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path()); + let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; + + let matcher = NetPciMatcher::new(&sysfs_rel_path); + + // Check if the interface is already added in case network is cold-plugged + // or the uevent loop is started before network is added. + // We check for the pci deive in the sysfs directory for network devices. + let pattern = format!( + r"[./]+{}/[a-z0-9/]*net/[a-z0-9/]*", + matcher.devpath.as_str() + ); + let re = Regex::new(&pattern).expect("BUG: Failed to compile regex for NetPciMatcher"); + + for entry in fs::read_dir(SYSFS_NET_PATH)? { + let entry = entry?; + let path = entry.path(); + let target_path = fs::read_link(path)?; + let target_path_str = target_path + .to_str() + .ok_or_else(|| anyhow!("Expected symlink in dir {}", SYSFS_NET_PATH))?; + + if re.is_match(target_path_str) { + return Ok(()); + } + } + let _uev = wait_for_uevent(sandbox, matcher).await?; + + Ok(()) +} + #[derive(Debug)] struct VfioMatcher { syspath: String, @@ -1691,6 +1750,41 @@ mod tests { assert!(!matcher_a.is_match(&uev_b)); } + #[tokio::test] + #[allow(clippy::redundant_clone)] + async fn test_net_pci_matcher() { + let root_bus = create_pci_root_bus_path(); + let relpath_a = "/0000:00:02.0/0000:01:01.0"; + + let mut uev_a = crate::uevent::Uevent::default(); + uev_a.action = crate::linux_abi::U_EVENT_ACTION_ADD.to_string(); + uev_a.devpath = format!("{}{}", root_bus, relpath_a); + uev_a.subsystem = String::from("net"); + uev_a.interface = String::from("eth0"); + let matcher_a = NetPciMatcher::new(relpath_a); + println!("Matcher a : {}", matcher_a.devpath); + + let relpath_b = "/0000:00:02.0/0000:01:02.0"; + let mut uev_b = uev_a.clone(); + uev_b.devpath = format!("{}{}", root_bus, relpath_b); + let matcher_b = NetPciMatcher::new(relpath_b); + + assert!(matcher_a.is_match(&uev_a)); + assert!(matcher_b.is_match(&uev_b)); + assert!(!matcher_b.is_match(&uev_a)); + assert!(!matcher_a.is_match(&uev_b)); + + let relpath_c = "/0000:00:02.0/0000:01:03.0"; + let net_substr = "/net/eth0"; + let mut uev_c = uev_a.clone(); + uev_c.devpath = format!("{}{}{}", root_bus, relpath_c, net_substr); + let matcher_c = NetPciMatcher::new(relpath_c); + + assert!(matcher_c.is_match(&uev_c)); + assert!(!matcher_a.is_match(&uev_c)); + assert!(!matcher_b.is_match(&uev_c)); + } + #[tokio::test] #[allow(clippy::redundant_clone)] async fn test_mmio_block_matcher() { diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs index b87da3ceb6..37e5a6fafd 100644 --- a/src/agent/src/linux_abi.rs +++ b/src/agent/src/linux_abi.rs @@ -89,6 +89,7 @@ pub const SYSFS_MEMORY_HOTPLUG_PROBE_PATH: &str = "/sys/devices/system/memory/pr pub const SYSFS_MEMORY_ONLINE_PATH: &str = "/sys/devices/system/memory"; pub const SYSFS_SCSI_HOST_PATH: &str = "/sys/class/scsi_host"; +pub const SYSFS_NET_PATH: &str = "/sys/class/net"; pub const SYSFS_BUS_PCI_PATH: &str = "/sys/bus/pci"; diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index aa598bc7ca..062c8096bc 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -13,6 +13,7 @@ use std::fmt::Debug; use std::io; use std::os::unix::ffi::OsStrExt; use std::path::Path; +use std::str::FromStr; use std::sync::Arc; use ttrpc::{ self, @@ -52,7 +53,9 @@ use nix::sys::{stat, statfs}; use nix::unistd::{self, Pid}; use rustjail::process::ProcessOperations; -use crate::device::{add_devices, get_virtio_blk_pci_device_name, update_env_pci}; +use crate::device::{ + add_devices, get_virtio_blk_pci_device_name, update_env_pci, wait_for_net_interface, +}; use crate::features::get_build_features; use crate::linux_abi::*; use crate::metrics::get_metrics; @@ -915,6 +918,17 @@ impl agent_ttrpc::AgentService for AgentService { "empty update interface request", )?; + // For network devices passed on the pci bus, check for the network interface + // to be available first. + if !interface.pciPath.is_empty() { + let pcipath = pci::Path::from_str(&interface.pciPath) + .map_ttrpc_err(|e| format!("Unexpected pci-path for network interface: {:?}", e))?; + + wait_for_net_interface(&self.sandbox, &pcipath) + .await + .map_ttrpc_err(|e| format!("interface not available: {:?}", e))?; + } + self.sandbox .lock() .await From 64d6293bb0f28fb6df5d3fc7544ae7aa06154b22 Mon Sep 17 00:00:00 2001 From: Archana Shinde <archana.m.shinde@intel.com> Date: Tue, 25 Jun 2024 11:25:44 -0700 Subject: [PATCH 4/4] tests:Add nerdctl test for testing with multiple netwokrs Add integration test that creates two bridge networks with nerdctl and verifies that Kata container is brought up while passing the networks created. Signed-off-by: Archana Shinde <archana.m.shinde@intel.com> --- tests/integration/nerdctl/gha-run.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/integration/nerdctl/gha-run.sh b/tests/integration/nerdctl/gha-run.sh index c7b2e27621..b8cceb867b 100644 --- a/tests/integration/nerdctl/gha-run.sh +++ b/tests/integration/nerdctl/gha-run.sh @@ -93,6 +93,12 @@ function run() { info "Creating macvlan network with eth0 interface on host as parent" sudo nerdctl network create ${macvlan_net_name} --driver ipvlan --subnet=10.8.0.0/24 -o parent=${parent_interface} + # Create two bridge networks for testing multiple networks with Kata + local net1="foo" + local net2="bar" + sudo nerdctl network create ${net1} + sudo nerdctl network create ${net2} + enabling_hypervisor if [ -n "${GITHUB_ENV:-}" ]; then @@ -106,6 +112,9 @@ function run() { info "Running nerdctl with Kata Containers (${KATA_HYPERVISOR})" sudo nerdctl run --rm --runtime io.containerd.kata-${KATA_HYPERVISOR}.v2 --entrypoint nping instrumentisto/nmap --tcp-connect -c 2 -p 80 www.github.com + info "Running nerdctl with Kata Containers (${KATA_HYPERVISOR}) and multiple bridge nwtorks" + sudo nerdctl run --rm --net ${net1} --net ${net2} --runtime io.containerd.kata-${KATA_HYPERVISOR}.v2 alpine ip a + info "Running nerdctl with Kata Containers (${KATA_HYPERVISOR}) and ipvlan network" sudo nerdctl run --rm --net ${ipvlan_net_name} --runtime io.containerd.kata-${KATA_HYPERVISOR}.v2 alpine ip a | grep "eth0"