diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs index aff7152005..e7b4baf237 100644 --- a/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs @@ -76,16 +76,27 @@ impl FcInner { } pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> { - debug!(sl(), "Starting sandbox"); + // For Firecracker, the VMM process was already started in prepare_vm. + // Network interfaces must be configured before InstanceStart, but + // OCI hooks (which create the container veth via CNI) have not run + // yet. Defer the network flush and InstanceStart to boot_vm(), which + // sandbox.rs calls after the hooks + network rescan. + debug!(sl(), "FC start_vm: VMM already running; deferring InstanceStart to boot_vm"); + Ok(()) + } - // Flush all buffered network devices before sending InstanceStart. - // FC rejects PUT /network-interfaces once the VM is running, so network - // interfaces must be configured here, immediately before the start action. + pub(crate) async fn boot_vm(&mut self) -> Result<()> { + debug!(sl(), "FC boot_vm: flushing network devices and sending InstanceStart"); + + // Flush all buffered network devices. These were populated by + // add_device(Network) after the OCI hooks ran and the netns was + // rescanned by sandbox.rs. FC rejects PUT /network-interfaces once + // the VM is running, so this must happen before InstanceStart. let net_devices = std::mem::take(&mut self.pending_net_devices); for (config, device_id) in net_devices { self.add_net_device(&config, device_id) .await - .context("configure network interface before start")?; + .context("configure network interface before InstanceStart")?; } let body: String = serde_json::json!({ diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/mod.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/mod.rs index 05fd0c57cb..b2a73444a8 100644 --- a/src/runtime-rs/crates/hypervisor/src/firecracker/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/mod.rs @@ -75,6 +75,11 @@ impl Hypervisor for Firecracker { inner.start_vm(timeout).await } + async fn boot_vm(&self) -> Result<()> { + let mut inner = self.inner.write().await; + inner.boot_vm().await + } + async fn stop_vm(&self) -> Result<()> { let mut inner = self.inner.write().await; inner.stop_vm().await diff --git a/src/runtime-rs/crates/hypervisor/src/lib.rs b/src/runtime-rs/crates/hypervisor/src/lib.rs index c1bd48f590..357b2efaa6 100644 --- a/src/runtime-rs/crates/hypervisor/src/lib.rs +++ b/src/runtime-rs/crates/hypervisor/src/lib.rs @@ -106,6 +106,23 @@ pub trait Hypervisor: std::fmt::Debug + Send + Sync { selinux_label: Option, ) -> Result<()>; async fn start_vm(&self, timeout: i32) -> Result<()>; + + /// Finalize VM boot after OCI hooks and network setup have run. + /// + /// For hypervisors that require all devices (including network) to be + /// registered before the guest boots (e.g. Firecracker, which has no + /// hotplug), `start_vm` only starts the VMM process, while `boot_vm` + /// flushes the device queue and issues the actual boot command + /// (InstanceStart for FC). For hypervisors that start the guest + /// immediately in `start_vm` (QEMU, dragonball, cloud-hypervisor), the + /// default no-op implementation is sufficient. + /// + /// sandbox.rs calls this after OCI hooks and the post-hooks network + /// rescan, but before connecting to the kata-agent. + async fn boot_vm(&self) -> Result<()> { + Ok(()) + } + async fn stop_vm(&self) -> Result<()>; async fn wait_vm(&self) -> Result; async fn pause_vm(&self) -> Result<()>; diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 592a033c13..68a181384e 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -611,14 +611,10 @@ impl Sandbox for VirtSandbox { .await .context("set up device before start vm")?; + // start vm + self.hypervisor.start_vm(10_000).await.context("start vm")?; + // execute pre-start hook functions, including Prestart Hooks and CreateRuntime Hooks - // - // These must run BEFORE start_vm so that: - // (a) createRuntime hooks (e.g. nerdctl's CNI hook) can create the veth pair - // in the container netns while the VMM process (already started by - // prepare_vm and placed in the netns) is still pre-InstanceStart, and - // (b) hypervisors that do not support network-interface hotplug (e.g. - // Firecracker) can configure the interface before InstanceStart. let (prestart_hooks, create_runtime_hooks) = if let Some(hooks) = sandbox_config.hooks.as_ref() { ( @@ -636,15 +632,12 @@ impl Sandbox for VirtSandbox { ) .await?; - // Rescan the netns and update the network configuration before start_vm: - // 1. When network_created==true the veth is set up by the createRuntime hook - // above; we must scan now so the network device lands in pending_net_devices - // before InstanceStart (required for FC which has no hotplug). - // 2. When there are pre-start hooks the network config may have changed. - // 3. Do not scan if disable_new_netns is set. + // 1. if there are pre-start hook functions, network config might have been changed. + // We need to rescan the netns to handle the change. + // 2. Do not scan the netns if we want no network for the VM. + // TODO In case of vm factory, scan the netns to hotplug interfaces after the VM is started. let config = self.resource_manager.config().await; - if (sandbox_config.network_env.network_created - || self.has_prestart_hooks(&prestart_hooks, &create_runtime_hooks)) + if self.has_prestart_hooks(&prestart_hooks, &create_runtime_hooks) && !config.runtime.disable_new_netns && !dan_config_path(&config, &self.sid).exists() { @@ -663,12 +656,20 @@ impl Sandbox for VirtSandbox { self.resource_manager .handle_network(network_resource) .await - .context("set up network before start vm")?; + .context("set up device after start vm")?; } } - // start vm - self.hypervisor.start_vm(10_000).await.context("start vm")?; + // Give the hypervisor a chance to finalize boot now that OCI hooks and + // the post-hooks network rescan have completed. For hypervisors that + // require all devices (including network) to be registered before the + // guest boots (e.g. Firecracker), start_vm defers the actual boot + // command to this call. For hypervisors that boot the guest in + // start_vm (QEMU, dragonball, cloud-hypervisor), this is a no-op. + self.hypervisor + .boot_vm() + .await + .context("boot vm")?; info!(sl!(), "start vm"); // connect agent