diff --git a/docs/.nav.yml b/docs/.nav.yml index 7dc1b12238..fa96b03a40 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -16,6 +16,8 @@ nav: - NVIDIA GPU Passthrough: use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md - NVIDIA vGPU: use-cases/NVIDIA-GPU-passthrough-and-Kata.md - Intel QAT: use-cases/using-Intel-QAT-and-kata.md + - How To: + - NUMA Support: how-to/how-to-use-numa-with-kata.md - Contributing: - Documentation: doc-contributing.md - Misc: diff --git a/docs/how-to/README.md b/docs/how-to/README.md index e2742ef374..134dd35e67 100644 --- a/docs/how-to/README.md +++ b/docs/how-to/README.md @@ -52,4 +52,5 @@ - [How to use seccomp with runtime-rs](how-to-use-seccomp-with-runtime-rs.md) - [How to use passthroughfd-IO with runtime-rs and Dragonball](how-to-use-passthroughfd-io-within-runtime-rs.md) - [How to use EROFS snapshotter with Kata Containers](how-to-use-erofs-snapshotter-with-kata.md) +- [How to use NUMA with Kata Containers](how-to-use-numa-with-kata.md) diff --git a/docs/how-to/how-to-use-numa-with-kata.md b/docs/how-to/how-to-use-numa-with-kata.md new file mode 100644 index 0000000000..248dec80ea --- /dev/null +++ b/docs/how-to/how-to-use-numa-with-kata.md @@ -0,0 +1,633 @@ +# NUMA Support for Kata Containers with QEMU + +## Overview + +Non-Uniform Memory Access (NUMA) is a memory architecture where access +latency depends on which CPU is accessing which memory region. On +multi-socket or multi-chiplet systems, each NUMA node has local memory that +its CPUs can access faster than remote memory belonging to other nodes. + +When running performance-sensitive workloads — particularly GPU passthrough +via VFIO — cross-NUMA memory access can significantly degrade throughput. +Kata Containers can expose the host NUMA topology to the guest VM so that +vCPUs, memory, and devices are all placed on the correct NUMA node, preserving +memory locality. + +This guide walks through the full setup end-to-end: host inspection, +Kubernetes configuration, Kata configuration, pod deployment, and +verification. + +> **Note:** +> +> NUMA support is currently available only for the **Go runtime** with the +> **QEMU hypervisor** on **amd64** and **arm64** architectures. The Rust +> runtime (`runtime-rs`) does not yet support NUMA topology. + +## Step 1: Inspect the Host NUMA Topology + +Before configuring anything, understand your host. Run on each worker node: + +```bash +$ numactl --hardware +``` + +Example output on a 2-socket system with 8 CPUs per socket: + +``` +available: 2 nodes (0-1) +node 0 cpus: 0 1 2 3 4 5 6 7 +node 0 size: 65536 MB +node 1 cpus: 8 9 10 11 12 13 14 15 +node 1 size: 65536 MB +node distances: +node 0 1 + 0: 10 21 + 1: 21 10 +``` + +Take note of: +- How many NUMA nodes exist (here: 2) +- Which CPUs belong to each node (here: 0-7 on node 0, 8-15 on node 1) +- The distance matrix (here: 10 local, 21 remote) + +If you have GPUs, check which NUMA node each GPU is attached to: + +```bash +$ lspci -nnk -d 10de: | grep -A2 "NVIDIA" +$ cat /sys/bus/pci/devices/0000:41:00.0/numa_node +``` + +Replace `0000:41:00.0` with your GPU's PCI address. The output (`0` or `1`) +tells you which NUMA node the GPU sits on. + +On a single-NUMA host (only node 0), enabling NUMA is a harmless no-op — +the runtime detects one node and skips multi-NUMA topology. + +## Step 2: Kubernetes CPU Manager Policy + +Kata's NUMA-aware vCPU pinning works **without** `cpuManagerPolicy: static`. +The recommended policy is the default (`none`): + +```yaml +apiVersion: kubelet.config.k8s.io/v1beta1 +kind: KubeletConfiguration +cpuManagerPolicy: "none" +``` + +> **Why not `static`?** +> +> With `cpuManagerPolicy: static`, Kubernetes assigns dedicated CPUs to +> Guaranteed QoS pods. On a multi-NUMA host, those CPUs are often all from +> a **single** NUMA node (depending on the topology manager policy). This +> causes the sandbox CPUSet to cover only one NUMA node, which defeats the +> purpose of multi-NUMA guest topology. +> +> With `cpuManagerPolicy: none` (the default), the pod inherits the full +> node CPUSet spanning all NUMA nodes, and Kata's NUMA-aware pinning +> distributes vCPU threads proportionally across host NUMA nodes. + +### 2.1 Check the current policy + +```bash +$ grep cpuManagerPolicy /var/lib/kubelet/config.yaml +``` + +If it shows `static`, switch to `none`: + +```bash +$ sudo sed -i 's/cpuManagerPolicy:.*/cpuManagerPolicy: "none"/' /var/lib/kubelet/config.yaml +$ sudo rm -f /var/lib/kubelet/cpu_manager_state +$ sudo systemctl restart kubelet +``` + +## Step 3: Configure Kata Containers for NUMA + +> **Note:** +> +> If you are using the NVIDIA GPU runtime classes +> (`kata-qemu-nvidia-gpu`, `kata-qemu-nvidia-gpu-snp`, +> `kata-qemu-nvidia-gpu-tdx`), NUMA is already enabled by default in their +> configuration templates. You only need the steps below for the base +> `kata-qemu` runtime class or custom configurations. + +Never edit the base `configuration-qemu.toml` directly — use a +**configuration drop-in** so your customizations survive upgrades. + +### 3.1 Via kata-deploy Helm chart (recommended) + +Add a custom runtime with a NUMA drop-in in your Helm values file: + +```yaml +customRuntimes: + enabled: true + runtimes: + numa: + baseConfig: qemu + runtimeClass: | + apiVersion: node.k8s.io/v1 + kind: RuntimeClass + metadata: + name: kata-qemu-numa + handler: kata-qemu-numa + dropIn: | + [hypervisor.qemu] + enable_numa = true + numa_mapping = [] + + [runtime] + static_sandbox_resource_mgmt = true + enable_vcpus_pinning = true +``` + +Then install (or upgrade) the Helm chart: + +```bash +$ helm upgrade kata-deploy \ + --namespace kata-system \ + -f my-values.yaml \ + "${CHART}" --version "${VERSION}" +``` + +Pods using `runtimeClassName: kata-qemu-numa` will get the NUMA-enabled +configuration. + +With `numa_mapping = []` (empty), the runtime auto-discovers host NUMA nodes +and creates a 1:1 guest-to-host mapping, then **right-sizes** the resulting +topology: if the sandbox's CPU and memory budget fits on a single host +NUMA node — and any cold-plugged VFIO devices live on that same node — +the guest topology collapses to that one node so the workload keeps full +memory locality without paying a multi-node penalty. Sandboxes that +genuinely span multiple host nodes keep the auto-derived multi-node +topology. An explicit `numa_mapping` opts out of right-sizing and is +honored verbatim — useful when you want a specific layout regardless of +sandbox size, or to group multiple host nodes into fewer guest nodes +(e.g., on a 4-socket system): + +```yaml + dropIn: | + [hypervisor.qemu] + enable_numa = true + numa_mapping = ["0-1", "2-3"] +``` + +Each entry is a cpuset-style string (ranges like `0-3` and lists like +`0,2,4` are both valid). + +### 3.2 Via manual drop-in on the node + +If you manage nodes directly (without kata-deploy), create a drop-in file +under the `config.d/` directory. Use a `50-*` prefix (the reserved range +for user customizations): + +```bash +$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-numa.toml <<'EOF' +[hypervisor.qemu] +enable_numa = true +numa_mapping = [] + +[runtime] +static_sandbox_resource_mgmt = true +enable_vcpus_pinning = true +EOF +``` + +The drop-in is merged on top of the base `configuration-qemu.toml` +automatically. No restart is needed — the shim reads the configuration +at pod creation time. + +> **Note:** +> +> For details on the drop-in mechanism, reserved prefix ranges, and +> additional Helm examples, see the +> [Helm configuration guide](../../docs/helm-configuration.md). + +### 3.3 Verify the effective configuration + +After applying the drop-in, verify the merged configuration on the node: + +```bash +$ grep -rE "enable_numa|numa_mapping|static_sandbox_resource_mgmt|enable_vcpus_pinning" \ + /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/ +``` + +## Step 4: Deploy a NUMA-Aware Pod + +### 4.1 Basic NUMA pod + +Create a pod that requests enough CPUs to span both NUMA nodes. Use the +runtime class matching your NUMA configuration from Step 3 (e.g., +`kata-qemu-numa` if you created a custom runtime, or `kata-qemu` if you +applied a drop-in to the base config). Kata sizes the VM based on +`limits`, so set `limits.cpu` to the desired vCPU count: + +```bash +$ cat <<'EOF' | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: numa-test +spec: + runtimeClassName: kata-qemu-numa + containers: + - name: numa-check + image: ubuntu:24.04 + command: ["sleep", "infinity"] + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + cpu: "80" + memory: "64Gi" +EOF +``` + +> **Note:** +> +> Kata sizes the VM based on `limits` (not `requests`). Using different +> values for `requests` and `limits` makes the pod **Burstable** QoS, +> which avoids Kubernetes CPU manager interference with NUMA-aware +> pinning. The large `limits.cpu` value tells Kata to create a VM with +> that many vCPUs distributed across NUMA nodes. + +### 4.2 GPU passthrough pod with NUMA + +For GPU workloads, use the NVIDIA GPU runtime class. NUMA is enabled by +default in the GPU configuration templates: + +```bash +$ cat <<'EOF' | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: gpu-numa-test +spec: + runtimeClassName: kata-qemu-nvidia-gpu + containers: + - name: cuda-test + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04 + resources: + limits: + cpu: "4" + memory: "8Gi" + nvidia.com/pgpu: "1" +EOF +``` + +## Step 5: Verify NUMA Inside the Guest + +### 5.1 Check guest NUMA topology + +Exec into the running pod and inspect the NUMA layout: + +```bash +$ kubectl exec -it numa-test -- bash +``` + +Inside the pod: + +```bash +$ apt-get update && apt-get install -y numactl +$ numactl --hardware +``` + +Expected output on a 2-NUMA-node guest: + +``` +available: 2 nodes (0-1) +node 0 cpus: 0 1 +node 0 size: 2048 MB +node 1 cpus: 2 3 +node 1 size: 2048 MB +node distances: +node 0 1 + 0: 10 21 + 1: 21 10 +``` + +Key things to verify: +- **Number of nodes** matches your host (or `numa_mapping` configuration). +- **CPUs** are distributed across nodes (not all on node 0). +- **Memory** is split across nodes (not all on node 0). +- **Distances** mirror the host distances. + +### 5.2 Check CPU-to-NUMA mapping + +```bash +$ lscpu | grep -i numa +``` + +Expected: + +``` +NUMA node(s): 2 +NUMA node0 CPU(s): 0,1 +NUMA node1 CPU(s): 2,3 +``` + +### 5.3 Check from /proc and /sys inside the guest + +```bash +$ cat /sys/devices/system/node/node*/cpulist +``` + +Expected: + +``` +0-1 +2-3 +``` + +```bash +$ cat /sys/devices/system/node/node*/meminfo | grep MemTotal +``` + +Expected (values will vary based on your pod's memory request): + +``` +Node 0 MemTotal: 2097152 kB +Node 1 MemTotal: 2097152 kB +``` + +## Step 6: Verify NUMA on the Host + +### 6.1 Check vCPU pinning + +From the host, find the QEMU process and check its thread affinities: + +```bash +$ QEMU_PID=$(pgrep -f "qemu.*numa-test") +$ ls /proc/${QEMU_PID}/task/ | while read tid; do + echo "TID ${tid}: $(taskset -p ${tid} 2>/dev/null)" + done +``` + +With NUMA pinning enabled, you should see vCPU threads pinned to specific +CPUs (not the full CPU mask). For example, on a 2-NUMA-node host with +CPUs 0-7 on node 0 and CPUs 8-15 on node 1: + +``` +TID 12345: pid 12345's current affinity mask: 1 # CPU 0 +TID 12346: pid 12346's current affinity mask: 2 # CPU 1 +TID 12347: pid 12347's current affinity mask: 100 # CPU 8 +TID 12348: pid 12348's current affinity mask: 200 # CPU 9 +``` + +### 6.2 Check the shim logs for NUMA configuration + +```bash +$ POD_SANDBOX_ID=$(crictl pods --name numa-test -q) +$ journalctl -t kata | grep "${POD_SANDBOX_ID}" | grep -i numa +``` + +Look for lines like: + +``` +buildNUMATopology: creating 2 guest NUMA nodes +VFIO device NUMA placement validated bdf=0000:41:00.0 host-numa=1 guest-numa=1 +``` + +### 6.3 Check the QEMU command line + +```bash +$ cat /proc/${QEMU_PID}/cmdline | tr '\0' '\n' | grep -E "numa|memory-backend" +``` + +Expected output (varies by configuration): + +``` +-object +memory-backend-ram,id=numa-mem0,size=2048M,host-nodes=0,policy=bind,share=on +-numa +node,nodeid=0,memdev=numa-mem0,cpus=0-1 +-object +memory-backend-ram,id=numa-mem1,size=2048M,host-nodes=1,policy=bind,share=on +-numa +node,nodeid=1,memdev=numa-mem1,cpus=2-3 +-numa +dist,src=0,dst=1,val=21 +-numa +dist,src=1,dst=0,val=21 +``` + +Key things to verify: +- Each `-object memory-backend-*` has `host-nodes=N` and `policy=bind` + matching the correct host NUMA node. +- Each `-numa node` has a `cpus=` range and `memdev=` pointing to the + correct memory backend. +- `-numa dist` entries mirror the host distances. + +## Step 7: Verify GPU NUMA Placement (GPU Passthrough Only) + +If using GPU passthrough, verify the device landed on the correct NUMA node: + +### 7.1 Check host-side GPU NUMA node + +```bash +$ GPU_BDF="0000:41:00.0" # Replace with your GPU's PCI address +$ cat /sys/bus/pci/devices/${GPU_BDF}/numa_node +``` + +### 7.2 Check shim logs for VFIO placement validation + +```bash +$ journalctl -t kata | grep -i "VFIO device NUMA" +``` + +Healthy output: + +``` +VFIO device NUMA placement validated bdf=0000:41:00.0 host-numa=1 guest-numa=1 +``` + +Warning output (indicates misconfiguration): + +``` +VFIO device on host NUMA node not covered by guest NUMA topology bdf=0000:41:00.0 host-numa=2 covered-nodes=map[0:0 1:1] +``` + +If you see the warning, extend your `numa_mapping` to include the GPU's host +NUMA node. + +### 7.3 Check GPU NUMA inside the guest + +Inside the GPU pod: + +```bash +$ nvidia-smi topo --matrix +``` + +This shows the GPU's relationship to NUMA nodes from the guest perspective. + +## How It Works + +When a VM is created with NUMA enabled, the runtime: + +1. **Discovers host NUMA**: Reads + `/sys/devices/system/node/node*/distance` to build the host distance + matrix. + +2. **Right-sizes the topology** (auto-discovery only): When `numa_mapping` + is empty, the runtime compares the sandbox's vCPU and memory budget + against per-node host capacity (read from + `/sys/devices/system/node/node*/meminfo` and `cpulist`). If any + cold-plugged VFIO device pins the sandbox to specific host nodes, the + chosen subset must cover those; otherwise the smallest single host + node that fits the workload is picked. When the resulting subset has + one node, the topology collapses to a flat (no `-numa`) layout so QEMU + uses a single memory backend. Sandboxes that exceed any single node + keep the full auto-derived multi-node topology. An explicit + `numa_mapping` opts out of this step entirely and is honored verbatim. + +3. **Builds guest topology**: Creates guest NUMA nodes with per-node memory + backends (`policy=bind` to lock memory to host NUMA nodes), distributes + vCPUs proportionally to host CPU counts, and mirrors distances. For + confidential guests (SEV-SNP, TDX), QEMU automatically enables + `guest_memfd` on each memory backend for private/shared memory + attribute tracking (requires the cross-region conversion patch). + +4. **Restructures SMP**: Sets `sockets = num_NUMA_nodes` and + `cores = ceil(maxvcpus / num_NUMA_nodes)` so QEMU groups vCPUs by socket + per NUMA node. + +5. **Pins vCPUs** (when enabled): Each vCPU thread is pinned to a host CPU + belonging to the same NUMA node. Right-sized single-node sandboxes + also go through this NUMA-aware path, so all vCPUs land on the chosen + host NUMA node's CPUs. + +6. **Validates VFIO devices**: Checks each cold-plugged device's host NUMA + node against the guest topology and logs placement status. + +7. **Translates cpuset.mems**: Converts host NUMA node IDs to guest node IDs + before forwarding to the agent. + +## Troubleshooting + +### Guest reports a single NUMA node on a multi-NUMA host + +**Symptom:** Inside a small pod on a 2+ NUMA-node host, `numactl --hardware` +shows only one NUMA node, and the QEMU command line has no `-numa` +arguments. + +**Cause:** Right-sizing collapsed the auto-derived topology because the +sandbox's vCPU + memory budget fits on one host NUMA node. This is the +intended optimization — the pod gets full memory locality without paying +the cross-node penalty for a workload that does not need it. + +**Fix (only if you really want the multi-node layout):** either +- set an explicit `numa_mapping = ["0", "1"]` (or similar) — explicit + mappings skip right-sizing and are honored verbatim, or +- raise the pod's `limits.cpu` / `limits.memory` so the sandbox truly + exceeds any single host node's capacity. + +### Multi-NUMA topology is skipped (too few vCPUs) + +**Symptom:** The shim logs show: + +``` +DefaultMaxVCPUs < NUMA node count; skipping multi-NUMA topology vcpus=1 numa-nodes=2 +``` + +**Cause:** The pod requested fewer CPUs than there are NUMA nodes. Each +NUMA node needs at least one vCPU. + +**Fix:** Request at least as many CPUs as NUMA nodes in the pod spec: + +```yaml +resources: + limits: + cpu: "2" # At least 2 for a 2-NUMA-node host +``` + +Or increase `default_vcpus` via a drop-in: + +```bash +$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-default-vcpus.toml <<'EOF' +[hypervisor.qemu] +default_vcpus = 2 +EOF +``` + +### vCPU pinning is skipped (empty CPUSet) + +**Symptom:** The shim logs show: + +``` +sandbox CPUSet is empty; skipping vCPU pinning +``` + +**Cause:** The runtime could not determine a CPUSet for pinning. With +`cpuManagerPolicy: none` and multi-NUMA enabled, the runtime derives the +CPUSet from the guest NUMA nodes' `HostCPUs`. This message indicates no +NUMA topology was built (e.g., the host has only one NUMA node). + +**Fix:** Verify: + +1. The host has multiple NUMA nodes (`numactl --hardware`) +2. `enable_numa = true` is set in the Kata configuration +3. `enable_vcpus_pinning = true` is set in the Kata configuration +4. `static_sandbox_resource_mgmt = true` is set (so all vCPUs boot at start) + +### NUMA pinning fallback warning + +**Symptom:** The shim logs show: + +``` +NUMA node HostCPUs do not intersect sandbox CPUSet; falling back to full cpuset +``` + +**Cause:** The CPUs Kubernetes assigned to the pod do not overlap with the +host CPUs on the NUMA node. This means NUMA locality is lost for that node. + +**Fix:** Verify that your `numa_mapping` matches the actual host topology: + +```bash +$ numactl --hardware # Check which CPUs are on which nodes +``` + +Ensure the Kubernetes node has CPUs from all mapped NUMA nodes available +for scheduling. + +### Configuration validation error at startup + +**Symptom:** + +``` +NUMA support requires static_sandbox_resource_mgmt to be enabled +``` + +**Fix:** Add `static_sandbox_resource_mgmt` via a drop-in: + +```bash +$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-static-resources.toml <<'EOF' +[runtime] +static_sandbox_resource_mgmt = true +EOF +``` + +## Configuration Reference + +| Option | Section | Default | Description | +|--------|---------|---------|-------------| +| `enable_numa` | `[hypervisor.qemu]` | `false` | Enable guest NUMA topology | +| `numa_mapping` | `[hypervisor.qemu]` | `[]` | Map guest NUMA nodes to host nodes. Empty = auto-discover with right-sizing (small sandboxes collapse to one node); non-empty = honored verbatim | +| `static_sandbox_resource_mgmt` | `[runtime]` | varies | Size VM at boot (required for NUMA) | +| `enable_vcpus_pinning` | `[runtime]` | `false` | Pin vCPU threads to host CPUs (NUMA-aware when NUMA enabled) | + +## Limitations + +- NUMA is only supported with the **Go runtime** and **QEMU** hypervisor. +- Only **amd64** and **arm64** architectures are supported. +- NUMA requires `static_sandbox_resource_mgmt = true` (no dynamic + CPU/memory hotplug). +- The VM needs at least as many vCPUs as NUMA nodes. If fewer vCPUs are + available, multi-NUMA is silently skipped. +- vCPU pinning with NUMA works best with `cpuManagerPolicy: none` (the + default). Using `static` may restrict the pod's CPUSet to a single NUMA + node, preventing balanced pinning across nodes. +- Confidential guests (SEV-SNP, TDX) with NUMA require a QEMU patch + ([accel/kvm: Fix kvm_convert_memory calls crossing memory regions](https://github.com/AMDESE/qemu/commit/6b0eaa20)) + to handle page conversions that span multiple NUMA memory backends. + The GPU-experimental QEMU builds (`gpu-snp`, `gpu-tdx`) include this + patch. Without it, QEMU crashes with + `ram_block_attributes_state_change, invalid range`. diff --git a/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md b/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md index b20a9d9d3e..118cf16919 100644 --- a/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md +++ b/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md @@ -506,6 +506,17 @@ To stop the pod, run: `kubectl delete pod cuda-vectoradd-kata`. ### Next steps +#### NUMA topology for GPU locality + +On multi-NUMA hosts, enabling NUMA support ensures GPU memory accesses stay +local to the NUMA node where the GPU is physically attached, avoiding +cross-NUMA latency. The NVIDIA GPU configuration templates ship with +`enable_numa = true` by default. + +For details on NUMA configuration, topology verification, and +troubleshooting, see the +[NUMA support guide](../how-to/how-to-use-numa-with-kata.md). + #### Use multi-GPU passthrough If you have machines supporting multi-GPU passthrough, use a pod deployment diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs index cb5c6bc3f0..a89454263e 100644 --- a/src/agent/src/linux_abi.rs +++ b/src/agent/src/linux_abi.rs @@ -26,15 +26,29 @@ pub fn create_pci_root_bus_path(root_complex: &str) -> String { format!("/devices/pci0000:{root_complex}") } -// This is used in several modules, let's create a helper function to parse the -// qom path and switch easily once the shim sends us the full NUMA path +// Parses a device tree path into a (root_complex, PCI path) pair. +// +// Supports two formats: +// - Full NUMA path: "root_complex/bus/device" (e.g. "10/00/02") where the +// first segment is the root complex and the rest form the PCI path. +// - Legacy path: "bus/device" (e.g. "00/02") which defaults to root complex "00". pub fn pcipath_from_dev_tree_path(dev_tree_path: &str) -> Result<(&str, pci::Path)> { - // Placeholder until the shim send us the full NUMA path - // via shim in the form of root_complex/bus/device 10/00/02 - // Currently the shim only sends us the bus/device 00/02 - let pci_path = pci::Path::from_str(dev_tree_path) - .with_context(|| format!("Failed to parse PCI path from QOM path '{}'", dev_tree_path))?; - Ok(("00", pci_path)) + let segments: Vec<&str> = dev_tree_path.split('/').collect(); + if segments.len() >= 3 { + let root_complex = segments[0]; + let pci_part = &dev_tree_path[root_complex.len() + 1..]; + let pci_path = pci::Path::from_str(pci_part).with_context(|| { + format!( + "Failed to parse PCI path from NUMA path '{}'", + dev_tree_path + ) + })?; + Ok((root_complex, pci_path)) + } else { + let pci_path = pci::Path::from_str(dev_tree_path) + .with_context(|| format!("Failed to parse PCI path from '{}'", dev_tree_path))?; + Ok(("00", pci_path)) + } } #[cfg(target_arch = "aarch64")] diff --git a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs index 7fbcdb2e06..d72dc73efe 100644 --- a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs +++ b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs @@ -7,6 +7,7 @@ use std::collections::{HashMap, HashSet}; use std::process; use std::str::FromStr; +use std::time::Duration; use anyhow::{anyhow, Context, Result}; use cgroups_rs::manager::is_systemd_cgroup; @@ -16,6 +17,7 @@ use kata_types::cpu::CpuSet; use nix::sched::{sched_setaffinity, CpuSet as NixCpuSet}; use nix::unistd::Pid; use oci_spec::runtime::{LinuxCpu, LinuxCpuBuilder, LinuxResources, LinuxResourcesBuilder}; +use tokio::time::sleep; use crate::cgroups::utils::get_tgid_from_pid; use crate::cgroups::CgroupConfig; @@ -187,12 +189,46 @@ impl CgroupsResourceInner { let needs_thread_ids = self.overhead_cgroup.is_some() || self.enable_vcpus_pinning; let thread_ids = if needs_thread_ids { - Some( - hypervisor - .get_thread_ids() - .await - .context("get vCPU thread IDs")?, - ) + let mut tids = hypervisor + .get_thread_ids() + .await + .context("get vCPU thread IDs")?; + + // QEMU may not have spawned all vCPU threads yet. Retry with + // exponential backoff until we see the expected count. + let expected = hypervisor.hypervisor_config().await.cpu_info.default_vcpus.ceil() as usize; + if expected > 0 && tids.vcpus.len() < expected { + const MAX_ATTEMPTS: u32 = 10; + let mut backoff = Duration::from_millis(50); + for attempt in 2..=MAX_ATTEMPTS { + if tids.vcpus.len() >= expected { + break; + } + info!( + sl!(), + "waiting for all vCPU threads: have {}, want {}, attempt {}", + tids.vcpus.len(), + expected, + attempt + ); + sleep(backoff).await; + backoff *= 2; + tids = hypervisor + .get_thread_ids() + .await + .context("get vCPU thread IDs (retry)")?; + } + if tids.vcpus.len() < expected { + warn!( + sl!(), + "not all vCPU threads available after retries: have {}, want {}; pinning available ones", + tids.vcpus.len(), + expected + ); + } + } + + Some(tids) } else { None }; diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 91d3eb976c..88ef8077ff 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -511,6 +511,8 @@ ifneq (,$(QEMUCMD)) DEFENABLEVCPUPINNING_NV = true + DEFENABLENUMA_NV = true + # NVIDIA profile: rootfs filesystem type (erofs for read-only, compressed images) DEFROOTFSTYPE_NV := $(ROOTFSTYPE_EROFS) @@ -689,6 +691,7 @@ USER_VARS += DEFAULTTIMEOUT_NV USER_VARS += DEFAULTLAUNCHPROCESSTIMEOUT_NV USER_VARS += DEFSANDBOXCGROUPONLY_NV USER_VARS += DEFENABLEVCPUPINNING_NV +USER_VARS += DEFENABLENUMA_NV USER_VARS += DEFROOTFSTYPE_NV USER_VARS += DEFROOTFSTYPE USER_VARS += MACHINETYPE diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in index 4dae978b9b..b15186867d 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in @@ -360,7 +360,12 @@ enable_iommu_platform = false # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs. # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should # also be enabled for memory pre-allocation. -enable_numa = false +# +# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime +# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA +# node covered by the guest NUMA topology, ensuring memory locality. Consider +# enabling this on multi-NUMA hosts with GPU passthrough. +enable_numa = @DEFENABLENUMA_NV@ # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes. # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to. diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in index 1c1ce20b01..2928389b1c 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in @@ -337,7 +337,12 @@ enable_iommu_platform = false # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs. # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should # also be enabled for memory pre-allocation. -enable_numa = false +# +# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime +# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA +# node covered by the guest NUMA topology, ensuring memory locality. Consider +# enabling this on multi-NUMA hosts with GPU passthrough. +enable_numa = @DEFENABLENUMA_NV@ # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes. # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to. diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in index 49f9db0d6e..f373082129 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in @@ -319,7 +319,12 @@ enable_iommu_platform = false # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs. # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should # also be enabled for memory pre-allocation. -enable_numa = false +# +# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime +# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA +# node covered by the guest NUMA topology, ensuring memory locality. Consider +# enabling this on multi-NUMA hosts with GPU passthrough. +enable_numa = @DEFENABLENUMA_NV@ # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes. # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 811884a088..5a51f628ca 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -314,6 +314,11 @@ enable_iommu_platform = false # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs. # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should # also be enabled for memory pre-allocation. +# +# When VFIO devices (e.g. GPUs) are cold-plugged and NUMA is enabled, the +# runtime validates that each device's host NUMA node is covered by the guest +# NUMA topology. A warning is logged if a device falls outside the configured +# nodes, indicating potential cross-NUMA memory access overhead. enable_numa = false # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes. diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index b912fc6377..489cf5e4dc 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -235,6 +235,17 @@ var ( // different types of PCI ports. We can deduces the Bus number from it // and eliminate duplicates being assigned. PCIeDevicesPerPort = map[PCIePort][]VFIODev{} + + // NUMARootPorts maps host NUMA node IDs to root port IDs on pxb-pcie + // bridges. When NUMA-aware PCIe topology is active (pxb-pcie), + // createPCIeTopology populates this so VFIODevice.Attach() can assign + // each device to the root port on its host NUMA node's pxb-pcie bus. + // Key: host NUMA node ID, Value: slice of root port IDs on that node's pxb. + NUMARootPorts = map[int][]string{} + + // NUMARootPortDeviceCount tracks how many devices have been assigned + // to each host NUMA node's root ports (for round-robin assignment). + NUMARootPortDeviceCount = map[int]int{} ) // DeviceInfo is an embedded type that contains device data common to all types of devices. @@ -418,6 +429,10 @@ type VFIODev struct { // Type of VFIO device Type VFIODeviceType + // NUMANode is the host NUMA node this device is attached to. + // -1 means no affinity or unknown. + NUMANode int + // IsPCIe specifies device is PCIe or PCI IsPCIe bool diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 1e7ba5f118..d111b9e2bb 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -46,6 +46,7 @@ var ( PCISysFsSlotsMaxBusSpeed PCISysFsProperty = "max_bus_speed" // /sys/bus/pci/slots/xxx/max_bus_speed PCISysFsDevicesVendor PCISysFsProperty = "vendor" // /sys/bus/pci/devices/xxx/vendor PCISysFsDevicesDevice PCISysFsProperty = "device" // /sys/bus/pci/devices/xxx/device + PCISysFsDevicesNUMANode PCISysFsProperty = "numa_node" // /sys/bus/pci/devices/xxx/numa_node ) func deviceLogger() *logrus.Entry { @@ -85,6 +86,20 @@ func GetPCIDeviceProperty(bdf string, property PCISysFsProperty) string { return rlt } +// GetPCIDeviceNUMANode returns the host NUMA node for a PCI device. +// Returns -1 if the device has no NUMA affinity or the value cannot be read. +func GetPCIDeviceNUMANode(bdf string) int { + raw := GetPCIDeviceProperty(bdf, PCISysFsDevicesNUMANode) + if raw == "" { + return -1 + } + n, err := strconv.Atoi(raw) + if err != nil { + return -1 + } + return n +} + func readPCIProperty(propertyPath string) (string, error) { var ( buf []byte @@ -240,6 +255,7 @@ func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) { Class: pciClass, VendorID: vendorID, DeviceID: deviceID, + NUMANode: GetPCIDeviceNUMANode(deviceBDF), Port: device.Port, HostPath: device.HostPath, } @@ -291,7 +307,6 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe vendorID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesVendor) deviceID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesDevice) - // Do not directly assign to `vfio` -- need to access field still vfio = config.VFIODev{ ID: id, Type: vfioDeviceType, @@ -301,6 +316,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe Class: pciClass, VendorID: vendorID, DeviceID: deviceID, + NUMANode: GetPCIDeviceNUMANode(deviceBDF), Port: device.Port, HostPath: device.HostPath, } @@ -315,6 +331,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe SysfsDev: deviceSysfsDev, Type: config.VFIOAPDeviceMediatedType, APDevices: devices, + NUMANode: -1, Port: device.Port, } default: diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 42d86e2dca..ff70c4ac76 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -90,11 +90,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } if vfio.IsPCIe { - busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) - vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) - // We need to keep track the number of devices per port to deduce - // the corectu bus number, additionally we can use the VFIO device - // info to act upon different Vendor IDs and Device IDs. + // When pxb-pcie NUMA topology is active, assign the device + // to a root port on the pxb-pcie bridge for its host NUMA + // node instead of the default rp/swdp numbering. + if rpIDs, ok := config.NUMARootPorts[vfio.NUMANode]; ok && len(rpIDs) > 0 { + idx := config.NUMARootPortDeviceCount[vfio.NUMANode] + vfio.Bus = rpIDs[idx%len(rpIDs)] + config.NUMARootPortDeviceCount[vfio.NUMANode] = idx + 1 + } else { + busIndex := len(config.PCIeDevicesPerPort[vfio.Port]) + vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) + } config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio) } } diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index 06f9117676..5726613e3a 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -71,6 +71,8 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0) config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0) config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0) + config.NUMARootPorts = make(map[int][]string) + config.NUMARootPortDeviceCount = make(map[int]int) for _, dev := range devices { dm.devices[dev.DeviceID()] = dev diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 7cf6915df9..9dca1e959e 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -50,6 +50,20 @@ const ( qgsSocketPath string = "/var/run/tdx-qgs/qgs.socket" ) +// hasPCIeRoot reports whether the configured QEMU machine type exposes a +// `pcie.0` root complex (q35 on x86, virt on arm64). Machines such as +// pseries (ppc64le -> pci.0), s390-ccw-virtio (s390x -> CCW transport) +// and microvm (no PCI at all) do not have a `pcie.0` bus, so emitting +// `bus=pcie.0` on virtio-pci leaf devices would fail to start QEMU. +// Used to gate the bus= pin we apply to keep leaf devices off pxb-pcie. +func hasPCIeRoot(config *Config) bool { + if config == nil { + return false + } + t := config.Machine.Type + return strings.HasPrefix(t, "q35") || strings.HasPrefix(t, "virt") +} + const ( // Well known vsock CID for host system. // https://man7.org/linux/man-pages/man7/vsock.7.html @@ -132,6 +146,10 @@ const ( // VHostVSockPCI is a generic Vsock vhost device with PCI transport. VHostVSockPCI DeviceDriver = "vhost-vsock-pci" + // PXBPCIe is a PCIe Expander Bridge that creates a new PCI root + // complex with NUMA node affinity. + PXBPCIe DeviceDriver = "pxb-pcie" + // PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port. PCIeRootPort DeviceDriver = "pcie-root-port" @@ -152,7 +170,7 @@ const ( func isDimmSupported(config *Config) bool { switch runtime.GOARCH { - case "amd64", "386", "ppc64le", "arm64": + case "amd64", "ppc64le", "arm64": if config != nil && config.Machine.Type == MachineTypeMicrovm { // microvm does not support NUMA return false @@ -1064,6 +1082,11 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string { if netdev.Bus != "" { deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus)) + } else if netdev.Transport.isVirtioPCI(config) && hasPCIeRoot(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device as the default bus. Skipped on machines + // without a `pcie.0` root (pseries, microvm, s390-ccw-virtio). + deviceParams = append(deviceParams, "bus=pcie.0") } if netdev.Addr != "" { @@ -1586,8 +1609,15 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("netdev=%s", vhostuserDev.TypeDevID)) deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device. See hasPCIeRoot() for skipped machines. + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-netdev") @@ -1612,8 +1642,13 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vhostuserDev.TypeDevID)) deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -1637,8 +1672,13 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string { deviceParams = append(deviceParams, "size=512M") deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID)) - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -1674,8 +1714,13 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string { } deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo)) } - if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + if vhostuserDev.Transport.isVirtioPCI(config) { + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } + if vhostuserDev.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile)) + } } qemuParams = append(qemuParams, "-device") @@ -1738,6 +1783,36 @@ func (vhostuserDev VhostUserDevice) deviceName(config *Config) string { } } +// PXBPCIeDevice represents a PCIe Expander Bridge (pxb-pcie). +// It creates a new PCI root complex with NUMA node affinity, allowing +// devices attached to its bus hierarchy to inherit the NUMA association. +// This is the only QEMU PCI device that carries a numa_node property. +type PXBPCIeDevice struct { + // ID is the QEMU device identifier (e.g. "pxb-numa0"). + ID string + + // BusNr is the guest PCI bus number for this root complex. + // Use values spaced apart (e.g. 0x20, 0x40) to leave room for + // bridges beneath each pxb-pcie. + BusNr uint8 + + // NUMANode is the guest NUMA node index this root complex belongs to. + NUMANode int +} + +// QemuParams returns the QEMU parameters for a pxb-pcie device. +func (dev PXBPCIeDevice) QemuParams(_ *Config) []string { + return []string{ + "-device", + fmt.Sprintf("pxb-pcie,id=%s,bus_nr=%d,numa_node=%d", dev.ID, dev.BusNr, dev.NUMANode), + } +} + +// Valid returns true if the PXBPCIeDevice structure is valid and complete. +func (dev PXBPCIeDevice) Valid() bool { + return dev.ID != "" +} + // PCIeRootPortDevice represents a memory balloon device. // nolint: govet type PCIeRootPortDevice struct { @@ -2310,8 +2385,15 @@ func (vsock VSOCKDevice) QemuParams(config *Config) []string { deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vsock.ID)) deviceParams = append(deviceParams, fmt.Sprintf("%s=%d", VSOCKGuestCID, vsock.ContextID)) - if vsock.Transport.isVirtioPCI(config) && vsock.ROMFile != "" { - deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile)) + if vsock.Transport.isVirtioPCI(config) { + // Pin to pcie.0 (when present) so pxb-pcie can't capture + // this leaf device. See hasPCIeRoot() for skipped machines. + if hasPCIeRoot(config) { + deviceParams = append(deviceParams, "bus=pcie.0") + } + if vsock.ROMFile != "" { + deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile)) + } } if vsock.Transport.isVirtioCCW(config) { @@ -2689,7 +2771,8 @@ type SMP struct { Sockets uint32 // MaxCPUs is the maximum number of VCPUs that a VM can have. - // This value, if non-zero, MUST BE equal to or greater than CPUs + // This value, if non-zero, MUST BE equal to or greater than CPUs, + // and must be equal to Sockets * Cores * Threads if all are non-zero. MaxCPUs uint32 } @@ -2775,6 +2858,36 @@ func (fwcfg FwCfg) QemuParams(config *Config) []string { return qemuParams } +// NUMANode describes a guest NUMA node and its mapping to host resources. +type NUMANode struct { + // NodeID is the guest NUMA node identifier (0-based). + NodeID uint32 + + // CPUs is the guest vCPU range assigned to this node (e.g. "0-3"). + CPUs string + + // MemSize is the amount of memory for this node (e.g. "512M", "1G"). + MemSize string + + // HostNodes is the host NUMA node(s) this guest node maps to (e.g. "0" or "0-1"). + HostNodes string + + // MemBackendType selects the QEMU memory backend object type. + // Typical values: "memory-backend-ram" or "memory-backend-file". + MemBackendType string + + // MemBackendPath is the mem-path for file-backed memory (hugepages, file-backed). + // Empty when using memory-backend-ram. + MemBackendPath string +} + +// NUMADist describes a NUMA distance entry for `-numa dist`. +type NUMADist struct { + Src uint32 + Dst uint32 + Val uint32 +} + // Knobs regroups a set of qemu boolean settings type Knobs struct { // NoUserConfig prevents qemu from loading user config files. @@ -2922,6 +3035,14 @@ type Config struct { IOThreads []IOThread + // NUMANodes defines multi-NUMA guest topology. When non-empty, + // appendMemoryKnobs creates per-node memory backends and -numa entries + // instead of a single flat memory region. + NUMANodes []NUMANode + + // NUMADists defines inter-node distance entries emitted as -numa dist. + NUMADists []NUMADist + // PidFile is the -pidfile parameter PidFile string @@ -3096,6 +3217,13 @@ func (config *Config) appendCPUs() error { return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d", config.SMP.MaxCPUs, config.SMP.CPUs) } + if len(config.NUMANodes) > 1 && config.SMP.Sockets > 0 && config.SMP.Cores > 0 && config.SMP.Threads > 0 { + expected := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads + if config.SMP.MaxCPUs != expected { + return fmt.Errorf("MaxCPUs %d must equal Sockets(%d) * Cores(%d) * Threads(%d) = %d", + config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads, expected) + } + } SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs)) } @@ -3169,6 +3297,12 @@ func (config *Config) appendMemoryKnobs() { if config.Memory.Size == "" { return } + + if len(config.NUMANodes) > 0 && isDimmSupported(config) { + config.appendMultiNUMAMemoryKnobs() + return + } + var objMemParam, numaMemParam string dimmName := "dimm1" if config.Knobs.HugePages { @@ -3200,6 +3334,49 @@ func (config *Config) appendMemoryKnobs() { } } +func (config *Config) appendMultiNUMAMemoryKnobs() { + for _, node := range config.NUMANodes { + memID := fmt.Sprintf("numa-mem%d", node.NodeID) + + backendType := node.MemBackendType + if backendType == "" { + backendType = "memory-backend-ram" + } + + objMemParam := fmt.Sprintf("%s,id=%s,size=%s", backendType, memID, node.MemSize) + + if node.MemBackendPath != "" { + objMemParam += ",mem-path=" + node.MemBackendPath + } + + if node.HostNodes != "" { + objMemParam += ",host-nodes=" + node.HostNodes + ",policy=bind" + } + + if config.Knobs.MemShared { + objMemParam += ",share=on" + } + if config.Knobs.MemPrealloc { + objMemParam += ",prealloc=on" + } + + config.qemuParams = append(config.qemuParams, "-object") + config.qemuParams = append(config.qemuParams, objMemParam) + + numaParam := fmt.Sprintf("node,nodeid=%d,memdev=%s", node.NodeID, memID) + if node.CPUs != "" { + numaParam += ",cpus=" + node.CPUs + } + config.qemuParams = append(config.qemuParams, "-numa") + config.qemuParams = append(config.qemuParams, numaParam) + } + + for _, dist := range config.NUMADists { + config.qemuParams = append(config.qemuParams, "-numa") + config.qemuParams = append(config.qemuParams, fmt.Sprintf("dist,src=%d,dst=%d,val=%d", dist.Src, dist.Dst, dist.Val)) + } +} + func (config *Config) appendKnobs() { if config.Knobs.NoUserConfig { config.qemuParams = append(config.qemuParams, "-no-user-config") diff --git a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go index a14e0fb032..36e03254ae 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go @@ -14,8 +14,8 @@ var ( deviceNetworkString = "-netdev tap,id=tap0,vhost=on,ifname=ceth0,downscript=no,script=no -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,romfile=efi-virtio.rom" deviceNetworkStringMq = "-netdev tap,id=tap0,vhost=on,fds=3:4 -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,mq=on,vectors=6,romfile=efi-virtio.rom" deviceSerialString = "-device virtio-serial-pci,disable-modern=true,id=serial0,romfile=efi-virtio.rom,max_ports=2" - deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,romfile=efi-virtio.rom" - deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=efi-virtio.rom" + deviceVhostUserNetString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,bus=pcie.0,romfile=efi-virtio.rom" + deviceVSOCKString = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,bus=pcie.0,romfile=efi-virtio.rom" deviceVFIOString = "-device vfio-pci,host=02:10.0,x-pci-vendor-id=0x1234,x-pci-device-id=0x5678,romfile=efi-virtio.rom" devicePCIeRootPortSimpleString = "-device pcie-root-port,id=rp1,bus=pcie.0,chassis=0x00,slot=0x00,multifunction=off" devicePCIeRootPortFullString = "-device pcie-root-port,id=rp2,bus=pcie.0,chassis=0x0,slot=0x1,addr=0x2,multifunction=on,bus-reserve=0x3,pref64-reserve=16G,mem-reserve=1G,io-reserve=512M,romfile=efi-virtio.rom" @@ -23,8 +23,8 @@ var ( deviceVFIOPCIeFullString = "-device vfio-pci,host=02:00.0,x-pci-vendor-id=0x10de,x-pci-device-id=0x15f8,romfile=efi-virtio.rom,bus=rp1" deviceSCSIControllerStr = "-device virtio-scsi-pci,id=foo,disable-modern=false,romfile=efi-virtio.rom" deviceSCSIControllerBusAddrStr = "-device virtio-scsi-pci,id=foo,bus=pci.0,addr=00:04.0,disable-modern=true,iothread=iothread1,romfile=efi-virtio.rom" - deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,romfile=efi-virtio.rom" - deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,romfile=efi-virtio.rom" + deviceVhostUserSCSIString = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,bus=pcie.0,romfile=efi-virtio.rom" + deviceVhostUserBlkString = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,bus=pcie.0,romfile=efi-virtio.rom" deviceBlockString = "-device virtio-blk-pci,disable-modern=true,drive=hd0,config-wce=off,romfile=efi-virtio.rom,share-rw=on,serial=hd0 -drive id=hd0,file=/var/lib/vm.img,aio=threads,format=qcow2,if=none,readonly=on" devicePCIBridgeString = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=on,addr=ff,romfile=efi-virtio.rom" devicePCIBridgeStringReserved = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=off,addr=ff,romfile=efi-virtio.rom,io-reserve=4k,mem-reserve=1m,pref64-reserve=1m" @@ -42,7 +42,8 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserBlk, ROMFile: romfile, } - testAppend(vhostuserBlkDevice, deviceVhostUserBlkString, t) + // vhost-user-pci device strings include bus=pcie.0 — gated to q35/virt. + testAppendQ35(vhostuserBlkDevice, deviceVhostUserBlkString, t) vhostuserSCSIDevice := VhostUserDevice{ SocketPath: "/tmp/nonexistentsocket.socket", @@ -52,7 +53,7 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserSCSI, ROMFile: romfile, } - testAppend(vhostuserSCSIDevice, deviceVhostUserSCSIString, t) + testAppendQ35(vhostuserSCSIDevice, deviceVhostUserSCSIString, t) vhostuserNetDevice := VhostUserDevice{ SocketPath: "/tmp/nonexistentsocket.socket", @@ -62,7 +63,7 @@ func TestAppendDeviceVhostUser(t *testing.T) { VhostUserType: VhostUserNet, ROMFile: romfile, } - testAppend(vhostuserNetDevice, deviceVhostUserNetString, t) + testAppendQ35(vhostuserNetDevice, deviceVhostUserNetString, t) } func TestAppendVirtioBalloon(t *testing.T) { diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 5d4c15ed9d..e4616a8231 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -9,6 +9,7 @@ import ( "fmt" "os" "reflect" + "runtime" "strings" "testing" ) @@ -23,6 +24,15 @@ func testAppend(structure interface{}, expected string, t *testing.T) { testConfigAppend(&config, structure, expected, t) } +// testAppendQ35 is testAppend with Config.Machine.Type set to "q35" so +// device emitters that gate on hasPCIeRoot() (e.g. virtio-pci leaves +// pinned to bus=pcie.0) take the PCIe path. Use this for tests whose +// expected string contains "bus=pcie.0". +func testAppendQ35(structure interface{}, expected string, t *testing.T) { + config := Config{Machine: Machine{Type: "q35"}} + testConfigAppend(&config, structure, expected, t) +} + func testConfigAppend(config *Config, structure interface{}, expected string, t *testing.T) { switch s := structure.(type) { case Machine: @@ -342,7 +352,32 @@ func TestAppendVSOCK(t *testing.T) { vsockDevice.DevNo = DevNo } - testAppend(vsockDevice, deviceVSOCKString, t) + // deviceVSOCKString includes bus=pcie.0 — gated to q35/virt machines. + testAppendQ35(vsockDevice, deviceVSOCKString, t) +} + +// TestAppendVSOCKNoPCIeRoot verifies that on machines without a `pcie.0` +// root (e.g. ppc64le's pseries, microvm, s390-ccw-virtio), we do NOT +// emit `bus=pcie.0` — doing so would crash QEMU with +// "Bus 'pcie.0' not found". Transport and ROMFile are set explicitly +// rather than using the arch-conditional `romfile` constant (which is +// "" on s390x via qemu_s390x_test.go), so the test exercises the +// same code path on every architecture. +func TestAppendVSOCKNoPCIeRoot(t *testing.T) { + const vsockRomfile = "efi-virtio.rom" + vsockDevice := VSOCKDevice{ + ID: "vhost-vsock-pci0", + ContextID: 4, + VHostFD: nil, + DisableModern: true, + ROMFile: vsockRomfile, + Transport: TransportPCI, + } + + // pseries -> hasPCIeRoot returns false -> no bus=pcie.0 emitted. + expected := "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=" + vsockRomfile + config := Config{Machine: Machine{Type: "pseries"}} + testConfigAppend(&config, vsockDevice, expected, t) } func TestVSOCKValid(t *testing.T) { @@ -1117,6 +1152,140 @@ func TestBadMemoryKnobs(t *testing.T) { } } +func TestAppendMultiNUMAMemoryKnobs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + Slots: 8, + MaxMem: "4G", + }, + NUMANodes: []NUMANode{ + { + NodeID: 0, + CPUs: "0-3", + MemSize: "1G", + HostNodes: "0", + MemBackendType: "memory-backend-ram", + }, + { + NodeID: 1, + CPUs: "4-7", + MemSize: "1G", + HostNodes: "1", + MemBackendType: "memory-backend-ram", + }, + }, + Knobs: Knobs{ + MemShared: true, + MemPrealloc: true, + }, + } + + c.appendMemoryKnobs() + + expected := []string{ + "-object", "memory-backend-ram,id=numa-mem0,size=1G,host-nodes=0,policy=bind,share=on,prealloc=on", + "-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-3", + "-object", "memory-backend-ram,id=numa-mem1,size=1G,host-nodes=1,policy=bind,share=on,prealloc=on", + "-numa", "node,nodeid=1,memdev=numa-mem1,cpus=4-7", + } + if len(c.qemuParams) != len(expected) { + t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams) + } + for i, p := range expected { + if c.qemuParams[i] != p { + t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i]) + } + } +} + +func TestAppendMultiNUMAHugePages(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + Slots: 8, + MaxMem: "4G", + }, + NUMANodes: []NUMANode{ + { + NodeID: 0, + CPUs: "0-1", + MemSize: "1G", + HostNodes: "0", + MemBackendType: "memory-backend-file", + MemBackendPath: "/dev/hugepages", + }, + { + NodeID: 1, + CPUs: "2-3", + MemSize: "1G", + HostNodes: "1", + MemBackendType: "memory-backend-file", + MemBackendPath: "/dev/hugepages", + }, + }, + Knobs: Knobs{ + MemShared: true, + }, + } + + c.appendMemoryKnobs() + + expected := []string{ + "-object", "memory-backend-file,id=numa-mem0,size=1G,mem-path=/dev/hugepages,host-nodes=0,policy=bind,share=on", + "-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-1", + "-object", "memory-backend-file,id=numa-mem1,size=1G,mem-path=/dev/hugepages,host-nodes=1,policy=bind,share=on", + "-numa", "node,nodeid=1,memdev=numa-mem1,cpus=2-3", + } + if len(c.qemuParams) != len(expected) { + t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams) + } + for i, p := range expected { + if c.qemuParams[i] != p { + t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i]) + } + } +} + +func TestAppendNUMADist(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + c := &Config{ + Memory: Memory{ + Size: "2G", + }, + NUMANodes: []NUMANode{ + {NodeID: 0, CPUs: "0-1", MemSize: "1G", MemBackendType: "memory-backend-ram"}, + {NodeID: 1, CPUs: "2-3", MemSize: "1G", MemBackendType: "memory-backend-ram"}, + }, + NUMADists: []NUMADist{ + {Src: 0, Dst: 1, Val: 20}, + {Src: 1, Dst: 0, Val: 20}, + }, + } + + c.appendMemoryKnobs() + + expectedDist := []string{ + "-numa", "dist,src=0,dst=1,val=20", + "-numa", "dist,src=1,dst=0,val=20", + } + params := c.qemuParams + distParams := params[len(params)-4:] + for i, p := range expectedDist { + if distParams[i] != p { + t.Errorf("Dist param %d: expected %q, got %q", i, p, distParams[i]) + } + } +} + func TestBadBios(t *testing.T) { c := &Config{} c.appendBios() diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 061bf8b2ed..c5c5f70c34 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -1071,6 +1071,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { IOMMU: h.IOMMU, IOMMUPlatform: h.getIOMMUPlatform(), GuestNUMANodes: h.defaultGuestNUMANodes(), + NUMAMapping: append([]string(nil), h.NUMAMapping...), FileBackedMemRootDir: h.FileBackedMemRootDir, FileBackedMemRootList: h.FileBackedMemRootList, Debug: h.Debug, @@ -1994,6 +1995,10 @@ func checkConfig(config oci.RuntimeConfig) error { return err } + if err := checkNumaConfig(config); err != nil { + return err + } + hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO machineType := config.HypervisorConfig.HypervisorMachineType @@ -2005,6 +2010,25 @@ func checkConfig(config oci.RuntimeConfig) error { return nil } +func checkNumaConfig(config oci.RuntimeConfig) error { + if len(config.HypervisorConfig.GuestNUMANodes) <= 1 { + return nil + } + + switch goruntime.GOARCH { + case "amd64", "arm64": + default: + return fmt.Errorf("multi-NUMA support is only available on amd64 and arm64, got %q", goruntime.GOARCH) + } + + if !config.StaticSandboxResourceMgmt { + return fmt.Errorf("NUMA support requires static_sandbox_resource_mgmt to be enabled; " + + "NUMA topology is not compatible with dynamic CPU/memory hotplug") + } + + return nil +} + // checkPCIeConfig ensures the PCIe configuration is valid. // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index b09a97e994..229f065740 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -794,11 +794,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig } if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok { - guestNUMANodes, err := vcutils.GetGuestNUMANodes(strings.Fields(annotation)) + mapping := strings.Fields(annotation) + guestNUMANodes, err := vcutils.GetGuestNUMANodes(mapping) if err != nil { return err } sbConfig.HypervisorConfig.GuestNUMANodes = guestNUMANodes + // Record the raw user-provided mapping so the hypervisor + // backend honors it verbatim instead of right-sizing. + sbConfig.HypervisorConfig.NUMAMapping = mapping } return nil @@ -1457,7 +1461,7 @@ func (a *annotationConfiguration) setFloat32WithCheck(f func(float32) error) err // be added to the VM if sandbox annotations are provided with this sizing details func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) { var memory, quota int64 - var period uint64 + var shares, period uint64 var err error if spec == nil || spec.Annotations == nil { @@ -1488,6 +1492,15 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) } } + annotation, ok = spec.Annotations[ctrAnnotations.SandboxCPUShares] + if ok { + shares, err = strconv.ParseUint(annotation, 10, 64) + if err != nil { + ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUShares: %s", annotation) + shares = 0 + } + } + annotation, ok = spec.Annotations[ctrAnnotations.SandboxMem] if ok { memory, err = strconv.ParseInt(annotation, 10, 64) @@ -1497,7 +1510,16 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) } } - return calculateVMResources(period, quota, memory) + numCPU, memSizeMB = calculateVMResources(period, quota, memory) + + // When cpuManagerPolicy=static is in use, kubelet sets quota=-1 + // (unconstrained) and assigns CPUs via cpuset instead. Fall back + // to deriving the CPU count from shares (1024 shares per CPU). + if numCPU == 0 && shares > 0 { + numCPU = float32(math.Ceil(float64(shares) / 1024.0)) + } + + return numCPU, memSizeMB } // CalculateContainerSizing will calculate the number of CPUs and amount of memory that is needed diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index dc96e3cf39..c3784712f3 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -1742,12 +1742,17 @@ func (c *Container) update(ctx context.Context, resources specs.LinuxResources) return err } - // There currently isn't a notion of cpusets.cpus or mems being tracked - // inside of the guest. Make sure we clear these before asking agent to update - // the container's cgroups. + // Cpus/Mems in cgroup cpuset are host-relative; clear Cpus since vCPU + // numbering differs inside the guest. For Mems, translate host NUMA node + // IDs to guest node IDs when multi-NUMA is configured, otherwise clear. if resources.CPU != nil { - resources.CPU.Mems = "" resources.CPU.Cpus = "" + numaNodes := c.sandbox.config.HypervisorConfig.GuestNUMANodes + if len(numaNodes) > 1 && resources.CPU.Mems != "" { + resources.CPU.Mems = translateHostMemsToGuest(resources.CPU.Mems, numaNodes) + } else { + resources.CPU.Mems = "" + } } return c.sandbox.agent.updateContainer(ctx, c.sandbox, *c, resources) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index b631960f6b..8b93b31428 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -803,6 +803,15 @@ type HypervisorConfig struct { // GuestNUMANodes defines guest NUMA topology and mapping to host NUMA nodes and CPUs. GuestNUMANodes []types.GuestNUMANode + // NUMAMapping is the raw user-provided NUMA mapping (TOML + // `numa_mapping` or the io.katacontainers.config.hypervisor.numa_mapping + // annotation). When empty, GuestNUMANodes was auto-derived from the + // host topology and may be right-sized at sandbox creation (e.g. + // collapsed to a single host node when the sandbox fits, or + // restricted to host nodes containing attached VFIO devices). When + // non-empty, the topology is honored verbatim. + NUMAMapping []string + // DisableNestingChecks is used to override customizations performed // when running on top of another VMM. DisableNestingChecks bool diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go index 381f3c8f07..1e21e4867c 100644 --- a/src/runtime/virtcontainers/hypervisor_config_linux.go +++ b/src/runtime/virtcontainers/hypervisor_config_linux.go @@ -63,10 +63,6 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { conf.DefaultMaxVCPUs = defaultMaxVCPUs } - if numNUMA := conf.NumGuestNUMANodes(); numNUMA > 1 { - conf.DefaultMaxVCPUs -= conf.DefaultMaxVCPUs % numNUMA - } - if conf.Msize9p == 0 && conf.SharedFS != config.VirtioFS { conf.Msize9p = defaultMsize9p } diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 452c64f9ce..8b34cb246a 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -34,6 +34,7 @@ import ( kataclient "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/client" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" @@ -1018,7 +1019,36 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st return nil } -func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error { +// translateHostMemsToGuest converts a host cpuset.mems string (e.g. "0,2") +// into guest NUMA node IDs. Each guest NUMA node index maps to a set of host +// nodes via GuestNUMANode.HostNodes. If a host node from `mems` appears in +// a GuestNUMANode's HostNodes, the corresponding guest node index is included. +func translateHostMemsToGuest(hostMems string, numaNodes []types.GuestNUMANode) string { + hostSet, err := cpuset.Parse(hostMems) + if err != nil { + return "" + } + hostSlice := hostSet.ToSlice() + var guestNodes []int + for guestIdx, gn := range numaNodes { + nodeSet, err := cpuset.Parse(gn.HostNodes) + if err != nil { + continue + } + for _, hostNode := range hostSlice { + if nodeSet.Contains(hostNode) { + guestNodes = append(guestNodes, guestIdx) + break + } + } + } + if len(guestNodes) == 0 { + return "" + } + return cpuset.NewCPUSet(guestNodes...).String() +} + +func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool, numaNodes []types.GuestNUMANode) error { // Disable Hooks since they have been handled on the host and there is // no reason to send them to the agent. It would make no sense to try // to apply them on the guest. @@ -1060,7 +1090,6 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis } } - // By now only CPU constraints are supported // Issue: https://github.com/kata-containers/runtime/issues/158 // Issue: https://github.com/kata-containers/runtime/issues/204 grpcSpec.Linux.Resources.Devices = nil @@ -1069,7 +1098,12 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis grpcSpec.Linux.Resources.Network = nil if grpcSpec.Linux.Resources.CPU != nil { grpcSpec.Linux.Resources.CPU.Cpus = "" - grpcSpec.Linux.Resources.CPU.Mems = "" + if len(numaNodes) > 1 && grpcSpec.Linux.Resources.CPU.Mems != "" { + guestMems := translateHostMemsToGuest(grpcSpec.Linux.Resources.CPU.Mems, numaNodes) + grpcSpec.Linux.Resources.CPU.Mems = guestMems + } else { + grpcSpec.Linux.Resources.CPU.Mems = "" + } } // Disable network and time namespaces since they are handled on the host @@ -1495,7 +1529,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co // We need to constrain the spec to make sure we're not // passing irrelevant information to the agent. - err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel) + err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel, sandbox.config.HypervisorConfig.GuestNUMANodes) if err != nil { return nil, err } diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index 62bdd76eac..4b27f0c07e 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -638,7 +638,7 @@ func TestConstrainGRPCSpec(t *testing.T) { } k := kataAgent{} - k.constrainGRPCSpec(g, true, true, "", true) + k.constrainGRPCSpec(g, true, true, "", true, nil) // Check nil fields assert.Nil(g.Hooks) @@ -1370,3 +1370,51 @@ func TestKataAgentCreateContainerVFIODevices(t *testing.T) { }) } } + +func TestTranslateHostMemsToGuest(t *testing.T) { + assert := assert.New(t) + + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + + result := translateHostMemsToGuest("0", numaNodes) + assert.Equal("0", result) + + result = translateHostMemsToGuest("1", numaNodes) + assert.Equal("1", result) + + result = translateHostMemsToGuest("0-1", numaNodes) + assert.Equal("0-1", result) + + result = translateHostMemsToGuest("0,1", numaNodes) + assert.Equal("0-1", result) + + result = translateHostMemsToGuest("42", numaNodes) + assert.Equal("", result) + + result = translateHostMemsToGuest("invalid", numaNodes) + assert.Equal("", result) + + result = translateHostMemsToGuest("", numaNodes) + assert.Equal("", result) +} + +func TestTranslateHostMemsToGuestRangeNodes(t *testing.T) { + assert := assert.New(t) + + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-7"}, + {HostNodes: "2-3", HostCPUs: "8-15"}, + } + + result := translateHostMemsToGuest("1", numaNodes) + assert.Equal("0", result) + + result = translateHostMemsToGuest("2", numaNodes) + assert.Equal("1", result) + + result = translateHostMemsToGuest("0,3", numaNodes) + assert.Equal("0-1", result) +} diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 4066c85e48..74818ff5d6 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -21,6 +21,7 @@ import ( "os/user" "path/filepath" "regexp" + goruntime "runtime" "strconv" "strings" "sync" @@ -44,6 +45,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" pkgUtils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" "github.com/kata-containers/kata-containers/src/runtime/pkg/uuid" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) @@ -250,6 +252,14 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso span, _ := katatrace.Trace(ctx, q.Logger(), "setup", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() + // Right-size auto-derived NUMA topology before snapshotting the config. + // We mutate the caller-owned pointer so the sandbox's shared + // HypervisorConfig (used by vCPU pinning and cpuset.mems forwarding) + // observes the same trimmed topology that QEMU is launched with. + // No-op when numa_mapping was set explicitly or when the topology + // already has one or zero nodes. + maybeRightSizeAutoNUMA(hypervisorConfig, q.Logger()) + if err := q.setConfig(hypervisorConfig); err != nil { return err } @@ -325,8 +335,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso return nil } -func (q *qemu) cpuTopology() govmmQemu.SMP { - return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs) +func (q *qemu) cpuTopology(effectiveNUMANodes uint32) govmmQemu.SMP { + return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, effectiveNUMANodes) } func (q *qemu) memoryTopology() (govmmQemu.Memory, error) { @@ -339,6 +349,407 @@ func (q *qemu) memoryTopology() (govmmQemu.Memory, error) { return q.arch.memoryTopology(memMb, 0, 0), nil } +// vfioHostNUMANodes walks the given VFIO devices and returns the set of +// host NUMA node IDs that contain at least one of them. Devices for which +// the NUMA node cannot be determined (returned as -1 by the kernel when +// the device is not bound to any node) are skipped silently. Resolution +// failures are logged as warnings and treated as "no constraint" for that +// device. The function is a free function (not a method) so it can be +// invoked before q.config is populated, e.g. during pre-setConfig +// right-sizing. +func vfioHostNUMANodes(devices []config.DeviceInfo, log *logrus.Entry) map[int]struct{} { + nodes := make(map[int]struct{}) + for _, dev := range devices { + hostPath, err := config.GetHostPath(dev, false, "") + if err != nil { + log.WithError(err).WithField("device", dev.HostPath).Warn("Failed to resolve VFIO device host path for NUMA placement") + continue + } + dev.HostPath = hostPath + var vfioDevs []*config.VFIODev + if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) { + vfioDevs, err = drivers.GetDeviceFromVFIODev(dev) + } else { + vfioDevs, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + } + if err != nil { + log.WithError(err).WithField("device", dev.HostPath).Warn("Failed to enumerate VFIO device(s) for NUMA placement") + continue + } + for _, vd := range vfioDevs { + if vd.NUMANode >= 0 { + nodes[vd.NUMANode] = struct{}{} + } + } + } + return nodes +} + +// guestNodeCoversAny reports whether the HostNodes of guestNode references +// any host NUMA ID present in the given set. +func guestNodeCoversAny(guestNode types.GuestNUMANode, hostSet map[int]struct{}) bool { + if len(hostSet) == 0 { + return false + } + parsed, err := cpuset.Parse(guestNode.HostNodes) + if err != nil { + return false + } + for _, id := range parsed.ToSlice() { + if _, ok := hostSet[id]; ok { + return true + } + } + return false +} + +// guestNodeHostIDs returns the host NUMA IDs covered by a single guest node. +func guestNodeHostIDs(gn types.GuestNUMANode) []int { + parsed, err := cpuset.Parse(gn.HostNodes) + if err != nil { + return nil + } + return parsed.ToSlice() +} + +// hostNUMACapFn returns the (cpu_count, mem_mb) capacity of a host NUMA +// node. Used to inject sysfs reads for testability. +type hostNUMACapFn func(nodeID int) (cpus int, memMB uint64, err error) + +// realHostNUMACapFn is the production capacity provider, backed by sysfs. +func realHostNUMACapFn(nodeID int) (int, uint64, error) { + c, err := utils.GetHostNUMANodeCapacity(nodeID) + if err != nil { + return 0, 0, err + } + return c.CPUs, c.MemMB, nil +} + +// sumNUMACapacity returns the (cpu_count, mem_mb) sum of the unique host +// NUMA nodes referenced by the given guest NUMA nodes. Nodes whose capacity +// can't be queried are skipped silently. +func sumNUMACapacity(nodes []types.GuestNUMANode, capFn hostNUMACapFn) (int, uint64) { + seen := make(map[int]struct{}) + var totalCPUs int + var totalMemMB uint64 + for _, gn := range nodes { + for _, hid := range guestNodeHostIDs(gn) { + if _, ok := seen[hid]; ok { + continue + } + seen[hid] = struct{}{} + cpus, memMB, err := capFn(hid) + if err != nil { + continue + } + totalCPUs += cpus + totalMemMB += memMB + } + } + return totalCPUs, totalMemMB +} + +// selectNUMANodes is the pure right-sizing decision: given an auto-derived +// guest NUMA topology, the sandbox's CPU/memory budget, the set of host +// NUMA nodes containing an attached VFIO device, and a capacity oracle, +// return the smallest subset of numaNodes that satisfies the constraints. +// +// Heuristic, in order: +// +// 1. If a VFIO device is attached, keep the guest nodes covering host +// nodes that contain a device. If their combined capacity fits the +// sandbox, return only that subset. +// 2. With no VFIO devices, if the smallest single host node has enough +// CPU+memory for the sandbox, return the first guest node. +// 3. Otherwise, return the input unchanged. +// +// The function is pure (no I/O), so it is unit-testable. Callers must pass +// a capFn that resolves host NUMA capacity; production code uses +// realHostNUMACapFn. +func selectNUMANodes( + numaNodes []types.GuestNUMANode, + vcpus uint32, + memMB uint64, + vfioHostSet map[int]struct{}, + capFn hostNUMACapFn, + log *logrus.Entry, +) []types.GuestNUMANode { + if len(numaNodes) <= 1 { + return numaNodes + } + + // 1) VFIO-aware: keep the guest nodes covering device-bearing host nodes. + if len(vfioHostSet) > 0 { + var covered []types.GuestNUMANode + for _, gn := range numaNodes { + if guestNodeCoversAny(gn, vfioHostSet) { + covered = append(covered, gn) + } + } + if len(covered) == 0 { + log.WithField("vfio-host-nodes", vfioHostSet). + Warn("No guest NUMA node covers VFIO device host nodes; keeping full topology") + return numaNodes + } + cpus, memCap := sumNUMACapacity(covered, capFn) + if uint32(cpus) >= vcpus && memCap >= memMB { + log.WithFields(logrus.Fields{ + "selected-nodes": len(covered), + "input-nodes": len(numaNodes), + "vfio-host-nodes": vfioHostSet, + "vcpus": vcpus, + "mem-mb": memMB, + }).Info("Right-sized NUMA topology to VFIO-aligned subset") + return covered + } + log.WithFields(logrus.Fields{ + "vfio-host-nodes": vfioHostSet, + "covered-cpus": cpus, + "covered-mem-mb": memCap, + "requested-vcpus": vcpus, + "requested-mem-mb": memMB, + }).Info("VFIO-aligned NUMA subset too small for sandbox; keeping full topology") + return numaNodes + } + + // 2) No VFIO constraints: collapse if the sandbox fits in a single + // (smallest) host node. + var smallestCPUs int = -1 + var smallestMem uint64 = math.MaxUint64 + for _, gn := range numaNodes { + cpus, memCap := sumNUMACapacity([]types.GuestNUMANode{gn}, capFn) + if smallestCPUs < 0 || cpus < smallestCPUs { + smallestCPUs = cpus + } + if memCap < smallestMem { + smallestMem = memCap + } + } + if smallestCPUs > 0 && uint32(smallestCPUs) >= vcpus && smallestMem >= memMB { + log.WithFields(logrus.Fields{ + "input-nodes": len(numaNodes), + "vcpus": vcpus, + "mem-mb": memMB, + "smallest-node-cpus": smallestCPUs, + "smallest-node-memMB": smallestMem, + }).Info("Right-sized NUMA topology: sandbox fits in a single host node") + return numaNodes[:1] + } + + // 3) Sandbox spans multiple nodes; preserve the auto-derived topology. + return numaNodes +} + +// maybeRightSizeAutoNUMA right-sizes an auto-derived guest NUMA topology +// in place on the given HypervisorConfig. It is a no-op when the user +// configured an explicit numa_mapping (TOML or annotation), or when the +// topology has at most one node. +// +// This must run before the config is consumed by the rest of the runtime +// (sandbox vCPU pinning, cpuset.mems forwarding, QEMU command-line build), +// so callers should invoke it on the *shared* HypervisorConfig pointer +// owned by the sandbox, not on a local copy. +func maybeRightSizeAutoNUMA(hc *HypervisorConfig, log *logrus.Entry) { + if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 { + return + } + hc.GuestNUMANodes = selectNUMANodes( + hc.GuestNUMANodes, + hc.DefaultMaxVCPUs, + uint64(hc.MemorySize), + vfioHostNUMANodes(hc.VFIODevices, log), + realHostNUMACapFn, + log, + ) +} + +func (q *qemu) buildNUMATopology() ([]govmmQemu.NUMANode, []govmmQemu.NUMADist, error) { + // q.config.GuestNUMANodes has already been right-sized (when applicable) + // by maybeRightSizeAutoNUMA() at hypervisor setup time. Empty means + // no NUMA topology; a single node may still carry a HostNodes binding + // (e.g. right-sized to the GPU's NUMA node), in which case we must + // emit it so memory is bound to the correct host node. + numaNodes := q.config.GuestNUMANodes + if !numaPlacementActive(numaNodes) { + return nil, nil, nil + } + + switch goruntime.GOARCH { + case "amd64", "arm64": + default: + return nil, nil, fmt.Errorf("multi-NUMA not supported on architecture %s", goruntime.GOARCH) + } + + // NUMA requires static_sandbox_resource_mgmt=true, which guarantees + // NumVCPUs == DefaultMaxVCPUs (set in oci/utils.go). All boot vCPUs + // are present at VM start, so the per-node CPU ranges below are valid. + // + // cpuTopology() rounds MaxCPUs up to (numNUMANodes * coresPerSocket) + // so that QEMU's SMP topology is consistent. We must cover all CPU + // slots in the NUMA map, otherwise QEMU warns about CPUs not present + // in any NUMA node. Apply the same ceiling here. + numNodes := uint32(len(numaNodes)) + if q.config.DefaultMaxVCPUs < numNodes { + hvLogger.WithFields(logrus.Fields{ + "vcpus": q.config.DefaultMaxVCPUs, + "numa-nodes": numNodes, + }).Warn("DefaultMaxVCPUs < NUMA node count; skipping multi-NUMA topology") + return nil, nil, nil + } + coresPerSocket := (q.config.DefaultMaxVCPUs + numNodes - 1) / numNodes + maxVCPUs := numNodes * coresPerSocket + + vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, maxVCPUs) + if err != nil { + return nil, nil, fmt.Errorf("failed to distribute vCPUs across NUMA nodes: %w", err) + } + + memMb := uint64(q.config.MemorySize) + + var memAlign uint64 = 1 + if q.config.HugePages { + memAlign = 2 + } + + backendType := "memory-backend-ram" + backendPath := "" + if q.config.HugePages { + backendType = "memory-backend-file" + backendPath = "/dev/hugepages" + } else if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus || + q.config.FileBackedMemRootDir != "" { + backendType = "memory-backend-file" + if q.config.FileBackedMemRootDir != "" { + backendPath = q.config.FileBackedMemRootDir + } else { + backendPath = fallbackFileBackedMemDir + } + } + if backendPath != "" { + if _, err := os.Stat(backendPath); err != nil { + return nil, nil, fmt.Errorf("NUMA memory backend path %q does not exist: %w", backendPath, err) + } + } + + // Distribute memory proportionally to vCPU counts, aligned to memAlign. + memPerNode := make([]uint64, numNodes) + var memAssigned uint64 + for i := uint32(0); i < numNodes; i++ { + raw := memMb * uint64(vcpusPerNode[i]) / uint64(maxVCPUs) + memPerNode[i] = (raw / memAlign) * memAlign + if memPerNode[i] == 0 { + memPerNode[i] = memAlign + } + memAssigned += memPerNode[i] + } + // Give the remainder to the last node (must also be aligned). + if memAssigned < memMb { + remainder := memMb - memAssigned + if remainder%memAlign != 0 { + return nil, nil, fmt.Errorf("MemorySize (%d MiB) cannot be evenly distributed across %d NUMA nodes with %d MiB alignment", + memMb, numNodes, memAlign) + } + memPerNode[numNodes-1] += remainder + } else if memAssigned > memMb { + return nil, nil, fmt.Errorf("MemorySize (%d MiB) cannot be evenly distributed across %d NUMA nodes with %d MiB alignment", + memMb, numNodes, memAlign) + } + + var nodes []govmmQemu.NUMANode + var cpuOffset uint32 + for i, gn := range numaNodes { + startCPU := cpuOffset + endCPU := startCPU + vcpusPerNode[i] - 1 + cpuOffset = endCPU + 1 + cpuRange := fmt.Sprintf("%d-%d", startCPU, endCPU) + + nodes = append(nodes, govmmQemu.NUMANode{ + NodeID: uint32(i), + CPUs: cpuRange, + MemSize: fmt.Sprintf("%dM", memPerNode[i]), + HostNodes: gn.HostNodes, + MemBackendType: backendType, + MemBackendPath: backendPath, + }) + } + + var dists []govmmQemu.NUMADist + hostDists := utils.GetHostNUMADistances(numaNodes) + for _, hd := range hostDists { + dists = append(dists, govmmQemu.NUMADist{ + Src: hd.Src, + Dst: hd.Dst, + Val: hd.Val, + }) + } + + q.validateVFIODeviceNUMAPlacement(numaNodes) + + return nodes, dists, nil +} + +// buildCoveredHostNodes maps each host NUMA node ID to its guest NUMA node +// index based on the GuestNUMANode HostNodes configuration. +func buildCoveredHostNodes(numaNodes []types.GuestNUMANode) map[int]uint32 { + covered := make(map[int]uint32) + for guestIdx, gn := range numaNodes { + nodeSet, err := cpuset.Parse(gn.HostNodes) + if err != nil { + continue + } + for _, n := range nodeSet.ToSlice() { + covered[n] = uint32(guestIdx) + } + } + return covered +} + +// validateVFIODeviceNUMAPlacement checks that every cold-plugged VFIO device +// (e.g. GPU) resides on a host NUMA node that is covered by the guest NUMA +// topology. A mismatch means the device will incur cross-NUMA memory accesses. +func (q *qemu) validateVFIODeviceNUMAPlacement(numaNodes []types.GuestNUMANode) { + coveredHostNodes := buildCoveredHostNodes(numaNodes) + + for _, dev := range q.config.VFIODevices { + hostPath, err := config.GetHostPath(dev, false, "") + if err != nil { + q.Logger().WithError(err).WithField("device", dev.HostPath).Warn("Failed to resolve VFIO device host path for NUMA placement validation") + continue + } + dev.HostPath = hostPath + var vfioDevs []*config.VFIODev + if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) { + vfioDevs, err = drivers.GetDeviceFromVFIODev(dev) + } else { + vfioDevs, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + } + if err != nil { + q.Logger().WithError(err).WithField("device", dev.HostPath).Warn("Failed to enumerate VFIO device(s) for NUMA placement validation") + continue + } + for _, vd := range vfioDevs { + if vd.NUMANode < 0 { + continue + } + guestNode, ok := coveredHostNodes[vd.NUMANode] + if !ok { + q.Logger().WithFields(logrus.Fields{ + "bdf": vd.BDF, + "host-numa": vd.NUMANode, + "guest-numa": "none", + "covered-nodes": coveredHostNodes, + }).Warn("VFIO device on host NUMA node not covered by guest NUMA topology; cross-NUMA memory accesses may occur") + } else { + q.Logger().WithFields(logrus.Fields{ + "bdf": vd.BDF, + "host-numa": vd.NUMANode, + "guest-numa": guestNode, + }).Debug("VFIO device NUMA placement validated") + } + } + } +} + func (q *qemu) qmpSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, qmpSocket) } @@ -596,7 +1007,13 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi return err } - smp := q.cpuTopology() + numaNodes, numaDists, err := q.buildNUMATopology() + if err != nil { + return err + } + + effectiveNUMANodes := uint32(len(numaNodes)) + smp := q.cpuTopology(effectiveNUMANodes) memory, err := q.memoryTopology() if err != nil { @@ -717,6 +1134,8 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi QMPSockets: qmpSockets, Knobs: knobs, Incoming: incoming, + NUMANodes: numaNodes, + NUMADists: numaDists, VGA: "none", GlobalParam: "kvm-pit.lost_tick_policy=discard", Bios: firmwarePath, @@ -881,6 +1300,15 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if numOfPluggablePorts > maxPCIeRootPort { return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) } + + // When NUMA is active (multi-node OR a single node right-sized to a + // specific host node), create pxb-pcie bridges so cold-plugged VFIO + // devices inherit the correct guest NUMA affinity. + if numaPlacementActive(q.config.GuestNUMANodes) && len(hypervisorConfig.VFIODevices) > 0 { + qemuConfig.Devices = q.createNUMAPCIeTopology(qemuConfig.Devices, hypervisorConfig, numOfPluggablePorts) + return nil + } + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts) return nil } @@ -2660,7 +3088,107 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff return memory } -// genericAppendPCIeRootPort appends to devices the given pcie-root-port +// numaPlacementActive reports whether the runtime should emit per-NUMA +// pxb-pcie / memory-binding QEMU args. True when there is more than one +// guest node, OR a single guest node with an explicit HostNodes binding. +// +// The single-node case covers two scenarios that the runtime cannot tell +// apart after right-sizing: +// - a multi-NUMA host whose workload was collapsed to one host node +// (e.g. GPU on host node 0) — pxb-pcie + host-nodes binding are +// required so the guest GPU reports the correct NUMA affinity; +// - a single-NUMA host with `enable_numa=true` — emitting the binding +// is a functional no-op (the only host node is node 0 anyway). +// +// Single node without a HostNodes value (no NUMA mapping at all) falls +// through to the flat memdev path. +func numaPlacementActive(nodes []types.GuestNUMANode) bool { + if len(nodes) > 1 { + return true + } + return len(nodes) == 1 && nodes[0].HostNodes != "" +} + +// createNUMAPCIeTopology creates pxb-pcie bridges for NUMA nodes that have +// VFIO devices, then creates root ports on each pxb bus. VFIO devices will +// be assigned to these root ports during Attach() based on their host NUMA +// node, giving the guest kernel correct NUMA affinity for the PCI devices. +func (q *qemu) createNUMAPCIeTopology(devices []govmmQemu.Device, hypervisorConfig *HypervisorConfig, totalPorts uint32) []govmmQemu.Device { + coveredHostNodes := buildCoveredHostNodes(q.config.GuestNUMANodes) + + // Count VFIO devices per host NUMA node. + numaDevCount := make(map[int]int) + for _, dev := range hypervisorConfig.VFIODevices { + hostPath, err := config.GetHostPath(dev, false, "") + if err != nil { + continue + } + dev.HostPath = hostPath + var vfioDevs []*config.VFIODev + if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) { + vfioDevs, _ = drivers.GetDeviceFromVFIODev(dev) + } else { + vfioDevs, _ = drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + } + for _, vd := range vfioDevs { + if vd.NUMANode >= 0 && drivers.IsPCIeDevice(vd.BDF) { + numaDevCount[vd.NUMANode]++ + } + } + } + + if len(numaDevCount) == 0 { + return q.arch.appendPCIeRootPortDevice(devices, totalPorts) + } + + // Create a pxb-pcie + root ports per NUMA node that has devices. + var rpIndex uint32 + const busNrSpacing uint8 = 0x20 + + for hostNode, devCount := range numaDevCount { + guestNode, ok := coveredHostNodes[hostNode] + if !ok { + q.Logger().WithField("host-numa", hostNode).Warn("VFIO device on uncovered NUMA node; skipping pxb-pcie") + continue + } + + pxbID := fmt.Sprintf("pxb-numa%d", guestNode) + busNr := busNrSpacing * uint8(guestNode+1) + + devices = append(devices, govmmQemu.PXBPCIeDevice{ + ID: pxbID, + BusNr: busNr, + NUMANode: int(guestNode), + }) + + // Create root ports on this pxb bus for the VFIO devices. + var rpIDs []string + for i := 0; i < devCount; i++ { + rpID := fmt.Sprintf("rp-numa%d-%d", guestNode, i) + rpIDs = append(rpIDs, rpID) + devices = append(devices, govmmQemu.PCIeRootPortDevice{ + ID: rpID, + Bus: pxbID, + Chassis: fmt.Sprintf("%d", 10+guestNode), + Slot: fmt.Sprintf("%d", i), + }) + rpIndex++ + } + + config.NUMARootPorts[hostNode] = rpIDs + + q.Logger().WithFields(logrus.Fields{ + "pxb-id": pxbID, + "bus-nr": busNr, + "guest-numa": guestNode, + "host-numa": hostNode, + "root-ports": rpIDs, + }).Info("Created pxb-pcie with root ports for NUMA VFIO placement") + } + + return devices +} + func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device { var ( bus string diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index aacb97b7cc..f3bba704ca 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -61,8 +61,9 @@ type qemuArch interface { // bridges sets the number bridges for the machine type bridges(number uint32) - // cpuTopology returns the CPU topology for the given amount of vcpus - cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP + // cpuTopology returns the CPU topology for the given amount of vcpus. + // numNUMANodes > 1 restructures the topology so vCPUs are grouped by socket per NUMA node. + cpuTopology(vcpus, maxvcpus uint32, numNUMANodes uint32) govmmQemu.SMP // cpuModel returns the CPU model for the machine type cpuModel() string @@ -324,16 +325,29 @@ func (q *qemuArchBase) bridges(number uint32) { } } -func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP { - smp := govmmQemu.SMP{ +func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32, numNUMANodes uint32) govmmQemu.SMP { + if numNUMANodes > 1 { + coresPerSocket := (maxvcpus + numNUMANodes - 1) / numNUMANodes + if coresPerSocket == 0 { + coresPerSocket = 1 + } + smpMaxCPUs := numNUMANodes * coresPerSocket * defaultThreads + return govmmQemu.SMP{ + CPUs: vcpus, + Sockets: numNUMANodes, + Cores: coresPerSocket, + Threads: defaultThreads, + MaxCPUs: smpMaxCPUs, + } + } + + return govmmQemu.SMP{ CPUs: vcpus, Sockets: maxvcpus, Cores: defaultCores, Threads: defaultThreads, MaxCPUs: maxvcpus, } - - return smp } func (q *qemuArchBase) cpuModel() string { diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index dfaebb8dab..c177ee44a8 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -189,7 +189,46 @@ func TestQemuArchBaseCPUTopology(t *testing.T) { MaxCPUs: defaultMaxVCPUs, } - smp := qemuArchBase.cpuTopology(vcpus, defaultMaxVCPUs) + smp := qemuArchBase.cpuTopology(vcpus, defaultMaxVCPUs, 0) + assert.Equal(expectedSMP, smp) +} + +func TestQemuArchBaseCPUTopologyNUMA(t *testing.T) { + assert := assert.New(t) + qemuArchBase := newQemuArchBase() + vcpus := uint32(2) + maxvcpus := uint32(8) + numNUMA := uint32(2) + + expectedSMP := govmmQemu.SMP{ + CPUs: vcpus, + Sockets: numNUMA, + Cores: maxvcpus / numNUMA, + Threads: defaultThreads, + MaxCPUs: maxvcpus, + } + + smp := qemuArchBase.cpuTopology(vcpus, maxvcpus, numNUMA) + assert.Equal(expectedSMP, smp) +} + +func TestQemuArchBaseCPUTopologyNUMAUneven(t *testing.T) { + assert := assert.New(t) + qemuArchBase := newQemuArchBase() + vcpus := uint32(2) + maxvcpus := uint32(5) + numNUMA := uint32(2) + + coresPerSocket := (maxvcpus + numNUMA - 1) / numNUMA + expectedSMP := govmmQemu.SMP{ + CPUs: vcpus, + Sockets: numNUMA, + Cores: coresPerSocket, + Threads: defaultThreads, + MaxCPUs: numNUMA * coresPerSocket * defaultThreads, + } + + smp := qemuArchBase.cpuTopology(vcpus, maxvcpus, numNUMA) assert.Equal(expectedSMP, smp) } diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index 5d4267f011..9fcb8dc1fa 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -19,6 +19,7 @@ import ( "os" "path" "path/filepath" + "runtime" "testing" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" @@ -29,6 +30,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" "github.com/pbnjay/memory" "github.com/pkg/errors" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" ) @@ -283,7 +285,7 @@ func TestQemuCPUTopology(t *testing.T) { MaxCPUs: uint32(vcpus), } - smp := q.cpuTopology() + smp := q.cpuTopology(0) assert.Exactly(smp, expectedOut) } @@ -1200,3 +1202,672 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) { // State should remain unchanged assert.Equal(100, q.state.HotpluggedMemory) } + +func TestNumaPlacementActive(t *testing.T) { + assert := assert.New(t) + cases := []struct { + name string + nodes []types.GuestNUMANode + want bool + }{ + {"empty", nil, false}, + {"single-node-no-binding", []types.GuestNUMANode{{}}, false}, + {"single-node-host-0", []types.GuestNUMANode{{HostNodes: "0"}}, true}, + {"single-node-host-1", []types.GuestNUMANode{{HostNodes: "1"}}, true}, + {"single-node-host-range", []types.GuestNUMANode{{HostNodes: "0-1"}}, true}, + {"two-nodes", []types.GuestNUMANode{{HostNodes: "0"}, {HostNodes: "1"}}, true}, + } + for _, c := range cases { + assert.Equal(c.want, numaPlacementActive(c.nodes), c.name) + } +} + +func TestBuildNUMATopologySingleNode(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // A single guest node mapped to a specific host node (e.g. produced + // by maybeRightSizeAutoNUMA() collapsing a multi-node sandbox to the + // GPU's host NUMA node) must still emit a one-node topology so that + // the memory backend gets a host-nodes= binding. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 1) + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("0-3", nodes[0].CPUs) + assert.Equal("1024M", nodes[0].MemSize) + assert.Equal("0", nodes[0].HostNodes) + assert.Equal("memory-backend-ram", nodes[0].MemBackendType) +} + +func TestBuildNUMATopologySingleNodeNoHostBinding(t *testing.T) { + // A single guest node without a HostNodes value carries no NUMA + // binding intent; buildNUMATopology() must return nil so that the + // QEMU command line falls through to the flat memdev path. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "", HostCPUs: "0-3"}, + }, + }, + } + nodes, dists, err := q.buildNUMATopology() + assert.NoError(err) + assert.Nil(nodes) + assert.Nil(dists) +} + +func TestBuildNUMATopologySingleNodeExplicitNonZeroHost(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // User explicitly mapped the only guest node to a non-zero host node + // (e.g. numa_mapping = ["1"]). buildNUMATopology() must propagate + // HostNodes verbatim so the memory backend ends up bound to host + // node 1 rather than the default node 0. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + NUMAMapping: []string{"1"}, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "1", HostCPUs: "0-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 1) + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("1", nodes[0].HostNodes) +} + +func TestBuildNUMATopologyExplicitRangedHostNodes(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + // User explicitly mapped two guest nodes to disjoint host-node ranges + // (e.g. numa_mapping = ["0-1", "2-3"]). buildNUMATopology() must + // preserve the ranged HostNodes strings on each emitted NUMANode. + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 8, + MemorySize: 2048, + NUMAMapping: []string{"0-1", "2-3"}, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-3"}, + {HostNodes: "2-3", HostCPUs: "4-7"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("0-1", nodes[0].HostNodes) + assert.Equal("2-3", nodes[1].HostNodes) +} + +func TestBuildNUMATopologyTwoNodes(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("0-1", nodes[0].CPUs) + assert.Equal("512M", nodes[0].MemSize) + assert.Equal("memory-backend-ram", nodes[0].MemBackendType) + + assert.Equal(uint32(1), nodes[1].NodeID) + assert.Equal("2-3", nodes[1].CPUs) + assert.Equal("512M", nodes[1].MemSize) +} + +func TestBuildNUMATopologyHugePages(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + if _, err := os.Stat("/dev/hugepages"); err != nil { + t.Skip("skipping: /dev/hugepages not available") + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal("/dev/hugepages", nodes[0].MemBackendPath) + assert.Equal("512M", nodes[0].MemSize) +} + +func TestBuildNUMATopologyVirtioFS(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + SharedFS: config.VirtioFS, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal(fallbackFileBackedMemDir, nodes[0].MemBackendPath) +} + +func TestBuildNUMATopologyFileBackedMem(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + tmpDir := t.TempDir() + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + FileBackedMemRootDir: tmpDir, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal(tmpDir, nodes[0].MemBackendPath) +} + +func TestBuildNUMATopologyTooFewVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0"}, + {HostNodes: "1", HostCPUs: "1"}, + }, + }, + } + nodes, dists, err := q.buildNUMATopology() + assert.NoError(err) + assert.Nil(nodes) + assert.Nil(dists) +} + +func TestBuildNUMATopologyUnevenVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 5, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-4"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + // cpuTopology() rounds MaxCPUs to ceil(5/2)*2=6, so 6 CPU slots + // are distributed proportionally: 2 host CPUs → 2 vCPUs, + // 3 host CPUs → 4 vCPUs (3 proportional + 1 remainder). + assert.Equal("0-1", nodes[0].CPUs) + assert.Equal("2-5", nodes[1].CPUs) +} + +func TestBuildNUMATopologyMemMisaligned(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + _, _, err := q.buildNUMATopology() + assert.Error(err) + assert.Contains(err.Error(), "cannot be evenly distributed") +} + +func TestBuildNUMATopologyMemMisalignedRemainder(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 6, + MemorySize: 1025, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-2"}, + {HostNodes: "1", HostCPUs: "3-5"}, + }, + }, + } + _, _, err := q.buildNUMATopology() + assert.Error(err) + assert.Contains(err.Error(), "cannot be evenly distributed") +} + +func TestBuildNUMATopologyEvenMemory(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 6, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-2"}, + {HostNodes: "1", HostCPUs: "3-5"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + + assert.Equal("0-2", nodes[0].CPUs) + assert.Equal("512M", nodes[0].MemSize) + + assert.Equal("3-5", nodes[1].CPUs) + assert.Equal("512M", nodes[1].MemSize) +} + +func TestBuildNUMATopologyProportionalVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 10, + MemorySize: 1000, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-7"}, + {HostNodes: "1", HostCPUs: "8-9"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + // 8 out of 10 host CPUs on node 0 → 8 vCPUs + assert.Equal("0-7", nodes[0].CPUs) + assert.Equal("800M", nodes[0].MemSize) + // 2 out of 10 host CPUs on node 1 → 2 vCPUs + assert.Equal("8-9", nodes[1].CPUs) + assert.Equal("200M", nodes[1].MemSize) +} + +func TestBuildCoveredHostNodes(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + }) + assert.Len(covered, 2) + assert.Equal(uint32(0), covered[0]) + assert.Equal(uint32(1), covered[1]) +} + +func TestBuildCoveredHostNodesRange(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-7"}, + }) + assert.Len(covered, 2) + assert.Equal(uint32(0), covered[0]) + assert.Equal(uint32(0), covered[1]) +} + +func TestBuildCoveredHostNodesEmpty(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes(nil) + assert.Len(covered, 0) +} + +func TestBuildCoveredHostNodesInvalidParse(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "invalid", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + }) + assert.Len(covered, 1) + assert.Equal(uint32(1), covered[1]) +} + +// silentLogger returns a logrus.Entry that discards all output, suitable +// for use in unit tests that exercise NUMA right-sizing decisions. +func silentLogger() *logrus.Entry { + l := logrus.New() + l.Out = io.Discard + return logrus.NewEntry(l) +} + +// fakeCapFn returns a hostNUMACapFn backed by a static map. Unknown nodes +// produce an error so we exercise the "skip unknown" branch in +// sumNUMACapacity when intended. +func fakeCapFn(caps map[int]struct { + cpus int + memMB uint64 +}) hostNUMACapFn { + return func(nodeID int) (int, uint64, error) { + if c, ok := caps[nodeID]; ok { + return c.cpus, c.memMB, nil + } + return 0, 0, fmt.Errorf("unknown host NUMA node %d", nodeID) + } +} + +// twoNodeHostCaps describes a typical 2-socket host: 32 CPUs and 128 GiB +// per node. +func twoNodeHostCaps() map[int]struct { + cpus int + memMB uint64 +} { + return map[int]struct { + cpus int + memMB uint64 + }{ + 0: {cpus: 32, memMB: 128 * 1024}, + 1: {cpus: 32, memMB: 128 * 1024}, + } +} + +func twoNodeAutoTopology() []types.GuestNUMANode { + return []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-31"}, + {HostNodes: "1", HostCPUs: "32-63"}, + } +} + +func TestSumNUMACapacity(t *testing.T) { + assert := assert.New(t) + + cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(twoNodeHostCaps())) + assert.Equal(64, cpus) + assert.Equal(uint64(256*1024), memMB) +} + +func TestSumNUMACapacityDeduplicatesHostNodes(t *testing.T) { + assert := assert.New(t) + + // Two guest entries that both reference host node 0 must only count + // once. The merged "0-1" entry adds host node 1. + nodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-31"}, + {HostNodes: "0-1", HostCPUs: "0-63"}, + } + cpus, memMB := sumNUMACapacity(nodes, fakeCapFn(twoNodeHostCaps())) + assert.Equal(64, cpus) + assert.Equal(uint64(256*1024), memMB) +} + +func TestSumNUMACapacitySkipsUnknown(t *testing.T) { + assert := assert.New(t) + + caps := map[int]struct { + cpus int + memMB uint64 + }{ + 0: {cpus: 16, memMB: 32 * 1024}, + // host node 1 missing on purpose + } + cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(caps)) + assert.Equal(16, cpus) + assert.Equal(uint64(32*1024), memMB) +} + +func TestSelectNUMANodesPassthroughForSingleNode(t *testing.T) { + assert := assert.New(t) + + in := []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}} + out := selectNUMANodes(in, 4, 1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesNoVFIOFitsOneNode(t *testing.T) { + // Small sandbox (8 vCPUs / 16 GiB) fits comfortably in one host node: + // expect collapse to the first guest node. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 8, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Len(out, 1) + assert.Equal("0", out[0].HostNodes) +} + +func TestSelectNUMANodesNoVFIOExceedsOneNode(t *testing.T) { + // 64 vCPUs needs both 32-CPU nodes: expect full topology. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 64, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesNoVFIOMemoryExceedsOneNode(t *testing.T) { + // CPU fits in one node but memory does not: expect full topology. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 8, 200*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIOSubsetFits(t *testing.T) { + // VFIO device on host node 1; sandbox fits in one node: expect + // collapse to the guest node covering host node 1. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{1: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Len(out, 1) + assert.Equal("1", out[0].HostNodes) +} + +func TestSelectNUMANodesVFIOSubsetTooSmall(t *testing.T) { + // VFIO device on host node 1, but sandbox needs more than one node's + // worth of memory: expect the full topology so the sandbox actually + // fits, even at the cost of cross-NUMA traffic. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{1: {}} + out := selectNUMANodes(in, 8, 200*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIOSpansAllNodes(t *testing.T) { + // One VFIO device per host node: VFIO subset == full topology, no + // collapse possible. Result is the input unchanged. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{0: {}, 1: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIONoCoverage(t *testing.T) { + // VFIO host node not represented in the guest topology (rare, but can + // happen if numa_mapping has been customized). Keep the full topology + // rather than dropping all nodes. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{2: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +// rightSizeNUMAWithFakeCaps mirrors maybeRightSizeAutoNUMA but lets tests +// inject a synthetic capacity oracle in place of realHostNUMACapFn so the +// decision is hermetic. +func rightSizeNUMAWithFakeCaps(hc *HypervisorConfig, capFn hostNUMACapFn) { + if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 { + return + } + hc.GuestNUMANodes = selectNUMANodes( + hc.GuestNUMANodes, + hc.DefaultMaxVCPUs, + uint64(hc.MemorySize), + nil, // no VFIO devices in this test + capFn, + silentLogger(), + ) +} + +func TestMaybeRightSizeAutoNUMACollapsesToOneNode(t *testing.T) { + // Empty NUMAMapping (auto) + sandbox fits in one host node: + // GuestNUMANodes is trimmed to a single entry. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 1) + assert.Equal("0", hc.GuestNUMANodes[0].HostNodes) +} + +func TestMaybeRightSizeAutoNUMAExplicitMappingHonored(t *testing.T) { + // Non-empty NUMAMapping (user-provided) is left untouched, even if + // the sandbox would fit in a single node. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1, + NUMAMapping: []string{"0", "1"}, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 2) +} + +func TestMaybeRightSizeAutoNUMAKeepsFullWhenSandboxSpansNodes(t *testing.T) { + // Sandbox needs more CPUs than a single host node has: full topology + // is preserved. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 64, // > one node's 32 CPUs + MemorySize: 1024, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 2) +} + +func TestMaybeRightSizeAutoNUMANoOpForFlatTopology(t *testing.T) { + // A topology with ≤ 1 node is a no-op regardless of NUMAMapping or + // budget. + assert := assert.New(t) + + for _, tc := range []struct { + name string + hc *HypervisorConfig + }{ + { + name: "nil config", + hc: nil, + }, + { + name: "single node", + hc: &HypervisorConfig{ + GuestNUMANodes: []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}}, + }, + }, + { + name: "empty", + hc: &HypervisorConfig{}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + before := 0 + if tc.hc != nil { + before = len(tc.hc.GuestNUMANodes) + } + rightSizeNUMAWithFakeCaps(tc.hc, fakeCapFn(twoNodeHostCaps())) + after := 0 + if tc.hc != nil { + after = len(tc.hc.GuestNUMANodes) + } + assert.Equal(before, after) + }) + } +} diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 6ceae42de9..1d7004e441 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -2961,9 +2961,26 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error { // checkVCPUsPinning is used to support CPUSet mode of kata container. // CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning -// is set to true. Then it fetches sandbox's number of vCPU threads -// and number of CPUs in CPUSet. If the two are equal, each vCPU thread -// is then pinned to one fixed CPU in CPUSet. +// is set to true. +// +// When NUMA topology is configured (GuestNUMANodes is non-empty), vCPU +// threads are pinned to host CPUs belonging to the same host NUMA node +// as the vCPU's assigned guest NUMA node, preserving memory locality. +// vCPUs are distributed proportionally across nodes and each vCPU is +// pinned round-robin to the host CPUs within its NUMA node; the 1:1 +// count equality check does not apply. +// +// This is true for both multi-node sandboxes and right-sized +// single-node sandboxes: when buildNUMATopology()/maybeRightSizeAutoNUMA +// collapses the topology to one node, that single node still carries a +// meaningful HostCPUs subset (the CPUs of the chosen host NUMA node), +// and pinning to that subset is what makes right-sizing actually deliver +// host-thread locality, not just guest-topology locality. +// +// In the non-NUMA path (GuestNUMANodes is empty, e.g. enable_numa=false), +// it fetches the sandbox's number of vCPU threads and number of CPUs in +// CPUSet. If the two are equal, each vCPU thread is pinned 1:1 to the +// CPUs in CPUSet; otherwise pinning is skipped. func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { if s.config == nil { return fmt.Errorf("no sandbox config found") @@ -2972,11 +2989,39 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { return nil } - // fetch vCPU thread ids and CPUSet + expectedVCPUs := int(s.config.HypervisorConfig.NumVCPUs()) + vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx) if err != nil { return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err) } + + // QEMU may not have spawned all vCPU threads yet. Retry with + // exponential backoff until we see the expected count. + if len(vCPUThreadsMap.vcpus) < expectedVCPUs { + const maxAttempts = 10 + backoff := 50 * time.Millisecond + for attempt := 2; attempt <= maxAttempts && len(vCPUThreadsMap.vcpus) < expectedVCPUs; attempt++ { + s.Logger().WithFields(logrus.Fields{ + "have": len(vCPUThreadsMap.vcpus), + "want": expectedVCPUs, + "attempt": attempt, + }).Debug("waiting for all vCPU threads to be available") + time.Sleep(backoff) + backoff *= 2 + vCPUThreadsMap, err = s.hypervisor.GetThreadIDs(ctx) + if err != nil { + return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err) + } + } + if len(vCPUThreadsMap.vcpus) < expectedVCPUs { + s.Logger().WithFields(logrus.Fields{ + "have": len(vCPUThreadsMap.vcpus), + "want": expectedVCPUs, + }).Warn("not all vCPU threads available after retries; pinning available ones") + } + } + cpuSetStr, _, err := s.getSandboxCPUSet() if err != nil { return fmt.Errorf("failed to get CPUSet config: %v", err) @@ -2987,9 +3032,42 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { } cpuSetSlice := cpuSet.ToSlice() - // check if vCPU thread numbers and CPU numbers are equal + numaNodes := s.config.HypervisorConfig.GuestNUMANodes + + if len(cpuSetSlice) == 0 { + if len(numaNodes) >= 1 { + // No cpuset constraint (e.g. ctr without k8s, or a Burstable + // pod with cpuManagerPolicy=none). Build an effective cpuset + // from the NUMA nodes' HostCPUs so pinning works using the + // (possibly right-sized) host NUMA topology. Even a single + // NUMA node here meaningfully constrains pinning to that + // node's host CPUs. + for _, gn := range numaNodes { + hostCPUs, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + continue + } + cpuSet = cpuSet.Union(hostCPUs) + } + cpuSetSlice = cpuSet.ToSlice() + if len(cpuSetSlice) == 0 { + s.Logger().Warn("sandbox CPUSet is empty and cannot derive from NUMA HostCPUs; skipping vCPU pinning") + s.isVCPUsPinningOn = false + return nil + } + s.Logger().WithField("effective-cpuset", cpuSet.String()).Debug("derived cpuset from NUMA HostCPUs for pinning") + } else { + s.Logger().Warn("sandbox CPUSet is empty; skipping vCPU pinning") + s.isVCPUsPinningOn = false + return nil + } + } + + if len(numaNodes) >= 1 { + return s.checkVCPUsPinningNUMA(ctx, vCPUThreadsMap, numaNodes, cpuSetSlice) + } + numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice) - // if not equal, we should reset threads scheduling to random pattern if numVCPUs != numCPUs { if s.isVCPUsPinningOn { s.isVCPUsPinningOn = false @@ -2997,7 +3075,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { } return nil } - // if equal, we can use vCPU thread pinning for i, tid := range vCPUThreadsMap.vcpus { if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil { if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { @@ -3010,6 +3087,68 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { return nil } +// checkVCPUsPinningNUMA pins vCPU threads to host CPUs that belong to the +// same NUMA node as the vCPU's guest NUMA node assignment. vCPUs are +// distributed proportionally to the host CPU count per NUMA node +// (matching buildNUMATopology). It handles any non-empty numaNodes +// slice — including the right-sized single-node case, where every vCPU +// is pinned within the single chosen host NUMA node's CPU set. +func (s *Sandbox) checkVCPUsPinningNUMA(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, numaNodes []types.GuestNUMANode, cpuSetSlice []int) error { + numVCPUs := uint32(len(vCPUThreadsMap.vcpus)) + numNodes := uint32(len(numaNodes)) + if numVCPUs < numNodes { + return fmt.Errorf("number of vCPUs (%d) must be >= NUMA node count (%d) for NUMA pinning", numVCPUs, numNodes) + } + + vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, numVCPUs) + if err != nil { + return fmt.Errorf("failed to compute NUMA vCPU distribution for pinning: %v", err) + } + + cpuSetAll := cpuset.NewCPUSet(cpuSetSlice...) + + var cpuOffset uint32 + for i, gn := range numaNodes { + hostCPUs, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + return fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %v", i, err) + } + allowedCPUs := hostCPUs.Intersection(cpuSetAll).ToSlice() + if len(allowedCPUs) == 0 { + s.Logger().WithFields(logrus.Fields{ + "numa-node": i, + "host-cpus": gn.HostCPUs, + "sandbox-cpus": cpuSetSlice, + }).Warn("NUMA node HostCPUs do not intersect sandbox CPUSet; pinning vCPUs to full cpuset for this node") + allowedCPUs = cpuSetSlice + } + + startVCPU := cpuOffset + endVCPU := startVCPU + vcpusPerNode[i] + cpuOffset = endVCPU + + for vcpuIdx := startVCPU; vcpuIdx < endVCPU; vcpuIdx++ { + tid, ok := vCPUThreadsMap.vcpus[int(vcpuIdx)] + if !ok { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("missing vcpu thread id for vcpu index %d", vcpuIdx) + } + pinIdx := int(vcpuIdx-startVCPU) % len(allowedCPUs) + if err := resCtrl.SetThreadAffinity(tid, allowedCPUs[pinIdx:pinIdx+1]); err != nil { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d (NUMA node %d): %v", tid, allowedCPUs[pinIdx], i, err) + } + } + } + + s.isVCPUsPinningOn = true + return nil +} + // resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error { for _, tid := range vCPUThreadsMap.vcpus { diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index 7e521f3842..50115c7a5b 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -1679,3 +1679,29 @@ func TestSandboxHugepageLimit(t *testing.T) { err = s.updateResources(context.Background()) assert.NoError(t, err) } + +func TestCheckVCPUsPinningNUMATooFewVCPUs(t *testing.T) { + assert := assert.New(t) + s := &Sandbox{} + vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100}} + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7}) + assert.Error(err) + assert.Contains(err.Error(), "must be >= NUMA node count") +} + +func TestCheckVCPUsPinningNUMABadHostCPUs(t *testing.T) { + assert := assert.New(t) + s := &Sandbox{} + vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100, 1: 101, 2: 102, 3: 103}} + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "not-valid"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7}) + assert.Error(err) + assert.Contains(err.Error(), "failed to parse HostCPUs") +} diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 39bcfde8f4..5e1ff51ae3 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -13,6 +13,7 @@ import ( "os/exec" "path/filepath" "regexp" + "strconv" "strings" "syscall" "time" @@ -623,3 +624,205 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) { return numaNodes, nil } + +// FilterNUMANodesByCPUSet returns only those guest NUMA nodes whose HostCPUs +// intersect with the given sandbox cpuset. If sandboxCPUs is empty (size 0), +// no filtering is applied and the original slice is returned unchanged. +func FilterNUMANodesByCPUSet(nodes []types.GuestNUMANode, sandboxCPUs cpuset.CPUSet) []types.GuestNUMANode { + if sandboxCPUs.Size() == 0 { + return nodes + } + var filtered []types.GuestNUMANode + for _, n := range nodes { + hostCPUs, err := cpuset.Parse(n.HostCPUs) + if err != nil { + continue + } + if hostCPUs.Intersection(sandboxCPUs).Size() > 0 { + filtered = append(filtered, n) + } + } + if len(filtered) == 0 { + return nodes + } + return filtered +} + +// NUMADistEntry represents a single NUMA distance measurement between two nodes. +type NUMADistEntry struct { + Src uint32 + Dst uint32 + Val uint32 +} + +// GetHostNUMADistances reads the host NUMA distance matrix for the nodes +// referenced by the given GuestNUMANode list and returns off-diagonal +// pairwise entries (skipping self-distance src==dst). +// The distance row from sysfs is indexed by host NUMA node ID, so we parse +// each guest node's HostNodes to find the representative host node ID and +// use that to index into the distance row. +func GetHostNUMADistances(nodes []types.GuestNUMANode) []NUMADistEntry { + hostNodeIDs := make([]int, len(nodes)) + for i, n := range nodes { + nodeSet, err := cpuset.Parse(n.HostNodes) + if err != nil { + hostNodeIDs[i] = -1 + continue + } + ids := nodeSet.ToSlice() + if len(ids) == 0 { + hostNodeIDs[i] = -1 + continue + } + hostNodeIDs[i] = ids[0] + } + + var dists []NUMADistEntry + for srcIdx, srcNode := range nodes { + if hostNodeIDs[srcIdx] < 0 { + continue + } + distStr := getHostNUMADistance(srcNode.HostNodes) + if distStr == "" { + continue + } + fields := strings.Fields(distStr) + for dstIdx := range nodes { + if srcIdx == dstIdx { + continue + } + hostID := hostNodeIDs[dstIdx] + if hostID < 0 || hostID >= len(fields) { + continue + } + val, err := strconv.ParseUint(fields[hostID], 10, 32) + if err != nil { + continue + } + dists = append(dists, NUMADistEntry{ + Src: uint32(srcIdx), + Dst: uint32(dstIdx), + Val: uint32(val), + }) + } + } + return dists +} + +// HostNUMANodeCapacity describes the CPU and memory capacity of a single +// host NUMA node, as seen via sysfs. +type HostNUMANodeCapacity struct { + NodeID int + CPUs int + MemMB uint64 +} + +// GetHostNUMANodeCapacity returns the CPU count and memory size (in MiB) +// of the given host NUMA node. +func GetHostNUMANodeCapacity(nodeID int) (HostNUMANodeCapacity, error) { + cap := HostNUMANodeCapacity{NodeID: nodeID} + cpuList, err := getHostNUMANodeCPUs(nodeID) + if err != nil { + return cap, err + } + cs, err := cpuset.Parse(cpuList) + if err != nil { + return cap, fmt.Errorf("parse host node %d cpulist %q: %w", nodeID, cpuList, err) + } + cap.CPUs = cs.Size() + memMB, err := getHostNUMANodeMemoryMB(nodeID) + if err != nil { + return cap, err + } + cap.MemMB = memMB + return cap, nil +} + +// GetHostNUMANodeCapacities returns the capacities of the given host NUMA +// node IDs in the same order. Nodes that fail to be read are skipped and +// the corresponding error is logged via the returned error (the slice may +// be shorter than the input). +func GetHostNUMANodeCapacities(nodeIDs []int) ([]HostNUMANodeCapacity, error) { + out := make([]HostNUMANodeCapacity, 0, len(nodeIDs)) + for _, id := range nodeIDs { + c, err := GetHostNUMANodeCapacity(id) + if err != nil { + return out, fmt.Errorf("read host NUMA node %d capacity: %w", id, err) + } + out = append(out, c) + } + return out, nil +} + +// DistributeVCPUsProportionally distributes totalVCPUs across NUMA nodes +// proportionally to the number of host CPUs available on each node. +// Each node is guaranteed at least 1 vCPU. Remainder vCPUs go to nodes +// with the most host CPUs. +func DistributeVCPUsProportionally(numaNodes []types.GuestNUMANode, totalVCPUs uint32) ([]uint32, error) { + numNodes := len(numaNodes) + if numNodes == 0 { + return nil, fmt.Errorf("no NUMA nodes") + } + if totalVCPUs < uint32(numNodes) { + return nil, fmt.Errorf("totalVCPUs (%d) must be >= NUMA node count (%d)", totalVCPUs, numNodes) + } + + hostCPUCounts := make([]int, numNodes) + totalHostCPUs := 0 + for i, gn := range numaNodes { + parsed, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + return nil, fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %w", i, err) + } + if parsed.Size() == 0 { + return nil, fmt.Errorf("HostCPUs for NUMA node %d must not be empty", i) + } + hostCPUCounts[i] = parsed.Size() + totalHostCPUs += hostCPUCounts[i] + } + if totalHostCPUs == 0 { + return nil, fmt.Errorf("total host CPU count is 0") + } + + vcpusPerNode := make([]uint32, numNodes) + var assigned uint32 + for i := range numaNodes { + vcpusPerNode[i] = uint32(int(totalVCPUs) * hostCPUCounts[i] / totalHostCPUs) + if vcpusPerNode[i] == 0 { + vcpusPerNode[i] = 1 + } + assigned += vcpusPerNode[i] + } + + // Use a copy for remainder distribution to avoid mutating the original counts. + weights := make([]int, numNodes) + copy(weights, hostCPUCounts) + + for assigned < totalVCPUs { + bestIdx := 0 + for i := 1; i < numNodes; i++ { + if weights[i] > weights[bestIdx] { + bestIdx = i + } + } + vcpusPerNode[bestIdx]++ + assigned++ + weights[bestIdx]-- + } + + for assigned > totalVCPUs { + bestIdx := 0 + for i := 1; i < numNodes; i++ { + if vcpusPerNode[i] > vcpusPerNode[bestIdx] { + bestIdx = i + } + } + if vcpusPerNode[bestIdx] <= 1 { + break + } + vcpusPerNode[bestIdx]-- + assigned-- + } + + return vcpusPerNode, nil +} diff --git a/src/runtime/virtcontainers/utils/utils_darwin.go b/src/runtime/virtcontainers/utils/utils_darwin.go index 4a64c921b1..a29d0378a2 100644 --- a/src/runtime/virtcontainers/utils/utils_darwin.go +++ b/src/runtime/virtcontainers/utils/utils_darwin.go @@ -22,3 +22,11 @@ func getHostNUMANodes() ([]int, error) { func getHostNUMANodeCPUs(nodeId int) (string, error) { return "", nil } + +func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) { + return 0, nil +} + +func getHostNUMADistance(hostNodes string) string { + return "" +} diff --git a/src/runtime/virtcontainers/utils/utils_linux.go b/src/runtime/virtcontainers/utils/utils_linux.go index 0ddb4dd5a9..11ae66b202 100644 --- a/src/runtime/virtcontainers/utils/utils_linux.go +++ b/src/runtime/virtcontainers/utils/utils_linux.go @@ -12,6 +12,8 @@ import ( "io" "math/big" "os" + "regexp" + "strconv" "strings" "syscall" "time" @@ -23,6 +25,8 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" ) +var nodeMemTotalRegexp = regexp.MustCompile(`Node\s+\d+\s+MemTotal:\s+(\d+)\s+kB`) + var ioctlFunc = Ioctl // maxUInt represents the maximum valid value for the context ID. @@ -220,3 +224,41 @@ func getHostNUMANodeCPUs(nodeId int) (string, error) { } return strings.TrimSuffix(string(data), "\n"), nil } + +// getHostNUMANodeMemoryMB returns the total memory in MiB for the given +// host NUMA node, parsed from /sys/devices/system/node/nodeN/meminfo. +func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) { + fileName := fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeId) + data, err := os.ReadFile(fileName) + if err != nil { + return 0, err + } + m := nodeMemTotalRegexp.FindSubmatch(data) + if m == nil { + return 0, fmt.Errorf("MemTotal not found in %s", fileName) + } + kb, err := strconv.ParseUint(string(m[1]), 10, 64) + if err != nil { + return 0, err + } + return kb / 1024, nil +} + +// getHostNUMADistance reads the distance row for the first host NUMA node +// in the given hostNodes specifier (e.g. "0" or "0-1"). +func getHostNUMADistance(hostNodes string) string { + nodeSet, err := cpuset.Parse(hostNodes) + if err != nil { + return "" + } + ids := nodeSet.ToSlice() + if len(ids) == 0 { + return "" + } + fileName := fmt.Sprintf("/sys/devices/system/node/node%d/distance", ids[0]) + data, err := os.ReadFile(fileName) + if err != nil { + return "" + } + return strings.TrimSuffix(string(data), "\n") +} diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go index 8361caa1ee..90663e64b5 100644 --- a/src/runtime/virtcontainers/utils/utils_test.go +++ b/src/runtime/virtcontainers/utils/utils_test.go @@ -19,6 +19,9 @@ import ( "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" + + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" ) const waitLocalProcessTimeoutSecs = 3 @@ -754,3 +757,102 @@ func TestDockerNetnsPath(t *testing.T) { } assert.Equal("", DockerNetnsPath(spec)) } + +func TestDistributeVCPUsProportionallySymmetric(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-3"}, + {HostCPUs: "4-7"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 8) + assert.NoError(err) + assert.Equal([]uint32{4, 4}, dist) +} + +func TestDistributeVCPUsProportionallyAsymmetric(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-7"}, + {HostCPUs: "8-9"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 10) + assert.NoError(err) + assert.Equal([]uint32{8, 2}, dist) +} + +func TestDistributeVCPUsProportionallyMinOnePerNode(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-99"}, + {HostCPUs: "100"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 2) + assert.NoError(err) + assert.Equal(uint32(1), dist[0]) + assert.Equal(uint32(1), dist[1]) +} + +func TestDistributeVCPUsProportionallyThreeNodes(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-5"}, + {HostCPUs: "6-8"}, + {HostCPUs: "9"}, + } + // 6+3+1=10 host CPUs, 10 vCPUs: proportional = 6, 3, 1 + dist, err := DistributeVCPUsProportionally(nodes, 10) + assert.NoError(err) + assert.Equal([]uint32{6, 3, 1}, dist) +} + +func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0"}, + {HostCPUs: "1"}, + {HostCPUs: "2"}, + } + _, err := DistributeVCPUsProportionally(nodes, 2) + assert.Error(err) + assert.Contains(err.Error(), "must be >= NUMA node count") +} + +func TestFilterNUMANodesByCPUSet(t *testing.T) { + assert := assert.New(t) + + nodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-55,112-167"}, + {HostNodes: "1", HostCPUs: "56-111,168-223"}, + } + + // Sandbox cpuset only from node 0 -> should return 1 node + sandboxCPUs, _ := cpuset.Parse("1-40,113-152") + filtered := FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("0", filtered[0].HostNodes) + + // Sandbox cpuset from both nodes -> should return 2 nodes + sandboxCPUs, _ = cpuset.Parse("1-40,56-80") + filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 2) + + // Sandbox cpuset only from node 1 -> should return 1 node + sandboxCPUs, _ = cpuset.Parse("60-70,170-180") + filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("1", filtered[0].HostNodes) + + // Empty cpuset -> no filtering, return all + emptyCPUs := cpuset.NewCPUSet() + filtered = FilterNUMANodesByCPUSet(nodes, emptyCPUs) + assert.Len(filtered, 2) + + // Single-node host (1 NUMA node) -> returns 1 regardless + singleNode := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-7"}, + } + sandboxCPUs, _ = cpuset.Parse("0-3") + filtered = FilterNUMANodesByCPUSet(singleNode, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("0", filtered[0].HostNodes) +} diff --git a/tests/integration/kubernetes/k8s-nvidia-numa.bats b/tests/integration/kubernetes/k8s-nvidia-numa.bats new file mode 100644 index 0000000000..dd695e6811 --- /dev/null +++ b/tests/integration/kubernetes/k8s-nvidia-numa.bats @@ -0,0 +1,745 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# NUMA topology and vCPU pinning verification tests for Kata Containers. +# +# Five tests cover the main paths in the runtime's NUMA logic: +# 1. Multi-node sandbox: a workload that does NOT fit in a single host +# NUMA node should be balanced across host nodes — the guest sees +# multiple NUMA nodes with even vCPU/memory distribution and host +# vCPU pinning is balanced as well. +# 2. Right-sized single-node sandbox: a workload that DOES fit in a +# single host NUMA node should be collapsed to one node — the guest +# sees exactly one NUMA node with all vCPUs in it AND all host +# QEMU vCPU threads are pinned to that one host NUMA node. +# 3. GPU passthrough (VFIO), multi-node: when a GPU is attached via +# VFIO and the workload spans every host NUMA node, the runtime +# creates pxb-pcie bridges and the guest GPU reports the same NUMA +# node as the host GPU. +# 4. GPU passthrough (VFIO), right-sized single-node: when a small +# workload + GPU fits on a single host NUMA node, the runtime +# collapses the topology to the GPU's host NUMA node (memory and +# vCPUs land on the same node as the GPU, not just any fitting node). +# 5. Explicit numa_mapping in the runtime TOML: when the user pins the +# guest topology to a specific host node via numa_mapping = ["1"], +# maybeRightSizeAutoNUMA() must be a no-op and buildNUMATopology() +# must propagate the binding (memory + vCPU pinning land on the +# chosen host node, regardless of how small the workload is). +# +# Guest-side checks use the quay.io/kata-containers/numa container image +# which reads sysfs and prints results to stdout. The bats test reads +# the output via "kubectl logs" — no kubectl exec, no CoCo policy +# overrides needed. +# +# WARNING: The host-side pinning check runs numa-pinning-check.sh directly +# on the host (not inside a container). This requires the bats runner to +# execute on the k8s node with privileged access to /proc, /sys, crictl, +# and taskset. If the test environment changes so that bats no longer +# runs on the node, these calls must be reworked to use exec_host or +# equivalent. + +load "${BATS_TEST_DIRNAME}/lib.sh" +load "${BATS_TEST_DIRNAME}/confidential_common.sh" + +export KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu-nvidia-gpu-snp}" + +# Hypervisors where NUMA is configured and supported by default. +# Only qemu-nvidia-gpu variants ship enable_numa=true in their base config. +# runtime-rs does not yet implement NUMA; non-QEMU hypervisors lack support. +NUMA_CONFIGURED_SUPPORTED_BY_DEFAULT=( + "qemu-nvidia-gpu" + "qemu-nvidia-gpu-snp" + "qemu-nvidia-gpu-tdx" +) + +# Multi-node test: large enough to span every host NUMA node. +NUMA_TEST_VCPUS_LARGE="${NUMA_TEST_VCPUS_LARGE:-64}" +NUMA_TEST_MEMORY_LARGE="${NUMA_TEST_MEMORY_LARGE:-64Gi}" + +# Right-sizing test: small enough to fit in a single host NUMA node on +# any reasonable production-class server. +NUMA_TEST_VCPUS_SMALL="${NUMA_TEST_VCPUS_SMALL:-4}" +NUMA_TEST_MEMORY_SMALL="${NUMA_TEST_MEMORY_SMALL:-4Gi}" + +# GPU test: same sizing as the large test, plus a GPU. +NUMA_TEST_VCPUS_GPU="${NUMA_TEST_VCPUS_GPU:-64}" +NUMA_TEST_MEMORY_GPU="${NUMA_TEST_MEMORY_GPU:-64Gi}" + +# Small GPU test: fits in a single host NUMA node, exercises the +# right-sizing path with VFIO (sandbox should land on the GPU's host +# NUMA node, not just any node that fits). +NUMA_TEST_VCPUS_GPU_SMALL="${NUMA_TEST_VCPUS_GPU_SMALL:-4}" +NUMA_TEST_MEMORY_GPU_SMALL="${NUMA_TEST_MEMORY_GPU_SMALL:-4Gi}" + +export POD_NAME_NUMA="numa-topology-test" +POD_NAME_NUMA_GPU="numa-topology-gpu-test" + +POD_WAIT_TIMEOUT=${POD_WAIT_TIMEOUT:-600s} +export POD_WAIT_TIMEOUT + +HOST_PINNING_RETRIES=20 +HOST_PINNING_SLEEP=0.5 + +setup() { + setup_common || die "setup_common failed" + + pod_yaml_in="${pod_config_dir}/${POD_NAME_NUMA}.yaml.in" + pod_yaml="${pod_config_dir}/${POD_NAME_NUMA}.yaml" + + policy_settings_dir="$(create_tmp_policy_settings_dir "${pod_config_dir}")" + add_requests_to_policy_settings "${policy_settings_dir}" "ReadStreamRequest" +} + +# ----------------------------------------------------------------------------- +# Skip / topology helpers +# ----------------------------------------------------------------------------- + +# numa_skip_reason returns a non-empty skip reason on stdout when the +# current test should be skipped (hypervisor lacks default NUMA support +# OR host has fewer than 2 NUMA nodes). Empty stdout means run. +# Callers must invoke `skip` themselves — bats `skip` inside command +# substitution does not propagate. +numa_skip_reason() { + # shellcheck disable=SC2076 + if [[ ! " ${NUMA_CONFIGURED_SUPPORTED_BY_DEFAULT[*]} " =~ " ${KATA_HYPERVISOR} " ]]; then + echo "NUMA not configured by default on ${KATA_HYPERVISOR} (only qemu-nvidia-gpu variants)" + return 0 + fi + local nodes + nodes=$(host_numa_node_count) + if [[ "${nodes}" -lt 2 ]]; then + echo "Host has only ${nodes} NUMA node(s), need >= 2 for this test" + fi +} + +# host_numa_node_count echoes the number of NUMA nodes on the host. +# WARNING: numactl runs directly on the host, not via exec_host. +host_numa_node_count() { + numactl --hardware | grep -oP 'available:\s+\K\d+' +} + +# ----------------------------------------------------------------------------- +# Pod lifecycle helpers +# ----------------------------------------------------------------------------- + +# render_pod renders the pod yaml with the given vCPU and memory limits +# and runs auto_generate_policy against it. Each @test calls this with +# its own sizing so the same template can serve multiple scenarios. +render_pod() { + local vcpus="${1}" memory="${2}" + NUMA_TEST_VCPUS="${vcpus}" NUMA_TEST_MEMORY="${memory}" \ + envsubst < "${pod_yaml_in}" > "${pod_yaml}" + auto_generate_policy "${policy_settings_dir}" "${pod_yaml}" +} + +# deploy_and_get_guest_logs renders, applies, waits for Ready, then +# echoes the pod's stdout (the test image prints NUMA topology then +# sleeps). The brief sleep gives the entrypoint time to print before +# we read. +deploy_and_get_guest_logs() { + local vcpus="${1}" memory="${2}" + render_pod "${vcpus}" "${memory}" + kubectl apply -f "${pod_yaml}" + kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA}" + sleep 2 + kubectl logs "${POD_NAME_NUMA}" +} + +# ----------------------------------------------------------------------------- +# Guest-log parsers (operate on stdout from the test container) +# ----------------------------------------------------------------------------- + +# guest_online_count parses a "numa_online: " payload (e.g. "0", +# "0-1", "0-7") and echoes the number of online NUMA nodes it implies. +guest_online_count() { + local online="${1}" + if [[ "${online}" =~ ^([0-9]+)-([0-9]+)$ ]]; then + echo $(( ${BASH_REMATCH[2]} - ${BASH_REMATCH[1]} + 1 )) + elif [[ "${online}" =~ ^[0-9]+$ ]]; then + echo 1 + else + die "Unexpected format for guest NUMA online nodes: ${online}" + fi +} + +# guest_field +# Echoes the value following ":" in . E.g. +# guest_field "$logs" numa_online -> "0-1" +guest_field() { + echo "${1}" | grep -oP "${2}:\s*\K\S+" +} + +# guest_per_node_values +# Emits one value per line for "node\d+: " entries +# (e.g. _cpus or _mem_kb). Suitable for `mapfile -t`. +guest_per_node_values() { + echo "${1}" | grep -oP "node\d+${2}:\s*\K\d+" +} + +# ----------------------------------------------------------------------------- +# Host-side pinning helpers +# ----------------------------------------------------------------------------- + +# get_qemu_pid_for_numa_pod resolves the running pod's sandbox via crictl +# and returns the QEMU PID via pgrep. Fails the test if either lookup +# turns up empty. +get_qemu_pid_for_numa_pod() { + local sandbox_id qemu_pid + sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \ + pods --name "${POD_NAME_NUMA}" -q | head -1) + [[ -n "${sandbox_id}" ]] || die "no sandbox id found for pod ${POD_NAME_NUMA}" + + qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1) + [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for sandbox ${sandbox_id}" + echo "${qemu_pid}" +} + +# pinning_thread_total sums the per-bucket counts in numa-pinning-check.sh +# output ("nodeN: " lines) and echoes the total. +pinning_thread_total() { + echo "${1}" | awk -F: '/^node[0-9]+:/ {sum+=$2} END {print sum+0}' +} + +# wait_for_host_pinning +# Polls numa-pinning-check.sh until at least threads +# report per-CPU affinity, or until HOST_PINNING_RETRIES is exhausted. +# Echoes the final script output regardless of whether convergence was +# reached, so callers can inspect/assert on the bucket distribution. +wait_for_host_pinning() { + local qemu_pid="${1}" expected="${2}" + local script="${BATS_TEST_DIRNAME}/numa-pinning-check.sh" + local output total + local attempt + for ((attempt = 1; attempt <= HOST_PINNING_RETRIES; attempt++)); do + output=$(sudo bash "${script}" "${qemu_pid}") + total=$(pinning_thread_total "${output}") + if (( total >= expected )); then + echo "${output}" + return 0 + fi + echo "# Host pinning attempt ${attempt}/${HOST_PINNING_RETRIES}: ${total}/${expected} threads pinned" >&2 + sleep "${HOST_PINNING_SLEEP}" + done + echo "${output}" +} + +# minmax_diff +# Echoes (max - min) for the given non-empty integer list. +minmax_diff() { + local lo=$1 hi=$1 v + shift + for v in "$@"; do + (( v > hi )) && hi=$v + (( v < lo )) && lo=$v + done + echo $((hi - lo)) +} + +# get_qemu_cmdline +# Reads the QEMU process command line from /proc, replacing null bytes +# with spaces. Runs directly on the host via sudo. +get_qemu_cmdline() { + sudo cat "/proc/${1}/cmdline" | tr '\0' ' ' +} + +# host_has_pgpu returns 0 if the node has allocatable nvidia.com/pgpu +# resources, 1 otherwise. +host_has_pgpu() { + local count + count=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/pgpu}' 2>/dev/null) + [[ -n "${count}" && "${count}" -gt 0 ]] 2>/dev/null +} + +# gpu_numa_skip_reason extends numa_skip_reason with a check for GPU +# availability. +gpu_numa_skip_reason() { + local reason + reason=$(numa_skip_reason) + if [[ -n "${reason}" ]]; then + echo "${reason}" + return 0 + fi + if ! host_has_pgpu; then + echo "No nvidia.com/pgpu resources available on the cluster" + fi +} + +# ----------------------------------------------------------------------------- +# Explicit numa_mapping config helpers (drop-in based) +# ----------------------------------------------------------------------------- +# +# Both kata-runtime (Go) and runtime-rs (Rust) read TOML fragments from a +# `config.d/` directory next to the active configuration-.toml file +# and merge them into the loaded config on every sandbox start. These +# helpers drop in a single override fragment so the main config file is +# never edited — teardown just deletes the fragment. +# +# WARNING: must run on the k8s node (sudo required) and patch/restore must +# be paired — a leaked drop-in would silently affect every subsequent pod +# on the same node. + +# kata_runtime_config_dir echoes the per-shim runtime config directory +# (the one that holds configuration-.toml and config.d/). Handles +# both the Go layout (.../runtimes/) and the runtime-rs layout +# (.../runtime-rs/runtimes/) by probing the filesystem rather than +# parsing the shim name (some Rust shims like `dragonball` lack the +# `-runtime-rs` suffix). +kata_runtime_config_dir() { + local base="/opt/kata/share/defaults/kata-containers" + local rs_dir="${base}/runtime-rs/runtimes/${KATA_HYPERVISOR}" + local go_dir="${base}/runtimes/${KATA_HYPERVISOR}" + if [[ -d "${rs_dir}" ]]; then + echo "${rs_dir}" + elif [[ -d "${go_dir}" ]]; then + echo "${go_dir}" + else + die "no Kata runtime config dir for ${KATA_HYPERVISOR} (looked in ${rs_dir} and ${go_dir})" + fi +} + +# kata_hypervisor_section echoes the [hypervisor.X] header from the active +# config so the drop-in fragment targets the right table. Discovering it +# at runtime keeps us hypervisor-agnostic (qemu / clh / firecracker / ...). +kata_hypervisor_section() { + local dir + dir=$(kata_runtime_config_dir) + local cfg="${dir}/configuration-${KATA_HYPERVISOR}.toml" + [[ -f "${cfg}" ]] || die "Kata config not found at ${cfg}" + local section + section=$(sudo grep -oE '^\[hypervisor\.[a-z0-9_-]+\]' "${cfg}" | head -1) + [[ -n "${section}" ]] || die "no [hypervisor.X] section in ${cfg}" + echo "${section}" +} + +# patch_kata_numa_mapping +# Writes a config.d/ drop-in that sets numa_mapping = under +# the active hypervisor section. Example values: '["1"]', '["0-1","2-3"]'. +# Records the file path in KATA_NUMA_DROPIN_PATH so teardown() can remove +# it. No restart needed — the next sandbox start picks it up. +patch_kata_numa_mapping() { + local value="${1}" + local dir section + dir=$(kata_runtime_config_dir) + section=$(kata_hypervisor_section) + + KATA_NUMA_DROPIN_PATH="${dir}/config.d/99-numa-test.toml" + export KATA_NUMA_DROPIN_PATH + + sudo mkdir -p "${dir}/config.d" + sudo tee "${KATA_NUMA_DROPIN_PATH}" >/dev/null < +# Returns the host PCI BDF of the first vfio-pci device passed through. +# E.g. "vfio-pci,host=0000:41:00.0,..." -> "0000:41:00.0". +extract_vfio_host_bdf() { + echo "${1}" | grep -oP 'vfio-pci,host=\K[0-9a-fA-F:.]+' | head -1 +} + +# host_gpu_numa +# Returns the NUMA node ID of a host PCI device from sysfs. +# Reads /sys/bus/pci/devices//numa_node on the host (via sudo +# since the bats runner may not have read access by default). +host_gpu_numa() { + sudo cat "/sys/bus/pci/devices/${1}/numa_node" +} + +# ----------------------------------------------------------------------------- +# Tests +# ----------------------------------------------------------------------------- + +@test "NUMA: guest topology and host pinning are balanced" { + # Skip checks must live inside @test (not setup) to avoid bats + # "Executed 0 instead of expected 1 tests" warnings. + local skip_reason + skip_reason=$(numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + local host_nodes + host_nodes=$(host_numa_node_count) + + local guest_logs + guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_LARGE}" "${NUMA_TEST_MEMORY_LARGE}") + echo "# Guest NUMA output:" + echo "# ${guest_logs}" + + # --- Guest topology matches host --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s); host has ${host_nodes}" + [[ "${guest_count}" -eq "${host_nodes}" ]] \ + || die "guest NUMA node count (${guest_count}) != host (${host_nodes})" + + # --- Guest vCPU balance --- + mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus) + echo "# Guest vCPUs per node: ${guest_cpus[*]}" + [[ ${#guest_cpus[@]} -ge 2 ]] \ + || die "expected >= 2 guest NUMA buckets, got ${#guest_cpus[@]}" + local diff + diff=$(minmax_diff "${guest_cpus[@]}") + echo "# Guest vCPU balance diff: ${diff}" + [[ "${diff}" -le 1 ]] || die "guest vCPU imbalance: ${guest_cpus[*]}" + + # --- Guest memory presence per node --- + mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb) + echo "# Guest memory per node (kB): ${guest_mem[*]}" + [[ ${#guest_mem[@]} -ge 2 ]] || die "expected >= 2 guest memory nodes" + + # --- Host-side vCPU pinning balance --- + local qemu_pid host_output + qemu_pid=$(get_qemu_pid_for_numa_pod) + echo "# QEMU PID: ${qemu_pid}" + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_LARGE}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -ge 2 ]] \ + || die "expected >= 2 host NUMA buckets, got ${#host_counts[@]}: ${host_output}" + diff=$(minmax_diff "${host_counts[@]}") + echo "# Host pinning diff: ${diff}" + [[ "${diff}" -le 1 ]] || die "host pinning imbalance: ${host_output}" +} + +@test "NUMA: small workload right-sizes to a single guest NUMA node" { + # When the sandbox CPU + memory budget fits comfortably on a single + # host NUMA node and no explicit numa_mapping is provided, the + # runtime should collapse the auto-derived multi-node topology to a + # single node to preserve memory locality. This test exercises + # selectNUMANodes()'s right-sizing path on a multi-NUMA host: + # 1. The guest sees exactly one NUMA node with all vCPUs in it. + # 2. The host-side QEMU vCPU threads are all pinned to that one + # host NUMA node (delivered by checkVCPUsPinningNUMA, which + # handles single-node sandboxes too). + local skip_reason + skip_reason=$(numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + local guest_logs + guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_SMALL}" "${NUMA_TEST_MEMORY_SMALL}") + echo "# Guest NUMA output:" + echo "# ${guest_logs}" + + # --- Guest topology collapsed to a single node --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)" + [[ "${guest_count}" -eq 1 ]] \ + || die "right-sized sandbox should expose 1 NUMA node, got ${guest_count}" + + mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus) + echo "# Guest vCPUs per node: ${guest_cpus[*]}" + [[ ${#guest_cpus[@]} -eq 1 ]] \ + || die "expected 1 guest NUMA bucket, got ${#guest_cpus[@]}: ${guest_cpus[*]}" + # The runtime may add a default vCPU on top of the workload request, + # so the guest can see slightly more than the pod spec asked for. + [[ "${guest_cpus[0]}" -ge "${NUMA_TEST_VCPUS_SMALL}" ]] \ + || die "expected at least ${NUMA_TEST_VCPUS_SMALL} vCPUs on the single node, got ${guest_cpus[0]}" + + mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb) + echo "# Guest memory per node (kB): ${guest_mem[*]}" + [[ ${#guest_mem[@]} -eq 1 ]] \ + || die "expected 1 guest memory node, got ${#guest_mem[@]}" + + # --- Host-side vCPU pinning collapsed to a single node --- + local qemu_pid host_output + qemu_pid=$(get_qemu_pid_for_numa_pod) + echo "# QEMU PID: ${qemu_pid}" + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_SMALL}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -eq 1 ]] \ + || die "right-sized sandbox vCPU threads should land on a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}" + [[ "${host_counts[0]}" -ge "${NUMA_TEST_VCPUS_SMALL}" ]] \ + || die "expected at least ${NUMA_TEST_VCPUS_SMALL} vCPU threads pinned, got ${host_counts[0]}: ${host_output}" +} + +@test "NUMA: GPU passthrough with VFIO has correct NUMA placement" { + local skip_reason + skip_reason=$(gpu_numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + local host_nodes + host_nodes=$(host_numa_node_count) + + local gpu_yaml_in="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml.in" + local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml" + + POD_NAME_NUMA="${POD_NAME_NUMA_GPU}" NUMA_TEST_VCPUS="${NUMA_TEST_VCPUS_GPU}" \ + NUMA_TEST_MEMORY="${NUMA_TEST_MEMORY_GPU}" \ + envsubst < "${gpu_yaml_in}" > "${gpu_yaml}" + auto_generate_policy "${policy_settings_dir}" "${gpu_yaml}" + + kubectl apply -f "${gpu_yaml}" + kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA_GPU}" + sleep 2 + + local guest_logs + guest_logs=$(kubectl logs "${POD_NAME_NUMA_GPU}") + echo "# GPU pod guest NUMA output:" + echo "# ${guest_logs}" + + # --- Guest NUMA topology matches host --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s); host has ${host_nodes}" + [[ "${guest_count}" -eq "${host_nodes}" ]] \ + || die "GPU pod guest NUMA node count (${guest_count}) != host (${host_nodes})" + + # --- Guest vCPU balance --- + mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus) + echo "# Guest vCPUs per node: ${guest_cpus[*]}" + [[ ${#guest_cpus[@]} -ge 2 ]] \ + || die "expected >= 2 guest NUMA buckets, got ${#guest_cpus[@]}" + local diff + diff=$(minmax_diff "${guest_cpus[@]}") + echo "# Guest vCPU balance diff: ${diff}" + [[ "${diff}" -le 1 ]] || die "GPU pod guest vCPU imbalance: ${guest_cpus[*]}" + + # --- Guest memory presence per node --- + mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb) + echo "# Guest memory per node (kB): ${guest_mem[*]}" + [[ ${#guest_mem[@]} -ge 2 ]] || die "expected >= 2 guest memory nodes" + + # --- Host-side QEMU lookup (needed for the GPU NUMA assertion) --- + local sandbox_id qemu_pid qemu_cmd host_bdf host_node + sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \ + pods --name "${POD_NAME_NUMA_GPU}" -q | head -1) + [[ -n "${sandbox_id}" ]] || die "no sandbox id found for GPU pod" + + qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1) + [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for GPU sandbox ${sandbox_id}" + echo "# QEMU PID: ${qemu_pid}" + + qemu_cmd=$(get_qemu_cmdline "${qemu_pid}") + host_bdf=$(extract_vfio_host_bdf "${qemu_cmd}") + [[ -n "${host_bdf}" ]] || die "no vfio-pci host BDF found in QEMU cmdline" + host_node=$(host_gpu_numa "${host_bdf}") + echo "# Host GPU ${host_bdf} on NUMA node ${host_node}" + + # --- Guest GPU NUMA affinity --- + # With pxb-pcie and default numa_mapping (1:1), the guest GPU's NUMA + # node must equal the host GPU's NUMA node. + mapfile -t gpu_numas < <(echo "${guest_logs}" | grep -oP 'gpu_.*_numa:\s*\K-?\d+') + echo "# Guest GPU NUMA nodes: ${gpu_numas[*]}" + [[ ${#gpu_numas[@]} -ge 1 ]] \ + || die "no GPU detected in guest sysfs (expected gpu_*_numa: lines)" + for gn in "${gpu_numas[@]}"; do + [[ "${gn}" -eq "${host_node}" ]] \ + || die "guest GPU on node ${gn} but host GPU ${host_bdf} is on node ${host_node}" + done + + # --- Host-side vCPU pinning balance --- + local host_output + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_GPU}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -ge 2 ]] \ + || die "expected >= 2 host NUMA buckets for GPU pod, got ${#host_counts[@]}: ${host_output}" + diff=$(minmax_diff "${host_counts[@]}") + echo "# Host pinning diff: ${diff}" + [[ "${diff}" -le 1 ]] || die "GPU pod host pinning imbalance: ${host_output}" + + # --- QEMU command line: pxb-pcie and NUMA binding --- + echo "# Checking QEMU cmdline for pxb-pcie..." + [[ "${qemu_cmd}" == *"pxb-pcie"* ]] \ + || die "QEMU command line does not contain 'pxb-pcie' — NUMA PCIe topology not active" + + echo "# Checking QEMU cmdline for NUMA memory binding..." + [[ "${qemu_cmd}" == *"policy=bind"* ]] \ + || die "QEMU command line does not contain 'policy=bind' — NUMA memory binding not active" +} + +@test "NUMA: small GPU workload right-sizes to the GPU's host NUMA node" { + # When a GPU is attached and the sandbox CPU + memory budget fits on + # a single host NUMA node, the runtime's right-sizing path + # (selectNUMANodes with VFIO awareness) should collapse the topology + # to the GPU's host NUMA node — not just any fitting node — so that + # GPU and memory access stay NUMA-local. + local skip_reason + skip_reason=$(gpu_numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + local gpu_yaml_in="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml.in" + local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml" + + POD_NAME_NUMA="${POD_NAME_NUMA_GPU}" NUMA_TEST_VCPUS="${NUMA_TEST_VCPUS_GPU_SMALL}" \ + NUMA_TEST_MEMORY="${NUMA_TEST_MEMORY_GPU_SMALL}" \ + envsubst < "${gpu_yaml_in}" > "${gpu_yaml}" + auto_generate_policy "${policy_settings_dir}" "${gpu_yaml}" + + kubectl apply -f "${gpu_yaml}" + kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA_GPU}" + sleep 2 + + local guest_logs + guest_logs=$(kubectl logs "${POD_NAME_NUMA_GPU}") + echo "# Small GPU pod guest NUMA output:" + echo "# ${guest_logs}" + + # --- Host-side QEMU lookup --- + local sandbox_id qemu_pid qemu_cmd host_bdf host_node + sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \ + pods --name "${POD_NAME_NUMA_GPU}" -q | head -1) + [[ -n "${sandbox_id}" ]] || die "no sandbox id found for GPU pod" + + qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1) + [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for GPU sandbox ${sandbox_id}" + + qemu_cmd=$(get_qemu_cmdline "${qemu_pid}") + host_bdf=$(extract_vfio_host_bdf "${qemu_cmd}") + [[ -n "${host_bdf}" ]] || die "no vfio-pci host BDF found in QEMU cmdline" + host_node=$(host_gpu_numa "${host_bdf}") + echo "# Host GPU ${host_bdf} on NUMA node ${host_node}" + + # --- Guest collapsed to a single NUMA node --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)" + [[ "${guest_count}" -eq 1 ]] \ + || die "right-sized GPU sandbox should expose 1 NUMA node, got ${guest_count}" + + # --- Guest GPU sees the (single) node --- + mapfile -t gpu_numas < <(echo "${guest_logs}" | grep -oP 'gpu_.*_numa:\s*\K-?\d+') + echo "# Guest GPU NUMA nodes: ${gpu_numas[*]}" + [[ ${#gpu_numas[@]} -ge 1 ]] \ + || die "no GPU detected in guest sysfs (expected gpu_*_numa: lines)" + # In a single-node guest, the GPU is on node 0. + for gn in "${gpu_numas[@]}"; do + [[ "${gn}" -eq 0 ]] \ + || die "guest GPU on node ${gn} but right-sized sandbox has only node 0" + done + + # --- QEMU memory backend bound to the GPU's host NUMA node --- + # The right-sizing path should pick the GPU's host node, not just + # any node that fits. With pxb-pcie + right-sizing, the single + # memory-backend-ram for the sandbox must have host-nodes=${host_node}. + echo "# Checking QEMU cmdline for memory binding on host node ${host_node}..." + [[ "${qemu_cmd}" == *"host-nodes=${host_node}"* ]] \ + || die "right-sized GPU sandbox memory not bound to GPU's host NUMA node ${host_node}: cmdline=${qemu_cmd}" + + # --- Host-side vCPU pinning collapsed to the GPU's host node --- + local host_output + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_GPU_SMALL}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -eq 1 ]] \ + || die "right-sized GPU sandbox vCPU threads should land on a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}" + + local pinned_node + pinned_node=$(echo "${host_output}" | grep -oP '^node\K[0-9]+' | head -1) + [[ "${pinned_node}" -eq "${host_node}" ]] \ + || die "right-sized GPU sandbox vCPUs pinned to node ${pinned_node} but GPU is on host node ${host_node}" +} + +@test "NUMA: explicit numa_mapping in TOML pins the sandbox to the chosen host node" { + # When the user sets numa_mapping = ["1"] in the runtime TOML, the + # right-sizing path must be skipped (maybeRightSizeAutoNUMA bails out + # for non-empty NUMAMapping) and buildNUMATopology must propagate the + # binding verbatim, regardless of how small the workload is. + # + # Verifies end-to-end that: + # - guest sees exactly 1 NUMA node; + # - the QEMU memory backend is bound to host node 1 (not 0); + # - host-side vCPU threads land on host node 1. + # + # QEMU-only: this test asserts on the QEMU command line (host-nodes=, + # policy=bind) and on the kata-runtime (Go) NUMA logic. runtime-rs + # does not yet implement NUMA, so even if numa_skip_reason were + # widened later we'd still want to gate this case explicitly. + [[ "${KATA_HYPERVISOR}" == qemu-* ]] \ + || skip "explicit numa_mapping test is QEMU-only (got ${KATA_HYPERVISOR})" + + local skip_reason + skip_reason=$(numa_skip_reason) + [[ -z "${skip_reason}" ]] || skip "${skip_reason}" + + # Need at least 2 host nodes so "host node 1" is a non-trivial pick. + local host_nodes + host_nodes=$(host_numa_node_count) + [[ "${host_nodes}" -ge 2 ]] || skip "explicit-mapping test needs >=2 host NUMA nodes" + + # Patch the active runtime config; teardown() restores it. + patch_kata_numa_mapping '["1"]' + + local guest_logs + guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_SMALL}" "${NUMA_TEST_MEMORY_SMALL}") + echo "# Guest NUMA output:" + echo "# ${guest_logs}" + + # --- Guest: explicit mapping always yields exactly one node --- + local online guest_count + online=$(guest_field "${guest_logs}" numa_online) + guest_count=$(guest_online_count "${online}") + echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)" + [[ "${guest_count}" -eq 1 ]] \ + || die "explicit numa_mapping=[1] should expose 1 guest NUMA node, got ${guest_count}" + + # --- QEMU memory backend bound to host node 1 --- + local qemu_pid qemu_cmd + qemu_pid=$(get_qemu_pid_for_numa_pod) + qemu_cmd=$(get_qemu_cmdline "${qemu_pid}") + echo "# Checking QEMU cmdline for memory binding on host node 1..." + [[ "${qemu_cmd}" == *"host-nodes=1"* ]] \ + || die "explicit numa_mapping=[1] did not pin QEMU memory to host node 1: cmdline=${qemu_cmd}" + [[ "${qemu_cmd}" == *"policy=bind"* ]] \ + || die "explicit numa_mapping=[1] missing policy=bind in QEMU cmdline: cmdline=${qemu_cmd}" + + # --- Host-side vCPU pinning lands on host node 1 --- + local host_output + host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_SMALL}") + echo "# Host pinning per NUMA node: ${host_output}" + + mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+') + [[ ${#host_counts[@]} -eq 1 ]] \ + || die "explicit numa_mapping=[1] should pin vCPUs to a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}" + + local pinned_node + pinned_node=$(echo "${host_output}" | grep -oP '^node\K[0-9]+' | head -1) + [[ "${pinned_node}" -eq 1 ]] \ + || die "explicit numa_mapping=[1] pinned vCPUs to node ${pinned_node}, expected 1" +} + +teardown() { + echo "=== NUMA test pod describe ===" + kubectl describe pod "${POD_NAME_NUMA}" || true + kubectl describe pod "${POD_NAME_NUMA_GPU}" 2>/dev/null || true + + echo "=== NUMA test pod logs ===" + kubectl logs "${POD_NAME_NUMA}" || true + kubectl logs "${POD_NAME_NUMA_GPU}" 2>/dev/null || true + + # Always restore the Kata config (no-op if no patch was applied). + restore_kata_numa_mapping || true + + delete_tmp_policy_settings_dir "${policy_settings_dir}" + + [ -f "${pod_yaml}" ] && kubectl delete -f "${pod_yaml}" --ignore-not-found=true + local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml" + [ -f "${gpu_yaml}" ] && kubectl delete -f "${gpu_yaml}" --ignore-not-found=true + + print_node_journal_since_test_start "${node}" "${node_start_time:-}" "${BATS_TEST_COMPLETED:-}" +} diff --git a/tests/integration/kubernetes/numa-pinning-check.sh b/tests/integration/kubernetes/numa-pinning-check.sh new file mode 100755 index 0000000000..ead2476e6a --- /dev/null +++ b/tests/integration/kubernetes/numa-pinning-check.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# WARNING: This script runs directly on the host, NOT inside a container. +# It requires privileged access to /proc and /sys to inspect QEMU vCPU +# thread affinities and map them to host NUMA nodes. +# +# Usage: numa-pinning-check.sh +# +# Output: one line per NUMA node with the count of pinned vCPU threads. +# node0: 32 +# node1: 32 +# +# A vCPU thread is counted only when taskset reports it pinned to a single +# CPU (bare number, no ranges or commas). Threads with broad affinity +# masks are silently skipped — the caller is expected to retry until the +# runtime has finished per-vCPU pinning. + +set -o pipefail + +QEMU_PID="${1:?Usage: $0 }" + +if [[ ! -d "/proc/${QEMU_PID}/task" ]]; then + echo "ERROR: /proc/${QEMU_PID}/task not found" >&2 + exit 1 +fi + +for tid in "/proc/${QEMU_PID}/task/"*; do + tid="${tid##*/}" + list=$(taskset -pc "${tid}" 2>/dev/null | sed 's/.*: //') + if [[ "${list}" =~ ^[0-9]+$ ]]; then + # Map the CPU to its NUMA node via the sysfs topology symlink + for node_link in "/sys/devices/system/cpu/cpu${list}/node"*; do + if [[ -d "${node_link}" ]]; then + numa_node="${node_link##*node}" + echo "node${numa_node}" + break + fi + done + fi +done | sort | uniq -c | awk '{print $2 ": " $1}' diff --git a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh index eda7934858..901b97779b 100644 --- a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh @@ -72,6 +72,7 @@ if [[ -n "${K8S_TEST_NV:-}" ]]; then mapfile -d " " -t K8S_TEST_NV <<< "${K8S_TEST_NV}" else K8S_TEST_NV=("k8s-confidential-attestation.bats" \ + "k8s-nvidia-numa.bats" \ "k8s-nvidia-cuda.bats" \ "k8s-nvidia-nim.bats" \ "k8s-nvidia-nim-service.bats") diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in new file mode 100644 index 0000000000..7167fa271c --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in @@ -0,0 +1,24 @@ +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: ${POD_NAME_NUMA} + labels: + app: ${POD_NAME_NUMA} +spec: + runtimeClassName: kata + containers: + - name: numa-check + image: "quay.io/kata-containers/numa:2026-05-15@sha256:a863fcf95fcbbf63352b0555a61a62537f74399dc4bca826a2e42d001e26accb" + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + cpu: "${NUMA_TEST_VCPUS}" + memory: "${NUMA_TEST_MEMORY}" + nvidia.com/pgpu: "1" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in new file mode 100644 index 0000000000..731e75a32d --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in @@ -0,0 +1,23 @@ +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: ${POD_NAME_NUMA} + labels: + app: ${POD_NAME_NUMA} +spec: + runtimeClassName: kata + containers: + - name: numa-check + image: "quay.io/kata-containers/numa:2026-05-15@sha256:a863fcf95fcbbf63352b0555a61a62537f74399dc4bca826a2e42d001e26accb" + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + cpu: "${NUMA_TEST_VCPUS}" + memory: "${NUMA_TEST_MEMORY}" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile b/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile new file mode 100644 index 0000000000..7e9f541ae8 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile @@ -0,0 +1,17 @@ +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Minimal image that reads guest NUMA topology from sysfs. +# Multi-arch: linux/amd64, linux/arm64 +# +# Build & push: +# docker buildx build --platform linux/amd64,linux/arm64 \ +# -t quay.io/kata-containers/numa:$(date +%Y-%m-%d) --push . + +FROM alpine:3.23 + +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md b/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md new file mode 100644 index 0000000000..8bed127cb8 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md @@ -0,0 +1,36 @@ +# NUMA Topology Check Container + +Minimal container image that reads guest NUMA topology from sysfs and +prints structured output to stdout. Used by `k8s-nvidia-numa.bats` to +verify guest NUMA node count, vCPU distribution, and memory layout +without needing `kubectl exec` (which requires CoCo policy overrides). + +## Image + +`quay.io/kata-containers/numa:` + +## Build and push (multi-arch) + +```bash +cd tests/integration/kubernetes/runtimeclass_workloads/numa/ + +docker buildx build --platform linux/amd64,linux/arm64 \ + -t quay.io/kata-containers/numa:$(date +%Y-%m-%d) --push . +``` + +After pushing, update the image reference (including digest) in +`numa-topology-test.yaml.in`. + +## Output format + +The entrypoint prints one `key: value` pair per line: + +``` +numa_online: 0-1 +node0_cpus: 32 +node1_cpus: 32 +node0_mem_kb: 37078332 +node1_mem_kb: 37125524 +``` + +The bats test parses this output from `kubectl logs`. diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh b/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh new file mode 100755 index 0000000000..1a8f970305 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh @@ -0,0 +1,73 @@ +#!/bin/sh +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Reads guest NUMA topology from sysfs and prints structured output. +# Designed to run inside a kata VM as the container entrypoint. +# +# Output format (one key: value per line): +# numa_online: 0-1 +# node0_cpus: 32 +# node1_cpus: 32 +# node0_mem_kb: 37078332 +# node1_mem_kb: 37125524 +# gpu_0000:41:00.0_numa: 1 (only if GPUs are present) + +set -e + +# Print results to stdout (readable via "kubectl logs"), then sleep to +# keep the pod alive so the host-side pinning check can inspect the +# QEMU process. The bats test deletes the pod when done. + +# NUMA nodes online (e.g. "0-1" or "0") +online=$(cat /sys/devices/system/node/online) +echo "numa_online: ${online}" + +# Per-node vCPU count +for cpulist in /sys/devices/system/node/node*/cpulist; do + node_name=$(basename "$(dirname "${cpulist}")") + cpus=$(cat "${cpulist}") + count=0 + # Parse comma-separated ranges like "0-31,64-95" + IFS="," + for range in ${cpus}; do + case "${range}" in + *-*) + lo=${range%-*} + hi=${range#*-} + count=$((count + hi - lo + 1)) + ;; + *) + count=$((count + 1)) + ;; + esac + done + unset IFS + echo "${node_name}_cpus: ${count}" +done + +# Per-node memory +for meminfo in /sys/devices/system/node/node*/meminfo; do + node_name=$(basename "$(dirname "${meminfo}")") + mem_kb=$(awk '/MemTotal/ {print $4}' "${meminfo}") + echo "${node_name}_mem_kb: ${mem_kb}" +done + +# GPU NUMA affinity (if any GPUs are present via VFIO passthrough). +# PCI class 0x030200 = 3D controller (NVIDIA data center GPUs: A100, H100, etc.) +for numa_file in /sys/bus/pci/devices/*/numa_node; do + dev_dir=$(dirname "${numa_file}") + class=$(cat "${dev_dir}/class" 2>/dev/null) || continue + case "${class}" in + 0x030200) + bdf=$(basename "${dev_dir}") + node=$(cat "${numa_file}") + echo "gpu_${bdf}_numa: ${node}" + ;; + esac +done + +# Keep the pod alive for host-side pinning verification. +exec sleep infinity diff --git a/tests/spellcheck/kata-dictionary.txt b/tests/spellcheck/kata-dictionary.txt index 66fb7076b1..e5701c4b82 100644 --- a/tests/spellcheck/kata-dictionary.txt +++ b/tests/spellcheck/kata-dictionary.txt @@ -20,6 +20,7 @@ materialx # Hardware & Architecture AMD APQN +chiplet cpuid DCAP DGPU @@ -78,6 +79,7 @@ ttrpc vsock # Container, Runtime & Misc terms +Burstable cgroupsv1 coredump CPUSET diff --git a/tools/packaging/qemu/patches/11.0.x/no_patches.txt b/tools/packaging/qemu/patches/11.0.x/no_patches.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch b/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch new file mode 100644 index 0000000000..b80adaa58f --- /dev/null +++ b/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch @@ -0,0 +1,94 @@ +From 6b0eaa20aa91e9d82e0bf72b4ade6e83d18a4c9f Mon Sep 17 00:00:00 2001 +From: Ashish Kalra +Date: Thu, 18 Sep 2025 22:10:35 +0000 +Subject: [PATCH] accel/kvm: Fix kvm_convert_memory calls crossing memory + regions + +Page conversion call can span multiple memory regions, potentially +resulting in a conversion failure if the memory range being converted +extends beyond the boundaries of the referenced memory region. + +Handle the case of page conversion call straddling across memory +regions. + +Signed-off-by: Ashish Kalra +Signed-off-by: Michael Roth +--- + accel/kvm/kvm-all.c | 27 ++++++++++++++++++++------- + 1 file changed, 20 insertions(+), 7 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 63230743d0..a1b2c3e5f4 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -3342,6 +3342,7 @@ static void kvm_eat_signals(CPUState *cpu) + int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + { + MemoryRegionSection section; ++ hwaddr convert_size; + ram_addr_t offset; + MemoryRegion *mr; + RAMBlock *rb; +@@ -3359,6 +3360,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + return ret; + } + ++ /* ++ * Page conversions can span multiple memory regions, for example, if two ++ * memory backends are added to support two different NUMA nodes/policies. ++ */ ++next_memory_region: + section = memory_region_find(get_system_memory(), start, size); + mr = section.mr; + if (!mr) { +@@ -3397,10 +3403,13 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + goto out_unref; + } + ++ convert_size = (section.offset_within_region + size > mr->size) ? ++ mr->size - section.offset_within_region : size; ++ + if (to_private) { +- ret = kvm_set_memory_attributes_private(start, size); ++ ret = kvm_set_memory_attributes_private(start, convert_size); + } else { +- ret = kvm_set_memory_attributes_shared(start, size); ++ ret = kvm_set_memory_attributes_shared(start, convert_size); + } + if (ret) { + goto out_unref; +@@ -3410,11 +3419,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + rb = qemu_ram_block_from_host(addr, false, &offset); + + ret = ram_block_attributes_state_change(rb->attributes, +- offset, size, to_private); ++ offset, convert_size, to_private); + if (ret) { + error_report("Failed to notify the listener the state change of " + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", +- start, size, to_private ? "private" : "shared"); ++ start, convert_size, to_private ? "private" : "shared"); + goto out_unref; + } + +@@ -3426,9 +3435,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + */ + goto out_unref; + } +- ret = ram_block_discard_range(rb, offset, size); ++ ret = ram_block_discard_range(rb, offset, convert_size); + } else { +- ret = ram_block_discard_guest_memfd_range(rb, offset, size); ++ ret = ram_block_discard_guest_memfd_range(rb, offset, convert_size); ++ } ++ ++ if (size - convert_size) { ++ start += convert_size; ++ size -= convert_size; ++ goto next_memory_region; + } + + out_unref: +-- +2.43.0 + diff --git a/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch b/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch new file mode 100644 index 0000000000..b80adaa58f --- /dev/null +++ b/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch @@ -0,0 +1,94 @@ +From 6b0eaa20aa91e9d82e0bf72b4ade6e83d18a4c9f Mon Sep 17 00:00:00 2001 +From: Ashish Kalra +Date: Thu, 18 Sep 2025 22:10:35 +0000 +Subject: [PATCH] accel/kvm: Fix kvm_convert_memory calls crossing memory + regions + +Page conversion call can span multiple memory regions, potentially +resulting in a conversion failure if the memory range being converted +extends beyond the boundaries of the referenced memory region. + +Handle the case of page conversion call straddling across memory +regions. + +Signed-off-by: Ashish Kalra +Signed-off-by: Michael Roth +--- + accel/kvm/kvm-all.c | 27 ++++++++++++++++++++------- + 1 file changed, 20 insertions(+), 7 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 63230743d0..a1b2c3e5f4 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -3342,6 +3342,7 @@ static void kvm_eat_signals(CPUState *cpu) + int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + { + MemoryRegionSection section; ++ hwaddr convert_size; + ram_addr_t offset; + MemoryRegion *mr; + RAMBlock *rb; +@@ -3359,6 +3360,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + return ret; + } + ++ /* ++ * Page conversions can span multiple memory regions, for example, if two ++ * memory backends are added to support two different NUMA nodes/policies. ++ */ ++next_memory_region: + section = memory_region_find(get_system_memory(), start, size); + mr = section.mr; + if (!mr) { +@@ -3397,10 +3403,13 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + goto out_unref; + } + ++ convert_size = (section.offset_within_region + size > mr->size) ? ++ mr->size - section.offset_within_region : size; ++ + if (to_private) { +- ret = kvm_set_memory_attributes_private(start, size); ++ ret = kvm_set_memory_attributes_private(start, convert_size); + } else { +- ret = kvm_set_memory_attributes_shared(start, size); ++ ret = kvm_set_memory_attributes_shared(start, convert_size); + } + if (ret) { + goto out_unref; +@@ -3410,11 +3419,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + rb = qemu_ram_block_from_host(addr, false, &offset); + + ret = ram_block_attributes_state_change(rb->attributes, +- offset, size, to_private); ++ offset, convert_size, to_private); + if (ret) { + error_report("Failed to notify the listener the state change of " + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", +- start, size, to_private ? "private" : "shared"); ++ start, convert_size, to_private ? "private" : "shared"); + goto out_unref; + } + +@@ -3426,9 +3435,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + */ + goto out_unref; + } +- ret = ram_block_discard_range(rb, offset, size); ++ ret = ram_block_discard_range(rb, offset, convert_size); + } else { +- ret = ram_block_discard_guest_memfd_range(rb, offset, size); ++ ret = ram_block_discard_guest_memfd_range(rb, offset, convert_size); ++ } ++ ++ if (size - convert_size) { ++ start += convert_size; ++ size -= convert_size; ++ goto next_memory_region; + } + + out_unref: +-- +2.43.0 + diff --git a/tools/packaging/qemu/patches/tag_patches/v11.0.0/no_patches.txt b/tools/packaging/qemu/patches/tag_patches/v11.0.0/no_patches.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/versions.yaml b/versions.yaml index 93cad9e9f1..69fb34b0ba 100644 --- a/versions.yaml +++ b/versions.yaml @@ -88,8 +88,8 @@ assets: qemu: description: "VMM that uses KVM" url: "https://github.com/qemu/qemu" - version: "v10.2.1" - tag: "v10.2.1" + version: "v11.0.0" + tag: "v11.0.0" # Do not include any non-full release versions # Break the line *without CR or space being appended*, to appease # yamllint, and note the deliberate ' ' at the end of the expression. @@ -107,12 +107,12 @@ assets: qemu-snp-experimental: description: "QEMU with GPU+SNP support" url: "https://github.com/confidential-containers/qemu.git" - tag: "gpu-snp-20260107" + tag: "gpu-snp-20260430" qemu-tdx-experimental: description: "QEMU with GPU+TDX support" url: "https://github.com/confidential-containers/qemu.git" - tag: "gpu-tdx-20260107" + tag: "gpu-tdx-20260430" stratovirt: description: "StratoVirt is an lightweight opensource VMM"