diff --git a/docs/.nav.yml b/docs/.nav.yml
index 7dc1b12238..fa96b03a40 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -16,6 +16,8 @@ nav:
       - NVIDIA GPU Passthrough: use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md
       - NVIDIA vGPU: use-cases/NVIDIA-GPU-passthrough-and-Kata.md
       - Intel QAT: use-cases/using-Intel-QAT-and-kata.md
+    - How To:
+      - NUMA Support: how-to/how-to-use-numa-with-kata.md
     - Contributing:
       - Documentation: doc-contributing.md
   - Misc:
diff --git a/docs/how-to/README.md b/docs/how-to/README.md
index e2742ef374..134dd35e67 100644
--- a/docs/how-to/README.md
+++ b/docs/how-to/README.md
@@ -52,4 +52,5 @@
 - [How to use seccomp with runtime-rs](how-to-use-seccomp-with-runtime-rs.md)
 - [How to use passthroughfd-IO with runtime-rs and Dragonball](how-to-use-passthroughfd-io-within-runtime-rs.md)
 - [How to use EROFS snapshotter with Kata Containers](how-to-use-erofs-snapshotter-with-kata.md)
+- [How to use NUMA with Kata Containers](how-to-use-numa-with-kata.md)
 
diff --git a/docs/how-to/how-to-use-numa-with-kata.md b/docs/how-to/how-to-use-numa-with-kata.md
new file mode 100644
index 0000000000..248dec80ea
--- /dev/null
+++ b/docs/how-to/how-to-use-numa-with-kata.md
@@ -0,0 +1,633 @@
+# NUMA Support for Kata Containers with QEMU
+
+## Overview
+
+Non-Uniform Memory Access (NUMA) is a memory architecture where access
+latency depends on which CPU is accessing which memory region. On
+multi-socket or multi-chiplet systems, each NUMA node has local memory that
+its CPUs can access faster than remote memory belonging to other nodes.
+
+When running performance-sensitive workloads — particularly GPU passthrough
+via VFIO — cross-NUMA memory access can significantly degrade throughput.
+Kata Containers can expose the host NUMA topology to the guest VM so that
+vCPUs, memory, and devices are all placed on the correct NUMA node, preserving
+memory locality.
+
+This guide walks through the full setup end-to-end: host inspection,
+Kubernetes configuration, Kata configuration, pod deployment, and
+verification.
+
+> **Note:**
+>
+> NUMA support is currently available only for the **Go runtime** with the
+> **QEMU hypervisor** on **amd64** and **arm64** architectures. The Rust
+> runtime (`runtime-rs`) does not yet support NUMA topology.
+
+## Step 1: Inspect the Host NUMA Topology
+
+Before configuring anything, understand your host. Run on each worker node:
+
+```bash
+$ numactl --hardware
+```
+
+Example output on a 2-socket system with 8 CPUs per socket:
+
+```
+available: 2 nodes (0-1)
+node 0 cpus: 0 1 2 3 4 5 6 7
+node 0 size: 65536 MB
+node 1 cpus: 8 9 10 11 12 13 14 15
+node 1 size: 65536 MB
+node distances:
+node   0   1
+  0:  10  21
+  1:  21  10
+```
+
+Take note of:
+- How many NUMA nodes exist (here: 2)
+- Which CPUs belong to each node (here: 0-7 on node 0, 8-15 on node 1)
+- The distance matrix (here: 10 local, 21 remote)
+
+If you have GPUs, check which NUMA node each GPU is attached to:
+
+```bash
+$ lspci -nnk -d 10de: | grep -A2 "NVIDIA"
+$ cat /sys/bus/pci/devices/0000:41:00.0/numa_node
+```
+
+Replace `0000:41:00.0` with your GPU's PCI address. The output (`0` or `1`)
+tells you which NUMA node the GPU sits on.
+
+On a single-NUMA host (only node 0), enabling NUMA is a harmless no-op —
+the runtime detects one node and skips multi-NUMA topology.
+
+## Step 2: Kubernetes CPU Manager Policy
+
+Kata's NUMA-aware vCPU pinning works **without** `cpuManagerPolicy: static`.
+The recommended policy is the default (`none`):
+
+```yaml
+apiVersion: kubelet.config.k8s.io/v1beta1
+kind: KubeletConfiguration
+cpuManagerPolicy: "none"
+```
+
+> **Why not `static`?**
+>
+> With `cpuManagerPolicy: static`, Kubernetes assigns dedicated CPUs to
+> Guaranteed QoS pods. On a multi-NUMA host, those CPUs are often all from
+> a **single** NUMA node (depending on the topology manager policy). This
+> causes the sandbox CPUSet to cover only one NUMA node, which defeats the
+> purpose of multi-NUMA guest topology.
+>
+> With `cpuManagerPolicy: none` (the default), the pod inherits the full
+> node CPUSet spanning all NUMA nodes, and Kata's NUMA-aware pinning
+> distributes vCPU threads proportionally across host NUMA nodes.
+
+### 2.1 Check the current policy
+
+```bash
+$ grep cpuManagerPolicy /var/lib/kubelet/config.yaml
+```
+
+If it shows `static`, switch to `none`:
+
+```bash
+$ sudo sed -i 's/cpuManagerPolicy:.*/cpuManagerPolicy: "none"/' /var/lib/kubelet/config.yaml
+$ sudo rm -f /var/lib/kubelet/cpu_manager_state
+$ sudo systemctl restart kubelet
+```
+
+## Step 3: Configure Kata Containers for NUMA
+
+> **Note:**
+>
+> If you are using the NVIDIA GPU runtime classes
+> (`kata-qemu-nvidia-gpu`, `kata-qemu-nvidia-gpu-snp`,
+> `kata-qemu-nvidia-gpu-tdx`), NUMA is already enabled by default in their
+> configuration templates. You only need the steps below for the base
+> `kata-qemu` runtime class or custom configurations.
+
+Never edit the base `configuration-qemu.toml` directly — use a
+**configuration drop-in** so your customizations survive upgrades.
+
+### 3.1 Via kata-deploy Helm chart (recommended)
+
+Add a custom runtime with a NUMA drop-in in your Helm values file:
+
+```yaml
+customRuntimes:
+  enabled: true
+  runtimes:
+    numa:
+      baseConfig: qemu
+      runtimeClass: |
+        apiVersion: node.k8s.io/v1
+        kind: RuntimeClass
+        metadata:
+          name: kata-qemu-numa
+        handler: kata-qemu-numa
+      dropIn: |
+        [hypervisor.qemu]
+        enable_numa = true
+        numa_mapping = []
+
+        [runtime]
+        static_sandbox_resource_mgmt = true
+        enable_vcpus_pinning = true
+```
+
+Then install (or upgrade) the Helm chart:
+
+```bash
+$ helm upgrade kata-deploy \
+    --namespace kata-system \
+    -f my-values.yaml \
+    "${CHART}" --version "${VERSION}"
+```
+
+Pods using `runtimeClassName: kata-qemu-numa` will get the NUMA-enabled
+configuration.
+
+With `numa_mapping = []` (empty), the runtime auto-discovers host NUMA nodes
+and creates a 1:1 guest-to-host mapping, then **right-sizes** the resulting
+topology: if the sandbox's CPU and memory budget fits on a single host
+NUMA node — and any cold-plugged VFIO devices live on that same node —
+the guest topology collapses to that one node so the workload keeps full
+memory locality without paying a multi-node penalty. Sandboxes that
+genuinely span multiple host nodes keep the auto-derived multi-node
+topology. An explicit `numa_mapping` opts out of right-sizing and is
+honored verbatim — useful when you want a specific layout regardless of
+sandbox size, or to group multiple host nodes into fewer guest nodes
+(e.g., on a 4-socket system):
+
+```yaml
+      dropIn: |
+        [hypervisor.qemu]
+        enable_numa = true
+        numa_mapping = ["0-1", "2-3"]
+```
+
+Each entry is a cpuset-style string (ranges like `0-3` and lists like
+`0,2,4` are both valid).
+
+### 3.2 Via manual drop-in on the node
+
+If you manage nodes directly (without kata-deploy), create a drop-in file
+under the `config.d/` directory. Use a `50-*` prefix (the reserved range
+for user customizations):
+
+```bash
+$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-numa.toml <<'EOF'
+[hypervisor.qemu]
+enable_numa = true
+numa_mapping = []
+
+[runtime]
+static_sandbox_resource_mgmt = true
+enable_vcpus_pinning = true
+EOF
+```
+
+The drop-in is merged on top of the base `configuration-qemu.toml`
+automatically. No restart is needed — the shim reads the configuration
+at pod creation time.
+
+> **Note:**
+>
+> For details on the drop-in mechanism, reserved prefix ranges, and
+> additional Helm examples, see the
+> [Helm configuration guide](../../docs/helm-configuration.md).
+
+### 3.3 Verify the effective configuration
+
+After applying the drop-in, verify the merged configuration on the node:
+
+```bash
+$ grep -rE "enable_numa|numa_mapping|static_sandbox_resource_mgmt|enable_vcpus_pinning" \
+    /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/
+```
+
+## Step 4: Deploy a NUMA-Aware Pod
+
+### 4.1 Basic NUMA pod
+
+Create a pod that requests enough CPUs to span both NUMA nodes. Use the
+runtime class matching your NUMA configuration from Step 3 (e.g.,
+`kata-qemu-numa` if you created a custom runtime, or `kata-qemu` if you
+applied a drop-in to the base config). Kata sizes the VM based on
+`limits`, so set `limits.cpu` to the desired vCPU count:
+
+```bash
+$ cat <<'EOF' | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: numa-test
+spec:
+  runtimeClassName: kata-qemu-numa
+  containers:
+  - name: numa-check
+    image: ubuntu:24.04
+    command: ["sleep", "infinity"]
+    resources:
+      requests:
+        cpu: "1"
+        memory: "1Gi"
+      limits:
+        cpu: "80"
+        memory: "64Gi"
+EOF
+```
+
+> **Note:**
+>
+> Kata sizes the VM based on `limits` (not `requests`). Using different
+> values for `requests` and `limits` makes the pod **Burstable** QoS,
+> which avoids Kubernetes CPU manager interference with NUMA-aware
+> pinning. The large `limits.cpu` value tells Kata to create a VM with
+> that many vCPUs distributed across NUMA nodes.
+
+### 4.2 GPU passthrough pod with NUMA
+
+For GPU workloads, use the NVIDIA GPU runtime class. NUMA is enabled by
+default in the GPU configuration templates:
+
+```bash
+$ cat <<'EOF' | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-numa-test
+spec:
+  runtimeClassName: kata-qemu-nvidia-gpu
+  containers:
+  - name: cuda-test
+    image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04
+    resources:
+      limits:
+        cpu: "4"
+        memory: "8Gi"
+        nvidia.com/pgpu: "1"
+EOF
+```
+
+## Step 5: Verify NUMA Inside the Guest
+
+### 5.1 Check guest NUMA topology
+
+Exec into the running pod and inspect the NUMA layout:
+
+```bash
+$ kubectl exec -it numa-test -- bash
+```
+
+Inside the pod:
+
+```bash
+$ apt-get update && apt-get install -y numactl
+$ numactl --hardware
+```
+
+Expected output on a 2-NUMA-node guest:
+
+```
+available: 2 nodes (0-1)
+node 0 cpus: 0 1
+node 0 size: 2048 MB
+node 1 cpus: 2 3
+node 1 size: 2048 MB
+node distances:
+node   0   1
+  0:  10  21
+  1:  21  10
+```
+
+Key things to verify:
+- **Number of nodes** matches your host (or `numa_mapping` configuration).
+- **CPUs** are distributed across nodes (not all on node 0).
+- **Memory** is split across nodes (not all on node 0).
+- **Distances** mirror the host distances.
+
+### 5.2 Check CPU-to-NUMA mapping
+
+```bash
+$ lscpu | grep -i numa
+```
+
+Expected:
+
+```
+NUMA node(s):          2
+NUMA node0 CPU(s):     0,1
+NUMA node1 CPU(s):     2,3
+```
+
+### 5.3 Check from /proc and /sys inside the guest
+
+```bash
+$ cat /sys/devices/system/node/node*/cpulist
+```
+
+Expected:
+
+```
+0-1
+2-3
+```
+
+```bash
+$ cat /sys/devices/system/node/node*/meminfo | grep MemTotal
+```
+
+Expected (values will vary based on your pod's memory request):
+
+```
+Node 0 MemTotal:     2097152 kB
+Node 1 MemTotal:     2097152 kB
+```
+
+## Step 6: Verify NUMA on the Host
+
+### 6.1 Check vCPU pinning
+
+From the host, find the QEMU process and check its thread affinities:
+
+```bash
+$ QEMU_PID=$(pgrep -f "qemu.*numa-test")
+$ ls /proc/${QEMU_PID}/task/ | while read tid; do
+    echo "TID ${tid}: $(taskset -p ${tid} 2>/dev/null)"
+  done
+```
+
+With NUMA pinning enabled, you should see vCPU threads pinned to specific
+CPUs (not the full CPU mask). For example, on a 2-NUMA-node host with
+CPUs 0-7 on node 0 and CPUs 8-15 on node 1:
+
+```
+TID 12345: pid 12345's current affinity mask: 1    # CPU 0
+TID 12346: pid 12346's current affinity mask: 2    # CPU 1
+TID 12347: pid 12347's current affinity mask: 100  # CPU 8
+TID 12348: pid 12348's current affinity mask: 200  # CPU 9
+```
+
+### 6.2 Check the shim logs for NUMA configuration
+
+```bash
+$ POD_SANDBOX_ID=$(crictl pods --name numa-test -q)
+$ journalctl -t kata | grep "${POD_SANDBOX_ID}" | grep -i numa
+```
+
+Look for lines like:
+
+```
+buildNUMATopology: creating 2 guest NUMA nodes
+VFIO device NUMA placement validated  bdf=0000:41:00.0 host-numa=1 guest-numa=1
+```
+
+### 6.3 Check the QEMU command line
+
+```bash
+$ cat /proc/${QEMU_PID}/cmdline | tr '\0' '\n' | grep -E "numa|memory-backend"
+```
+
+Expected output (varies by configuration):
+
+```
+-object
+memory-backend-ram,id=numa-mem0,size=2048M,host-nodes=0,policy=bind,share=on
+-numa
+node,nodeid=0,memdev=numa-mem0,cpus=0-1
+-object
+memory-backend-ram,id=numa-mem1,size=2048M,host-nodes=1,policy=bind,share=on
+-numa
+node,nodeid=1,memdev=numa-mem1,cpus=2-3
+-numa
+dist,src=0,dst=1,val=21
+-numa
+dist,src=1,dst=0,val=21
+```
+
+Key things to verify:
+- Each `-object memory-backend-*` has `host-nodes=N` and `policy=bind`
+  matching the correct host NUMA node.
+- Each `-numa node` has a `cpus=` range and `memdev=` pointing to the
+  correct memory backend.
+- `-numa dist` entries mirror the host distances.
+
+## Step 7: Verify GPU NUMA Placement (GPU Passthrough Only)
+
+If using GPU passthrough, verify the device landed on the correct NUMA node:
+
+### 7.1 Check host-side GPU NUMA node
+
+```bash
+$ GPU_BDF="0000:41:00.0"  # Replace with your GPU's PCI address
+$ cat /sys/bus/pci/devices/${GPU_BDF}/numa_node
+```
+
+### 7.2 Check shim logs for VFIO placement validation
+
+```bash
+$ journalctl -t kata | grep -i "VFIO device NUMA"
+```
+
+Healthy output:
+
+```
+VFIO device NUMA placement validated  bdf=0000:41:00.0 host-numa=1 guest-numa=1
+```
+
+Warning output (indicates misconfiguration):
+
+```
+VFIO device on host NUMA node not covered by guest NUMA topology  bdf=0000:41:00.0 host-numa=2 covered-nodes=map[0:0 1:1]
+```
+
+If you see the warning, extend your `numa_mapping` to include the GPU's host
+NUMA node.
+
+### 7.3 Check GPU NUMA inside the guest
+
+Inside the GPU pod:
+
+```bash
+$ nvidia-smi topo --matrix
+```
+
+This shows the GPU's relationship to NUMA nodes from the guest perspective.
+
+## How It Works
+
+When a VM is created with NUMA enabled, the runtime:
+
+1. **Discovers host NUMA**: Reads
+   `/sys/devices/system/node/node*/distance` to build the host distance
+   matrix.
+
+2. **Right-sizes the topology** (auto-discovery only): When `numa_mapping`
+   is empty, the runtime compares the sandbox's vCPU and memory budget
+   against per-node host capacity (read from
+   `/sys/devices/system/node/node*/meminfo` and `cpulist`). If any
+   cold-plugged VFIO device pins the sandbox to specific host nodes, the
+   chosen subset must cover those; otherwise the smallest single host
+   node that fits the workload is picked. When the resulting subset has
+   one node, the topology collapses to a flat (no `-numa`) layout so QEMU
+   uses a single memory backend. Sandboxes that exceed any single node
+   keep the full auto-derived multi-node topology. An explicit
+   `numa_mapping` opts out of this step entirely and is honored verbatim.
+
+3. **Builds guest topology**: Creates guest NUMA nodes with per-node memory
+   backends (`policy=bind` to lock memory to host NUMA nodes), distributes
+   vCPUs proportionally to host CPU counts, and mirrors distances. For
+   confidential guests (SEV-SNP, TDX), QEMU automatically enables
+   `guest_memfd` on each memory backend for private/shared memory
+   attribute tracking (requires the cross-region conversion patch).
+
+4. **Restructures SMP**: Sets `sockets = num_NUMA_nodes` and
+   `cores = ceil(maxvcpus / num_NUMA_nodes)` so QEMU groups vCPUs by socket
+   per NUMA node.
+
+5. **Pins vCPUs** (when enabled): Each vCPU thread is pinned to a host CPU
+   belonging to the same NUMA node. Right-sized single-node sandboxes
+   also go through this NUMA-aware path, so all vCPUs land on the chosen
+   host NUMA node's CPUs.
+
+6. **Validates VFIO devices**: Checks each cold-plugged device's host NUMA
+   node against the guest topology and logs placement status.
+
+7. **Translates cpuset.mems**: Converts host NUMA node IDs to guest node IDs
+   before forwarding to the agent.
+
+## Troubleshooting
+
+### Guest reports a single NUMA node on a multi-NUMA host
+
+**Symptom:** Inside a small pod on a 2+ NUMA-node host, `numactl --hardware`
+shows only one NUMA node, and the QEMU command line has no `-numa`
+arguments.
+
+**Cause:** Right-sizing collapsed the auto-derived topology because the
+sandbox's vCPU + memory budget fits on one host NUMA node. This is the
+intended optimization — the pod gets full memory locality without paying
+the cross-node penalty for a workload that does not need it.
+
+**Fix (only if you really want the multi-node layout):** either
+- set an explicit `numa_mapping = ["0", "1"]` (or similar) — explicit
+  mappings skip right-sizing and are honored verbatim, or
+- raise the pod's `limits.cpu` / `limits.memory` so the sandbox truly
+  exceeds any single host node's capacity.
+
+### Multi-NUMA topology is skipped (too few vCPUs)
+
+**Symptom:** The shim logs show:
+
+```
+DefaultMaxVCPUs < NUMA node count; skipping multi-NUMA topology  vcpus=1 numa-nodes=2
+```
+
+**Cause:** The pod requested fewer CPUs than there are NUMA nodes. Each
+NUMA node needs at least one vCPU.
+
+**Fix:** Request at least as many CPUs as NUMA nodes in the pod spec:
+
+```yaml
+resources:
+  limits:
+    cpu: "2"   # At least 2 for a 2-NUMA-node host
+```
+
+Or increase `default_vcpus` via a drop-in:
+
+```bash
+$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-default-vcpus.toml <<'EOF'
+[hypervisor.qemu]
+default_vcpus = 2
+EOF
+```
+
+### vCPU pinning is skipped (empty CPUSet)
+
+**Symptom:** The shim logs show:
+
+```
+sandbox CPUSet is empty; skipping vCPU pinning
+```
+
+**Cause:** The runtime could not determine a CPUSet for pinning. With
+`cpuManagerPolicy: none` and multi-NUMA enabled, the runtime derives the
+CPUSet from the guest NUMA nodes' `HostCPUs`. This message indicates no
+NUMA topology was built (e.g., the host has only one NUMA node).
+
+**Fix:** Verify:
+
+1. The host has multiple NUMA nodes (`numactl --hardware`)
+2. `enable_numa = true` is set in the Kata configuration
+3. `enable_vcpus_pinning = true` is set in the Kata configuration
+4. `static_sandbox_resource_mgmt = true` is set (so all vCPUs boot at start)
+
+### NUMA pinning fallback warning
+
+**Symptom:** The shim logs show:
+
+```
+NUMA node HostCPUs do not intersect sandbox CPUSet; falling back to full cpuset
+```
+
+**Cause:** The CPUs Kubernetes assigned to the pod do not overlap with the
+host CPUs on the NUMA node. This means NUMA locality is lost for that node.
+
+**Fix:** Verify that your `numa_mapping` matches the actual host topology:
+
+```bash
+$ numactl --hardware  # Check which CPUs are on which nodes
+```
+
+Ensure the Kubernetes node has CPUs from all mapped NUMA nodes available
+for scheduling.
+
+### Configuration validation error at startup
+
+**Symptom:**
+
+```
+NUMA support requires static_sandbox_resource_mgmt to be enabled
+```
+
+**Fix:** Add `static_sandbox_resource_mgmt` via a drop-in:
+
+```bash
+$ cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-static-resources.toml <<'EOF'
+[runtime]
+static_sandbox_resource_mgmt = true
+EOF
+```
+
+## Configuration Reference
+
+| Option | Section | Default | Description |
+|--------|---------|---------|-------------|
+| `enable_numa` | `[hypervisor.qemu]` | `false` | Enable guest NUMA topology |
+| `numa_mapping` | `[hypervisor.qemu]` | `[]` | Map guest NUMA nodes to host nodes. Empty = auto-discover with right-sizing (small sandboxes collapse to one node); non-empty = honored verbatim |
+| `static_sandbox_resource_mgmt` | `[runtime]` | varies | Size VM at boot (required for NUMA) |
+| `enable_vcpus_pinning` | `[runtime]` | `false` | Pin vCPU threads to host CPUs (NUMA-aware when NUMA enabled) |
+
+## Limitations
+
+- NUMA is only supported with the **Go runtime** and **QEMU** hypervisor.
+- Only **amd64** and **arm64** architectures are supported.
+- NUMA requires `static_sandbox_resource_mgmt = true` (no dynamic
+  CPU/memory hotplug).
+- The VM needs at least as many vCPUs as NUMA nodes. If fewer vCPUs are
+  available, multi-NUMA is silently skipped.
+- vCPU pinning with NUMA works best with `cpuManagerPolicy: none` (the
+  default). Using `static` may restrict the pod's CPUSet to a single NUMA
+  node, preventing balanced pinning across nodes.
+- Confidential guests (SEV-SNP, TDX) with NUMA require a QEMU patch
+  ([accel/kvm: Fix kvm_convert_memory calls crossing memory regions](https://github.com/AMDESE/qemu/commit/6b0eaa20))
+  to handle page conversions that span multiple NUMA memory backends.
+  The GPU-experimental QEMU builds (`gpu-snp`, `gpu-tdx`) include this
+  patch. Without it, QEMU crashes with
+  `ram_block_attributes_state_change, invalid range`.
diff --git a/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md b/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md
index b20a9d9d3e..118cf16919 100644
--- a/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md
+++ b/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md
@@ -506,6 +506,17 @@ To stop the pod, run: `kubectl delete pod cuda-vectoradd-kata`.
 
 ### Next steps
 
+#### NUMA topology for GPU locality
+
+On multi-NUMA hosts, enabling NUMA support ensures GPU memory accesses stay
+local to the NUMA node where the GPU is physically attached, avoiding
+cross-NUMA latency. The NVIDIA GPU configuration templates ship with
+`enable_numa = true` by default.
+
+For details on NUMA configuration, topology verification, and
+troubleshooting, see the
+[NUMA support guide](../how-to/how-to-use-numa-with-kata.md).
+
 #### Use multi-GPU passthrough
 
 If you have machines supporting multi-GPU passthrough, use a pod deployment
diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs
index cb5c6bc3f0..a89454263e 100644
--- a/src/agent/src/linux_abi.rs
+++ b/src/agent/src/linux_abi.rs
@@ -26,15 +26,29 @@ pub fn create_pci_root_bus_path(root_complex: &str) -> String {
     format!("/devices/pci0000:{root_complex}")
 }
 
-// This is used in several modules, let's create a helper function to parse the
-// qom path and switch easily once the shim sends us the full NUMA path
+// Parses a device tree path into a (root_complex, PCI path) pair.
+//
+// Supports two formats:
+//   - Full NUMA path: "root_complex/bus/device" (e.g. "10/00/02") where the
+//     first segment is the root complex and the rest form the PCI path.
+//   - Legacy path: "bus/device" (e.g. "00/02") which defaults to root complex "00".
 pub fn pcipath_from_dev_tree_path(dev_tree_path: &str) -> Result<(&str, pci::Path)> {
-    // Placeholder until the shim send us the full NUMA path
-    // via shim in the form of root_complex/bus/device  10/00/02
-    // Currently the shim only sends us the bus/device 00/02
-    let pci_path = pci::Path::from_str(dev_tree_path)
-        .with_context(|| format!("Failed to parse PCI path from QOM path '{}'", dev_tree_path))?;
-    Ok(("00", pci_path))
+    let segments: Vec<&str> = dev_tree_path.split('/').collect();
+    if segments.len() >= 3 {
+        let root_complex = segments[0];
+        let pci_part = &dev_tree_path[root_complex.len() + 1..];
+        let pci_path = pci::Path::from_str(pci_part).with_context(|| {
+            format!(
+                "Failed to parse PCI path from NUMA path '{}'",
+                dev_tree_path
+            )
+        })?;
+        Ok((root_complex, pci_path))
+    } else {
+        let pci_path = pci::Path::from_str(dev_tree_path)
+            .with_context(|| format!("Failed to parse PCI path from '{}'", dev_tree_path))?;
+        Ok(("00", pci_path))
+    }
 }
 
 #[cfg(target_arch = "aarch64")]
diff --git a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs
index 7fbcdb2e06..d72dc73efe 100644
--- a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs
+++ b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs
@@ -7,6 +7,7 @@
 use std::collections::{HashMap, HashSet};
 use std::process;
 use std::str::FromStr;
+use std::time::Duration;
 
 use anyhow::{anyhow, Context, Result};
 use cgroups_rs::manager::is_systemd_cgroup;
@@ -16,6 +17,7 @@ use kata_types::cpu::CpuSet;
 use nix::sched::{sched_setaffinity, CpuSet as NixCpuSet};
 use nix::unistd::Pid;
 use oci_spec::runtime::{LinuxCpu, LinuxCpuBuilder, LinuxResources, LinuxResourcesBuilder};
+use tokio::time::sleep;
 
 use crate::cgroups::utils::get_tgid_from_pid;
 use crate::cgroups::CgroupConfig;
@@ -187,12 +189,46 @@ impl CgroupsResourceInner {
         let needs_thread_ids = self.overhead_cgroup.is_some() || self.enable_vcpus_pinning;
 
         let thread_ids = if needs_thread_ids {
-            Some(
-                hypervisor
-                    .get_thread_ids()
-                    .await
-                    .context("get vCPU thread IDs")?,
-            )
+            let mut tids = hypervisor
+                .get_thread_ids()
+                .await
+                .context("get vCPU thread IDs")?;
+
+            // QEMU may not have spawned all vCPU threads yet. Retry with
+            // exponential backoff until we see the expected count.
+            let expected = hypervisor.hypervisor_config().await.cpu_info.default_vcpus.ceil() as usize;
+            if expected > 0 && tids.vcpus.len() < expected {
+                const MAX_ATTEMPTS: u32 = 10;
+                let mut backoff = Duration::from_millis(50);
+                for attempt in 2..=MAX_ATTEMPTS {
+                    if tids.vcpus.len() >= expected {
+                        break;
+                    }
+                    info!(
+                        sl!(),
+                        "waiting for all vCPU threads: have {}, want {}, attempt {}",
+                        tids.vcpus.len(),
+                        expected,
+                        attempt
+                    );
+                    sleep(backoff).await;
+                    backoff *= 2;
+                    tids = hypervisor
+                        .get_thread_ids()
+                        .await
+                        .context("get vCPU thread IDs (retry)")?;
+                }
+                if tids.vcpus.len() < expected {
+                    warn!(
+                        sl!(),
+                        "not all vCPU threads available after retries: have {}, want {}; pinning available ones",
+                        tids.vcpus.len(),
+                        expected
+                    );
+                }
+            }
+
+            Some(tids)
         } else {
             None
         };
diff --git a/src/runtime/Makefile b/src/runtime/Makefile
index 91d3eb976c..88ef8077ff 100644
--- a/src/runtime/Makefile
+++ b/src/runtime/Makefile
@@ -511,6 +511,8 @@ ifneq (,$(QEMUCMD))
 
     DEFENABLEVCPUPINNING_NV = true
 
+    DEFENABLENUMA_NV = true
+
     # NVIDIA profile: rootfs filesystem type (erofs for read-only, compressed images)
     DEFROOTFSTYPE_NV := $(ROOTFSTYPE_EROFS)
 
@@ -689,6 +691,7 @@ USER_VARS += DEFAULTTIMEOUT_NV
 USER_VARS += DEFAULTLAUNCHPROCESSTIMEOUT_NV
 USER_VARS += DEFSANDBOXCGROUPONLY_NV
 USER_VARS += DEFENABLEVCPUPINNING_NV
+USER_VARS += DEFENABLENUMA_NV
 USER_VARS += DEFROOTFSTYPE_NV
 USER_VARS += DEFROOTFSTYPE
 USER_VARS += MACHINETYPE
diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in
index 4dae978b9b..b15186867d 100644
--- a/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in
+++ b/src/runtime/config/configuration-qemu-nvidia-gpu-snp.toml.in
@@ -360,7 +360,12 @@ enable_iommu_platform = false
 # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs.
 # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should
 # also be enabled for memory pre-allocation.
-enable_numa = false
+#
+# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime
+# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA
+# node covered by the guest NUMA topology, ensuring memory locality. Consider
+# enabling this on multi-NUMA hosts with GPU passthrough.
+enable_numa = @DEFENABLENUMA_NV@
 
 # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes.
 # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to.
diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in
index 1c1ce20b01..2928389b1c 100644
--- a/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in
+++ b/src/runtime/config/configuration-qemu-nvidia-gpu-tdx.toml.in
@@ -337,7 +337,12 @@ enable_iommu_platform = false
 # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs.
 # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should
 # also be enabled for memory pre-allocation.
-enable_numa = false
+#
+# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime
+# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA
+# node covered by the guest NUMA topology, ensuring memory locality. Consider
+# enabling this on multi-NUMA hosts with GPU passthrough.
+enable_numa = @DEFENABLENUMA_NV@
 
 # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes.
 # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to.
diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
index 49f9db0d6e..f373082129 100644
--- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
+++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
@@ -319,7 +319,12 @@ enable_iommu_platform = false
 # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs.
 # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should
 # also be enabled for memory pre-allocation.
-enable_numa = false
+#
+# GPU workloads strongly benefit from NUMA awareness: when enabled, the runtime
+# validates that each cold-plugged VFIO device (GPU) resides on a host NUMA
+# node covered by the guest NUMA topology, ensuring memory locality. Consider
+# enabling this on multi-NUMA hosts with GPU passthrough.
+enable_numa = @DEFENABLENUMA_NV@
 
 # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes.
 # Each entry defines a VM NUMA node and the host NUMA node(s) it maps to.
diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in
index 811884a088..5a51f628ca 100644
--- a/src/runtime/config/configuration-qemu.toml.in
+++ b/src/runtime/config/configuration-qemu.toml.in
@@ -314,6 +314,11 @@ enable_iommu_platform = false
 # as is: map VM NUMA nodes to host 1:1 and bind vCPUs to related CPUs.
 # Note: To take proper advantage of NUMA, static_sandbox_resource_mgmt should
 # also be enabled for memory pre-allocation.
+#
+# When VFIO devices (e.g. GPUs) are cold-plugged and NUMA is enabled, the
+# runtime validates that each device's host NUMA node is covered by the guest
+# NUMA topology. A warning is logged if a device falls outside the configured
+# nodes, indicating potential cross-NUMA memory access overhead.
 enable_numa = false
 
 # NUMA node mapping allows customizing how VM NUMA nodes map to host NUMA nodes.
diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go
index b912fc6377..489cf5e4dc 100644
--- a/src/runtime/pkg/device/config/config.go
+++ b/src/runtime/pkg/device/config/config.go
@@ -235,6 +235,17 @@ var (
 	// different types of PCI ports. We can deduces the Bus number from it
 	// and eliminate duplicates being assigned.
 	PCIeDevicesPerPort = map[PCIePort][]VFIODev{}
+
+	// NUMARootPorts maps host NUMA node IDs to root port IDs on pxb-pcie
+	// bridges.  When NUMA-aware PCIe topology is active (pxb-pcie),
+	// createPCIeTopology populates this so VFIODevice.Attach() can assign
+	// each device to the root port on its host NUMA node's pxb-pcie bus.
+	// Key: host NUMA node ID, Value: slice of root port IDs on that node's pxb.
+	NUMARootPorts = map[int][]string{}
+
+	// NUMARootPortDeviceCount tracks how many devices have been assigned
+	// to each host NUMA node's root ports (for round-robin assignment).
+	NUMARootPortDeviceCount = map[int]int{}
 )
 
 // DeviceInfo is an embedded type that contains device data common to all types of devices.
@@ -418,6 +429,10 @@ type VFIODev struct {
 	// Type of VFIO device
 	Type VFIODeviceType
 
+	// NUMANode is the host NUMA node this device is attached to.
+	// -1 means no affinity or unknown.
+	NUMANode int
+
 	// IsPCIe specifies device is PCIe or PCI
 	IsPCIe bool
 
diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go
index 1e7ba5f118..d111b9e2bb 100644
--- a/src/runtime/pkg/device/drivers/utils.go
+++ b/src/runtime/pkg/device/drivers/utils.go
@@ -46,6 +46,7 @@ var (
 	PCISysFsSlotsMaxBusSpeed PCISysFsProperty = "max_bus_speed" // /sys/bus/pci/slots/xxx/max_bus_speed
 	PCISysFsDevicesVendor    PCISysFsProperty = "vendor"        // /sys/bus/pci/devices/xxx/vendor
 	PCISysFsDevicesDevice    PCISysFsProperty = "device"        // /sys/bus/pci/devices/xxx/device
+	PCISysFsDevicesNUMANode  PCISysFsProperty = "numa_node"     // /sys/bus/pci/devices/xxx/numa_node
 )
 
 func deviceLogger() *logrus.Entry {
@@ -85,6 +86,20 @@ func GetPCIDeviceProperty(bdf string, property PCISysFsProperty) string {
 	return rlt
 }
 
+// GetPCIDeviceNUMANode returns the host NUMA node for a PCI device.
+// Returns -1 if the device has no NUMA affinity or the value cannot be read.
+func GetPCIDeviceNUMANode(bdf string) int {
+	raw := GetPCIDeviceProperty(bdf, PCISysFsDevicesNUMANode)
+	if raw == "" {
+		return -1
+	}
+	n, err := strconv.Atoi(raw)
+	if err != nil {
+		return -1
+	}
+	return n
+}
+
 func readPCIProperty(propertyPath string) (string, error) {
 	var (
 		buf []byte
@@ -240,6 +255,7 @@ func GetDeviceFromVFIODev(device config.DeviceInfo) ([]*config.VFIODev, error) {
 		Class:    pciClass,
 		VendorID: vendorID,
 		DeviceID: deviceID,
+		NUMANode: GetPCIDeviceNUMANode(deviceBDF),
 		Port:     device.Port,
 		HostPath: device.HostPath,
 	}
@@ -291,7 +307,6 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
 			vendorID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesVendor)
 			deviceID := GetPCIDeviceProperty(deviceBDF, PCISysFsDevicesDevice)
 
-			// Do not directly assign to `vfio` -- need to access field still
 			vfio = config.VFIODev{
 				ID:       id,
 				Type:     vfioDeviceType,
@@ -301,6 +316,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
 				Class:    pciClass,
 				VendorID: vendorID,
 				DeviceID: deviceID,
+				NUMANode: GetPCIDeviceNUMANode(deviceBDF),
 				Port:     device.Port,
 				HostPath: device.HostPath,
 			}
@@ -315,6 +331,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
 				SysfsDev:  deviceSysfsDev,
 				Type:      config.VFIOAPDeviceMediatedType,
 				APDevices: devices,
+				NUMANode:  -1,
 				Port:      device.Port,
 			}
 		default:
diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go
index 42d86e2dca..ff70c4ac76 100644
--- a/src/runtime/pkg/device/drivers/vfio.go
+++ b/src/runtime/pkg/device/drivers/vfio.go
@@ -90,11 +90,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
 		}
 
 		if vfio.IsPCIe {
-			busIndex := len(config.PCIeDevicesPerPort[vfio.Port])
-			vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
-			// We need to keep track the number of devices per port to deduce
-			// the corectu bus number, additionally we can use the VFIO device
-			// info to act upon different Vendor IDs and Device IDs.
+			// When pxb-pcie NUMA topology is active, assign the device
+			// to a root port on the pxb-pcie bridge for its host NUMA
+			// node instead of the default rp/swdp numbering.
+			if rpIDs, ok := config.NUMARootPorts[vfio.NUMANode]; ok && len(rpIDs) > 0 {
+				idx := config.NUMARootPortDeviceCount[vfio.NUMANode]
+				vfio.Bus = rpIDs[idx%len(rpIDs)]
+				config.NUMARootPortDeviceCount[vfio.NUMANode] = idx + 1
+			} else {
+				busIndex := len(config.PCIeDevicesPerPort[vfio.Port])
+				vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
+			}
 			config.PCIeDevicesPerPort[vfio.Port] = append(config.PCIeDevicesPerPort[vfio.Port], *vfio)
 		}
 	}
diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go
index 06f9117676..5726613e3a 100644
--- a/src/runtime/pkg/device/manager/manager.go
+++ b/src/runtime/pkg/device/manager/manager.go
@@ -71,6 +71,8 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
 	config.PCIeDevicesPerPort[config.RootPort] = make([]config.VFIODev, 0)
 	config.PCIeDevicesPerPort[config.SwitchPort] = make([]config.VFIODev, 0)
 	config.PCIeDevicesPerPort[config.BridgePort] = make([]config.VFIODev, 0)
+	config.NUMARootPorts = make(map[int][]string)
+	config.NUMARootPortDeviceCount = make(map[int]int)
 
 	for _, dev := range devices {
 		dm.devices[dev.DeviceID()] = dev
diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go
index 7cf6915df9..9dca1e959e 100644
--- a/src/runtime/pkg/govmm/qemu/qemu.go
+++ b/src/runtime/pkg/govmm/qemu/qemu.go
@@ -50,6 +50,20 @@ const (
 	qgsSocketPath string = "/var/run/tdx-qgs/qgs.socket"
 )
 
+// hasPCIeRoot reports whether the configured QEMU machine type exposes a
+// `pcie.0` root complex (q35 on x86, virt on arm64).  Machines such as
+// pseries (ppc64le -> pci.0), s390-ccw-virtio (s390x -> CCW transport)
+// and microvm (no PCI at all) do not have a `pcie.0` bus, so emitting
+// `bus=pcie.0` on virtio-pci leaf devices would fail to start QEMU.
+// Used to gate the bus= pin we apply to keep leaf devices off pxb-pcie.
+func hasPCIeRoot(config *Config) bool {
+	if config == nil {
+		return false
+	}
+	t := config.Machine.Type
+	return strings.HasPrefix(t, "q35") || strings.HasPrefix(t, "virt")
+}
+
 const (
 	// Well known vsock CID for host system.
 	// https://man7.org/linux/man-pages/man7/vsock.7.html
@@ -132,6 +146,10 @@ const (
 	// VHostVSockPCI is a generic Vsock vhost device with PCI transport.
 	VHostVSockPCI DeviceDriver = "vhost-vsock-pci"
 
+	// PXBPCIe is a PCIe Expander Bridge that creates a new PCI root
+	// complex with NUMA node affinity.
+	PXBPCIe DeviceDriver = "pxb-pcie"
+
 	// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
 	PCIeRootPort DeviceDriver = "pcie-root-port"
 
@@ -152,7 +170,7 @@ const (
 
 func isDimmSupported(config *Config) bool {
 	switch runtime.GOARCH {
-	case "amd64", "386", "ppc64le", "arm64":
+	case "amd64", "ppc64le", "arm64":
 		if config != nil && config.Machine.Type == MachineTypeMicrovm {
 			// microvm does not support NUMA
 			return false
@@ -1064,6 +1082,11 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string {
 
 	if netdev.Bus != "" {
 		deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus))
+	} else if netdev.Transport.isVirtioPCI(config) && hasPCIeRoot(config) {
+		// Pin to pcie.0 (when present) so pxb-pcie can't capture
+		// this leaf device as the default bus.  Skipped on machines
+		// without a `pcie.0` root (pseries, microvm, s390-ccw-virtio).
+		deviceParams = append(deviceParams, "bus=pcie.0")
 	}
 
 	if netdev.Addr != "" {
@@ -1586,8 +1609,15 @@ func (vhostuserDev VhostUserDevice) QemuNetParams(config *Config) []string {
 	deviceParams = append(deviceParams, fmt.Sprintf("netdev=%s", vhostuserDev.TypeDevID))
 	deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", vhostuserDev.Address))
 
-	if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
-		deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
+	if vhostuserDev.Transport.isVirtioPCI(config) {
+		// Pin to pcie.0 (when present) so pxb-pcie can't capture
+		// this leaf device.  See hasPCIeRoot() for skipped machines.
+		if hasPCIeRoot(config) {
+			deviceParams = append(deviceParams, "bus=pcie.0")
+		}
+		if vhostuserDev.ROMFile != "" {
+			deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
+		}
 	}
 
 	qemuParams = append(qemuParams, "-netdev")
@@ -1612,8 +1642,13 @@ func (vhostuserDev VhostUserDevice) QemuSCSIParams(config *Config) []string {
 	deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vhostuserDev.TypeDevID))
 	deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID))
 
-	if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
-		deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
+	if vhostuserDev.Transport.isVirtioPCI(config) {
+		if hasPCIeRoot(config) {
+			deviceParams = append(deviceParams, "bus=pcie.0")
+		}
+		if vhostuserDev.ROMFile != "" {
+			deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
+		}
 	}
 
 	qemuParams = append(qemuParams, "-device")
@@ -1637,8 +1672,13 @@ func (vhostuserDev VhostUserDevice) QemuBlkParams(config *Config) []string {
 	deviceParams = append(deviceParams, "size=512M")
 	deviceParams = append(deviceParams, fmt.Sprintf("chardev=%s", vhostuserDev.CharDevID))
 
-	if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
-		deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
+	if vhostuserDev.Transport.isVirtioPCI(config) {
+		if hasPCIeRoot(config) {
+			deviceParams = append(deviceParams, "bus=pcie.0")
+		}
+		if vhostuserDev.ROMFile != "" {
+			deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
+		}
 	}
 
 	qemuParams = append(qemuParams, "-device")
@@ -1674,8 +1714,13 @@ func (vhostuserDev VhostUserDevice) QemuFSParams(config *Config) []string {
 		}
 		deviceParams = append(deviceParams, fmt.Sprintf("devno=%s", vhostuserDev.DevNo))
 	}
-	if vhostuserDev.Transport.isVirtioPCI(config) && vhostuserDev.ROMFile != "" {
-		deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
+	if vhostuserDev.Transport.isVirtioPCI(config) {
+		if hasPCIeRoot(config) {
+			deviceParams = append(deviceParams, "bus=pcie.0")
+		}
+		if vhostuserDev.ROMFile != "" {
+			deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vhostuserDev.ROMFile))
+		}
 	}
 
 	qemuParams = append(qemuParams, "-device")
@@ -1738,6 +1783,36 @@ func (vhostuserDev VhostUserDevice) deviceName(config *Config) string {
 	}
 }
 
+// PXBPCIeDevice represents a PCIe Expander Bridge (pxb-pcie).
+// It creates a new PCI root complex with NUMA node affinity, allowing
+// devices attached to its bus hierarchy to inherit the NUMA association.
+// This is the only QEMU PCI device that carries a numa_node property.
+type PXBPCIeDevice struct {
+	// ID is the QEMU device identifier (e.g. "pxb-numa0").
+	ID string
+
+	// BusNr is the guest PCI bus number for this root complex.
+	// Use values spaced apart (e.g. 0x20, 0x40) to leave room for
+	// bridges beneath each pxb-pcie.
+	BusNr uint8
+
+	// NUMANode is the guest NUMA node index this root complex belongs to.
+	NUMANode int
+}
+
+// QemuParams returns the QEMU parameters for a pxb-pcie device.
+func (dev PXBPCIeDevice) QemuParams(_ *Config) []string {
+	return []string{
+		"-device",
+		fmt.Sprintf("pxb-pcie,id=%s,bus_nr=%d,numa_node=%d", dev.ID, dev.BusNr, dev.NUMANode),
+	}
+}
+
+// Valid returns true if the PXBPCIeDevice structure is valid and complete.
+func (dev PXBPCIeDevice) Valid() bool {
+	return dev.ID != ""
+}
+
 // PCIeRootPortDevice represents a memory balloon device.
 // nolint: govet
 type PCIeRootPortDevice struct {
@@ -2310,8 +2385,15 @@ func (vsock VSOCKDevice) QemuParams(config *Config) []string {
 	deviceParams = append(deviceParams, fmt.Sprintf("id=%s", vsock.ID))
 	deviceParams = append(deviceParams, fmt.Sprintf("%s=%d", VSOCKGuestCID, vsock.ContextID))
 
-	if vsock.Transport.isVirtioPCI(config) && vsock.ROMFile != "" {
-		deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile))
+	if vsock.Transport.isVirtioPCI(config) {
+		// Pin to pcie.0 (when present) so pxb-pcie can't capture
+		// this leaf device.  See hasPCIeRoot() for skipped machines.
+		if hasPCIeRoot(config) {
+			deviceParams = append(deviceParams, "bus=pcie.0")
+		}
+		if vsock.ROMFile != "" {
+			deviceParams = append(deviceParams, fmt.Sprintf("romfile=%s", vsock.ROMFile))
+		}
 	}
 
 	if vsock.Transport.isVirtioCCW(config) {
@@ -2689,7 +2771,8 @@ type SMP struct {
 	Sockets uint32
 
 	// MaxCPUs is the maximum number of VCPUs that a VM can have.
-	// This value, if non-zero, MUST BE equal to or greater than CPUs
+	// This value, if non-zero, MUST BE equal to or greater than CPUs,
+	// and must be equal to Sockets * Cores * Threads if all are non-zero.
 	MaxCPUs uint32
 }
 
@@ -2775,6 +2858,36 @@ func (fwcfg FwCfg) QemuParams(config *Config) []string {
 	return qemuParams
 }
 
+// NUMANode describes a guest NUMA node and its mapping to host resources.
+type NUMANode struct {
+	// NodeID is the guest NUMA node identifier (0-based).
+	NodeID uint32
+
+	// CPUs is the guest vCPU range assigned to this node (e.g. "0-3").
+	CPUs string
+
+	// MemSize is the amount of memory for this node (e.g. "512M", "1G").
+	MemSize string
+
+	// HostNodes is the host NUMA node(s) this guest node maps to (e.g. "0" or "0-1").
+	HostNodes string
+
+	// MemBackendType selects the QEMU memory backend object type.
+	// Typical values: "memory-backend-ram" or "memory-backend-file".
+	MemBackendType string
+
+	// MemBackendPath is the mem-path for file-backed memory (hugepages, file-backed).
+	// Empty when using memory-backend-ram.
+	MemBackendPath string
+}
+
+// NUMADist describes a NUMA distance entry for `-numa dist`.
+type NUMADist struct {
+	Src uint32
+	Dst uint32
+	Val uint32
+}
+
 // Knobs regroups a set of qemu boolean settings
 type Knobs struct {
 	// NoUserConfig prevents qemu from loading user config files.
@@ -2922,6 +3035,14 @@ type Config struct {
 
 	IOThreads []IOThread
 
+	// NUMANodes defines multi-NUMA guest topology. When non-empty,
+	// appendMemoryKnobs creates per-node memory backends and -numa entries
+	// instead of a single flat memory region.
+	NUMANodes []NUMANode
+
+	// NUMADists defines inter-node distance entries emitted as -numa dist.
+	NUMADists []NUMADist
+
 	// PidFile is the -pidfile parameter
 	PidFile string
 
@@ -3096,6 +3217,13 @@ func (config *Config) appendCPUs() error {
 				return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d",
 					config.SMP.MaxCPUs, config.SMP.CPUs)
 			}
+			if len(config.NUMANodes) > 1 && config.SMP.Sockets > 0 && config.SMP.Cores > 0 && config.SMP.Threads > 0 {
+				expected := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads
+				if config.SMP.MaxCPUs != expected {
+					return fmt.Errorf("MaxCPUs %d must equal Sockets(%d) * Cores(%d) * Threads(%d) = %d",
+						config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads, expected)
+				}
+			}
 			SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs))
 		}
 
@@ -3169,6 +3297,12 @@ func (config *Config) appendMemoryKnobs() {
 	if config.Memory.Size == "" {
 		return
 	}
+
+	if len(config.NUMANodes) > 0 && isDimmSupported(config) {
+		config.appendMultiNUMAMemoryKnobs()
+		return
+	}
+
 	var objMemParam, numaMemParam string
 	dimmName := "dimm1"
 	if config.Knobs.HugePages {
@@ -3200,6 +3334,49 @@ func (config *Config) appendMemoryKnobs() {
 	}
 }
 
+func (config *Config) appendMultiNUMAMemoryKnobs() {
+	for _, node := range config.NUMANodes {
+		memID := fmt.Sprintf("numa-mem%d", node.NodeID)
+
+		backendType := node.MemBackendType
+		if backendType == "" {
+			backendType = "memory-backend-ram"
+		}
+
+		objMemParam := fmt.Sprintf("%s,id=%s,size=%s", backendType, memID, node.MemSize)
+
+		if node.MemBackendPath != "" {
+			objMemParam += ",mem-path=" + node.MemBackendPath
+		}
+
+		if node.HostNodes != "" {
+			objMemParam += ",host-nodes=" + node.HostNodes + ",policy=bind"
+		}
+
+		if config.Knobs.MemShared {
+			objMemParam += ",share=on"
+		}
+		if config.Knobs.MemPrealloc {
+			objMemParam += ",prealloc=on"
+		}
+
+		config.qemuParams = append(config.qemuParams, "-object")
+		config.qemuParams = append(config.qemuParams, objMemParam)
+
+		numaParam := fmt.Sprintf("node,nodeid=%d,memdev=%s", node.NodeID, memID)
+		if node.CPUs != "" {
+			numaParam += ",cpus=" + node.CPUs
+		}
+		config.qemuParams = append(config.qemuParams, "-numa")
+		config.qemuParams = append(config.qemuParams, numaParam)
+	}
+
+	for _, dist := range config.NUMADists {
+		config.qemuParams = append(config.qemuParams, "-numa")
+		config.qemuParams = append(config.qemuParams, fmt.Sprintf("dist,src=%d,dst=%d,val=%d", dist.Src, dist.Dst, dist.Val))
+	}
+}
+
 func (config *Config) appendKnobs() {
 	if config.Knobs.NoUserConfig {
 		config.qemuParams = append(config.qemuParams, "-no-user-config")
diff --git a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go
index a14e0fb032..36e03254ae 100644
--- a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go
+++ b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go
@@ -14,8 +14,8 @@ var (
 	deviceNetworkString            = "-netdev tap,id=tap0,vhost=on,ifname=ceth0,downscript=no,script=no -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,romfile=efi-virtio.rom"
 	deviceNetworkStringMq          = "-netdev tap,id=tap0,vhost=on,fds=3:4 -device driver=virtio-net-pci,netdev=tap0,mac=01:02:de:ad:be:ef,bus=/pci-bus/pcie.0,addr=ff,disable-modern=true,mq=on,vectors=6,romfile=efi-virtio.rom"
 	deviceSerialString             = "-device virtio-serial-pci,disable-modern=true,id=serial0,romfile=efi-virtio.rom,max_ports=2"
-	deviceVhostUserNetString       = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,romfile=efi-virtio.rom"
-	deviceVSOCKString              = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=efi-virtio.rom"
+	deviceVhostUserNetString       = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -netdev type=vhost-user,id=net1,chardev=char1,vhostforce -device virtio-net-pci,netdev=net1,mac=00:11:22:33:44:55,bus=pcie.0,romfile=efi-virtio.rom"
+	deviceVSOCKString              = "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,bus=pcie.0,romfile=efi-virtio.rom"
 	deviceVFIOString               = "-device vfio-pci,host=02:10.0,x-pci-vendor-id=0x1234,x-pci-device-id=0x5678,romfile=efi-virtio.rom"
 	devicePCIeRootPortSimpleString = "-device pcie-root-port,id=rp1,bus=pcie.0,chassis=0x00,slot=0x00,multifunction=off"
 	devicePCIeRootPortFullString   = "-device pcie-root-port,id=rp2,bus=pcie.0,chassis=0x0,slot=0x1,addr=0x2,multifunction=on,bus-reserve=0x3,pref64-reserve=16G,mem-reserve=1G,io-reserve=512M,romfile=efi-virtio.rom"
@@ -23,8 +23,8 @@ var (
 	deviceVFIOPCIeFullString       = "-device vfio-pci,host=02:00.0,x-pci-vendor-id=0x10de,x-pci-device-id=0x15f8,romfile=efi-virtio.rom,bus=rp1"
 	deviceSCSIControllerStr        = "-device virtio-scsi-pci,id=foo,disable-modern=false,romfile=efi-virtio.rom"
 	deviceSCSIControllerBusAddrStr = "-device virtio-scsi-pci,id=foo,bus=pci.0,addr=00:04.0,disable-modern=true,iothread=iothread1,romfile=efi-virtio.rom"
-	deviceVhostUserSCSIString      = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,romfile=efi-virtio.rom"
-	deviceVhostUserBlkString       = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,romfile=efi-virtio.rom"
+	deviceVhostUserSCSIString      = "-chardev socket,id=char1,path=/tmp/nonexistentsocket.socket -device vhost-user-scsi-pci,id=scsi1,chardev=char1,bus=pcie.0,romfile=efi-virtio.rom"
+	deviceVhostUserBlkString       = "-chardev socket,id=char2,path=/tmp/nonexistentsocket.socket -device vhost-user-blk-pci,logical_block_size=4096,size=512M,chardev=char2,bus=pcie.0,romfile=efi-virtio.rom"
 	deviceBlockString              = "-device virtio-blk-pci,disable-modern=true,drive=hd0,config-wce=off,romfile=efi-virtio.rom,share-rw=on,serial=hd0 -drive id=hd0,file=/var/lib/vm.img,aio=threads,format=qcow2,if=none,readonly=on"
 	devicePCIBridgeString          = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=on,addr=ff,romfile=efi-virtio.rom"
 	devicePCIBridgeStringReserved  = "-device pci-bridge,bus=/pci-bus/pcie.0,id=mybridge,chassis_nr=5,shpc=off,addr=ff,romfile=efi-virtio.rom,io-reserve=4k,mem-reserve=1m,pref64-reserve=1m"
@@ -42,7 +42,8 @@ func TestAppendDeviceVhostUser(t *testing.T) {
 		VhostUserType: VhostUserBlk,
 		ROMFile:       romfile,
 	}
-	testAppend(vhostuserBlkDevice, deviceVhostUserBlkString, t)
+	// vhost-user-pci device strings include bus=pcie.0 — gated to q35/virt.
+	testAppendQ35(vhostuserBlkDevice, deviceVhostUserBlkString, t)
 
 	vhostuserSCSIDevice := VhostUserDevice{
 		SocketPath:    "/tmp/nonexistentsocket.socket",
@@ -52,7 +53,7 @@ func TestAppendDeviceVhostUser(t *testing.T) {
 		VhostUserType: VhostUserSCSI,
 		ROMFile:       romfile,
 	}
-	testAppend(vhostuserSCSIDevice, deviceVhostUserSCSIString, t)
+	testAppendQ35(vhostuserSCSIDevice, deviceVhostUserSCSIString, t)
 
 	vhostuserNetDevice := VhostUserDevice{
 		SocketPath:    "/tmp/nonexistentsocket.socket",
@@ -62,7 +63,7 @@ func TestAppendDeviceVhostUser(t *testing.T) {
 		VhostUserType: VhostUserNet,
 		ROMFile:       romfile,
 	}
-	testAppend(vhostuserNetDevice, deviceVhostUserNetString, t)
+	testAppendQ35(vhostuserNetDevice, deviceVhostUserNetString, t)
 }
 
 func TestAppendVirtioBalloon(t *testing.T) {
diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go
index 5d4c15ed9d..e4616a8231 100644
--- a/src/runtime/pkg/govmm/qemu/qemu_test.go
+++ b/src/runtime/pkg/govmm/qemu/qemu_test.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"os"
 	"reflect"
+	"runtime"
 	"strings"
 	"testing"
 )
@@ -23,6 +24,15 @@ func testAppend(structure interface{}, expected string, t *testing.T) {
 	testConfigAppend(&config, structure, expected, t)
 }
 
+// testAppendQ35 is testAppend with Config.Machine.Type set to "q35" so
+// device emitters that gate on hasPCIeRoot() (e.g. virtio-pci leaves
+// pinned to bus=pcie.0) take the PCIe path.  Use this for tests whose
+// expected string contains "bus=pcie.0".
+func testAppendQ35(structure interface{}, expected string, t *testing.T) {
+	config := Config{Machine: Machine{Type: "q35"}}
+	testConfigAppend(&config, structure, expected, t)
+}
+
 func testConfigAppend(config *Config, structure interface{}, expected string, t *testing.T) {
 	switch s := structure.(type) {
 	case Machine:
@@ -342,7 +352,32 @@ func TestAppendVSOCK(t *testing.T) {
 		vsockDevice.DevNo = DevNo
 	}
 
-	testAppend(vsockDevice, deviceVSOCKString, t)
+	// deviceVSOCKString includes bus=pcie.0 — gated to q35/virt machines.
+	testAppendQ35(vsockDevice, deviceVSOCKString, t)
+}
+
+// TestAppendVSOCKNoPCIeRoot verifies that on machines without a `pcie.0`
+// root (e.g. ppc64le's pseries, microvm, s390-ccw-virtio), we do NOT
+// emit `bus=pcie.0` — doing so would crash QEMU with
+// "Bus 'pcie.0' not found".  Transport and ROMFile are set explicitly
+// rather than using the arch-conditional `romfile` constant (which is
+// "" on s390x via qemu_s390x_test.go), so the test exercises the
+// same code path on every architecture.
+func TestAppendVSOCKNoPCIeRoot(t *testing.T) {
+	const vsockRomfile = "efi-virtio.rom"
+	vsockDevice := VSOCKDevice{
+		ID:            "vhost-vsock-pci0",
+		ContextID:     4,
+		VHostFD:       nil,
+		DisableModern: true,
+		ROMFile:       vsockRomfile,
+		Transport:     TransportPCI,
+	}
+
+	// pseries -> hasPCIeRoot returns false -> no bus=pcie.0 emitted.
+	expected := "-device vhost-vsock-pci,disable-modern=true,id=vhost-vsock-pci0,guest-cid=4,romfile=" + vsockRomfile
+	config := Config{Machine: Machine{Type: "pseries"}}
+	testConfigAppend(&config, vsockDevice, expected, t)
 }
 
 func TestVSOCKValid(t *testing.T) {
@@ -1117,6 +1152,140 @@ func TestBadMemoryKnobs(t *testing.T) {
 	}
 }
 
+func TestAppendMultiNUMAMemoryKnobs(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	c := &Config{
+		Memory: Memory{
+			Size:   "2G",
+			Slots:  8,
+			MaxMem: "4G",
+		},
+		NUMANodes: []NUMANode{
+			{
+				NodeID:         0,
+				CPUs:           "0-3",
+				MemSize:        "1G",
+				HostNodes:      "0",
+				MemBackendType: "memory-backend-ram",
+			},
+			{
+				NodeID:         1,
+				CPUs:           "4-7",
+				MemSize:        "1G",
+				HostNodes:      "1",
+				MemBackendType: "memory-backend-ram",
+			},
+		},
+		Knobs: Knobs{
+			MemShared:   true,
+			MemPrealloc: true,
+		},
+	}
+
+	c.appendMemoryKnobs()
+
+	expected := []string{
+		"-object", "memory-backend-ram,id=numa-mem0,size=1G,host-nodes=0,policy=bind,share=on,prealloc=on",
+		"-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-3",
+		"-object", "memory-backend-ram,id=numa-mem1,size=1G,host-nodes=1,policy=bind,share=on,prealloc=on",
+		"-numa", "node,nodeid=1,memdev=numa-mem1,cpus=4-7",
+	}
+	if len(c.qemuParams) != len(expected) {
+		t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams)
+	}
+	for i, p := range expected {
+		if c.qemuParams[i] != p {
+			t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i])
+		}
+	}
+}
+
+func TestAppendMultiNUMAHugePages(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	c := &Config{
+		Memory: Memory{
+			Size:   "2G",
+			Slots:  8,
+			MaxMem: "4G",
+		},
+		NUMANodes: []NUMANode{
+			{
+				NodeID:         0,
+				CPUs:           "0-1",
+				MemSize:        "1G",
+				HostNodes:      "0",
+				MemBackendType: "memory-backend-file",
+				MemBackendPath: "/dev/hugepages",
+			},
+			{
+				NodeID:         1,
+				CPUs:           "2-3",
+				MemSize:        "1G",
+				HostNodes:      "1",
+				MemBackendType: "memory-backend-file",
+				MemBackendPath: "/dev/hugepages",
+			},
+		},
+		Knobs: Knobs{
+			MemShared: true,
+		},
+	}
+
+	c.appendMemoryKnobs()
+
+	expected := []string{
+		"-object", "memory-backend-file,id=numa-mem0,size=1G,mem-path=/dev/hugepages,host-nodes=0,policy=bind,share=on",
+		"-numa", "node,nodeid=0,memdev=numa-mem0,cpus=0-1",
+		"-object", "memory-backend-file,id=numa-mem1,size=1G,mem-path=/dev/hugepages,host-nodes=1,policy=bind,share=on",
+		"-numa", "node,nodeid=1,memdev=numa-mem1,cpus=2-3",
+	}
+	if len(c.qemuParams) != len(expected) {
+		t.Fatalf("Expected %d params, got %d: %v", len(expected), len(c.qemuParams), c.qemuParams)
+	}
+	for i, p := range expected {
+		if c.qemuParams[i] != p {
+			t.Errorf("Param %d: expected %q, got %q", i, p, c.qemuParams[i])
+		}
+	}
+}
+
+func TestAppendNUMADist(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	c := &Config{
+		Memory: Memory{
+			Size: "2G",
+		},
+		NUMANodes: []NUMANode{
+			{NodeID: 0, CPUs: "0-1", MemSize: "1G", MemBackendType: "memory-backend-ram"},
+			{NodeID: 1, CPUs: "2-3", MemSize: "1G", MemBackendType: "memory-backend-ram"},
+		},
+		NUMADists: []NUMADist{
+			{Src: 0, Dst: 1, Val: 20},
+			{Src: 1, Dst: 0, Val: 20},
+		},
+	}
+
+	c.appendMemoryKnobs()
+
+	expectedDist := []string{
+		"-numa", "dist,src=0,dst=1,val=20",
+		"-numa", "dist,src=1,dst=0,val=20",
+	}
+	params := c.qemuParams
+	distParams := params[len(params)-4:]
+	for i, p := range expectedDist {
+		if distParams[i] != p {
+			t.Errorf("Dist param %d: expected %q, got %q", i, p, distParams[i])
+		}
+	}
+}
+
 func TestBadBios(t *testing.T) {
 	c := &Config{}
 	c.appendBios()
diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go
index 061bf8b2ed..c5c5f70c34 100644
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -1071,6 +1071,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		IOMMU:                         h.IOMMU,
 		IOMMUPlatform:                 h.getIOMMUPlatform(),
 		GuestNUMANodes:                h.defaultGuestNUMANodes(),
+		NUMAMapping:                   append([]string(nil), h.NUMAMapping...),
 		FileBackedMemRootDir:          h.FileBackedMemRootDir,
 		FileBackedMemRootList:         h.FileBackedMemRootList,
 		Debug:                         h.Debug,
@@ -1994,6 +1995,10 @@ func checkConfig(config oci.RuntimeConfig) error {
 		return err
 	}
 
+	if err := checkNumaConfig(config); err != nil {
+		return err
+	}
+
 	hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
 	coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
 	machineType := config.HypervisorConfig.HypervisorMachineType
@@ -2005,6 +2010,25 @@ func checkConfig(config oci.RuntimeConfig) error {
 	return nil
 }
 
+func checkNumaConfig(config oci.RuntimeConfig) error {
+	if len(config.HypervisorConfig.GuestNUMANodes) <= 1 {
+		return nil
+	}
+
+	switch goruntime.GOARCH {
+	case "amd64", "arm64":
+	default:
+		return fmt.Errorf("multi-NUMA support is only available on amd64 and arm64, got %q", goruntime.GOARCH)
+	}
+
+	if !config.StaticSandboxResourceMgmt {
+		return fmt.Errorf("NUMA support requires static_sandbox_resource_mgmt to be enabled; " +
+			"NUMA topology is not compatible with dynamic CPU/memory hotplug")
+	}
+
+	return nil
+}
+
 // checkPCIeConfig ensures the PCIe configuration is valid.
 // Only allow one of the following settings for cold-plug:
 // no-port, root-port, switch-port
diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go
index b09a97e994..229f065740 100644
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -794,11 +794,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
 	}
 
 	if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok {
-		guestNUMANodes, err := vcutils.GetGuestNUMANodes(strings.Fields(annotation))
+		mapping := strings.Fields(annotation)
+		guestNUMANodes, err := vcutils.GetGuestNUMANodes(mapping)
 		if err != nil {
 			return err
 		}
 		sbConfig.HypervisorConfig.GuestNUMANodes = guestNUMANodes
+		// Record the raw user-provided mapping so the hypervisor
+		// backend honors it verbatim instead of right-sizing.
+		sbConfig.HypervisorConfig.NUMAMapping = mapping
 	}
 
 	return nil
@@ -1457,7 +1461,7 @@ func (a *annotationConfiguration) setFloat32WithCheck(f func(float32) error) err
 // be added to the VM if sandbox annotations are provided with this sizing details
 func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32) {
 	var memory, quota int64
-	var period uint64
+	var shares, period uint64
 	var err error
 
 	if spec == nil || spec.Annotations == nil {
@@ -1488,6 +1492,15 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32)
 		}
 	}
 
+	annotation, ok = spec.Annotations[ctrAnnotations.SandboxCPUShares]
+	if ok {
+		shares, err = strconv.ParseUint(annotation, 10, 64)
+		if err != nil {
+			ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUShares: %s", annotation)
+			shares = 0
+		}
+	}
+
 	annotation, ok = spec.Annotations[ctrAnnotations.SandboxMem]
 	if ok {
 		memory, err = strconv.ParseInt(annotation, 10, 64)
@@ -1497,7 +1510,16 @@ func CalculateSandboxSizing(spec *specs.Spec) (numCPU float32, memSizeMB uint32)
 		}
 	}
 
-	return calculateVMResources(period, quota, memory)
+	numCPU, memSizeMB = calculateVMResources(period, quota, memory)
+
+	// When cpuManagerPolicy=static is in use, kubelet sets quota=-1
+	// (unconstrained) and assigns CPUs via cpuset instead. Fall back
+	// to deriving the CPU count from shares (1024 shares per CPU).
+	if numCPU == 0 && shares > 0 {
+		numCPU = float32(math.Ceil(float64(shares) / 1024.0))
+	}
+
+	return numCPU, memSizeMB
 }
 
 // CalculateContainerSizing will calculate the number of CPUs and amount of memory that is needed
diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go
index dc96e3cf39..c3784712f3 100644
--- a/src/runtime/virtcontainers/container.go
+++ b/src/runtime/virtcontainers/container.go
@@ -1742,12 +1742,17 @@ func (c *Container) update(ctx context.Context, resources specs.LinuxResources)
 		return err
 	}
 
-	// There currently isn't a notion of cpusets.cpus or mems being tracked
-	// inside of the guest. Make sure we clear these before asking agent to update
-	// the container's cgroups.
+	// Cpus/Mems in cgroup cpuset are host-relative; clear Cpus since vCPU
+	// numbering differs inside the guest. For Mems, translate host NUMA node
+	// IDs to guest node IDs when multi-NUMA is configured, otherwise clear.
 	if resources.CPU != nil {
-		resources.CPU.Mems = ""
 		resources.CPU.Cpus = ""
+		numaNodes := c.sandbox.config.HypervisorConfig.GuestNUMANodes
+		if len(numaNodes) > 1 && resources.CPU.Mems != "" {
+			resources.CPU.Mems = translateHostMemsToGuest(resources.CPU.Mems, numaNodes)
+		} else {
+			resources.CPU.Mems = ""
+		}
 	}
 
 	return c.sandbox.agent.updateContainer(ctx, c.sandbox, *c, resources)
diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go
index b631960f6b..8b93b31428 100644
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -803,6 +803,15 @@ type HypervisorConfig struct {
 	// GuestNUMANodes defines guest NUMA topology and mapping to host NUMA nodes and CPUs.
 	GuestNUMANodes []types.GuestNUMANode
 
+	// NUMAMapping is the raw user-provided NUMA mapping (TOML
+	// `numa_mapping` or the io.katacontainers.config.hypervisor.numa_mapping
+	// annotation). When empty, GuestNUMANodes was auto-derived from the
+	// host topology and may be right-sized at sandbox creation (e.g.
+	// collapsed to a single host node when the sandbox fits, or
+	// restricted to host nodes containing attached VFIO devices). When
+	// non-empty, the topology is honored verbatim.
+	NUMAMapping []string
+
 	// DisableNestingChecks is used to override customizations performed
 	// when running on top of another VMM.
 	DisableNestingChecks bool
diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go
index 381f3c8f07..1e21e4867c 100644
--- a/src/runtime/virtcontainers/hypervisor_config_linux.go
+++ b/src/runtime/virtcontainers/hypervisor_config_linux.go
@@ -63,10 +63,6 @@ func validateHypervisorConfig(conf *HypervisorConfig) error {
 		conf.DefaultMaxVCPUs = defaultMaxVCPUs
 	}
 
-	if numNUMA := conf.NumGuestNUMANodes(); numNUMA > 1 {
-		conf.DefaultMaxVCPUs -= conf.DefaultMaxVCPUs % numNUMA
-	}
-
 	if conf.Msize9p == 0 && conf.SharedFS != config.VirtioFS {
 		conf.Msize9p = defaultMsize9p
 	}
diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go
index 452c64f9ce..8b34cb246a 100644
--- a/src/runtime/virtcontainers/kata_agent.go
+++ b/src/runtime/virtcontainers/kata_agent.go
@@ -34,6 +34,7 @@ import (
 	kataclient "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/client"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
 	vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
@@ -1018,7 +1019,36 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st
 	return nil
 }
 
-func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error {
+// translateHostMemsToGuest converts a host cpuset.mems string (e.g. "0,2")
+// into guest NUMA node IDs. Each guest NUMA node index maps to a set of host
+// nodes via GuestNUMANode.HostNodes. If a host node from `mems` appears in
+// a GuestNUMANode's HostNodes, the corresponding guest node index is included.
+func translateHostMemsToGuest(hostMems string, numaNodes []types.GuestNUMANode) string {
+	hostSet, err := cpuset.Parse(hostMems)
+	if err != nil {
+		return ""
+	}
+	hostSlice := hostSet.ToSlice()
+	var guestNodes []int
+	for guestIdx, gn := range numaNodes {
+		nodeSet, err := cpuset.Parse(gn.HostNodes)
+		if err != nil {
+			continue
+		}
+		for _, hostNode := range hostSlice {
+			if nodeSet.Contains(hostNode) {
+				guestNodes = append(guestNodes, guestIdx)
+				break
+			}
+		}
+	}
+	if len(guestNodes) == 0 {
+		return ""
+	}
+	return cpuset.NewCPUSet(guestNodes...).String()
+}
+
+func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool, numaNodes []types.GuestNUMANode) error {
 	// Disable Hooks since they have been handled on the host and there is
 	// no reason to send them to the agent. It would make no sense to try
 	// to apply them on the guest.
@@ -1060,7 +1090,6 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis
 		}
 	}
 
-	// By now only CPU constraints are supported
 	// Issue: https://github.com/kata-containers/runtime/issues/158
 	// Issue: https://github.com/kata-containers/runtime/issues/204
 	grpcSpec.Linux.Resources.Devices = nil
@@ -1069,7 +1098,12 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis
 	grpcSpec.Linux.Resources.Network = nil
 	if grpcSpec.Linux.Resources.CPU != nil {
 		grpcSpec.Linux.Resources.CPU.Cpus = ""
-		grpcSpec.Linux.Resources.CPU.Mems = ""
+		if len(numaNodes) > 1 && grpcSpec.Linux.Resources.CPU.Mems != "" {
+			guestMems := translateHostMemsToGuest(grpcSpec.Linux.Resources.CPU.Mems, numaNodes)
+			grpcSpec.Linux.Resources.CPU.Mems = guestMems
+		} else {
+			grpcSpec.Linux.Resources.CPU.Mems = ""
+		}
 	}
 
 	// Disable network and time namespaces since they are handled on the host
@@ -1495,7 +1529,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
 
 	// We need to constrain the spec to make sure we're not
 	// passing irrelevant information to the agent.
-	err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel)
+	err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel, sandbox.config.HypervisorConfig.GuestNUMANodes)
 	if err != nil {
 		return nil, err
 	}
diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go
index 62bdd76eac..4b27f0c07e 100644
--- a/src/runtime/virtcontainers/kata_agent_test.go
+++ b/src/runtime/virtcontainers/kata_agent_test.go
@@ -638,7 +638,7 @@ func TestConstrainGRPCSpec(t *testing.T) {
 	}
 
 	k := kataAgent{}
-	k.constrainGRPCSpec(g, true, true, "", true)
+	k.constrainGRPCSpec(g, true, true, "", true, nil)
 
 	// Check nil fields
 	assert.Nil(g.Hooks)
@@ -1370,3 +1370,51 @@ func TestKataAgentCreateContainerVFIODevices(t *testing.T) {
 		})
 	}
 }
+
+func TestTranslateHostMemsToGuest(t *testing.T) {
+	assert := assert.New(t)
+
+	numaNodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-3"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	}
+
+	result := translateHostMemsToGuest("0", numaNodes)
+	assert.Equal("0", result)
+
+	result = translateHostMemsToGuest("1", numaNodes)
+	assert.Equal("1", result)
+
+	result = translateHostMemsToGuest("0-1", numaNodes)
+	assert.Equal("0-1", result)
+
+	result = translateHostMemsToGuest("0,1", numaNodes)
+	assert.Equal("0-1", result)
+
+	result = translateHostMemsToGuest("42", numaNodes)
+	assert.Equal("", result)
+
+	result = translateHostMemsToGuest("invalid", numaNodes)
+	assert.Equal("", result)
+
+	result = translateHostMemsToGuest("", numaNodes)
+	assert.Equal("", result)
+}
+
+func TestTranslateHostMemsToGuestRangeNodes(t *testing.T) {
+	assert := assert.New(t)
+
+	numaNodes := []types.GuestNUMANode{
+		{HostNodes: "0-1", HostCPUs: "0-7"},
+		{HostNodes: "2-3", HostCPUs: "8-15"},
+	}
+
+	result := translateHostMemsToGuest("1", numaNodes)
+	assert.Equal("0", result)
+
+	result = translateHostMemsToGuest("2", numaNodes)
+	assert.Equal("1", result)
+
+	result = translateHostMemsToGuest("0,3", numaNodes)
+	assert.Equal("0-1", result)
+}
diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go
index 4066c85e48..74818ff5d6 100644
--- a/src/runtime/virtcontainers/qemu.go
+++ b/src/runtime/virtcontainers/qemu.go
@@ -21,6 +21,7 @@ import (
 	"os/user"
 	"path/filepath"
 	"regexp"
+	goruntime "runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -44,6 +45,7 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
 	pkgUtils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/uuid"
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
 )
@@ -250,6 +252,14 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
 	span, _ := katatrace.Trace(ctx, q.Logger(), "setup", qemuTracingTags, map[string]string{"sandbox_id": q.id})
 	defer span.End()
 
+	// Right-size auto-derived NUMA topology before snapshotting the config.
+	// We mutate the caller-owned pointer so the sandbox's shared
+	// HypervisorConfig (used by vCPU pinning and cpuset.mems forwarding)
+	// observes the same trimmed topology that QEMU is launched with.
+	// No-op when numa_mapping was set explicitly or when the topology
+	// already has one or zero nodes.
+	maybeRightSizeAutoNUMA(hypervisorConfig, q.Logger())
+
 	if err := q.setConfig(hypervisorConfig); err != nil {
 		return err
 	}
@@ -325,8 +335,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
 	return nil
 }
 
-func (q *qemu) cpuTopology() govmmQemu.SMP {
-	return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs)
+func (q *qemu) cpuTopology(effectiveNUMANodes uint32) govmmQemu.SMP {
+	return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, effectiveNUMANodes)
 }
 
 func (q *qemu) memoryTopology() (govmmQemu.Memory, error) {
@@ -339,6 +349,407 @@ func (q *qemu) memoryTopology() (govmmQemu.Memory, error) {
 	return q.arch.memoryTopology(memMb, 0, 0), nil
 }
 
+// vfioHostNUMANodes walks the given VFIO devices and returns the set of
+// host NUMA node IDs that contain at least one of them. Devices for which
+// the NUMA node cannot be determined (returned as -1 by the kernel when
+// the device is not bound to any node) are skipped silently. Resolution
+// failures are logged as warnings and treated as "no constraint" for that
+// device. The function is a free function (not a method) so it can be
+// invoked before q.config is populated, e.g. during pre-setConfig
+// right-sizing.
+func vfioHostNUMANodes(devices []config.DeviceInfo, log *logrus.Entry) map[int]struct{} {
+	nodes := make(map[int]struct{})
+	for _, dev := range devices {
+		hostPath, err := config.GetHostPath(dev, false, "")
+		if err != nil {
+			log.WithError(err).WithField("device", dev.HostPath).Warn("Failed to resolve VFIO device host path for NUMA placement")
+			continue
+		}
+		dev.HostPath = hostPath
+		var vfioDevs []*config.VFIODev
+		if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) {
+			vfioDevs, err = drivers.GetDeviceFromVFIODev(dev)
+		} else {
+			vfioDevs, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
+		}
+		if err != nil {
+			log.WithError(err).WithField("device", dev.HostPath).Warn("Failed to enumerate VFIO device(s) for NUMA placement")
+			continue
+		}
+		for _, vd := range vfioDevs {
+			if vd.NUMANode >= 0 {
+				nodes[vd.NUMANode] = struct{}{}
+			}
+		}
+	}
+	return nodes
+}
+
+// guestNodeCoversAny reports whether the HostNodes of guestNode references
+// any host NUMA ID present in the given set.
+func guestNodeCoversAny(guestNode types.GuestNUMANode, hostSet map[int]struct{}) bool {
+	if len(hostSet) == 0 {
+		return false
+	}
+	parsed, err := cpuset.Parse(guestNode.HostNodes)
+	if err != nil {
+		return false
+	}
+	for _, id := range parsed.ToSlice() {
+		if _, ok := hostSet[id]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+// guestNodeHostIDs returns the host NUMA IDs covered by a single guest node.
+func guestNodeHostIDs(gn types.GuestNUMANode) []int {
+	parsed, err := cpuset.Parse(gn.HostNodes)
+	if err != nil {
+		return nil
+	}
+	return parsed.ToSlice()
+}
+
+// hostNUMACapFn returns the (cpu_count, mem_mb) capacity of a host NUMA
+// node. Used to inject sysfs reads for testability.
+type hostNUMACapFn func(nodeID int) (cpus int, memMB uint64, err error)
+
+// realHostNUMACapFn is the production capacity provider, backed by sysfs.
+func realHostNUMACapFn(nodeID int) (int, uint64, error) {
+	c, err := utils.GetHostNUMANodeCapacity(nodeID)
+	if err != nil {
+		return 0, 0, err
+	}
+	return c.CPUs, c.MemMB, nil
+}
+
+// sumNUMACapacity returns the (cpu_count, mem_mb) sum of the unique host
+// NUMA nodes referenced by the given guest NUMA nodes. Nodes whose capacity
+// can't be queried are skipped silently.
+func sumNUMACapacity(nodes []types.GuestNUMANode, capFn hostNUMACapFn) (int, uint64) {
+	seen := make(map[int]struct{})
+	var totalCPUs int
+	var totalMemMB uint64
+	for _, gn := range nodes {
+		for _, hid := range guestNodeHostIDs(gn) {
+			if _, ok := seen[hid]; ok {
+				continue
+			}
+			seen[hid] = struct{}{}
+			cpus, memMB, err := capFn(hid)
+			if err != nil {
+				continue
+			}
+			totalCPUs += cpus
+			totalMemMB += memMB
+		}
+	}
+	return totalCPUs, totalMemMB
+}
+
+// selectNUMANodes is the pure right-sizing decision: given an auto-derived
+// guest NUMA topology, the sandbox's CPU/memory budget, the set of host
+// NUMA nodes containing an attached VFIO device, and a capacity oracle,
+// return the smallest subset of numaNodes that satisfies the constraints.
+//
+// Heuristic, in order:
+//
+//  1. If a VFIO device is attached, keep the guest nodes covering host
+//     nodes that contain a device. If their combined capacity fits the
+//     sandbox, return only that subset.
+//  2. With no VFIO devices, if the smallest single host node has enough
+//     CPU+memory for the sandbox, return the first guest node.
+//  3. Otherwise, return the input unchanged.
+//
+// The function is pure (no I/O), so it is unit-testable. Callers must pass
+// a capFn that resolves host NUMA capacity; production code uses
+// realHostNUMACapFn.
+func selectNUMANodes(
+	numaNodes []types.GuestNUMANode,
+	vcpus uint32,
+	memMB uint64,
+	vfioHostSet map[int]struct{},
+	capFn hostNUMACapFn,
+	log *logrus.Entry,
+) []types.GuestNUMANode {
+	if len(numaNodes) <= 1 {
+		return numaNodes
+	}
+
+	// 1) VFIO-aware: keep the guest nodes covering device-bearing host nodes.
+	if len(vfioHostSet) > 0 {
+		var covered []types.GuestNUMANode
+		for _, gn := range numaNodes {
+			if guestNodeCoversAny(gn, vfioHostSet) {
+				covered = append(covered, gn)
+			}
+		}
+		if len(covered) == 0 {
+			log.WithField("vfio-host-nodes", vfioHostSet).
+				Warn("No guest NUMA node covers VFIO device host nodes; keeping full topology")
+			return numaNodes
+		}
+		cpus, memCap := sumNUMACapacity(covered, capFn)
+		if uint32(cpus) >= vcpus && memCap >= memMB {
+			log.WithFields(logrus.Fields{
+				"selected-nodes":  len(covered),
+				"input-nodes":     len(numaNodes),
+				"vfio-host-nodes": vfioHostSet,
+				"vcpus":           vcpus,
+				"mem-mb":          memMB,
+			}).Info("Right-sized NUMA topology to VFIO-aligned subset")
+			return covered
+		}
+		log.WithFields(logrus.Fields{
+			"vfio-host-nodes":  vfioHostSet,
+			"covered-cpus":     cpus,
+			"covered-mem-mb":   memCap,
+			"requested-vcpus":  vcpus,
+			"requested-mem-mb": memMB,
+		}).Info("VFIO-aligned NUMA subset too small for sandbox; keeping full topology")
+		return numaNodes
+	}
+
+	// 2) No VFIO constraints: collapse if the sandbox fits in a single
+	// (smallest) host node.
+	var smallestCPUs int = -1
+	var smallestMem uint64 = math.MaxUint64
+	for _, gn := range numaNodes {
+		cpus, memCap := sumNUMACapacity([]types.GuestNUMANode{gn}, capFn)
+		if smallestCPUs < 0 || cpus < smallestCPUs {
+			smallestCPUs = cpus
+		}
+		if memCap < smallestMem {
+			smallestMem = memCap
+		}
+	}
+	if smallestCPUs > 0 && uint32(smallestCPUs) >= vcpus && smallestMem >= memMB {
+		log.WithFields(logrus.Fields{
+			"input-nodes":         len(numaNodes),
+			"vcpus":               vcpus,
+			"mem-mb":              memMB,
+			"smallest-node-cpus":  smallestCPUs,
+			"smallest-node-memMB": smallestMem,
+		}).Info("Right-sized NUMA topology: sandbox fits in a single host node")
+		return numaNodes[:1]
+	}
+
+	// 3) Sandbox spans multiple nodes; preserve the auto-derived topology.
+	return numaNodes
+}
+
+// maybeRightSizeAutoNUMA right-sizes an auto-derived guest NUMA topology
+// in place on the given HypervisorConfig. It is a no-op when the user
+// configured an explicit numa_mapping (TOML or annotation), or when the
+// topology has at most one node.
+//
+// This must run before the config is consumed by the rest of the runtime
+// (sandbox vCPU pinning, cpuset.mems forwarding, QEMU command-line build),
+// so callers should invoke it on the *shared* HypervisorConfig pointer
+// owned by the sandbox, not on a local copy.
+func maybeRightSizeAutoNUMA(hc *HypervisorConfig, log *logrus.Entry) {
+	if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 {
+		return
+	}
+	hc.GuestNUMANodes = selectNUMANodes(
+		hc.GuestNUMANodes,
+		hc.DefaultMaxVCPUs,
+		uint64(hc.MemorySize),
+		vfioHostNUMANodes(hc.VFIODevices, log),
+		realHostNUMACapFn,
+		log,
+	)
+}
+
+func (q *qemu) buildNUMATopology() ([]govmmQemu.NUMANode, []govmmQemu.NUMADist, error) {
+	// q.config.GuestNUMANodes has already been right-sized (when applicable)
+	// by maybeRightSizeAutoNUMA() at hypervisor setup time.  Empty means
+	// no NUMA topology; a single node may still carry a HostNodes binding
+	// (e.g. right-sized to the GPU's NUMA node), in which case we must
+	// emit it so memory is bound to the correct host node.
+	numaNodes := q.config.GuestNUMANodes
+	if !numaPlacementActive(numaNodes) {
+		return nil, nil, nil
+	}
+
+	switch goruntime.GOARCH {
+	case "amd64", "arm64":
+	default:
+		return nil, nil, fmt.Errorf("multi-NUMA not supported on architecture %s", goruntime.GOARCH)
+	}
+
+	// NUMA requires static_sandbox_resource_mgmt=true, which guarantees
+	// NumVCPUs == DefaultMaxVCPUs (set in oci/utils.go). All boot vCPUs
+	// are present at VM start, so the per-node CPU ranges below are valid.
+	//
+	// cpuTopology() rounds MaxCPUs up to (numNUMANodes * coresPerSocket)
+	// so that QEMU's SMP topology is consistent. We must cover all CPU
+	// slots in the NUMA map, otherwise QEMU warns about CPUs not present
+	// in any NUMA node. Apply the same ceiling here.
+	numNodes := uint32(len(numaNodes))
+	if q.config.DefaultMaxVCPUs < numNodes {
+		hvLogger.WithFields(logrus.Fields{
+			"vcpus":      q.config.DefaultMaxVCPUs,
+			"numa-nodes": numNodes,
+		}).Warn("DefaultMaxVCPUs < NUMA node count; skipping multi-NUMA topology")
+		return nil, nil, nil
+	}
+	coresPerSocket := (q.config.DefaultMaxVCPUs + numNodes - 1) / numNodes
+	maxVCPUs := numNodes * coresPerSocket
+
+	vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, maxVCPUs)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to distribute vCPUs across NUMA nodes: %w", err)
+	}
+
+	memMb := uint64(q.config.MemorySize)
+
+	var memAlign uint64 = 1
+	if q.config.HugePages {
+		memAlign = 2
+	}
+
+	backendType := "memory-backend-ram"
+	backendPath := ""
+	if q.config.HugePages {
+		backendType = "memory-backend-file"
+		backendPath = "/dev/hugepages"
+	} else if q.config.SharedFS == config.VirtioFS || q.config.SharedFS == config.VirtioFSNydus ||
+		q.config.FileBackedMemRootDir != "" {
+		backendType = "memory-backend-file"
+		if q.config.FileBackedMemRootDir != "" {
+			backendPath = q.config.FileBackedMemRootDir
+		} else {
+			backendPath = fallbackFileBackedMemDir
+		}
+	}
+	if backendPath != "" {
+		if _, err := os.Stat(backendPath); err != nil {
+			return nil, nil, fmt.Errorf("NUMA memory backend path %q does not exist: %w", backendPath, err)
+		}
+	}
+
+	// Distribute memory proportionally to vCPU counts, aligned to memAlign.
+	memPerNode := make([]uint64, numNodes)
+	var memAssigned uint64
+	for i := uint32(0); i < numNodes; i++ {
+		raw := memMb * uint64(vcpusPerNode[i]) / uint64(maxVCPUs)
+		memPerNode[i] = (raw / memAlign) * memAlign
+		if memPerNode[i] == 0 {
+			memPerNode[i] = memAlign
+		}
+		memAssigned += memPerNode[i]
+	}
+	// Give the remainder to the last node (must also be aligned).
+	if memAssigned < memMb {
+		remainder := memMb - memAssigned
+		if remainder%memAlign != 0 {
+			return nil, nil, fmt.Errorf("MemorySize (%d MiB) cannot be evenly distributed across %d NUMA nodes with %d MiB alignment",
+				memMb, numNodes, memAlign)
+		}
+		memPerNode[numNodes-1] += remainder
+	} else if memAssigned > memMb {
+		return nil, nil, fmt.Errorf("MemorySize (%d MiB) cannot be evenly distributed across %d NUMA nodes with %d MiB alignment",
+			memMb, numNodes, memAlign)
+	}
+
+	var nodes []govmmQemu.NUMANode
+	var cpuOffset uint32
+	for i, gn := range numaNodes {
+		startCPU := cpuOffset
+		endCPU := startCPU + vcpusPerNode[i] - 1
+		cpuOffset = endCPU + 1
+		cpuRange := fmt.Sprintf("%d-%d", startCPU, endCPU)
+
+		nodes = append(nodes, govmmQemu.NUMANode{
+			NodeID:         uint32(i),
+			CPUs:           cpuRange,
+			MemSize:        fmt.Sprintf("%dM", memPerNode[i]),
+			HostNodes:      gn.HostNodes,
+			MemBackendType: backendType,
+			MemBackendPath: backendPath,
+		})
+	}
+
+	var dists []govmmQemu.NUMADist
+	hostDists := utils.GetHostNUMADistances(numaNodes)
+	for _, hd := range hostDists {
+		dists = append(dists, govmmQemu.NUMADist{
+			Src: hd.Src,
+			Dst: hd.Dst,
+			Val: hd.Val,
+		})
+	}
+
+	q.validateVFIODeviceNUMAPlacement(numaNodes)
+
+	return nodes, dists, nil
+}
+
+// buildCoveredHostNodes maps each host NUMA node ID to its guest NUMA node
+// index based on the GuestNUMANode HostNodes configuration.
+func buildCoveredHostNodes(numaNodes []types.GuestNUMANode) map[int]uint32 {
+	covered := make(map[int]uint32)
+	for guestIdx, gn := range numaNodes {
+		nodeSet, err := cpuset.Parse(gn.HostNodes)
+		if err != nil {
+			continue
+		}
+		for _, n := range nodeSet.ToSlice() {
+			covered[n] = uint32(guestIdx)
+		}
+	}
+	return covered
+}
+
+// validateVFIODeviceNUMAPlacement checks that every cold-plugged VFIO device
+// (e.g. GPU) resides on a host NUMA node that is covered by the guest NUMA
+// topology. A mismatch means the device will incur cross-NUMA memory accesses.
+func (q *qemu) validateVFIODeviceNUMAPlacement(numaNodes []types.GuestNUMANode) {
+	coveredHostNodes := buildCoveredHostNodes(numaNodes)
+
+	for _, dev := range q.config.VFIODevices {
+		hostPath, err := config.GetHostPath(dev, false, "")
+		if err != nil {
+			q.Logger().WithError(err).WithField("device", dev.HostPath).Warn("Failed to resolve VFIO device host path for NUMA placement validation")
+			continue
+		}
+		dev.HostPath = hostPath
+		var vfioDevs []*config.VFIODev
+		if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) {
+			vfioDevs, err = drivers.GetDeviceFromVFIODev(dev)
+		} else {
+			vfioDevs, err = drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
+		}
+		if err != nil {
+			q.Logger().WithError(err).WithField("device", dev.HostPath).Warn("Failed to enumerate VFIO device(s) for NUMA placement validation")
+			continue
+		}
+		for _, vd := range vfioDevs {
+			if vd.NUMANode < 0 {
+				continue
+			}
+			guestNode, ok := coveredHostNodes[vd.NUMANode]
+			if !ok {
+				q.Logger().WithFields(logrus.Fields{
+					"bdf":           vd.BDF,
+					"host-numa":     vd.NUMANode,
+					"guest-numa":    "none",
+					"covered-nodes": coveredHostNodes,
+				}).Warn("VFIO device on host NUMA node not covered by guest NUMA topology; cross-NUMA memory accesses may occur")
+			} else {
+				q.Logger().WithFields(logrus.Fields{
+					"bdf":        vd.BDF,
+					"host-numa":  vd.NUMANode,
+					"guest-numa": guestNode,
+				}).Debug("VFIO device NUMA placement validated")
+			}
+		}
+	}
+}
+
 func (q *qemu) qmpSocketPath(id string) (string, error) {
 	return utils.BuildSocketPath(q.config.VMStorePath, id, qmpSocket)
 }
@@ -596,7 +1007,13 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
 		return err
 	}
 
-	smp := q.cpuTopology()
+	numaNodes, numaDists, err := q.buildNUMATopology()
+	if err != nil {
+		return err
+	}
+
+	effectiveNUMANodes := uint32(len(numaNodes))
+	smp := q.cpuTopology(effectiveNUMANodes)
 
 	memory, err := q.memoryTopology()
 	if err != nil {
@@ -717,6 +1134,8 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
 		QMPSockets:     qmpSockets,
 		Knobs:          knobs,
 		Incoming:       incoming,
+		NUMANodes:      numaNodes,
+		NUMADists:      numaDists,
 		VGA:            "none",
 		GlobalParam:    "kvm-pit.lost_tick_policy=discard",
 		Bios:           firmwarePath,
@@ -881,6 +1300,15 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
 		if numOfPluggablePorts > maxPCIeRootPort {
 			return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
 		}
+
+		// When NUMA is active (multi-node OR a single node right-sized to a
+		// specific host node), create pxb-pcie bridges so cold-plugged VFIO
+		// devices inherit the correct guest NUMA affinity.
+		if numaPlacementActive(q.config.GuestNUMANodes) && len(hypervisorConfig.VFIODevices) > 0 {
+			qemuConfig.Devices = q.createNUMAPCIeTopology(qemuConfig.Devices, hypervisorConfig, numOfPluggablePorts)
+			return nil
+		}
+
 		qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts)
 		return nil
 	}
@@ -2660,7 +3088,107 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff
 	return memory
 }
 
-// genericAppendPCIeRootPort appends to devices the given pcie-root-port
+// numaPlacementActive reports whether the runtime should emit per-NUMA
+// pxb-pcie / memory-binding QEMU args.  True when there is more than one
+// guest node, OR a single guest node with an explicit HostNodes binding.
+//
+// The single-node case covers two scenarios that the runtime cannot tell
+// apart after right-sizing:
+//   - a multi-NUMA host whose workload was collapsed to one host node
+//     (e.g. GPU on host node 0) — pxb-pcie + host-nodes binding are
+//     required so the guest GPU reports the correct NUMA affinity;
+//   - a single-NUMA host with `enable_numa=true` — emitting the binding
+//     is a functional no-op (the only host node is node 0 anyway).
+//
+// Single node without a HostNodes value (no NUMA mapping at all) falls
+// through to the flat memdev path.
+func numaPlacementActive(nodes []types.GuestNUMANode) bool {
+	if len(nodes) > 1 {
+		return true
+	}
+	return len(nodes) == 1 && nodes[0].HostNodes != ""
+}
+
+// createNUMAPCIeTopology creates pxb-pcie bridges for NUMA nodes that have
+// VFIO devices, then creates root ports on each pxb bus.  VFIO devices will
+// be assigned to these root ports during Attach() based on their host NUMA
+// node, giving the guest kernel correct NUMA affinity for the PCI devices.
+func (q *qemu) createNUMAPCIeTopology(devices []govmmQemu.Device, hypervisorConfig *HypervisorConfig, totalPorts uint32) []govmmQemu.Device {
+	coveredHostNodes := buildCoveredHostNodes(q.config.GuestNUMANodes)
+
+	// Count VFIO devices per host NUMA node.
+	numaDevCount := make(map[int]int)
+	for _, dev := range hypervisorConfig.VFIODevices {
+		hostPath, err := config.GetHostPath(dev, false, "")
+		if err != nil {
+			continue
+		}
+		dev.HostPath = hostPath
+		var vfioDevs []*config.VFIODev
+		if strings.HasPrefix(dev.HostPath, pkgDevice.IommufdDevPath) {
+			vfioDevs, _ = drivers.GetDeviceFromVFIODev(dev)
+		} else {
+			vfioDevs, _ = drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
+		}
+		for _, vd := range vfioDevs {
+			if vd.NUMANode >= 0 && drivers.IsPCIeDevice(vd.BDF) {
+				numaDevCount[vd.NUMANode]++
+			}
+		}
+	}
+
+	if len(numaDevCount) == 0 {
+		return q.arch.appendPCIeRootPortDevice(devices, totalPorts)
+	}
+
+	// Create a pxb-pcie + root ports per NUMA node that has devices.
+	var rpIndex uint32
+	const busNrSpacing uint8 = 0x20
+
+	for hostNode, devCount := range numaDevCount {
+		guestNode, ok := coveredHostNodes[hostNode]
+		if !ok {
+			q.Logger().WithField("host-numa", hostNode).Warn("VFIO device on uncovered NUMA node; skipping pxb-pcie")
+			continue
+		}
+
+		pxbID := fmt.Sprintf("pxb-numa%d", guestNode)
+		busNr := busNrSpacing * uint8(guestNode+1)
+
+		devices = append(devices, govmmQemu.PXBPCIeDevice{
+			ID:       pxbID,
+			BusNr:    busNr,
+			NUMANode: int(guestNode),
+		})
+
+		// Create root ports on this pxb bus for the VFIO devices.
+		var rpIDs []string
+		for i := 0; i < devCount; i++ {
+			rpID := fmt.Sprintf("rp-numa%d-%d", guestNode, i)
+			rpIDs = append(rpIDs, rpID)
+			devices = append(devices, govmmQemu.PCIeRootPortDevice{
+				ID:      rpID,
+				Bus:     pxbID,
+				Chassis: fmt.Sprintf("%d", 10+guestNode),
+				Slot:    fmt.Sprintf("%d", i),
+			})
+			rpIndex++
+		}
+
+		config.NUMARootPorts[hostNode] = rpIDs
+
+		q.Logger().WithFields(logrus.Fields{
+			"pxb-id":     pxbID,
+			"bus-nr":     busNr,
+			"guest-numa": guestNode,
+			"host-numa":  hostNode,
+			"root-ports": rpIDs,
+		}).Info("Created pxb-pcie with root ports for NUMA VFIO placement")
+	}
+
+	return devices
+}
+
 func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device {
 	var (
 		bus           string
diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go
index aacb97b7cc..f3bba704ca 100644
--- a/src/runtime/virtcontainers/qemu_arch_base.go
+++ b/src/runtime/virtcontainers/qemu_arch_base.go
@@ -61,8 +61,9 @@ type qemuArch interface {
 	// bridges sets the number bridges for the machine type
 	bridges(number uint32)
 
-	// cpuTopology returns the CPU topology for the given amount of vcpus
-	cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP
+	// cpuTopology returns the CPU topology for the given amount of vcpus.
+	// numNUMANodes > 1 restructures the topology so vCPUs are grouped by socket per NUMA node.
+	cpuTopology(vcpus, maxvcpus uint32, numNUMANodes uint32) govmmQemu.SMP
 
 	// cpuModel returns the CPU model for the machine type
 	cpuModel() string
@@ -324,16 +325,29 @@ func (q *qemuArchBase) bridges(number uint32) {
 	}
 }
 
-func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP {
-	smp := govmmQemu.SMP{
+func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32, numNUMANodes uint32) govmmQemu.SMP {
+	if numNUMANodes > 1 {
+		coresPerSocket := (maxvcpus + numNUMANodes - 1) / numNUMANodes
+		if coresPerSocket == 0 {
+			coresPerSocket = 1
+		}
+		smpMaxCPUs := numNUMANodes * coresPerSocket * defaultThreads
+		return govmmQemu.SMP{
+			CPUs:    vcpus,
+			Sockets: numNUMANodes,
+			Cores:   coresPerSocket,
+			Threads: defaultThreads,
+			MaxCPUs: smpMaxCPUs,
+		}
+	}
+
+	return govmmQemu.SMP{
 		CPUs:    vcpus,
 		Sockets: maxvcpus,
 		Cores:   defaultCores,
 		Threads: defaultThreads,
 		MaxCPUs: maxvcpus,
 	}
-
-	return smp
 }
 
 func (q *qemuArchBase) cpuModel() string {
diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go
index dfaebb8dab..c177ee44a8 100644
--- a/src/runtime/virtcontainers/qemu_arch_base_test.go
+++ b/src/runtime/virtcontainers/qemu_arch_base_test.go
@@ -189,7 +189,46 @@ func TestQemuArchBaseCPUTopology(t *testing.T) {
 		MaxCPUs: defaultMaxVCPUs,
 	}
 
-	smp := qemuArchBase.cpuTopology(vcpus, defaultMaxVCPUs)
+	smp := qemuArchBase.cpuTopology(vcpus, defaultMaxVCPUs, 0)
+	assert.Equal(expectedSMP, smp)
+}
+
+func TestQemuArchBaseCPUTopologyNUMA(t *testing.T) {
+	assert := assert.New(t)
+	qemuArchBase := newQemuArchBase()
+	vcpus := uint32(2)
+	maxvcpus := uint32(8)
+	numNUMA := uint32(2)
+
+	expectedSMP := govmmQemu.SMP{
+		CPUs:    vcpus,
+		Sockets: numNUMA,
+		Cores:   maxvcpus / numNUMA,
+		Threads: defaultThreads,
+		MaxCPUs: maxvcpus,
+	}
+
+	smp := qemuArchBase.cpuTopology(vcpus, maxvcpus, numNUMA)
+	assert.Equal(expectedSMP, smp)
+}
+
+func TestQemuArchBaseCPUTopologyNUMAUneven(t *testing.T) {
+	assert := assert.New(t)
+	qemuArchBase := newQemuArchBase()
+	vcpus := uint32(2)
+	maxvcpus := uint32(5)
+	numNUMA := uint32(2)
+
+	coresPerSocket := (maxvcpus + numNUMA - 1) / numNUMA
+	expectedSMP := govmmQemu.SMP{
+		CPUs:    vcpus,
+		Sockets: numNUMA,
+		Cores:   coresPerSocket,
+		Threads: defaultThreads,
+		MaxCPUs: numNUMA * coresPerSocket * defaultThreads,
+	}
+
+	smp := qemuArchBase.cpuTopology(vcpus, maxvcpus, numNUMA)
 	assert.Equal(expectedSMP, smp)
 }
 
diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go
index 5d4267f011..9fcb8dc1fa 100644
--- a/src/runtime/virtcontainers/qemu_test.go
+++ b/src/runtime/virtcontainers/qemu_test.go
@@ -19,6 +19,7 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"runtime"
 	"testing"
 
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
@@ -29,6 +30,7 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
 	"github.com/pbnjay/memory"
 	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -283,7 +285,7 @@ func TestQemuCPUTopology(t *testing.T) {
 		MaxCPUs: uint32(vcpus),
 	}
 
-	smp := q.cpuTopology()
+	smp := q.cpuTopology(0)
 	assert.Exactly(smp, expectedOut)
 }
 
@@ -1200,3 +1202,672 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) {
 	// State should remain unchanged
 	assert.Equal(100, q.state.HotpluggedMemory)
 }
+
+func TestNumaPlacementActive(t *testing.T) {
+	assert := assert.New(t)
+	cases := []struct {
+		name  string
+		nodes []types.GuestNUMANode
+		want  bool
+	}{
+		{"empty", nil, false},
+		{"single-node-no-binding", []types.GuestNUMANode{{}}, false},
+		{"single-node-host-0", []types.GuestNUMANode{{HostNodes: "0"}}, true},
+		{"single-node-host-1", []types.GuestNUMANode{{HostNodes: "1"}}, true},
+		{"single-node-host-range", []types.GuestNUMANode{{HostNodes: "0-1"}}, true},
+		{"two-nodes", []types.GuestNUMANode{{HostNodes: "0"}, {HostNodes: "1"}}, true},
+	}
+	for _, c := range cases {
+		assert.Equal(c.want, numaPlacementActive(c.nodes), c.name)
+	}
+}
+
+func TestBuildNUMATopologySingleNode(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	// A single guest node mapped to a specific host node (e.g. produced
+	// by maybeRightSizeAutoNUMA() collapsing a multi-node sandbox to the
+	// GPU's host NUMA node) must still emit a one-node topology so that
+	// the memory backend gets a host-nodes= binding.
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 1)
+	assert.Equal(uint32(0), nodes[0].NodeID)
+	assert.Equal("0-3", nodes[0].CPUs)
+	assert.Equal("1024M", nodes[0].MemSize)
+	assert.Equal("0", nodes[0].HostNodes)
+	assert.Equal("memory-backend-ram", nodes[0].MemBackendType)
+}
+
+func TestBuildNUMATopologySingleNodeNoHostBinding(t *testing.T) {
+	// A single guest node without a HostNodes value carries no NUMA
+	// binding intent; buildNUMATopology() must return nil so that the
+	// QEMU command line falls through to the flat memdev path.
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "", HostCPUs: "0-3"},
+			},
+		},
+	}
+	nodes, dists, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Nil(nodes)
+	assert.Nil(dists)
+}
+
+func TestBuildNUMATopologySingleNodeExplicitNonZeroHost(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	// User explicitly mapped the only guest node to a non-zero host node
+	// (e.g. numa_mapping = ["1"]).  buildNUMATopology() must propagate
+	// HostNodes verbatim so the memory backend ends up bound to host
+	// node 1 rather than the default node 0.
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			NUMAMapping:     []string{"1"},
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "1", HostCPUs: "0-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 1)
+	assert.Equal(uint32(0), nodes[0].NodeID)
+	assert.Equal("1", nodes[0].HostNodes)
+}
+
+func TestBuildNUMATopologyExplicitRangedHostNodes(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	// User explicitly mapped two guest nodes to disjoint host-node ranges
+	// (e.g. numa_mapping = ["0-1", "2-3"]).  buildNUMATopology() must
+	// preserve the ranged HostNodes strings on each emitted NUMANode.
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 8,
+			MemorySize:      2048,
+			NUMAMapping:     []string{"0-1", "2-3"},
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0-1", HostCPUs: "0-3"},
+				{HostNodes: "2-3", HostCPUs: "4-7"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	assert.Equal("0-1", nodes[0].HostNodes)
+	assert.Equal("2-3", nodes[1].HostNodes)
+}
+
+func TestBuildNUMATopologyTwoNodes(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+
+	assert.Equal(uint32(0), nodes[0].NodeID)
+	assert.Equal("0-1", nodes[0].CPUs)
+	assert.Equal("512M", nodes[0].MemSize)
+	assert.Equal("memory-backend-ram", nodes[0].MemBackendType)
+
+	assert.Equal(uint32(1), nodes[1].NodeID)
+	assert.Equal("2-3", nodes[1].CPUs)
+	assert.Equal("512M", nodes[1].MemSize)
+}
+
+func TestBuildNUMATopologyHugePages(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	if _, err := os.Stat("/dev/hugepages"); err != nil {
+		t.Skip("skipping: /dev/hugepages not available")
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			HugePages:       true,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	assert.Equal("memory-backend-file", nodes[0].MemBackendType)
+	assert.Equal("/dev/hugepages", nodes[0].MemBackendPath)
+	assert.Equal("512M", nodes[0].MemSize)
+}
+
+func TestBuildNUMATopologyVirtioFS(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			SharedFS:        config.VirtioFS,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	assert.Equal("memory-backend-file", nodes[0].MemBackendType)
+	assert.Equal(fallbackFileBackedMemDir, nodes[0].MemBackendPath)
+}
+
+func TestBuildNUMATopologyFileBackedMem(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	tmpDir := t.TempDir()
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs:      4,
+			MemorySize:           1024,
+			FileBackedMemRootDir: tmpDir,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	assert.Equal("memory-backend-file", nodes[0].MemBackendType)
+	assert.Equal(tmpDir, nodes[0].MemBackendPath)
+}
+
+func TestBuildNUMATopologyTooFewVCPUs(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 1,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0"},
+				{HostNodes: "1", HostCPUs: "1"},
+			},
+		},
+	}
+	nodes, dists, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Nil(nodes)
+	assert.Nil(dists)
+}
+
+func TestBuildNUMATopologyUnevenVCPUs(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 5,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-4"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	// cpuTopology() rounds MaxCPUs to ceil(5/2)*2=6, so 6 CPU slots
+	// are distributed proportionally: 2 host CPUs → 2 vCPUs,
+	// 3 host CPUs → 4 vCPUs (3 proportional + 1 remainder).
+	assert.Equal("0-1", nodes[0].CPUs)
+	assert.Equal("2-5", nodes[1].CPUs)
+}
+
+func TestBuildNUMATopologyMemMisaligned(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1,
+			HugePages:       true,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	_, _, err := q.buildNUMATopology()
+	assert.Error(err)
+	assert.Contains(err.Error(), "cannot be evenly distributed")
+}
+
+func TestBuildNUMATopologyMemMisalignedRemainder(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 6,
+			MemorySize:      1025,
+			HugePages:       true,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-2"},
+				{HostNodes: "1", HostCPUs: "3-5"},
+			},
+		},
+	}
+	_, _, err := q.buildNUMATopology()
+	assert.Error(err)
+	assert.Contains(err.Error(), "cannot be evenly distributed")
+}
+
+func TestBuildNUMATopologyEvenMemory(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 6,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-2"},
+				{HostNodes: "1", HostCPUs: "3-5"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+
+	assert.Equal("0-2", nodes[0].CPUs)
+	assert.Equal("512M", nodes[0].MemSize)
+
+	assert.Equal("3-5", nodes[1].CPUs)
+	assert.Equal("512M", nodes[1].MemSize)
+}
+
+func TestBuildNUMATopologyProportionalVCPUs(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 10,
+			MemorySize:      1000,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-7"},
+				{HostNodes: "1", HostCPUs: "8-9"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	// 8 out of 10 host CPUs on node 0 → 8 vCPUs
+	assert.Equal("0-7", nodes[0].CPUs)
+	assert.Equal("800M", nodes[0].MemSize)
+	// 2 out of 10 host CPUs on node 1 → 2 vCPUs
+	assert.Equal("8-9", nodes[1].CPUs)
+	assert.Equal("200M", nodes[1].MemSize)
+}
+
+func TestBuildCoveredHostNodes(t *testing.T) {
+	assert := assert.New(t)
+
+	covered := buildCoveredHostNodes([]types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-3"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	})
+	assert.Len(covered, 2)
+	assert.Equal(uint32(0), covered[0])
+	assert.Equal(uint32(1), covered[1])
+}
+
+func TestBuildCoveredHostNodesRange(t *testing.T) {
+	assert := assert.New(t)
+
+	covered := buildCoveredHostNodes([]types.GuestNUMANode{
+		{HostNodes: "0-1", HostCPUs: "0-7"},
+	})
+	assert.Len(covered, 2)
+	assert.Equal(uint32(0), covered[0])
+	assert.Equal(uint32(0), covered[1])
+}
+
+func TestBuildCoveredHostNodesEmpty(t *testing.T) {
+	assert := assert.New(t)
+
+	covered := buildCoveredHostNodes(nil)
+	assert.Len(covered, 0)
+}
+
+func TestBuildCoveredHostNodesInvalidParse(t *testing.T) {
+	assert := assert.New(t)
+
+	covered := buildCoveredHostNodes([]types.GuestNUMANode{
+		{HostNodes: "invalid", HostCPUs: "0-3"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	})
+	assert.Len(covered, 1)
+	assert.Equal(uint32(1), covered[1])
+}
+
+// silentLogger returns a logrus.Entry that discards all output, suitable
+// for use in unit tests that exercise NUMA right-sizing decisions.
+func silentLogger() *logrus.Entry {
+	l := logrus.New()
+	l.Out = io.Discard
+	return logrus.NewEntry(l)
+}
+
+// fakeCapFn returns a hostNUMACapFn backed by a static map. Unknown nodes
+// produce an error so we exercise the "skip unknown" branch in
+// sumNUMACapacity when intended.
+func fakeCapFn(caps map[int]struct {
+	cpus  int
+	memMB uint64
+}) hostNUMACapFn {
+	return func(nodeID int) (int, uint64, error) {
+		if c, ok := caps[nodeID]; ok {
+			return c.cpus, c.memMB, nil
+		}
+		return 0, 0, fmt.Errorf("unknown host NUMA node %d", nodeID)
+	}
+}
+
+// twoNodeHostCaps describes a typical 2-socket host: 32 CPUs and 128 GiB
+// per node.
+func twoNodeHostCaps() map[int]struct {
+	cpus  int
+	memMB uint64
+} {
+	return map[int]struct {
+		cpus  int
+		memMB uint64
+	}{
+		0: {cpus: 32, memMB: 128 * 1024},
+		1: {cpus: 32, memMB: 128 * 1024},
+	}
+}
+
+func twoNodeAutoTopology() []types.GuestNUMANode {
+	return []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-31"},
+		{HostNodes: "1", HostCPUs: "32-63"},
+	}
+}
+
+func TestSumNUMACapacity(t *testing.T) {
+	assert := assert.New(t)
+
+	cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(twoNodeHostCaps()))
+	assert.Equal(64, cpus)
+	assert.Equal(uint64(256*1024), memMB)
+}
+
+func TestSumNUMACapacityDeduplicatesHostNodes(t *testing.T) {
+	assert := assert.New(t)
+
+	// Two guest entries that both reference host node 0 must only count
+	// once. The merged "0-1" entry adds host node 1.
+	nodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-31"},
+		{HostNodes: "0-1", HostCPUs: "0-63"},
+	}
+	cpus, memMB := sumNUMACapacity(nodes, fakeCapFn(twoNodeHostCaps()))
+	assert.Equal(64, cpus)
+	assert.Equal(uint64(256*1024), memMB)
+}
+
+func TestSumNUMACapacitySkipsUnknown(t *testing.T) {
+	assert := assert.New(t)
+
+	caps := map[int]struct {
+		cpus  int
+		memMB uint64
+	}{
+		0: {cpus: 16, memMB: 32 * 1024},
+		// host node 1 missing on purpose
+	}
+	cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(caps))
+	assert.Equal(16, cpus)
+	assert.Equal(uint64(32*1024), memMB)
+}
+
+func TestSelectNUMANodesPassthroughForSingleNode(t *testing.T) {
+	assert := assert.New(t)
+
+	in := []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}}
+	out := selectNUMANodes(in, 4, 1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesNoVFIOFitsOneNode(t *testing.T) {
+	// Small sandbox (8 vCPUs / 16 GiB) fits comfortably in one host node:
+	// expect collapse to the first guest node.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	out := selectNUMANodes(in, 8, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Len(out, 1)
+	assert.Equal("0", out[0].HostNodes)
+}
+
+func TestSelectNUMANodesNoVFIOExceedsOneNode(t *testing.T) {
+	// 64 vCPUs needs both 32-CPU nodes: expect full topology.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	out := selectNUMANodes(in, 64, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesNoVFIOMemoryExceedsOneNode(t *testing.T) {
+	// CPU fits in one node but memory does not: expect full topology.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	out := selectNUMANodes(in, 8, 200*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesVFIOSubsetFits(t *testing.T) {
+	// VFIO device on host node 1; sandbox fits in one node: expect
+	// collapse to the guest node covering host node 1.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	vfio := map[int]struct{}{1: {}}
+	out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Len(out, 1)
+	assert.Equal("1", out[0].HostNodes)
+}
+
+func TestSelectNUMANodesVFIOSubsetTooSmall(t *testing.T) {
+	// VFIO device on host node 1, but sandbox needs more than one node's
+	// worth of memory: expect the full topology so the sandbox actually
+	// fits, even at the cost of cross-NUMA traffic.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	vfio := map[int]struct{}{1: {}}
+	out := selectNUMANodes(in, 8, 200*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesVFIOSpansAllNodes(t *testing.T) {
+	// One VFIO device per host node: VFIO subset == full topology, no
+	// collapse possible. Result is the input unchanged.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	vfio := map[int]struct{}{0: {}, 1: {}}
+	out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesVFIONoCoverage(t *testing.T) {
+	// VFIO host node not represented in the guest topology (rare, but can
+	// happen if numa_mapping has been customized). Keep the full topology
+	// rather than dropping all nodes.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	vfio := map[int]struct{}{2: {}}
+	out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+// rightSizeNUMAWithFakeCaps mirrors maybeRightSizeAutoNUMA but lets tests
+// inject a synthetic capacity oracle in place of realHostNUMACapFn so the
+// decision is hermetic.
+func rightSizeNUMAWithFakeCaps(hc *HypervisorConfig, capFn hostNUMACapFn) {
+	if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 {
+		return
+	}
+	hc.GuestNUMANodes = selectNUMANodes(
+		hc.GuestNUMANodes,
+		hc.DefaultMaxVCPUs,
+		uint64(hc.MemorySize),
+		nil, // no VFIO devices in this test
+		capFn,
+		silentLogger(),
+	)
+}
+
+func TestMaybeRightSizeAutoNUMACollapsesToOneNode(t *testing.T) {
+	// Empty NUMAMapping (auto) + sandbox fits in one host node:
+	// GuestNUMANodes is trimmed to a single entry.
+	assert := assert.New(t)
+
+	hc := &HypervisorConfig{
+		DefaultMaxVCPUs: 1,
+		MemorySize:      1,
+		GuestNUMANodes:  twoNodeAutoTopology(),
+	}
+	rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
+	assert.Len(hc.GuestNUMANodes, 1)
+	assert.Equal("0", hc.GuestNUMANodes[0].HostNodes)
+}
+
+func TestMaybeRightSizeAutoNUMAExplicitMappingHonored(t *testing.T) {
+	// Non-empty NUMAMapping (user-provided) is left untouched, even if
+	// the sandbox would fit in a single node.
+	assert := assert.New(t)
+
+	hc := &HypervisorConfig{
+		DefaultMaxVCPUs: 1,
+		MemorySize:      1,
+		NUMAMapping:     []string{"0", "1"},
+		GuestNUMANodes:  twoNodeAutoTopology(),
+	}
+	rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
+	assert.Len(hc.GuestNUMANodes, 2)
+}
+
+func TestMaybeRightSizeAutoNUMAKeepsFullWhenSandboxSpansNodes(t *testing.T) {
+	// Sandbox needs more CPUs than a single host node has: full topology
+	// is preserved.
+	assert := assert.New(t)
+
+	hc := &HypervisorConfig{
+		DefaultMaxVCPUs: 64, // > one node's 32 CPUs
+		MemorySize:      1024,
+		GuestNUMANodes:  twoNodeAutoTopology(),
+	}
+	rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
+	assert.Len(hc.GuestNUMANodes, 2)
+}
+
+func TestMaybeRightSizeAutoNUMANoOpForFlatTopology(t *testing.T) {
+	// A topology with ≤ 1 node is a no-op regardless of NUMAMapping or
+	// budget.
+	assert := assert.New(t)
+
+	for _, tc := range []struct {
+		name string
+		hc   *HypervisorConfig
+	}{
+		{
+			name: "nil config",
+			hc:   nil,
+		},
+		{
+			name: "single node",
+			hc: &HypervisorConfig{
+				GuestNUMANodes: []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}},
+			},
+		},
+		{
+			name: "empty",
+			hc:   &HypervisorConfig{},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			before := 0
+			if tc.hc != nil {
+				before = len(tc.hc.GuestNUMANodes)
+			}
+			rightSizeNUMAWithFakeCaps(tc.hc, fakeCapFn(twoNodeHostCaps()))
+			after := 0
+			if tc.hc != nil {
+				after = len(tc.hc.GuestNUMANodes)
+			}
+			assert.Equal(before, after)
+		})
+	}
+}
diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go
index 6ceae42de9..1d7004e441 100644
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -2961,9 +2961,26 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error {
 
 // checkVCPUsPinning is used to support CPUSet mode of kata container.
 // CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning
-// is set to true. Then it fetches sandbox's number of vCPU threads
-// and number of CPUs in CPUSet. If the two are equal, each vCPU thread
-// is then pinned to one fixed CPU in CPUSet.
+// is set to true.
+//
+// When NUMA topology is configured (GuestNUMANodes is non-empty), vCPU
+// threads are pinned to host CPUs belonging to the same host NUMA node
+// as the vCPU's assigned guest NUMA node, preserving memory locality.
+// vCPUs are distributed proportionally across nodes and each vCPU is
+// pinned round-robin to the host CPUs within its NUMA node; the 1:1
+// count equality check does not apply.
+//
+// This is true for both multi-node sandboxes and right-sized
+// single-node sandboxes: when buildNUMATopology()/maybeRightSizeAutoNUMA
+// collapses the topology to one node, that single node still carries a
+// meaningful HostCPUs subset (the CPUs of the chosen host NUMA node),
+// and pinning to that subset is what makes right-sizing actually deliver
+// host-thread locality, not just guest-topology locality.
+//
+// In the non-NUMA path (GuestNUMANodes is empty, e.g. enable_numa=false),
+// it fetches the sandbox's number of vCPU threads and number of CPUs in
+// CPUSet. If the two are equal, each vCPU thread is pinned 1:1 to the
+// CPUs in CPUSet; otherwise pinning is skipped.
 func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 	if s.config == nil {
 		return fmt.Errorf("no sandbox config found")
@@ -2972,11 +2989,39 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 		return nil
 	}
 
-	// fetch vCPU thread ids and CPUSet
+	expectedVCPUs := int(s.config.HypervisorConfig.NumVCPUs())
+
 	vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx)
 	if err != nil {
 		return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err)
 	}
+
+	// QEMU may not have spawned all vCPU threads yet. Retry with
+	// exponential backoff until we see the expected count.
+	if len(vCPUThreadsMap.vcpus) < expectedVCPUs {
+		const maxAttempts = 10
+		backoff := 50 * time.Millisecond
+		for attempt := 2; attempt <= maxAttempts && len(vCPUThreadsMap.vcpus) < expectedVCPUs; attempt++ {
+			s.Logger().WithFields(logrus.Fields{
+				"have":    len(vCPUThreadsMap.vcpus),
+				"want":    expectedVCPUs,
+				"attempt": attempt,
+			}).Debug("waiting for all vCPU threads to be available")
+			time.Sleep(backoff)
+			backoff *= 2
+			vCPUThreadsMap, err = s.hypervisor.GetThreadIDs(ctx)
+			if err != nil {
+				return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err)
+			}
+		}
+		if len(vCPUThreadsMap.vcpus) < expectedVCPUs {
+			s.Logger().WithFields(logrus.Fields{
+				"have": len(vCPUThreadsMap.vcpus),
+				"want": expectedVCPUs,
+			}).Warn("not all vCPU threads available after retries; pinning available ones")
+		}
+	}
+
 	cpuSetStr, _, err := s.getSandboxCPUSet()
 	if err != nil {
 		return fmt.Errorf("failed to get CPUSet config: %v", err)
@@ -2987,9 +3032,42 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 	}
 	cpuSetSlice := cpuSet.ToSlice()
 
-	// check if vCPU thread numbers and CPU numbers are equal
+	numaNodes := s.config.HypervisorConfig.GuestNUMANodes
+
+	if len(cpuSetSlice) == 0 {
+		if len(numaNodes) >= 1 {
+			// No cpuset constraint (e.g. ctr without k8s, or a Burstable
+			// pod with cpuManagerPolicy=none). Build an effective cpuset
+			// from the NUMA nodes' HostCPUs so pinning works using the
+			// (possibly right-sized) host NUMA topology. Even a single
+			// NUMA node here meaningfully constrains pinning to that
+			// node's host CPUs.
+			for _, gn := range numaNodes {
+				hostCPUs, err := cpuset.Parse(gn.HostCPUs)
+				if err != nil {
+					continue
+				}
+				cpuSet = cpuSet.Union(hostCPUs)
+			}
+			cpuSetSlice = cpuSet.ToSlice()
+			if len(cpuSetSlice) == 0 {
+				s.Logger().Warn("sandbox CPUSet is empty and cannot derive from NUMA HostCPUs; skipping vCPU pinning")
+				s.isVCPUsPinningOn = false
+				return nil
+			}
+			s.Logger().WithField("effective-cpuset", cpuSet.String()).Debug("derived cpuset from NUMA HostCPUs for pinning")
+		} else {
+			s.Logger().Warn("sandbox CPUSet is empty; skipping vCPU pinning")
+			s.isVCPUsPinningOn = false
+			return nil
+		}
+	}
+
+	if len(numaNodes) >= 1 {
+		return s.checkVCPUsPinningNUMA(ctx, vCPUThreadsMap, numaNodes, cpuSetSlice)
+	}
+
 	numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice)
-	// if not equal, we should reset threads scheduling to random pattern
 	if numVCPUs != numCPUs {
 		if s.isVCPUsPinningOn {
 			s.isVCPUsPinningOn = false
@@ -2997,7 +3075,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 		}
 		return nil
 	}
-	// if equal, we can use vCPU thread pinning
 	for i, tid := range vCPUThreadsMap.vcpus {
 		if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil {
 			if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
@@ -3010,6 +3087,68 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 	return nil
 }
 
+// checkVCPUsPinningNUMA pins vCPU threads to host CPUs that belong to the
+// same NUMA node as the vCPU's guest NUMA node assignment. vCPUs are
+// distributed proportionally to the host CPU count per NUMA node
+// (matching buildNUMATopology). It handles any non-empty numaNodes
+// slice — including the right-sized single-node case, where every vCPU
+// is pinned within the single chosen host NUMA node's CPU set.
+func (s *Sandbox) checkVCPUsPinningNUMA(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, numaNodes []types.GuestNUMANode, cpuSetSlice []int) error {
+	numVCPUs := uint32(len(vCPUThreadsMap.vcpus))
+	numNodes := uint32(len(numaNodes))
+	if numVCPUs < numNodes {
+		return fmt.Errorf("number of vCPUs (%d) must be >= NUMA node count (%d) for NUMA pinning", numVCPUs, numNodes)
+	}
+
+	vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, numVCPUs)
+	if err != nil {
+		return fmt.Errorf("failed to compute NUMA vCPU distribution for pinning: %v", err)
+	}
+
+	cpuSetAll := cpuset.NewCPUSet(cpuSetSlice...)
+
+	var cpuOffset uint32
+	for i, gn := range numaNodes {
+		hostCPUs, err := cpuset.Parse(gn.HostCPUs)
+		if err != nil {
+			return fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %v", i, err)
+		}
+		allowedCPUs := hostCPUs.Intersection(cpuSetAll).ToSlice()
+		if len(allowedCPUs) == 0 {
+			s.Logger().WithFields(logrus.Fields{
+				"numa-node":    i,
+				"host-cpus":    gn.HostCPUs,
+				"sandbox-cpus": cpuSetSlice,
+			}).Warn("NUMA node HostCPUs do not intersect sandbox CPUSet; pinning vCPUs to full cpuset for this node")
+			allowedCPUs = cpuSetSlice
+		}
+
+		startVCPU := cpuOffset
+		endVCPU := startVCPU + vcpusPerNode[i]
+		cpuOffset = endVCPU
+
+		for vcpuIdx := startVCPU; vcpuIdx < endVCPU; vcpuIdx++ {
+			tid, ok := vCPUThreadsMap.vcpus[int(vcpuIdx)]
+			if !ok {
+				if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
+					return err
+				}
+				return fmt.Errorf("missing vcpu thread id for vcpu index %d", vcpuIdx)
+			}
+			pinIdx := int(vcpuIdx-startVCPU) % len(allowedCPUs)
+			if err := resCtrl.SetThreadAffinity(tid, allowedCPUs[pinIdx:pinIdx+1]); err != nil {
+				if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
+					return err
+				}
+				return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d (NUMA node %d): %v", tid, allowedCPUs[pinIdx], i, err)
+			}
+		}
+	}
+
+	s.isVCPUsPinningOn = true
+	return nil
+}
+
 // resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling
 func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error {
 	for _, tid := range vCPUThreadsMap.vcpus {
diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go
index 7e521f3842..50115c7a5b 100644
--- a/src/runtime/virtcontainers/sandbox_test.go
+++ b/src/runtime/virtcontainers/sandbox_test.go
@@ -1679,3 +1679,29 @@ func TestSandboxHugepageLimit(t *testing.T) {
 	err = s.updateResources(context.Background())
 	assert.NoError(t, err)
 }
+
+func TestCheckVCPUsPinningNUMATooFewVCPUs(t *testing.T) {
+	assert := assert.New(t)
+	s := &Sandbox{}
+	vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100}}
+	numaNodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-3"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	}
+	err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7})
+	assert.Error(err)
+	assert.Contains(err.Error(), "must be >= NUMA node count")
+}
+
+func TestCheckVCPUsPinningNUMABadHostCPUs(t *testing.T) {
+	assert := assert.New(t)
+	s := &Sandbox{}
+	vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100, 1: 101, 2: 102, 3: 103}}
+	numaNodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "not-valid"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	}
+	err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7})
+	assert.Error(err)
+	assert.Contains(err.Error(), "failed to parse HostCPUs")
+}
diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go
index 39bcfde8f4..5e1ff51ae3 100644
--- a/src/runtime/virtcontainers/utils/utils.go
+++ b/src/runtime/virtcontainers/utils/utils.go
@@ -13,6 +13,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
+	"strconv"
 	"strings"
 	"syscall"
 	"time"
@@ -623,3 +624,205 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) {
 
 	return numaNodes, nil
 }
+
+// FilterNUMANodesByCPUSet returns only those guest NUMA nodes whose HostCPUs
+// intersect with the given sandbox cpuset. If sandboxCPUs is empty (size 0),
+// no filtering is applied and the original slice is returned unchanged.
+func FilterNUMANodesByCPUSet(nodes []types.GuestNUMANode, sandboxCPUs cpuset.CPUSet) []types.GuestNUMANode {
+	if sandboxCPUs.Size() == 0 {
+		return nodes
+	}
+	var filtered []types.GuestNUMANode
+	for _, n := range nodes {
+		hostCPUs, err := cpuset.Parse(n.HostCPUs)
+		if err != nil {
+			continue
+		}
+		if hostCPUs.Intersection(sandboxCPUs).Size() > 0 {
+			filtered = append(filtered, n)
+		}
+	}
+	if len(filtered) == 0 {
+		return nodes
+	}
+	return filtered
+}
+
+// NUMADistEntry represents a single NUMA distance measurement between two nodes.
+type NUMADistEntry struct {
+	Src uint32
+	Dst uint32
+	Val uint32
+}
+
+// GetHostNUMADistances reads the host NUMA distance matrix for the nodes
+// referenced by the given GuestNUMANode list and returns off-diagonal
+// pairwise entries (skipping self-distance src==dst).
+// The distance row from sysfs is indexed by host NUMA node ID, so we parse
+// each guest node's HostNodes to find the representative host node ID and
+// use that to index into the distance row.
+func GetHostNUMADistances(nodes []types.GuestNUMANode) []NUMADistEntry {
+	hostNodeIDs := make([]int, len(nodes))
+	for i, n := range nodes {
+		nodeSet, err := cpuset.Parse(n.HostNodes)
+		if err != nil {
+			hostNodeIDs[i] = -1
+			continue
+		}
+		ids := nodeSet.ToSlice()
+		if len(ids) == 0 {
+			hostNodeIDs[i] = -1
+			continue
+		}
+		hostNodeIDs[i] = ids[0]
+	}
+
+	var dists []NUMADistEntry
+	for srcIdx, srcNode := range nodes {
+		if hostNodeIDs[srcIdx] < 0 {
+			continue
+		}
+		distStr := getHostNUMADistance(srcNode.HostNodes)
+		if distStr == "" {
+			continue
+		}
+		fields := strings.Fields(distStr)
+		for dstIdx := range nodes {
+			if srcIdx == dstIdx {
+				continue
+			}
+			hostID := hostNodeIDs[dstIdx]
+			if hostID < 0 || hostID >= len(fields) {
+				continue
+			}
+			val, err := strconv.ParseUint(fields[hostID], 10, 32)
+			if err != nil {
+				continue
+			}
+			dists = append(dists, NUMADistEntry{
+				Src: uint32(srcIdx),
+				Dst: uint32(dstIdx),
+				Val: uint32(val),
+			})
+		}
+	}
+	return dists
+}
+
+// HostNUMANodeCapacity describes the CPU and memory capacity of a single
+// host NUMA node, as seen via sysfs.
+type HostNUMANodeCapacity struct {
+	NodeID int
+	CPUs   int
+	MemMB  uint64
+}
+
+// GetHostNUMANodeCapacity returns the CPU count and memory size (in MiB)
+// of the given host NUMA node.
+func GetHostNUMANodeCapacity(nodeID int) (HostNUMANodeCapacity, error) {
+	cap := HostNUMANodeCapacity{NodeID: nodeID}
+	cpuList, err := getHostNUMANodeCPUs(nodeID)
+	if err != nil {
+		return cap, err
+	}
+	cs, err := cpuset.Parse(cpuList)
+	if err != nil {
+		return cap, fmt.Errorf("parse host node %d cpulist %q: %w", nodeID, cpuList, err)
+	}
+	cap.CPUs = cs.Size()
+	memMB, err := getHostNUMANodeMemoryMB(nodeID)
+	if err != nil {
+		return cap, err
+	}
+	cap.MemMB = memMB
+	return cap, nil
+}
+
+// GetHostNUMANodeCapacities returns the capacities of the given host NUMA
+// node IDs in the same order. Nodes that fail to be read are skipped and
+// the corresponding error is logged via the returned error (the slice may
+// be shorter than the input).
+func GetHostNUMANodeCapacities(nodeIDs []int) ([]HostNUMANodeCapacity, error) {
+	out := make([]HostNUMANodeCapacity, 0, len(nodeIDs))
+	for _, id := range nodeIDs {
+		c, err := GetHostNUMANodeCapacity(id)
+		if err != nil {
+			return out, fmt.Errorf("read host NUMA node %d capacity: %w", id, err)
+		}
+		out = append(out, c)
+	}
+	return out, nil
+}
+
+// DistributeVCPUsProportionally distributes totalVCPUs across NUMA nodes
+// proportionally to the number of host CPUs available on each node.
+// Each node is guaranteed at least 1 vCPU. Remainder vCPUs go to nodes
+// with the most host CPUs.
+func DistributeVCPUsProportionally(numaNodes []types.GuestNUMANode, totalVCPUs uint32) ([]uint32, error) {
+	numNodes := len(numaNodes)
+	if numNodes == 0 {
+		return nil, fmt.Errorf("no NUMA nodes")
+	}
+	if totalVCPUs < uint32(numNodes) {
+		return nil, fmt.Errorf("totalVCPUs (%d) must be >= NUMA node count (%d)", totalVCPUs, numNodes)
+	}
+
+	hostCPUCounts := make([]int, numNodes)
+	totalHostCPUs := 0
+	for i, gn := range numaNodes {
+		parsed, err := cpuset.Parse(gn.HostCPUs)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %w", i, err)
+		}
+		if parsed.Size() == 0 {
+			return nil, fmt.Errorf("HostCPUs for NUMA node %d must not be empty", i)
+		}
+		hostCPUCounts[i] = parsed.Size()
+		totalHostCPUs += hostCPUCounts[i]
+	}
+	if totalHostCPUs == 0 {
+		return nil, fmt.Errorf("total host CPU count is 0")
+	}
+
+	vcpusPerNode := make([]uint32, numNodes)
+	var assigned uint32
+	for i := range numaNodes {
+		vcpusPerNode[i] = uint32(int(totalVCPUs) * hostCPUCounts[i] / totalHostCPUs)
+		if vcpusPerNode[i] == 0 {
+			vcpusPerNode[i] = 1
+		}
+		assigned += vcpusPerNode[i]
+	}
+
+	// Use a copy for remainder distribution to avoid mutating the original counts.
+	weights := make([]int, numNodes)
+	copy(weights, hostCPUCounts)
+
+	for assigned < totalVCPUs {
+		bestIdx := 0
+		for i := 1; i < numNodes; i++ {
+			if weights[i] > weights[bestIdx] {
+				bestIdx = i
+			}
+		}
+		vcpusPerNode[bestIdx]++
+		assigned++
+		weights[bestIdx]--
+	}
+
+	for assigned > totalVCPUs {
+		bestIdx := 0
+		for i := 1; i < numNodes; i++ {
+			if vcpusPerNode[i] > vcpusPerNode[bestIdx] {
+				bestIdx = i
+			}
+		}
+		if vcpusPerNode[bestIdx] <= 1 {
+			break
+		}
+		vcpusPerNode[bestIdx]--
+		assigned--
+	}
+
+	return vcpusPerNode, nil
+}
diff --git a/src/runtime/virtcontainers/utils/utils_darwin.go b/src/runtime/virtcontainers/utils/utils_darwin.go
index 4a64c921b1..a29d0378a2 100644
--- a/src/runtime/virtcontainers/utils/utils_darwin.go
+++ b/src/runtime/virtcontainers/utils/utils_darwin.go
@@ -22,3 +22,11 @@ func getHostNUMANodes() ([]int, error) {
 func getHostNUMANodeCPUs(nodeId int) (string, error) {
 	return "", nil
 }
+
+func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) {
+	return 0, nil
+}
+
+func getHostNUMADistance(hostNodes string) string {
+	return ""
+}
diff --git a/src/runtime/virtcontainers/utils/utils_linux.go b/src/runtime/virtcontainers/utils/utils_linux.go
index 0ddb4dd5a9..11ae66b202 100644
--- a/src/runtime/virtcontainers/utils/utils_linux.go
+++ b/src/runtime/virtcontainers/utils/utils_linux.go
@@ -12,6 +12,8 @@ import (
 	"io"
 	"math/big"
 	"os"
+	"regexp"
+	"strconv"
 	"strings"
 	"syscall"
 	"time"
@@ -23,6 +25,8 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
 )
 
+var nodeMemTotalRegexp = regexp.MustCompile(`Node\s+\d+\s+MemTotal:\s+(\d+)\s+kB`)
+
 var ioctlFunc = Ioctl
 
 // maxUInt represents the maximum valid value for the context ID.
@@ -220,3 +224,41 @@ func getHostNUMANodeCPUs(nodeId int) (string, error) {
 	}
 	return strings.TrimSuffix(string(data), "\n"), nil
 }
+
+// getHostNUMANodeMemoryMB returns the total memory in MiB for the given
+// host NUMA node, parsed from /sys/devices/system/node/nodeN/meminfo.
+func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) {
+	fileName := fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeId)
+	data, err := os.ReadFile(fileName)
+	if err != nil {
+		return 0, err
+	}
+	m := nodeMemTotalRegexp.FindSubmatch(data)
+	if m == nil {
+		return 0, fmt.Errorf("MemTotal not found in %s", fileName)
+	}
+	kb, err := strconv.ParseUint(string(m[1]), 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	return kb / 1024, nil
+}
+
+// getHostNUMADistance reads the distance row for the first host NUMA node
+// in the given hostNodes specifier (e.g. "0" or "0-1").
+func getHostNUMADistance(hostNodes string) string {
+	nodeSet, err := cpuset.Parse(hostNodes)
+	if err != nil {
+		return ""
+	}
+	ids := nodeSet.ToSlice()
+	if len(ids) == 0 {
+		return ""
+	}
+	fileName := fmt.Sprintf("/sys/devices/system/node/node%d/distance", ids[0])
+	data, err := os.ReadFile(fileName)
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSuffix(string(data), "\n")
+}
diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go
index 8361caa1ee..90663e64b5 100644
--- a/src/runtime/virtcontainers/utils/utils_test.go
+++ b/src/runtime/virtcontainers/utils/utils_test.go
@@ -19,6 +19,9 @@ import (
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
+
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 )
 
 const waitLocalProcessTimeoutSecs = 3
@@ -754,3 +757,102 @@ func TestDockerNetnsPath(t *testing.T) {
 	}
 	assert.Equal("", DockerNetnsPath(spec))
 }
+
+func TestDistributeVCPUsProportionallySymmetric(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0-3"},
+		{HostCPUs: "4-7"},
+	}
+	dist, err := DistributeVCPUsProportionally(nodes, 8)
+	assert.NoError(err)
+	assert.Equal([]uint32{4, 4}, dist)
+}
+
+func TestDistributeVCPUsProportionallyAsymmetric(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0-7"},
+		{HostCPUs: "8-9"},
+	}
+	dist, err := DistributeVCPUsProportionally(nodes, 10)
+	assert.NoError(err)
+	assert.Equal([]uint32{8, 2}, dist)
+}
+
+func TestDistributeVCPUsProportionallyMinOnePerNode(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0-99"},
+		{HostCPUs: "100"},
+	}
+	dist, err := DistributeVCPUsProportionally(nodes, 2)
+	assert.NoError(err)
+	assert.Equal(uint32(1), dist[0])
+	assert.Equal(uint32(1), dist[1])
+}
+
+func TestDistributeVCPUsProportionallyThreeNodes(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0-5"},
+		{HostCPUs: "6-8"},
+		{HostCPUs: "9"},
+	}
+	// 6+3+1=10 host CPUs, 10 vCPUs: proportional = 6, 3, 1
+	dist, err := DistributeVCPUsProportionally(nodes, 10)
+	assert.NoError(err)
+	assert.Equal([]uint32{6, 3, 1}, dist)
+}
+
+func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0"},
+		{HostCPUs: "1"},
+		{HostCPUs: "2"},
+	}
+	_, err := DistributeVCPUsProportionally(nodes, 2)
+	assert.Error(err)
+	assert.Contains(err.Error(), "must be >= NUMA node count")
+}
+
+func TestFilterNUMANodesByCPUSet(t *testing.T) {
+	assert := assert.New(t)
+
+	nodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-55,112-167"},
+		{HostNodes: "1", HostCPUs: "56-111,168-223"},
+	}
+
+	// Sandbox cpuset only from node 0 -> should return 1 node
+	sandboxCPUs, _ := cpuset.Parse("1-40,113-152")
+	filtered := FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
+	assert.Len(filtered, 1)
+	assert.Equal("0", filtered[0].HostNodes)
+
+	// Sandbox cpuset from both nodes -> should return 2 nodes
+	sandboxCPUs, _ = cpuset.Parse("1-40,56-80")
+	filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
+	assert.Len(filtered, 2)
+
+	// Sandbox cpuset only from node 1 -> should return 1 node
+	sandboxCPUs, _ = cpuset.Parse("60-70,170-180")
+	filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
+	assert.Len(filtered, 1)
+	assert.Equal("1", filtered[0].HostNodes)
+
+	// Empty cpuset -> no filtering, return all
+	emptyCPUs := cpuset.NewCPUSet()
+	filtered = FilterNUMANodesByCPUSet(nodes, emptyCPUs)
+	assert.Len(filtered, 2)
+
+	// Single-node host (1 NUMA node) -> returns 1 regardless
+	singleNode := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-7"},
+	}
+	sandboxCPUs, _ = cpuset.Parse("0-3")
+	filtered = FilterNUMANodesByCPUSet(singleNode, sandboxCPUs)
+	assert.Len(filtered, 1)
+	assert.Equal("0", filtered[0].HostNodes)
+}
diff --git a/tests/integration/kubernetes/k8s-nvidia-numa.bats b/tests/integration/kubernetes/k8s-nvidia-numa.bats
new file mode 100644
index 0000000000..dd695e6811
--- /dev/null
+++ b/tests/integration/kubernetes/k8s-nvidia-numa.bats
@@ -0,0 +1,745 @@
+#!/usr/bin/env bats
+#
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# NUMA topology and vCPU pinning verification tests for Kata Containers.
+#
+# Five tests cover the main paths in the runtime's NUMA logic:
+#   1. Multi-node sandbox: a workload that does NOT fit in a single host
+#      NUMA node should be balanced across host nodes — the guest sees
+#      multiple NUMA nodes with even vCPU/memory distribution and host
+#      vCPU pinning is balanced as well.
+#   2. Right-sized single-node sandbox: a workload that DOES fit in a
+#      single host NUMA node should be collapsed to one node — the guest
+#      sees exactly one NUMA node with all vCPUs in it AND all host
+#      QEMU vCPU threads are pinned to that one host NUMA node.
+#   3. GPU passthrough (VFIO), multi-node: when a GPU is attached via
+#      VFIO and the workload spans every host NUMA node, the runtime
+#      creates pxb-pcie bridges and the guest GPU reports the same NUMA
+#      node as the host GPU.
+#   4. GPU passthrough (VFIO), right-sized single-node: when a small
+#      workload + GPU fits on a single host NUMA node, the runtime
+#      collapses the topology to the GPU's host NUMA node (memory and
+#      vCPUs land on the same node as the GPU, not just any fitting node).
+#   5. Explicit numa_mapping in the runtime TOML: when the user pins the
+#      guest topology to a specific host node via numa_mapping = ["1"],
+#      maybeRightSizeAutoNUMA() must be a no-op and buildNUMATopology()
+#      must propagate the binding (memory + vCPU pinning land on the
+#      chosen host node, regardless of how small the workload is).
+#
+# Guest-side checks use the quay.io/kata-containers/numa container image
+# which reads sysfs and prints results to stdout.  The bats test reads
+# the output via "kubectl logs" — no kubectl exec, no CoCo policy
+# overrides needed.
+#
+# WARNING: The host-side pinning check runs numa-pinning-check.sh directly
+# on the host (not inside a container).  This requires the bats runner to
+# execute on the k8s node with privileged access to /proc, /sys, crictl,
+# and taskset.  If the test environment changes so that bats no longer
+# runs on the node, these calls must be reworked to use exec_host or
+# equivalent.
+
+load "${BATS_TEST_DIRNAME}/lib.sh"
+load "${BATS_TEST_DIRNAME}/confidential_common.sh"
+
+export KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu-nvidia-gpu-snp}"
+
+# Hypervisors where NUMA is configured and supported by default.
+# Only qemu-nvidia-gpu variants ship enable_numa=true in their base config.
+# runtime-rs does not yet implement NUMA; non-QEMU hypervisors lack support.
+NUMA_CONFIGURED_SUPPORTED_BY_DEFAULT=(
+    "qemu-nvidia-gpu"
+    "qemu-nvidia-gpu-snp"
+    "qemu-nvidia-gpu-tdx"
+)
+
+# Multi-node test: large enough to span every host NUMA node.
+NUMA_TEST_VCPUS_LARGE="${NUMA_TEST_VCPUS_LARGE:-64}"
+NUMA_TEST_MEMORY_LARGE="${NUMA_TEST_MEMORY_LARGE:-64Gi}"
+
+# Right-sizing test: small enough to fit in a single host NUMA node on
+# any reasonable production-class server.
+NUMA_TEST_VCPUS_SMALL="${NUMA_TEST_VCPUS_SMALL:-4}"
+NUMA_TEST_MEMORY_SMALL="${NUMA_TEST_MEMORY_SMALL:-4Gi}"
+
+# GPU test: same sizing as the large test, plus a GPU.
+NUMA_TEST_VCPUS_GPU="${NUMA_TEST_VCPUS_GPU:-64}"
+NUMA_TEST_MEMORY_GPU="${NUMA_TEST_MEMORY_GPU:-64Gi}"
+
+# Small GPU test: fits in a single host NUMA node, exercises the
+# right-sizing path with VFIO (sandbox should land on the GPU's host
+# NUMA node, not just any node that fits).
+NUMA_TEST_VCPUS_GPU_SMALL="${NUMA_TEST_VCPUS_GPU_SMALL:-4}"
+NUMA_TEST_MEMORY_GPU_SMALL="${NUMA_TEST_MEMORY_GPU_SMALL:-4Gi}"
+
+export POD_NAME_NUMA="numa-topology-test"
+POD_NAME_NUMA_GPU="numa-topology-gpu-test"
+
+POD_WAIT_TIMEOUT=${POD_WAIT_TIMEOUT:-600s}
+export POD_WAIT_TIMEOUT
+
+HOST_PINNING_RETRIES=20
+HOST_PINNING_SLEEP=0.5
+
+setup() {
+    setup_common || die "setup_common failed"
+
+    pod_yaml_in="${pod_config_dir}/${POD_NAME_NUMA}.yaml.in"
+    pod_yaml="${pod_config_dir}/${POD_NAME_NUMA}.yaml"
+
+    policy_settings_dir="$(create_tmp_policy_settings_dir "${pod_config_dir}")"
+    add_requests_to_policy_settings "${policy_settings_dir}" "ReadStreamRequest"
+}
+
+# -----------------------------------------------------------------------------
+# Skip / topology helpers
+# -----------------------------------------------------------------------------
+
+# numa_skip_reason returns a non-empty skip reason on stdout when the
+# current test should be skipped (hypervisor lacks default NUMA support
+# OR host has fewer than 2 NUMA nodes).  Empty stdout means run.
+# Callers must invoke `skip` themselves — bats `skip` inside command
+# substitution does not propagate.
+numa_skip_reason() {
+    # shellcheck disable=SC2076
+    if [[ ! " ${NUMA_CONFIGURED_SUPPORTED_BY_DEFAULT[*]} " =~ " ${KATA_HYPERVISOR} " ]]; then
+        echo "NUMA not configured by default on ${KATA_HYPERVISOR} (only qemu-nvidia-gpu variants)"
+        return 0
+    fi
+    local nodes
+    nodes=$(host_numa_node_count)
+    if [[ "${nodes}" -lt 2 ]]; then
+        echo "Host has only ${nodes} NUMA node(s), need >= 2 for this test"
+    fi
+}
+
+# host_numa_node_count echoes the number of NUMA nodes on the host.
+# WARNING: numactl runs directly on the host, not via exec_host.
+host_numa_node_count() {
+    numactl --hardware | grep -oP 'available:\s+\K\d+'
+}
+
+# -----------------------------------------------------------------------------
+# Pod lifecycle helpers
+# -----------------------------------------------------------------------------
+
+# render_pod renders the pod yaml with the given vCPU and memory limits
+# and runs auto_generate_policy against it.  Each @test calls this with
+# its own sizing so the same template can serve multiple scenarios.
+render_pod() {
+    local vcpus="${1}" memory="${2}"
+    NUMA_TEST_VCPUS="${vcpus}" NUMA_TEST_MEMORY="${memory}" \
+        envsubst < "${pod_yaml_in}" > "${pod_yaml}"
+    auto_generate_policy "${policy_settings_dir}" "${pod_yaml}"
+}
+
+# deploy_and_get_guest_logs renders, applies, waits for Ready, then
+# echoes the pod's stdout (the test image prints NUMA topology then
+# sleeps).  The brief sleep gives the entrypoint time to print before
+# we read.
+deploy_and_get_guest_logs() {
+    local vcpus="${1}" memory="${2}"
+    render_pod "${vcpus}" "${memory}"
+    kubectl apply -f "${pod_yaml}"
+    kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA}"
+    sleep 2
+    kubectl logs "${POD_NAME_NUMA}"
+}
+
+# -----------------------------------------------------------------------------
+# Guest-log parsers (operate on stdout from the test container)
+# -----------------------------------------------------------------------------
+
+# guest_online_count parses a "numa_online: <value>" payload (e.g. "0",
+# "0-1", "0-7") and echoes the number of online NUMA nodes it implies.
+guest_online_count() {
+    local online="${1}"
+    if [[ "${online}" =~ ^([0-9]+)-([0-9]+)$ ]]; then
+        echo $(( ${BASH_REMATCH[2]} - ${BASH_REMATCH[1]} + 1 ))
+    elif [[ "${online}" =~ ^[0-9]+$ ]]; then
+        echo 1
+    else
+        die "Unexpected format for guest NUMA online nodes: ${online}"
+    fi
+}
+
+# guest_field <logs> <field>
+# Echoes the value following "<field>:" in <logs>.  E.g.
+#   guest_field "$logs" numa_online -> "0-1"
+guest_field() {
+    echo "${1}" | grep -oP "${2}:\s*\K\S+"
+}
+
+# guest_per_node_values <logs> <suffix>
+# Emits one value per line for "node\d+<suffix>: <value>" entries
+# (e.g. _cpus or _mem_kb).  Suitable for `mapfile -t`.
+guest_per_node_values() {
+    echo "${1}" | grep -oP "node\d+${2}:\s*\K\d+"
+}
+
+# -----------------------------------------------------------------------------
+# Host-side pinning helpers
+# -----------------------------------------------------------------------------
+
+# get_qemu_pid_for_numa_pod resolves the running pod's sandbox via crictl
+# and returns the QEMU PID via pgrep.  Fails the test if either lookup
+# turns up empty.
+get_qemu_pid_for_numa_pod() {
+    local sandbox_id qemu_pid
+    sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \
+        pods --name "${POD_NAME_NUMA}" -q | head -1)
+    [[ -n "${sandbox_id}" ]] || die "no sandbox id found for pod ${POD_NAME_NUMA}"
+
+    qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1)
+    [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for sandbox ${sandbox_id}"
+    echo "${qemu_pid}"
+}
+
+# pinning_thread_total sums the per-bucket counts in numa-pinning-check.sh
+# output ("nodeN: <count>" lines) and echoes the total.
+pinning_thread_total() {
+    echo "${1}" | awk -F: '/^node[0-9]+:/ {sum+=$2} END {print sum+0}'
+}
+
+# wait_for_host_pinning <qemu_pid> <expected_vcpus>
+# Polls numa-pinning-check.sh until at least <expected_vcpus> threads
+# report per-CPU affinity, or until HOST_PINNING_RETRIES is exhausted.
+# Echoes the final script output regardless of whether convergence was
+# reached, so callers can inspect/assert on the bucket distribution.
+wait_for_host_pinning() {
+    local qemu_pid="${1}" expected="${2}"
+    local script="${BATS_TEST_DIRNAME}/numa-pinning-check.sh"
+    local output total
+    local attempt
+    for ((attempt = 1; attempt <= HOST_PINNING_RETRIES; attempt++)); do
+        output=$(sudo bash "${script}" "${qemu_pid}")
+        total=$(pinning_thread_total "${output}")
+        if (( total >= expected )); then
+            echo "${output}"
+            return 0
+        fi
+        echo "# Host pinning attempt ${attempt}/${HOST_PINNING_RETRIES}: ${total}/${expected} threads pinned" >&2
+        sleep "${HOST_PINNING_SLEEP}"
+    done
+    echo "${output}"
+}
+
+# minmax_diff <values...>
+# Echoes (max - min) for the given non-empty integer list.
+minmax_diff() {
+    local lo=$1 hi=$1 v
+    shift
+    for v in "$@"; do
+        (( v > hi )) && hi=$v
+        (( v < lo )) && lo=$v
+    done
+    echo $((hi - lo))
+}
+
+# get_qemu_cmdline <qemu_pid>
+# Reads the QEMU process command line from /proc, replacing null bytes
+# with spaces.  Runs directly on the host via sudo.
+get_qemu_cmdline() {
+    sudo cat "/proc/${1}/cmdline" | tr '\0' ' '
+}
+
+# host_has_pgpu returns 0 if the node has allocatable nvidia.com/pgpu
+# resources, 1 otherwise.
+host_has_pgpu() {
+    local count
+    count=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/pgpu}' 2>/dev/null)
+    [[ -n "${count}" && "${count}" -gt 0 ]] 2>/dev/null
+}
+
+# gpu_numa_skip_reason extends numa_skip_reason with a check for GPU
+# availability.
+gpu_numa_skip_reason() {
+    local reason
+    reason=$(numa_skip_reason)
+    if [[ -n "${reason}" ]]; then
+        echo "${reason}"
+        return 0
+    fi
+    if ! host_has_pgpu; then
+        echo "No nvidia.com/pgpu resources available on the cluster"
+    fi
+}
+
+# -----------------------------------------------------------------------------
+# Explicit numa_mapping config helpers (drop-in based)
+# -----------------------------------------------------------------------------
+#
+# Both kata-runtime (Go) and runtime-rs (Rust) read TOML fragments from a
+# `config.d/` directory next to the active configuration-<shim>.toml file
+# and merge them into the loaded config on every sandbox start.  These
+# helpers drop in a single override fragment so the main config file is
+# never edited — teardown just deletes the fragment.
+#
+# WARNING: must run on the k8s node (sudo required) and patch/restore must
+# be paired — a leaked drop-in would silently affect every subsequent pod
+# on the same node.
+
+# kata_runtime_config_dir echoes the per-shim runtime config directory
+# (the one that holds configuration-<shim>.toml and config.d/).  Handles
+# both the Go layout (.../runtimes/<shim>) and the runtime-rs layout
+# (.../runtime-rs/runtimes/<shim>) by probing the filesystem rather than
+# parsing the shim name (some Rust shims like `dragonball` lack the
+# `-runtime-rs` suffix).
+kata_runtime_config_dir() {
+    local base="/opt/kata/share/defaults/kata-containers"
+    local rs_dir="${base}/runtime-rs/runtimes/${KATA_HYPERVISOR}"
+    local go_dir="${base}/runtimes/${KATA_HYPERVISOR}"
+    if [[ -d "${rs_dir}" ]]; then
+        echo "${rs_dir}"
+    elif [[ -d "${go_dir}" ]]; then
+        echo "${go_dir}"
+    else
+        die "no Kata runtime config dir for ${KATA_HYPERVISOR} (looked in ${rs_dir} and ${go_dir})"
+    fi
+}
+
+# kata_hypervisor_section echoes the [hypervisor.X] header from the active
+# config so the drop-in fragment targets the right table.  Discovering it
+# at runtime keeps us hypervisor-agnostic (qemu / clh / firecracker / ...).
+kata_hypervisor_section() {
+    local dir
+    dir=$(kata_runtime_config_dir)
+    local cfg="${dir}/configuration-${KATA_HYPERVISOR}.toml"
+    [[ -f "${cfg}" ]] || die "Kata config not found at ${cfg}"
+    local section
+    section=$(sudo grep -oE '^\[hypervisor\.[a-z0-9_-]+\]' "${cfg}" | head -1)
+    [[ -n "${section}" ]] || die "no [hypervisor.X] section in ${cfg}"
+    echo "${section}"
+}
+
+# patch_kata_numa_mapping <toml_value>
+# Writes a config.d/ drop-in that sets numa_mapping = <toml_value> under
+# the active hypervisor section.  Example values: '["1"]', '["0-1","2-3"]'.
+# Records the file path in KATA_NUMA_DROPIN_PATH so teardown() can remove
+# it.  No restart needed — the next sandbox start picks it up.
+patch_kata_numa_mapping() {
+    local value="${1}"
+    local dir section
+    dir=$(kata_runtime_config_dir)
+    section=$(kata_hypervisor_section)
+
+    KATA_NUMA_DROPIN_PATH="${dir}/config.d/99-numa-test.toml"
+    export KATA_NUMA_DROPIN_PATH
+
+    sudo mkdir -p "${dir}/config.d"
+    sudo tee "${KATA_NUMA_DROPIN_PATH}" >/dev/null <<EOF
+${section}
+numa_mapping = ${value}
+EOF
+    echo "# Wrote drop-in ${KATA_NUMA_DROPIN_PATH}:"
+    sudo cat "${KATA_NUMA_DROPIN_PATH}" | sed 's/^/#   /'
+}
+
+# restore_kata_numa_mapping removes the drop-in file written by
+# patch_kata_numa_mapping (no-op if nothing was patched).
+restore_kata_numa_mapping() {
+    [[ -n "${KATA_NUMA_DROPIN_PATH:-}" ]] || return 0
+    sudo rm -f "${KATA_NUMA_DROPIN_PATH}"
+    echo "# Removed drop-in ${KATA_NUMA_DROPIN_PATH}"
+    unset KATA_NUMA_DROPIN_PATH
+}
+
+# extract_vfio_host_bdf <qemu_cmdline>
+# Returns the host PCI BDF of the first vfio-pci device passed through.
+# E.g. "vfio-pci,host=0000:41:00.0,..." -> "0000:41:00.0".
+extract_vfio_host_bdf() {
+    echo "${1}" | grep -oP 'vfio-pci,host=\K[0-9a-fA-F:.]+' | head -1
+}
+
+# host_gpu_numa <host_bdf>
+# Returns the NUMA node ID of a host PCI device from sysfs.
+# Reads /sys/bus/pci/devices/<BDF>/numa_node on the host (via sudo
+# since the bats runner may not have read access by default).
+host_gpu_numa() {
+    sudo cat "/sys/bus/pci/devices/${1}/numa_node"
+}
+
+# -----------------------------------------------------------------------------
+# Tests
+# -----------------------------------------------------------------------------
+
+@test "NUMA: guest topology and host pinning are balanced" {
+    # Skip checks must live inside @test (not setup) to avoid bats
+    # "Executed 0 instead of expected 1 tests" warnings.
+    local skip_reason
+    skip_reason=$(numa_skip_reason)
+    [[ -z "${skip_reason}" ]] || skip "${skip_reason}"
+
+    local host_nodes
+    host_nodes=$(host_numa_node_count)
+
+    local guest_logs
+    guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_LARGE}" "${NUMA_TEST_MEMORY_LARGE}")
+    echo "# Guest NUMA output:"
+    echo "# ${guest_logs}"
+
+    # --- Guest topology matches host ---
+    local online guest_count
+    online=$(guest_field "${guest_logs}" numa_online)
+    guest_count=$(guest_online_count "${online}")
+    echo "# Guest NUMA online: ${online} -> ${guest_count} node(s); host has ${host_nodes}"
+    [[ "${guest_count}" -eq "${host_nodes}" ]] \
+        || die "guest NUMA node count (${guest_count}) != host (${host_nodes})"
+
+    # --- Guest vCPU balance ---
+    mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus)
+    echo "# Guest vCPUs per node: ${guest_cpus[*]}"
+    [[ ${#guest_cpus[@]} -ge 2 ]] \
+        || die "expected >= 2 guest NUMA buckets, got ${#guest_cpus[@]}"
+    local diff
+    diff=$(minmax_diff "${guest_cpus[@]}")
+    echo "# Guest vCPU balance diff: ${diff}"
+    [[ "${diff}" -le 1 ]] || die "guest vCPU imbalance: ${guest_cpus[*]}"
+
+    # --- Guest memory presence per node ---
+    mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb)
+    echo "# Guest memory per node (kB): ${guest_mem[*]}"
+    [[ ${#guest_mem[@]} -ge 2 ]] || die "expected >= 2 guest memory nodes"
+
+    # --- Host-side vCPU pinning balance ---
+    local qemu_pid host_output
+    qemu_pid=$(get_qemu_pid_for_numa_pod)
+    echo "# QEMU PID: ${qemu_pid}"
+    host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_LARGE}")
+    echo "# Host pinning per NUMA node: ${host_output}"
+
+    mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+')
+    [[ ${#host_counts[@]} -ge 2 ]] \
+        || die "expected >= 2 host NUMA buckets, got ${#host_counts[@]}: ${host_output}"
+    diff=$(minmax_diff "${host_counts[@]}")
+    echo "# Host pinning diff: ${diff}"
+    [[ "${diff}" -le 1 ]] || die "host pinning imbalance: ${host_output}"
+}
+
+@test "NUMA: small workload right-sizes to a single guest NUMA node" {
+    # When the sandbox CPU + memory budget fits comfortably on a single
+    # host NUMA node and no explicit numa_mapping is provided, the
+    # runtime should collapse the auto-derived multi-node topology to a
+    # single node to preserve memory locality.  This test exercises
+    # selectNUMANodes()'s right-sizing path on a multi-NUMA host:
+    #   1. The guest sees exactly one NUMA node with all vCPUs in it.
+    #   2. The host-side QEMU vCPU threads are all pinned to that one
+    #      host NUMA node (delivered by checkVCPUsPinningNUMA, which
+    #      handles single-node sandboxes too).
+    local skip_reason
+    skip_reason=$(numa_skip_reason)
+    [[ -z "${skip_reason}" ]] || skip "${skip_reason}"
+
+    local guest_logs
+    guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_SMALL}" "${NUMA_TEST_MEMORY_SMALL}")
+    echo "# Guest NUMA output:"
+    echo "# ${guest_logs}"
+
+    # --- Guest topology collapsed to a single node ---
+    local online guest_count
+    online=$(guest_field "${guest_logs}" numa_online)
+    guest_count=$(guest_online_count "${online}")
+    echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)"
+    [[ "${guest_count}" -eq 1 ]] \
+        || die "right-sized sandbox should expose 1 NUMA node, got ${guest_count}"
+
+    mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus)
+    echo "# Guest vCPUs per node: ${guest_cpus[*]}"
+    [[ ${#guest_cpus[@]} -eq 1 ]] \
+        || die "expected 1 guest NUMA bucket, got ${#guest_cpus[@]}: ${guest_cpus[*]}"
+    # The runtime may add a default vCPU on top of the workload request,
+    # so the guest can see slightly more than the pod spec asked for.
+    [[ "${guest_cpus[0]}" -ge "${NUMA_TEST_VCPUS_SMALL}" ]] \
+        || die "expected at least ${NUMA_TEST_VCPUS_SMALL} vCPUs on the single node, got ${guest_cpus[0]}"
+
+    mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb)
+    echo "# Guest memory per node (kB): ${guest_mem[*]}"
+    [[ ${#guest_mem[@]} -eq 1 ]] \
+        || die "expected 1 guest memory node, got ${#guest_mem[@]}"
+
+    # --- Host-side vCPU pinning collapsed to a single node ---
+    local qemu_pid host_output
+    qemu_pid=$(get_qemu_pid_for_numa_pod)
+    echo "# QEMU PID: ${qemu_pid}"
+    host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_SMALL}")
+    echo "# Host pinning per NUMA node: ${host_output}"
+
+    mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+')
+    [[ ${#host_counts[@]} -eq 1 ]] \
+        || die "right-sized sandbox vCPU threads should land on a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}"
+    [[ "${host_counts[0]}" -ge "${NUMA_TEST_VCPUS_SMALL}" ]] \
+        || die "expected at least ${NUMA_TEST_VCPUS_SMALL} vCPU threads pinned, got ${host_counts[0]}: ${host_output}"
+}
+
+@test "NUMA: GPU passthrough with VFIO has correct NUMA placement" {
+    local skip_reason
+    skip_reason=$(gpu_numa_skip_reason)
+    [[ -z "${skip_reason}" ]] || skip "${skip_reason}"
+
+    local host_nodes
+    host_nodes=$(host_numa_node_count)
+
+    local gpu_yaml_in="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml.in"
+    local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml"
+
+    POD_NAME_NUMA="${POD_NAME_NUMA_GPU}" NUMA_TEST_VCPUS="${NUMA_TEST_VCPUS_GPU}" \
+        NUMA_TEST_MEMORY="${NUMA_TEST_MEMORY_GPU}" \
+        envsubst < "${gpu_yaml_in}" > "${gpu_yaml}"
+    auto_generate_policy "${policy_settings_dir}" "${gpu_yaml}"
+
+    kubectl apply -f "${gpu_yaml}"
+    kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA_GPU}"
+    sleep 2
+
+    local guest_logs
+    guest_logs=$(kubectl logs "${POD_NAME_NUMA_GPU}")
+    echo "# GPU pod guest NUMA output:"
+    echo "# ${guest_logs}"
+
+    # --- Guest NUMA topology matches host ---
+    local online guest_count
+    online=$(guest_field "${guest_logs}" numa_online)
+    guest_count=$(guest_online_count "${online}")
+    echo "# Guest NUMA online: ${online} -> ${guest_count} node(s); host has ${host_nodes}"
+    [[ "${guest_count}" -eq "${host_nodes}" ]] \
+        || die "GPU pod guest NUMA node count (${guest_count}) != host (${host_nodes})"
+
+    # --- Guest vCPU balance ---
+    mapfile -t guest_cpus < <(guest_per_node_values "${guest_logs}" _cpus)
+    echo "# Guest vCPUs per node: ${guest_cpus[*]}"
+    [[ ${#guest_cpus[@]} -ge 2 ]] \
+        || die "expected >= 2 guest NUMA buckets, got ${#guest_cpus[@]}"
+    local diff
+    diff=$(minmax_diff "${guest_cpus[@]}")
+    echo "# Guest vCPU balance diff: ${diff}"
+    [[ "${diff}" -le 1 ]] || die "GPU pod guest vCPU imbalance: ${guest_cpus[*]}"
+
+    # --- Guest memory presence per node ---
+    mapfile -t guest_mem < <(guest_per_node_values "${guest_logs}" _mem_kb)
+    echo "# Guest memory per node (kB): ${guest_mem[*]}"
+    [[ ${#guest_mem[@]} -ge 2 ]] || die "expected >= 2 guest memory nodes"
+
+    # --- Host-side QEMU lookup (needed for the GPU NUMA assertion) ---
+    local sandbox_id qemu_pid qemu_cmd host_bdf host_node
+    sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \
+        pods --name "${POD_NAME_NUMA_GPU}" -q | head -1)
+    [[ -n "${sandbox_id}" ]] || die "no sandbox id found for GPU pod"
+
+    qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1)
+    [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for GPU sandbox ${sandbox_id}"
+    echo "# QEMU PID: ${qemu_pid}"
+
+    qemu_cmd=$(get_qemu_cmdline "${qemu_pid}")
+    host_bdf=$(extract_vfio_host_bdf "${qemu_cmd}")
+    [[ -n "${host_bdf}" ]] || die "no vfio-pci host BDF found in QEMU cmdline"
+    host_node=$(host_gpu_numa "${host_bdf}")
+    echo "# Host GPU ${host_bdf} on NUMA node ${host_node}"
+
+    # --- Guest GPU NUMA affinity ---
+    # With pxb-pcie and default numa_mapping (1:1), the guest GPU's NUMA
+    # node must equal the host GPU's NUMA node.
+    mapfile -t gpu_numas < <(echo "${guest_logs}" | grep -oP 'gpu_.*_numa:\s*\K-?\d+')
+    echo "# Guest GPU NUMA nodes: ${gpu_numas[*]}"
+    [[ ${#gpu_numas[@]} -ge 1 ]] \
+        || die "no GPU detected in guest sysfs (expected gpu_*_numa: lines)"
+    for gn in "${gpu_numas[@]}"; do
+        [[ "${gn}" -eq "${host_node}" ]] \
+            || die "guest GPU on node ${gn} but host GPU ${host_bdf} is on node ${host_node}"
+    done
+
+    # --- Host-side vCPU pinning balance ---
+    local host_output
+    host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_GPU}")
+    echo "# Host pinning per NUMA node: ${host_output}"
+
+    mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+')
+    [[ ${#host_counts[@]} -ge 2 ]] \
+        || die "expected >= 2 host NUMA buckets for GPU pod, got ${#host_counts[@]}: ${host_output}"
+    diff=$(minmax_diff "${host_counts[@]}")
+    echo "# Host pinning diff: ${diff}"
+    [[ "${diff}" -le 1 ]] || die "GPU pod host pinning imbalance: ${host_output}"
+
+    # --- QEMU command line: pxb-pcie and NUMA binding ---
+    echo "# Checking QEMU cmdline for pxb-pcie..."
+    [[ "${qemu_cmd}" == *"pxb-pcie"* ]] \
+        || die "QEMU command line does not contain 'pxb-pcie' — NUMA PCIe topology not active"
+
+    echo "# Checking QEMU cmdline for NUMA memory binding..."
+    [[ "${qemu_cmd}" == *"policy=bind"* ]] \
+        || die "QEMU command line does not contain 'policy=bind' — NUMA memory binding not active"
+}
+
+@test "NUMA: small GPU workload right-sizes to the GPU's host NUMA node" {
+    # When a GPU is attached and the sandbox CPU + memory budget fits on
+    # a single host NUMA node, the runtime's right-sizing path
+    # (selectNUMANodes with VFIO awareness) should collapse the topology
+    # to the GPU's host NUMA node — not just any fitting node — so that
+    # GPU and memory access stay NUMA-local.
+    local skip_reason
+    skip_reason=$(gpu_numa_skip_reason)
+    [[ -z "${skip_reason}" ]] || skip "${skip_reason}"
+
+    local gpu_yaml_in="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml.in"
+    local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml"
+
+    POD_NAME_NUMA="${POD_NAME_NUMA_GPU}" NUMA_TEST_VCPUS="${NUMA_TEST_VCPUS_GPU_SMALL}" \
+        NUMA_TEST_MEMORY="${NUMA_TEST_MEMORY_GPU_SMALL}" \
+        envsubst < "${gpu_yaml_in}" > "${gpu_yaml}"
+    auto_generate_policy "${policy_settings_dir}" "${gpu_yaml}"
+
+    kubectl apply -f "${gpu_yaml}"
+    kubectl wait --for=condition=Ready --timeout="${POD_WAIT_TIMEOUT}" pod "${POD_NAME_NUMA_GPU}"
+    sleep 2
+
+    local guest_logs
+    guest_logs=$(kubectl logs "${POD_NAME_NUMA_GPU}")
+    echo "# Small GPU pod guest NUMA output:"
+    echo "# ${guest_logs}"
+
+    # --- Host-side QEMU lookup ---
+    local sandbox_id qemu_pid qemu_cmd host_bdf host_node
+    sandbox_id=$(sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock \
+        pods --name "${POD_NAME_NUMA_GPU}" -q | head -1)
+    [[ -n "${sandbox_id}" ]] || die "no sandbox id found for GPU pod"
+
+    qemu_pid=$(sudo pgrep -f "qemu.*${sandbox_id}" | head -1)
+    [[ -n "${qemu_pid}" ]] || die "no QEMU PID found for GPU sandbox ${sandbox_id}"
+
+    qemu_cmd=$(get_qemu_cmdline "${qemu_pid}")
+    host_bdf=$(extract_vfio_host_bdf "${qemu_cmd}")
+    [[ -n "${host_bdf}" ]] || die "no vfio-pci host BDF found in QEMU cmdline"
+    host_node=$(host_gpu_numa "${host_bdf}")
+    echo "# Host GPU ${host_bdf} on NUMA node ${host_node}"
+
+    # --- Guest collapsed to a single NUMA node ---
+    local online guest_count
+    online=$(guest_field "${guest_logs}" numa_online)
+    guest_count=$(guest_online_count "${online}")
+    echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)"
+    [[ "${guest_count}" -eq 1 ]] \
+        || die "right-sized GPU sandbox should expose 1 NUMA node, got ${guest_count}"
+
+    # --- Guest GPU sees the (single) node ---
+    mapfile -t gpu_numas < <(echo "${guest_logs}" | grep -oP 'gpu_.*_numa:\s*\K-?\d+')
+    echo "# Guest GPU NUMA nodes: ${gpu_numas[*]}"
+    [[ ${#gpu_numas[@]} -ge 1 ]] \
+        || die "no GPU detected in guest sysfs (expected gpu_*_numa: lines)"
+    # In a single-node guest, the GPU is on node 0.
+    for gn in "${gpu_numas[@]}"; do
+        [[ "${gn}" -eq 0 ]] \
+            || die "guest GPU on node ${gn} but right-sized sandbox has only node 0"
+    done
+
+    # --- QEMU memory backend bound to the GPU's host NUMA node ---
+    # The right-sizing path should pick the GPU's host node, not just
+    # any node that fits. With pxb-pcie + right-sizing, the single
+    # memory-backend-ram for the sandbox must have host-nodes=${host_node}.
+    echo "# Checking QEMU cmdline for memory binding on host node ${host_node}..."
+    [[ "${qemu_cmd}" == *"host-nodes=${host_node}"* ]] \
+        || die "right-sized GPU sandbox memory not bound to GPU's host NUMA node ${host_node}: cmdline=${qemu_cmd}"
+
+    # --- Host-side vCPU pinning collapsed to the GPU's host node ---
+    local host_output
+    host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_GPU_SMALL}")
+    echo "# Host pinning per NUMA node: ${host_output}"
+
+    mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+')
+    [[ ${#host_counts[@]} -eq 1 ]] \
+        || die "right-sized GPU sandbox vCPU threads should land on a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}"
+
+    local pinned_node
+    pinned_node=$(echo "${host_output}" | grep -oP '^node\K[0-9]+' | head -1)
+    [[ "${pinned_node}" -eq "${host_node}" ]] \
+        || die "right-sized GPU sandbox vCPUs pinned to node ${pinned_node} but GPU is on host node ${host_node}"
+}
+
+@test "NUMA: explicit numa_mapping in TOML pins the sandbox to the chosen host node" {
+    # When the user sets numa_mapping = ["1"] in the runtime TOML, the
+    # right-sizing path must be skipped (maybeRightSizeAutoNUMA bails out
+    # for non-empty NUMAMapping) and buildNUMATopology must propagate the
+    # binding verbatim, regardless of how small the workload is.
+    #
+    # Verifies end-to-end that:
+    #   - guest sees exactly 1 NUMA node;
+    #   - the QEMU memory backend is bound to host node 1 (not 0);
+    #   - host-side vCPU threads land on host node 1.
+    #
+    # QEMU-only: this test asserts on the QEMU command line (host-nodes=,
+    # policy=bind) and on the kata-runtime (Go) NUMA logic.  runtime-rs
+    # does not yet implement NUMA, so even if numa_skip_reason were
+    # widened later we'd still want to gate this case explicitly.
+    [[ "${KATA_HYPERVISOR}" == qemu-* ]] \
+        || skip "explicit numa_mapping test is QEMU-only (got ${KATA_HYPERVISOR})"
+
+    local skip_reason
+    skip_reason=$(numa_skip_reason)
+    [[ -z "${skip_reason}" ]] || skip "${skip_reason}"
+
+    # Need at least 2 host nodes so "host node 1" is a non-trivial pick.
+    local host_nodes
+    host_nodes=$(host_numa_node_count)
+    [[ "${host_nodes}" -ge 2 ]] || skip "explicit-mapping test needs >=2 host NUMA nodes"
+
+    # Patch the active runtime config; teardown() restores it.
+    patch_kata_numa_mapping '["1"]'
+
+    local guest_logs
+    guest_logs=$(deploy_and_get_guest_logs "${NUMA_TEST_VCPUS_SMALL}" "${NUMA_TEST_MEMORY_SMALL}")
+    echo "# Guest NUMA output:"
+    echo "# ${guest_logs}"
+
+    # --- Guest: explicit mapping always yields exactly one node ---
+    local online guest_count
+    online=$(guest_field "${guest_logs}" numa_online)
+    guest_count=$(guest_online_count "${online}")
+    echo "# Guest NUMA online: ${online} -> ${guest_count} node(s)"
+    [[ "${guest_count}" -eq 1 ]] \
+        || die "explicit numa_mapping=[1] should expose 1 guest NUMA node, got ${guest_count}"
+
+    # --- QEMU memory backend bound to host node 1 ---
+    local qemu_pid qemu_cmd
+    qemu_pid=$(get_qemu_pid_for_numa_pod)
+    qemu_cmd=$(get_qemu_cmdline "${qemu_pid}")
+    echo "# Checking QEMU cmdline for memory binding on host node 1..."
+    [[ "${qemu_cmd}" == *"host-nodes=1"* ]] \
+        || die "explicit numa_mapping=[1] did not pin QEMU memory to host node 1: cmdline=${qemu_cmd}"
+    [[ "${qemu_cmd}" == *"policy=bind"* ]] \
+        || die "explicit numa_mapping=[1] missing policy=bind in QEMU cmdline: cmdline=${qemu_cmd}"
+
+    # --- Host-side vCPU pinning lands on host node 1 ---
+    local host_output
+    host_output=$(wait_for_host_pinning "${qemu_pid}" "${NUMA_TEST_VCPUS_SMALL}")
+    echo "# Host pinning per NUMA node: ${host_output}"
+
+    mapfile -t host_counts < <(echo "${host_output}" | grep -oP '^node[0-9]+:\s*\K\d+')
+    [[ ${#host_counts[@]} -eq 1 ]] \
+        || die "explicit numa_mapping=[1] should pin vCPUs to a single host NUMA node, got ${#host_counts[@]} buckets: ${host_output}"
+
+    local pinned_node
+    pinned_node=$(echo "${host_output}" | grep -oP '^node\K[0-9]+' | head -1)
+    [[ "${pinned_node}" -eq 1 ]] \
+        || die "explicit numa_mapping=[1] pinned vCPUs to node ${pinned_node}, expected 1"
+}
+
+teardown() {
+    echo "=== NUMA test pod describe ==="
+    kubectl describe pod "${POD_NAME_NUMA}" || true
+    kubectl describe pod "${POD_NAME_NUMA_GPU}" 2>/dev/null || true
+
+    echo "=== NUMA test pod logs ==="
+    kubectl logs "${POD_NAME_NUMA}" || true
+    kubectl logs "${POD_NAME_NUMA_GPU}" 2>/dev/null || true
+
+    # Always restore the Kata config (no-op if no patch was applied).
+    restore_kata_numa_mapping || true
+
+    delete_tmp_policy_settings_dir "${policy_settings_dir}"
+
+    [ -f "${pod_yaml}" ] && kubectl delete -f "${pod_yaml}" --ignore-not-found=true
+    local gpu_yaml="${pod_config_dir}/${POD_NAME_NUMA_GPU}.yaml"
+    [ -f "${gpu_yaml}" ] && kubectl delete -f "${gpu_yaml}" --ignore-not-found=true
+
+    print_node_journal_since_test_start "${node}" "${node_start_time:-}" "${BATS_TEST_COMPLETED:-}"
+}
diff --git a/tests/integration/kubernetes/numa-pinning-check.sh b/tests/integration/kubernetes/numa-pinning-check.sh
new file mode 100755
index 0000000000..ead2476e6a
--- /dev/null
+++ b/tests/integration/kubernetes/numa-pinning-check.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# WARNING: This script runs directly on the host, NOT inside a container.
+# It requires privileged access to /proc and /sys to inspect QEMU vCPU
+# thread affinities and map them to host NUMA nodes.
+#
+# Usage: numa-pinning-check.sh <qemu_pid>
+#
+# Output: one line per NUMA node with the count of pinned vCPU threads.
+#   node0: 32
+#   node1: 32
+#
+# A vCPU thread is counted only when taskset reports it pinned to a single
+# CPU (bare number, no ranges or commas).  Threads with broad affinity
+# masks are silently skipped — the caller is expected to retry until the
+# runtime has finished per-vCPU pinning.
+
+set -o pipefail
+
+QEMU_PID="${1:?Usage: $0 <qemu_pid>}"
+
+if [[ ! -d "/proc/${QEMU_PID}/task" ]]; then
+    echo "ERROR: /proc/${QEMU_PID}/task not found" >&2
+    exit 1
+fi
+
+for tid in "/proc/${QEMU_PID}/task/"*; do
+    tid="${tid##*/}"
+    list=$(taskset -pc "${tid}" 2>/dev/null | sed 's/.*: //')
+    if [[ "${list}" =~ ^[0-9]+$ ]]; then
+        # Map the CPU to its NUMA node via the sysfs topology symlink
+        for node_link in "/sys/devices/system/cpu/cpu${list}/node"*; do
+            if [[ -d "${node_link}" ]]; then
+                numa_node="${node_link##*node}"
+                echo "node${numa_node}"
+                break
+            fi
+        done
+    fi
+done | sort | uniq -c | awk '{print $2 ": " $1}'
diff --git a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh
index eda7934858..901b97779b 100644
--- a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh
+++ b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh
@@ -72,6 +72,7 @@ if [[ -n "${K8S_TEST_NV:-}" ]]; then
 	mapfile -d " " -t K8S_TEST_NV <<< "${K8S_TEST_NV}"
 else
 	K8S_TEST_NV=("k8s-confidential-attestation.bats" \
+		"k8s-nvidia-numa.bats" \
 		"k8s-nvidia-cuda.bats" \
 		"k8s-nvidia-nim.bats" \
 		"k8s-nvidia-nim-service.bats")
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in
new file mode 100644
index 0000000000..7167fa271c
--- /dev/null
+++ b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-gpu-test.yaml.in
@@ -0,0 +1,24 @@
+#
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${POD_NAME_NUMA}
+  labels:
+    app: ${POD_NAME_NUMA}
+spec:
+  runtimeClassName: kata
+  containers:
+  - name: numa-check
+    image: "quay.io/kata-containers/numa:2026-05-15@sha256:a863fcf95fcbbf63352b0555a61a62537f74399dc4bca826a2e42d001e26accb"
+    resources:
+      requests:
+        cpu: "1"
+        memory: "1Gi"
+      limits:
+        cpu: "${NUMA_TEST_VCPUS}"
+        memory: "${NUMA_TEST_MEMORY}"
+        nvidia.com/pgpu: "1"
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in
new file mode 100644
index 0000000000..731e75a32d
--- /dev/null
+++ b/tests/integration/kubernetes/runtimeclass_workloads/numa-topology-test.yaml.in
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${POD_NAME_NUMA}
+  labels:
+    app: ${POD_NAME_NUMA}
+spec:
+  runtimeClassName: kata
+  containers:
+  - name: numa-check
+    image: "quay.io/kata-containers/numa:2026-05-15@sha256:a863fcf95fcbbf63352b0555a61a62537f74399dc4bca826a2e42d001e26accb"
+    resources:
+      requests:
+        cpu: "1"
+        memory: "1Gi"
+      limits:
+        cpu: "${NUMA_TEST_VCPUS}"
+        memory: "${NUMA_TEST_MEMORY}"
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile b/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile
new file mode 100644
index 0000000000..7e9f541ae8
--- /dev/null
+++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/Dockerfile
@@ -0,0 +1,17 @@
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Minimal image that reads guest NUMA topology from sysfs.
+# Multi-arch: linux/amd64, linux/arm64
+#
+# Build & push:
+#   docker buildx build --platform linux/amd64,linux/arm64 \
+#       -t quay.io/kata-containers/numa:$(date +%Y-%m-%d) --push .
+
+FROM alpine:3.23
+
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md b/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md
new file mode 100644
index 0000000000..8bed127cb8
--- /dev/null
+++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/README.md
@@ -0,0 +1,36 @@
+# NUMA Topology Check Container
+
+Minimal container image that reads guest NUMA topology from sysfs and
+prints structured output to stdout.  Used by `k8s-nvidia-numa.bats` to
+verify guest NUMA node count, vCPU distribution, and memory layout
+without needing `kubectl exec` (which requires CoCo policy overrides).
+
+## Image
+
+`quay.io/kata-containers/numa:<date>`
+
+## Build and push (multi-arch)
+
+```bash
+cd tests/integration/kubernetes/runtimeclass_workloads/numa/
+
+docker buildx build --platform linux/amd64,linux/arm64 \
+    -t quay.io/kata-containers/numa:$(date +%Y-%m-%d) --push .
+```
+
+After pushing, update the image reference (including digest) in
+`numa-topology-test.yaml.in`.
+
+## Output format
+
+The entrypoint prints one `key: value` pair per line:
+
+```
+numa_online: 0-1
+node0_cpus: 32
+node1_cpus: 32
+node0_mem_kb: 37078332
+node1_mem_kb: 37125524
+```
+
+The bats test parses this output from `kubectl logs`.
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh b/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh
new file mode 100755
index 0000000000..1a8f970305
--- /dev/null
+++ b/tests/integration/kubernetes/runtimeclass_workloads/numa/entrypoint.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+#
+# Copyright (c) 2026 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Reads guest NUMA topology from sysfs and prints structured output.
+# Designed to run inside a kata VM as the container entrypoint.
+#
+# Output format (one key: value per line):
+#   numa_online: 0-1
+#   node0_cpus: 32
+#   node1_cpus: 32
+#   node0_mem_kb: 37078332
+#   node1_mem_kb: 37125524
+#   gpu_0000:41:00.0_numa: 1       (only if GPUs are present)
+
+set -e
+
+# Print results to stdout (readable via "kubectl logs"), then sleep to
+# keep the pod alive so the host-side pinning check can inspect the
+# QEMU process.  The bats test deletes the pod when done.
+
+# NUMA nodes online (e.g. "0-1" or "0")
+online=$(cat /sys/devices/system/node/online)
+echo "numa_online: ${online}"
+
+# Per-node vCPU count
+for cpulist in /sys/devices/system/node/node*/cpulist; do
+    node_name=$(basename "$(dirname "${cpulist}")")
+    cpus=$(cat "${cpulist}")
+    count=0
+    # Parse comma-separated ranges like "0-31,64-95"
+    IFS=","
+    for range in ${cpus}; do
+        case "${range}" in
+            *-*)
+                lo=${range%-*}
+                hi=${range#*-}
+                count=$((count + hi - lo + 1))
+                ;;
+            *)
+                count=$((count + 1))
+                ;;
+        esac
+    done
+    unset IFS
+    echo "${node_name}_cpus: ${count}"
+done
+
+# Per-node memory
+for meminfo in /sys/devices/system/node/node*/meminfo; do
+    node_name=$(basename "$(dirname "${meminfo}")")
+    mem_kb=$(awk '/MemTotal/ {print $4}' "${meminfo}")
+    echo "${node_name}_mem_kb: ${mem_kb}"
+done
+
+# GPU NUMA affinity (if any GPUs are present via VFIO passthrough).
+# PCI class 0x030200 = 3D controller (NVIDIA data center GPUs: A100, H100, etc.)
+for numa_file in /sys/bus/pci/devices/*/numa_node; do
+    dev_dir=$(dirname "${numa_file}")
+    class=$(cat "${dev_dir}/class" 2>/dev/null) || continue
+    case "${class}" in
+        0x030200)
+            bdf=$(basename "${dev_dir}")
+            node=$(cat "${numa_file}")
+            echo "gpu_${bdf}_numa: ${node}"
+            ;;
+    esac
+done
+
+# Keep the pod alive for host-side pinning verification.
+exec sleep infinity
diff --git a/tests/spellcheck/kata-dictionary.txt b/tests/spellcheck/kata-dictionary.txt
index 66fb7076b1..e5701c4b82 100644
--- a/tests/spellcheck/kata-dictionary.txt
+++ b/tests/spellcheck/kata-dictionary.txt
@@ -20,6 +20,7 @@ materialx
 # Hardware & Architecture
 AMD
 APQN
+chiplet
 cpuid
 DCAP
 DGPU
@@ -78,6 +79,7 @@ ttrpc
 vsock
 
 # Container, Runtime & Misc terms
+Burstable
 cgroupsv1
 coredump
 CPUSET
diff --git a/tools/packaging/qemu/patches/11.0.x/no_patches.txt b/tools/packaging/qemu/patches/11.0.x/no_patches.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch b/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch
new file mode 100644
index 0000000000..b80adaa58f
--- /dev/null
+++ b/tools/packaging/qemu/patches/tag_patches/gpu-snp-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch
@@ -0,0 +1,94 @@
+From 6b0eaa20aa91e9d82e0bf72b4ade6e83d18a4c9f Mon Sep 17 00:00:00 2001
+From: Ashish Kalra <ashish.kalra@amd.com>
+Date: Thu, 18 Sep 2025 22:10:35 +0000
+Subject: [PATCH] accel/kvm: Fix kvm_convert_memory calls crossing memory
+ regions
+
+Page conversion call can span multiple memory regions, potentially
+resulting in a conversion failure if the memory range being converted
+extends beyond the boundaries of the referenced memory region.
+
+Handle the case of page conversion call straddling across memory
+regions.
+
+Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
+Signed-off-by: Michael Roth <michael.roth@amd.com>
+---
+ accel/kvm/kvm-all.c | 27 ++++++++++++++++++++-------
+ 1 file changed, 20 insertions(+), 7 deletions(-)
+
+diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
+index 63230743d0..a1b2c3e5f4 100644
+--- a/accel/kvm/kvm-all.c
++++ b/accel/kvm/kvm-all.c
+@@ -3342,6 +3342,7 @@ static void kvm_eat_signals(CPUState *cpu)
+ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+ {
+     MemoryRegionSection section;
++    hwaddr convert_size;
+     ram_addr_t offset;
+     MemoryRegion *mr;
+     RAMBlock *rb;
+@@ -3359,6 +3360,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+         return ret;
+     }
+ 
++    /*
++     * Page conversions can span multiple memory regions, for example, if two
++     * memory backends are added to support two different NUMA nodes/policies.
++     */
++next_memory_region:
+     section = memory_region_find(get_system_memory(), start, size);
+     mr = section.mr;
+     if (!mr) {
+@@ -3397,10 +3403,13 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+         goto out_unref;
+     }
+ 
++    convert_size = (section.offset_within_region + size > mr->size) ?
++                   mr->size - section.offset_within_region : size;
++
+     if (to_private) {
+-        ret = kvm_set_memory_attributes_private(start, size);
++        ret = kvm_set_memory_attributes_private(start, convert_size);
+     } else {
+-        ret = kvm_set_memory_attributes_shared(start, size);
++        ret = kvm_set_memory_attributes_shared(start, convert_size);
+     }
+     if (ret) {
+         goto out_unref;
+@@ -3410,11 +3419,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+     rb = qemu_ram_block_from_host(addr, false, &offset);
+ 
+     ret = ram_block_attributes_state_change(rb->attributes,
+-                                            offset, size, to_private);
++                                            offset, convert_size, to_private);
+     if (ret) {
+         error_report("Failed to notify the listener the state change of "
+                      "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s",
+-                     start, size, to_private ? "private" : "shared");
++                     start, convert_size, to_private ? "private" : "shared");
+         goto out_unref;
+     }
+ 
+@@ -3426,9 +3435,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+              */
+             goto out_unref;
+         }
+-        ret = ram_block_discard_range(rb, offset, size);
++        ret = ram_block_discard_range(rb, offset, convert_size);
+     } else {
+-        ret = ram_block_discard_guest_memfd_range(rb, offset, size);
++        ret = ram_block_discard_guest_memfd_range(rb, offset, convert_size);
++    }
++
++    if (size - convert_size) {
++        start += convert_size;
++        size -= convert_size;
++        goto next_memory_region;
+     }
+ 
+ out_unref:
+-- 
+2.43.0
+
diff --git a/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch b/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch
new file mode 100644
index 0000000000..b80adaa58f
--- /dev/null
+++ b/tools/packaging/qemu/patches/tag_patches/gpu-tdx-20260430/0001-accel-kvm-Fix-kvm_convert_memory-calls-crossing-memo.patch
@@ -0,0 +1,94 @@
+From 6b0eaa20aa91e9d82e0bf72b4ade6e83d18a4c9f Mon Sep 17 00:00:00 2001
+From: Ashish Kalra <ashish.kalra@amd.com>
+Date: Thu, 18 Sep 2025 22:10:35 +0000
+Subject: [PATCH] accel/kvm: Fix kvm_convert_memory calls crossing memory
+ regions
+
+Page conversion call can span multiple memory regions, potentially
+resulting in a conversion failure if the memory range being converted
+extends beyond the boundaries of the referenced memory region.
+
+Handle the case of page conversion call straddling across memory
+regions.
+
+Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
+Signed-off-by: Michael Roth <michael.roth@amd.com>
+---
+ accel/kvm/kvm-all.c | 27 ++++++++++++++++++++-------
+ 1 file changed, 20 insertions(+), 7 deletions(-)
+
+diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
+index 63230743d0..a1b2c3e5f4 100644
+--- a/accel/kvm/kvm-all.c
++++ b/accel/kvm/kvm-all.c
+@@ -3342,6 +3342,7 @@ static void kvm_eat_signals(CPUState *cpu)
+ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+ {
+     MemoryRegionSection section;
++    hwaddr convert_size;
+     ram_addr_t offset;
+     MemoryRegion *mr;
+     RAMBlock *rb;
+@@ -3359,6 +3360,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+         return ret;
+     }
+ 
++    /*
++     * Page conversions can span multiple memory regions, for example, if two
++     * memory backends are added to support two different NUMA nodes/policies.
++     */
++next_memory_region:
+     section = memory_region_find(get_system_memory(), start, size);
+     mr = section.mr;
+     if (!mr) {
+@@ -3397,10 +3403,13 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+         goto out_unref;
+     }
+ 
++    convert_size = (section.offset_within_region + size > mr->size) ?
++                   mr->size - section.offset_within_region : size;
++
+     if (to_private) {
+-        ret = kvm_set_memory_attributes_private(start, size);
++        ret = kvm_set_memory_attributes_private(start, convert_size);
+     } else {
+-        ret = kvm_set_memory_attributes_shared(start, size);
++        ret = kvm_set_memory_attributes_shared(start, convert_size);
+     }
+     if (ret) {
+         goto out_unref;
+@@ -3410,11 +3419,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+     rb = qemu_ram_block_from_host(addr, false, &offset);
+ 
+     ret = ram_block_attributes_state_change(rb->attributes,
+-                                            offset, size, to_private);
++                                            offset, convert_size, to_private);
+     if (ret) {
+         error_report("Failed to notify the listener the state change of "
+                      "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s",
+-                     start, size, to_private ? "private" : "shared");
++                     start, convert_size, to_private ? "private" : "shared");
+         goto out_unref;
+     }
+ 
+@@ -3426,9 +3435,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+              */
+             goto out_unref;
+         }
+-        ret = ram_block_discard_range(rb, offset, size);
++        ret = ram_block_discard_range(rb, offset, convert_size);
+     } else {
+-        ret = ram_block_discard_guest_memfd_range(rb, offset, size);
++        ret = ram_block_discard_guest_memfd_range(rb, offset, convert_size);
++    }
++
++    if (size - convert_size) {
++        start += convert_size;
++        size -= convert_size;
++        goto next_memory_region;
+     }
+ 
+ out_unref:
+-- 
+2.43.0
+
diff --git a/tools/packaging/qemu/patches/tag_patches/v11.0.0/no_patches.txt b/tools/packaging/qemu/patches/tag_patches/v11.0.0/no_patches.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/versions.yaml b/versions.yaml
index 93cad9e9f1..69fb34b0ba 100644
--- a/versions.yaml
+++ b/versions.yaml
@@ -88,8 +88,8 @@ assets:
     qemu:
       description: "VMM that uses KVM"
       url: "https://github.com/qemu/qemu"
-      version: "v10.2.1"
-      tag: "v10.2.1"
+      version: "v11.0.0"
+      tag: "v11.0.0"
       # Do not include any non-full release versions
       # Break the line *without CR or space being appended*, to appease
       # yamllint, and note the deliberate ' ' at the end of the expression.
@@ -107,12 +107,12 @@ assets:
     qemu-snp-experimental:
       description: "QEMU with GPU+SNP support"
       url: "https://github.com/confidential-containers/qemu.git"
-      tag: "gpu-snp-20260107"
+      tag: "gpu-snp-20260430"
 
     qemu-tdx-experimental:
       description: "QEMU with GPU+TDX support"
       url: "https://github.com/confidential-containers/qemu.git"
-      tag: "gpu-tdx-20260107"
+      tag: "gpu-tdx-20260430"
 
     stratovirt:
       description: "StratoVirt is an lightweight opensource VMM"