runtime: Add host NUMA distance discovery and build guest NUMA topology

Add sysfs-based host NUMA distance reading (GetHostNUMADistances) that parses /sys/devices/system/node/nodeN/distance to mirror the host NUMA distance matrix into the guest via -numa dist entries. Implement buildNUMATopology() which translates the GuestNUMANodes configuration into govmm NUMANode and NUMADist slices. Each guest NUMA node gets a floor-divided share of vCPUs and memory, with the last node absorbing any remainder. This handles the common Kata case of +1 VMM overhead vCPU gracefully. Memory backends are selected based on hugepages/virtio-fs/file-backed-mem configuration. Guard multi-NUMA topology generation to amd64 and arm64 only, since other architectures (s390x, riscv64) do not support QEMU NUMA/DIMM. Wire buildNUMATopology() into CreateVM so the QEMU config includes NUMA nodes and distances. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com> Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-07-01 06:28:11 +00:00 · 2026-04-14 15:03:49 +02:00
parent 447e2a3faf
commit d0d7deb262
9 changed files with 919 additions and 5 deletions
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -1071,6 +1071,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		IOMMU:                         h.IOMMU,
 		IOMMUPlatform:                 h.getIOMMUPlatform(),
 		GuestNUMANodes:                h.defaultGuestNUMANodes(),
+		NUMAMapping:                   append([]string(nil), h.NUMAMapping...),
 		FileBackedMemRootDir:          h.FileBackedMemRootDir,
 		FileBackedMemRootList:         h.FileBackedMemRootList,
 		Debug:                         h.Debug,
@@ -1994,6 +1995,10 @@ func checkConfig(config oci.RuntimeConfig) error {
 		return err
 	}

+	if err := checkNumaConfig(config); err != nil {
+		return err
+	}
+
 	hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
 	coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
 	machineType := config.HypervisorConfig.HypervisorMachineType
@@ -2005,6 +2010,25 @@ func checkConfig(config oci.RuntimeConfig) error {
 	return nil
 }

+func checkNumaConfig(config oci.RuntimeConfig) error {
+	if len(config.HypervisorConfig.GuestNUMANodes) <= 1 {
+		return nil
+	}
+
+	switch goruntime.GOARCH {
+	case "amd64", "arm64":
+	default:
+		return fmt.Errorf("multi-NUMA support is only available on amd64 and arm64, got %q", goruntime.GOARCH)
+	}
+
+	if !config.StaticSandboxResourceMgmt {
+		return fmt.Errorf("NUMA support requires static_sandbox_resource_mgmt to be enabled; " +
+			"NUMA topology is not compatible with dynamic CPU/memory hotplug")
+	}
+
+	return nil
+}
+
 // checkPCIeConfig ensures the PCIe configuration is valid.
 // Only allow one of the following settings for cold-plug:
 // no-port, root-port, switch-port
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -794,11 +794,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
 	}

 	if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok {
-		guestNUMANodes, err := vcutils.GetGuestNUMANodes(strings.Fields(annotation))
+		mapping := strings.Fields(annotation)
+		guestNUMANodes, err := vcutils.GetGuestNUMANodes(mapping)
 		if err != nil {
 			return err
 		}
 		sbConfig.HypervisorConfig.GuestNUMANodes = guestNUMANodes
+		// Record the raw user-provided mapping so the hypervisor
+		// backend honors it verbatim instead of right-sizing.
+		sbConfig.HypervisorConfig.NUMAMapping = mapping
 	}

 	return nil
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -803,6 +803,15 @@ type HypervisorConfig struct {
 	// GuestNUMANodes defines guest NUMA topology and mapping to host NUMA nodes and CPUs.
 	GuestNUMANodes []types.GuestNUMANode

+	// NUMAMapping is the raw user-provided NUMA mapping (TOML
+	// `numa_mapping` or the io.katacontainers.config.hypervisor.numa_mapping
+	// annotation). When empty, GuestNUMANodes was auto-derived from the
+	// host topology and may be right-sized at sandbox creation (e.g.
+	// collapsed to a single host node when the sandbox fits, or
+	// restricted to host nodes containing attached VFIO devices). When
+	// non-empty, the topology is honored verbatim.
+	NUMAMapping []string
+
 	// DisableNestingChecks is used to override customizations performed
 	// when running on top of another VMM.
 	DisableNestingChecks bool
--- a/src/runtime/virtcontainers/qemu.go
+++ b/src/runtime/virtcontainers/qemu.go
@@ -21,6 +21,7 @@ import (
 	"os/user"
 	"path/filepath"
 	"regexp"
+	goruntime "runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -251,6 +252,14 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
 	span, _ := katatrace.Trace(ctx, q.Logger(), "setup", qemuTracingTags, map[string]string{"sandbox_id": q.id})
 	defer span.End()

+	// Right-size auto-derived NUMA topology before snapshotting the config.
+	// We mutate the caller-owned pointer so the sandbox's shared
+	// HypervisorConfig (used by vCPU pinning and cpuset.mems forwarding)
+	// observes the same trimmed topology that QEMU is launched with.
+	// No-op when numa_mapping was set explicitly or when the topology
+	// already has one or zero nodes.
+	maybeRightSizeAutoNUMA(hypervisorConfig, q.Logger())
+
 	if err := q.setConfig(hypervisorConfig); err != nil {
 		return err
 	}
@@ -326,8 +335,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
 	return nil
 }

-func (q *qemu) cpuTopology() govmmQemu.SMP {
-	return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, q.config.NumGuestNUMANodes())
+func (q *qemu) cpuTopology(effectiveNUMANodes uint32) govmmQemu.SMP {
+	return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, effectiveNUMANodes)
 }

 func (q *qemu) memoryTopology() (govmmQemu.Memory, error) {
@@ -996,7 +1005,13 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
 		return err
 	}

-	smp := q.cpuTopology()
+	numaNodes, numaDists, err := q.buildNUMATopology()
+	if err != nil {
+		return err
+	}
+
+	effectiveNUMANodes := uint32(len(numaNodes))
+	smp := q.cpuTopology(effectiveNUMANodes)

 	memory, err := q.memoryTopology()
 	if err != nil {
@@ -1117,6 +1132,8 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
 		QMPSockets:     qmpSockets,
 		Knobs:          knobs,
 		Incoming:       incoming,
+		NUMANodes:      numaNodes,
+		NUMADists:      numaDists,
 		VGA:            "none",
 		GlobalParam:    "kvm-pit.lost_tick_policy=discard",
 		Bios:           firmwarePath,
--- a/src/runtime/virtcontainers/qemu_test.go
+++ b/src/runtime/virtcontainers/qemu_test.go
@@ -19,6 +19,7 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"runtime"
 	"testing"

 	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
@@ -29,6 +30,7 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
 	"github.com/pbnjay/memory"
 	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
 )

@@ -283,7 +285,7 @@ func TestQemuCPUTopology(t *testing.T) {
 		MaxCPUs: uint32(vcpus),
 	}

-	smp := q.cpuTopology()
+	smp := q.cpuTopology(0)
 	assert.Exactly(smp, expectedOut)
 }

@@ -1200,3 +1202,570 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) {
 	// State should remain unchanged
 	assert.Equal(100, q.state.HotpluggedMemory)
 }
+
+func TestBuildNUMATopologySingleNode(t *testing.T) {
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-3"},
+			},
+		},
+	}
+	nodes, dists, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Nil(nodes)
+	assert.Nil(dists)
+}
+
+func TestBuildNUMATopologyTwoNodes(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+
+	assert.Equal(uint32(0), nodes[0].NodeID)
+	assert.Equal("0-1", nodes[0].CPUs)
+	assert.Equal("512M", nodes[0].MemSize)
+	assert.Equal("memory-backend-ram", nodes[0].MemBackendType)
+
+	assert.Equal(uint32(1), nodes[1].NodeID)
+	assert.Equal("2-3", nodes[1].CPUs)
+	assert.Equal("512M", nodes[1].MemSize)
+}
+
+func TestBuildNUMATopologyHugePages(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	if _, err := os.Stat("/dev/hugepages"); err != nil {
+		t.Skip("skipping: /dev/hugepages not available")
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			HugePages:       true,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	assert.Equal("memory-backend-file", nodes[0].MemBackendType)
+	assert.Equal("/dev/hugepages", nodes[0].MemBackendPath)
+	assert.Equal("512M", nodes[0].MemSize)
+}
+
+func TestBuildNUMATopologyVirtioFS(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1024,
+			SharedFS:        config.VirtioFS,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	assert.Equal("memory-backend-file", nodes[0].MemBackendType)
+	assert.Equal(fallbackFileBackedMemDir, nodes[0].MemBackendPath)
+}
+
+func TestBuildNUMATopologyFileBackedMem(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	tmpDir := t.TempDir()
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs:      4,
+			MemorySize:           1024,
+			FileBackedMemRootDir: tmpDir,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	assert.Equal("memory-backend-file", nodes[0].MemBackendType)
+	assert.Equal(tmpDir, nodes[0].MemBackendPath)
+}
+
+func TestBuildNUMATopologyTooFewVCPUs(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 1,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0"},
+				{HostNodes: "1", HostCPUs: "1"},
+			},
+		},
+	}
+	nodes, dists, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Nil(nodes)
+	assert.Nil(dists)
+}
+
+func TestBuildNUMATopologyUnevenVCPUs(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 5,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-4"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	// cpuTopology() rounds MaxCPUs to ceil(5/2)*2=6, so 6 CPU slots
+	// are distributed proportionally: 2 host CPUs → 2 vCPUs,
+	// 3 host CPUs → 4 vCPUs (3 proportional + 1 remainder).
+	assert.Equal("0-1", nodes[0].CPUs)
+	assert.Equal("2-5", nodes[1].CPUs)
+}
+
+func TestBuildNUMATopologyMemMisaligned(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 4,
+			MemorySize:      1,
+			HugePages:       true,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-1"},
+				{HostNodes: "1", HostCPUs: "2-3"},
+			},
+		},
+	}
+	_, _, err := q.buildNUMATopology()
+	assert.Error(err)
+	assert.Contains(err.Error(), "cannot be evenly distributed")
+}
+
+func TestBuildNUMATopologyMemMisalignedRemainder(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 6,
+			MemorySize:      1025,
+			HugePages:       true,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-2"},
+				{HostNodes: "1", HostCPUs: "3-5"},
+			},
+		},
+	}
+	_, _, err := q.buildNUMATopology()
+	assert.Error(err)
+	assert.Contains(err.Error(), "cannot be evenly distributed")
+}
+
+func TestBuildNUMATopologyEvenMemory(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 6,
+			MemorySize:      1024,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-2"},
+				{HostNodes: "1", HostCPUs: "3-5"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+
+	assert.Equal("0-2", nodes[0].CPUs)
+	assert.Equal("512M", nodes[0].MemSize)
+
+	assert.Equal("3-5", nodes[1].CPUs)
+	assert.Equal("512M", nodes[1].MemSize)
+}
+
+func TestBuildNUMATopologyProportionalVCPUs(t *testing.T) {
+	if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
+		t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
+	}
+	assert := assert.New(t)
+	q := &qemu{
+		config: HypervisorConfig{
+			DefaultMaxVCPUs: 10,
+			MemorySize:      1000,
+			GuestNUMANodes: []types.GuestNUMANode{
+				{HostNodes: "0", HostCPUs: "0-7"},
+				{HostNodes: "1", HostCPUs: "8-9"},
+			},
+		},
+	}
+	nodes, _, err := q.buildNUMATopology()
+	assert.NoError(err)
+	assert.Len(nodes, 2)
+	// 8 out of 10 host CPUs on node 0 → 8 vCPUs
+	assert.Equal("0-7", nodes[0].CPUs)
+	assert.Equal("800M", nodes[0].MemSize)
+	// 2 out of 10 host CPUs on node 1 → 2 vCPUs
+	assert.Equal("8-9", nodes[1].CPUs)
+	assert.Equal("200M", nodes[1].MemSize)
+}
+
+func TestBuildCoveredHostNodes(t *testing.T) {
+	assert := assert.New(t)
+
+	covered := buildCoveredHostNodes([]types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-3"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	})
+	assert.Len(covered, 2)
+	assert.Equal(uint32(0), covered[0])
+	assert.Equal(uint32(1), covered[1])
+}
+
+func TestBuildCoveredHostNodesRange(t *testing.T) {
+	assert := assert.New(t)
+
+	covered := buildCoveredHostNodes([]types.GuestNUMANode{
+		{HostNodes: "0-1", HostCPUs: "0-7"},
+	})
+	assert.Len(covered, 2)
+	assert.Equal(uint32(0), covered[0])
+	assert.Equal(uint32(0), covered[1])
+}
+
+func TestBuildCoveredHostNodesEmpty(t *testing.T) {
+	assert := assert.New(t)
+
+	covered := buildCoveredHostNodes(nil)
+	assert.Len(covered, 0)
+}
+
+func TestBuildCoveredHostNodesInvalidParse(t *testing.T) {
+	assert := assert.New(t)
+
+	covered := buildCoveredHostNodes([]types.GuestNUMANode{
+		{HostNodes: "invalid", HostCPUs: "0-3"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	})
+	assert.Len(covered, 1)
+	assert.Equal(uint32(1), covered[1])
+}
+
+// silentLogger returns a logrus.Entry that discards all output, suitable
+// for use in unit tests that exercise NUMA right-sizing decisions.
+func silentLogger() *logrus.Entry {
+	l := logrus.New()
+	l.Out = io.Discard
+	return logrus.NewEntry(l)
+}
+
+// fakeCapFn returns a hostNUMACapFn backed by a static map. Unknown nodes
+// produce an error so we exercise the "skip unknown" branch in
+// sumNUMACapacity when intended.
+func fakeCapFn(caps map[int]struct {
+	cpus  int
+	memMB uint64
+}) hostNUMACapFn {
+	return func(nodeID int) (int, uint64, error) {
+		if c, ok := caps[nodeID]; ok {
+			return c.cpus, c.memMB, nil
+		}
+		return 0, 0, fmt.Errorf("unknown host NUMA node %d", nodeID)
+	}
+}
+
+// twoNodeHostCaps describes a typical 2-socket host: 32 CPUs and 128 GiB
+// per node.
+func twoNodeHostCaps() map[int]struct {
+	cpus  int
+	memMB uint64
+} {
+	return map[int]struct {
+		cpus  int
+		memMB uint64
+	}{
+		0: {cpus: 32, memMB: 128 * 1024},
+		1: {cpus: 32, memMB: 128 * 1024},
+	}
+}
+
+func twoNodeAutoTopology() []types.GuestNUMANode {
+	return []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-31"},
+		{HostNodes: "1", HostCPUs: "32-63"},
+	}
+}
+
+func TestSumNUMACapacity(t *testing.T) {
+	assert := assert.New(t)
+
+	cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(twoNodeHostCaps()))
+	assert.Equal(64, cpus)
+	assert.Equal(uint64(256*1024), memMB)
+}
+
+func TestSumNUMACapacityDeduplicatesHostNodes(t *testing.T) {
+	assert := assert.New(t)
+
+	// Two guest entries that both reference host node 0 must only count
+	// once. The merged "0-1" entry adds host node 1.
+	nodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-31"},
+		{HostNodes: "0-1", HostCPUs: "0-63"},
+	}
+	cpus, memMB := sumNUMACapacity(nodes, fakeCapFn(twoNodeHostCaps()))
+	assert.Equal(64, cpus)
+	assert.Equal(uint64(256*1024), memMB)
+}
+
+func TestSumNUMACapacitySkipsUnknown(t *testing.T) {
+	assert := assert.New(t)
+
+	caps := map[int]struct {
+		cpus  int
+		memMB uint64
+	}{
+		0: {cpus: 16, memMB: 32 * 1024},
+		// host node 1 missing on purpose
+	}
+	cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(caps))
+	assert.Equal(16, cpus)
+	assert.Equal(uint64(32*1024), memMB)
+}
+
+func TestSelectNUMANodesPassthroughForSingleNode(t *testing.T) {
+	assert := assert.New(t)
+
+	in := []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}}
+	out := selectNUMANodes(in, 4, 1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesNoVFIOFitsOneNode(t *testing.T) {
+	// Small sandbox (8 vCPUs / 16 GiB) fits comfortably in one host node:
+	// expect collapse to the first guest node.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	out := selectNUMANodes(in, 8, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Len(out, 1)
+	assert.Equal("0", out[0].HostNodes)
+}
+
+func TestSelectNUMANodesNoVFIOExceedsOneNode(t *testing.T) {
+	// 64 vCPUs needs both 32-CPU nodes: expect full topology.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	out := selectNUMANodes(in, 64, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesNoVFIOMemoryExceedsOneNode(t *testing.T) {
+	// CPU fits in one node but memory does not: expect full topology.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	out := selectNUMANodes(in, 8, 200*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesVFIOSubsetFits(t *testing.T) {
+	// VFIO device on host node 1; sandbox fits in one node: expect
+	// collapse to the guest node covering host node 1.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	vfio := map[int]struct{}{1: {}}
+	out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Len(out, 1)
+	assert.Equal("1", out[0].HostNodes)
+}
+
+func TestSelectNUMANodesVFIOSubsetTooSmall(t *testing.T) {
+	// VFIO device on host node 1, but sandbox needs more than one node's
+	// worth of memory: expect the full topology so the sandbox actually
+	// fits, even at the cost of cross-NUMA traffic.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	vfio := map[int]struct{}{1: {}}
+	out := selectNUMANodes(in, 8, 200*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesVFIOSpansAllNodes(t *testing.T) {
+	// One VFIO device per host node: VFIO subset == full topology, no
+	// collapse possible. Result is the input unchanged.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	vfio := map[int]struct{}{0: {}, 1: {}}
+	out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+func TestSelectNUMANodesVFIONoCoverage(t *testing.T) {
+	// VFIO host node not represented in the guest topology (rare, but can
+	// happen if numa_mapping has been customized). Keep the full topology
+	// rather than dropping all nodes.
+	assert := assert.New(t)
+
+	in := twoNodeAutoTopology()
+	vfio := map[int]struct{}{2: {}}
+	out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
+	assert.Equal(in, out)
+}
+
+// rightSizeNUMAWithFakeCaps mirrors maybeRightSizeAutoNUMA but lets tests
+// inject a synthetic capacity oracle in place of realHostNUMACapFn so the
+// decision is hermetic.
+func rightSizeNUMAWithFakeCaps(hc *HypervisorConfig, capFn hostNUMACapFn) {
+	if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 {
+		return
+	}
+	hc.GuestNUMANodes = selectNUMANodes(
+		hc.GuestNUMANodes,
+		hc.DefaultMaxVCPUs,
+		uint64(hc.MemorySize),
+		nil, // no VFIO devices in this test
+		capFn,
+		silentLogger(),
+	)
+}
+
+func TestMaybeRightSizeAutoNUMACollapsesToOneNode(t *testing.T) {
+	// Empty NUMAMapping (auto) + sandbox fits in one host node:
+	// GuestNUMANodes is trimmed to a single entry.
+	assert := assert.New(t)
+
+	hc := &HypervisorConfig{
+		DefaultMaxVCPUs: 1,
+		MemorySize:      1,
+		GuestNUMANodes:  twoNodeAutoTopology(),
+	}
+	rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
+	assert.Len(hc.GuestNUMANodes, 1)
+	assert.Equal("0", hc.GuestNUMANodes[0].HostNodes)
+}
+
+func TestMaybeRightSizeAutoNUMAExplicitMappingHonored(t *testing.T) {
+	// Non-empty NUMAMapping (user-provided) is left untouched, even if
+	// the sandbox would fit in a single node.
+	assert := assert.New(t)
+
+	hc := &HypervisorConfig{
+		DefaultMaxVCPUs: 1,
+		MemorySize:      1,
+		NUMAMapping:     []string{"0", "1"},
+		GuestNUMANodes:  twoNodeAutoTopology(),
+	}
+	rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
+	assert.Len(hc.GuestNUMANodes, 2)
+}
+
+func TestMaybeRightSizeAutoNUMAKeepsFullWhenSandboxSpansNodes(t *testing.T) {
+	// Sandbox needs more CPUs than a single host node has: full topology
+	// is preserved.
+	assert := assert.New(t)
+
+	hc := &HypervisorConfig{
+		DefaultMaxVCPUs: 64, // > one node's 32 CPUs
+		MemorySize:      1024,
+		GuestNUMANodes:  twoNodeAutoTopology(),
+	}
+	rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
+	assert.Len(hc.GuestNUMANodes, 2)
+}
+
+func TestMaybeRightSizeAutoNUMANoOpForFlatTopology(t *testing.T) {
+	// A topology with ≤ 1 node is a no-op regardless of NUMAMapping or
+	// budget.
+	assert := assert.New(t)
+
+	for _, tc := range []struct {
+		name string
+		hc   *HypervisorConfig
+	}{
+		{
+			name: "nil config",
+			hc:   nil,
+		},
+		{
+			name: "single node",
+			hc: &HypervisorConfig{
+				GuestNUMANodes: []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}},
+			},
+		},
+		{
+			name: "empty",
+			hc:   &HypervisorConfig{},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			before := 0
+			if tc.hc != nil {
+				before = len(tc.hc.GuestNUMANodes)
+			}
+			rightSizeNUMAWithFakeCaps(tc.hc, fakeCapFn(twoNodeHostCaps()))
+			after := 0
+			if tc.hc != nil {
+				after = len(tc.hc.GuestNUMANodes)
+			}
+			assert.Equal(before, after)
+		})
+	}
+}
--- a/src/runtime/virtcontainers/utils/utils.go
+++ b/src/runtime/virtcontainers/utils/utils.go
@@ -13,6 +13,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
+	"strconv"
 	"strings"
 	"syscall"
 	"time"
@@ -623,3 +624,182 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) {

 	return numaNodes, nil
 }
+
+// NUMADistEntry represents a single NUMA distance measurement between two nodes.
+type NUMADistEntry struct {
+	Src uint32
+	Dst uint32
+	Val uint32
+}
+
+// GetHostNUMADistances reads the host NUMA distance matrix for the nodes
+// referenced by the given GuestNUMANode list and returns off-diagonal
+// pairwise entries (skipping self-distance src==dst).
+// The distance row from sysfs is indexed by host NUMA node ID, so we parse
+// each guest node's HostNodes to find the representative host node ID and
+// use that to index into the distance row.
+func GetHostNUMADistances(nodes []types.GuestNUMANode) []NUMADistEntry {
+	hostNodeIDs := make([]int, len(nodes))
+	for i, n := range nodes {
+		nodeSet, err := cpuset.Parse(n.HostNodes)
+		if err != nil {
+			hostNodeIDs[i] = -1
+			continue
+		}
+		ids := nodeSet.ToSlice()
+		if len(ids) == 0 {
+			hostNodeIDs[i] = -1
+			continue
+		}
+		hostNodeIDs[i] = ids[0]
+	}
+
+	var dists []NUMADistEntry
+	for srcIdx, srcNode := range nodes {
+		if hostNodeIDs[srcIdx] < 0 {
+			continue
+		}
+		distStr := getHostNUMADistance(srcNode.HostNodes)
+		if distStr == "" {
+			continue
+		}
+		fields := strings.Fields(distStr)
+		for dstIdx := range nodes {
+			if srcIdx == dstIdx {
+				continue
+			}
+			hostID := hostNodeIDs[dstIdx]
+			if hostID < 0 || hostID >= len(fields) {
+				continue
+			}
+			val, err := strconv.ParseUint(fields[hostID], 10, 32)
+			if err != nil {
+				continue
+			}
+			dists = append(dists, NUMADistEntry{
+				Src: uint32(srcIdx),
+				Dst: uint32(dstIdx),
+				Val: uint32(val),
+			})
+		}
+	}
+	return dists
+}
+
+// HostNUMANodeCapacity describes the CPU and memory capacity of a single
+// host NUMA node, as seen via sysfs.
+type HostNUMANodeCapacity struct {
+	NodeID int
+	CPUs   int
+	MemMB  uint64
+}
+
+// GetHostNUMANodeCapacity returns the CPU count and memory size (in MiB)
+// of the given host NUMA node.
+func GetHostNUMANodeCapacity(nodeID int) (HostNUMANodeCapacity, error) {
+	cap := HostNUMANodeCapacity{NodeID: nodeID}
+	cpuList, err := getHostNUMANodeCPUs(nodeID)
+	if err != nil {
+		return cap, err
+	}
+	cs, err := cpuset.Parse(cpuList)
+	if err != nil {
+		return cap, fmt.Errorf("parse host node %d cpulist %q: %w", nodeID, cpuList, err)
+	}
+	cap.CPUs = cs.Size()
+	memMB, err := getHostNUMANodeMemoryMB(nodeID)
+	if err != nil {
+		return cap, err
+	}
+	cap.MemMB = memMB
+	return cap, nil
+}
+
+// GetHostNUMANodeCapacities returns the capacities of the given host NUMA
+// node IDs in the same order. Nodes that fail to be read are skipped and
+// the corresponding error is logged via the returned error (the slice may
+// be shorter than the input).
+func GetHostNUMANodeCapacities(nodeIDs []int) ([]HostNUMANodeCapacity, error) {
+	out := make([]HostNUMANodeCapacity, 0, len(nodeIDs))
+	for _, id := range nodeIDs {
+		c, err := GetHostNUMANodeCapacity(id)
+		if err != nil {
+			return out, fmt.Errorf("read host NUMA node %d capacity: %w", id, err)
+		}
+		out = append(out, c)
+	}
+	return out, nil
+}
+
+// DistributeVCPUsProportionally distributes totalVCPUs across NUMA nodes
+// proportionally to the number of host CPUs available on each node.
+// Each node is guaranteed at least 1 vCPU. Remainder vCPUs go to nodes
+// with the most host CPUs.
+func DistributeVCPUsProportionally(numaNodes []types.GuestNUMANode, totalVCPUs uint32) ([]uint32, error) {
+	numNodes := len(numaNodes)
+	if numNodes == 0 {
+		return nil, fmt.Errorf("no NUMA nodes")
+	}
+	if totalVCPUs < uint32(numNodes) {
+		return nil, fmt.Errorf("totalVCPUs (%d) must be >= NUMA node count (%d)", totalVCPUs, numNodes)
+	}
+
+	hostCPUCounts := make([]int, numNodes)
+	totalHostCPUs := 0
+	for i, gn := range numaNodes {
+		parsed, err := cpuset.Parse(gn.HostCPUs)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %w", i, err)
+		}
+		if parsed.Size() == 0 {
+			return nil, fmt.Errorf("HostCPUs for NUMA node %d must not be empty", i)
+		}
+		hostCPUCounts[i] = parsed.Size()
+		totalHostCPUs += hostCPUCounts[i]
+	}
+	if totalHostCPUs == 0 {
+		return nil, fmt.Errorf("total host CPU count is 0")
+	}
+
+	vcpusPerNode := make([]uint32, numNodes)
+	var assigned uint32
+	for i := range numaNodes {
+		vcpusPerNode[i] = uint32(int(totalVCPUs) * hostCPUCounts[i] / totalHostCPUs)
+		if vcpusPerNode[i] == 0 {
+			vcpusPerNode[i] = 1
+		}
+		assigned += vcpusPerNode[i]
+	}
+
+	// Use a copy for remainder distribution to avoid mutating the original counts.
+	weights := make([]int, numNodes)
+	copy(weights, hostCPUCounts)
+
+	for assigned < totalVCPUs {
+		bestIdx := 0
+		for i := 1; i < numNodes; i++ {
+			if weights[i] > weights[bestIdx] {
+				bestIdx = i
+			}
+		}
+		vcpusPerNode[bestIdx]++
+		assigned++
+		weights[bestIdx]--
+	}
+
+	for assigned > totalVCPUs {
+		bestIdx := 0
+		for i := 1; i < numNodes; i++ {
+			if vcpusPerNode[i] > vcpusPerNode[bestIdx] {
+				bestIdx = i
+			}
+		}
+		if vcpusPerNode[bestIdx] <= 1 {
+			break
+		}
+		vcpusPerNode[bestIdx]--
+		assigned--
+	}
+
+	return vcpusPerNode, nil
+}
--- a/src/runtime/virtcontainers/utils/utils_darwin.go
+++ b/src/runtime/virtcontainers/utils/utils_darwin.go
@@ -22,3 +22,11 @@ func getHostNUMANodes() ([]int, error) {
 func getHostNUMANodeCPUs(nodeId int) (string, error) {
 	return "", nil
 }
+
+func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) {
+	return 0, nil
+}
+
+func getHostNUMADistance(hostNodes string) string {
+	return ""
+}
--- a/src/runtime/virtcontainers/utils/utils_linux.go
+++ b/src/runtime/virtcontainers/utils/utils_linux.go
@@ -12,6 +12,8 @@ import (
 	"io"
 	"math/big"
 	"os"
+	"regexp"
+	"strconv"
 	"strings"
 	"syscall"
 	"time"
@@ -23,6 +25,8 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
 )

+var nodeMemTotalRegexp = regexp.MustCompile(`Node\s+\d+\s+MemTotal:\s+(\d+)\s+kB`)
+
 var ioctlFunc = Ioctl

 // maxUInt represents the maximum valid value for the context ID.
@@ -220,3 +224,41 @@ func getHostNUMANodeCPUs(nodeId int) (string, error) {
 	}
 	return strings.TrimSuffix(string(data), "\n"), nil
 }
+
+// getHostNUMANodeMemoryMB returns the total memory in MiB for the given
+// host NUMA node, parsed from /sys/devices/system/node/nodeN/meminfo.
+func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) {
+	fileName := fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeId)
+	data, err := os.ReadFile(fileName)
+	if err != nil {
+		return 0, err
+	}
+	m := nodeMemTotalRegexp.FindSubmatch(data)
+	if m == nil {
+		return 0, fmt.Errorf("MemTotal not found in %s", fileName)
+	}
+	kb, err := strconv.ParseUint(string(m[1]), 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	return kb / 1024, nil
+}
+
+// getHostNUMADistance reads the distance row for the first host NUMA node
+// in the given hostNodes specifier (e.g. "0" or "0-1").
+func getHostNUMADistance(hostNodes string) string {
+	nodeSet, err := cpuset.Parse(hostNodes)
+	if err != nil {
+		return ""
+	}
+	ids := nodeSet.ToSlice()
+	if len(ids) == 0 {
+		return ""
+	}
+	fileName := fmt.Sprintf("/sys/devices/system/node/node%d/distance", ids[0])
+	data, err := os.ReadFile(fileName)
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSuffix(string(data), "\n")
+}
--- a/src/runtime/virtcontainers/utils/utils_test.go
+++ b/src/runtime/virtcontainers/utils/utils_test.go
@@ -19,6 +19,8 @@ import (
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
+
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 )

 const waitLocalProcessTimeoutSecs = 3
@@ -754,3 +756,62 @@ func TestDockerNetnsPath(t *testing.T) {
 	}
 	assert.Equal("", DockerNetnsPath(spec))
 }
+
+func TestDistributeVCPUsProportionallySymmetric(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0-3"},
+		{HostCPUs: "4-7"},
+	}
+	dist, err := DistributeVCPUsProportionally(nodes, 8)
+	assert.NoError(err)
+	assert.Equal([]uint32{4, 4}, dist)
+}
+
+func TestDistributeVCPUsProportionallyAsymmetric(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0-7"},
+		{HostCPUs: "8-9"},
+	}
+	dist, err := DistributeVCPUsProportionally(nodes, 10)
+	assert.NoError(err)
+	assert.Equal([]uint32{8, 2}, dist)
+}
+
+func TestDistributeVCPUsProportionallyMinOnePerNode(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0-99"},
+		{HostCPUs: "100"},
+	}
+	dist, err := DistributeVCPUsProportionally(nodes, 2)
+	assert.NoError(err)
+	assert.Equal(uint32(1), dist[0])
+	assert.Equal(uint32(1), dist[1])
+}
+
+func TestDistributeVCPUsProportionallyThreeNodes(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0-5"},
+		{HostCPUs: "6-8"},
+		{HostCPUs: "9"},
+	}
+	// 6+3+1=10 host CPUs, 10 vCPUs: proportional = 6, 3, 1
+	dist, err := DistributeVCPUsProportionally(nodes, 10)
+	assert.NoError(err)
+	assert.Equal([]uint32{6, 3, 1}, dist)
+}
+
+func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) {
+	assert := assert.New(t)
+	nodes := []types.GuestNUMANode{
+		{HostCPUs: "0"},
+		{HostCPUs: "1"},
+		{HostCPUs: "2"},
+	}
+	_, err := DistributeVCPUsProportionally(nodes, 2)
+	assert.Error(err)
+	assert.Contains(err.Error(), "must be >= NUMA node count")
+}