runtime: Add host NUMA distance discovery and build guest NUMA topology

Add sysfs-based host NUMA distance reading (GetHostNUMADistances) that
parses /sys/devices/system/node/nodeN/distance to mirror the host NUMA
distance matrix into the guest via -numa dist entries.

Implement buildNUMATopology() which translates the GuestNUMANodes
configuration into govmm NUMANode and NUMADist slices. Each guest NUMA
node gets a floor-divided share of vCPUs and memory, with the last node
absorbing any remainder. This handles the common Kata case of +1 VMM
overhead vCPU gracefully. Memory backends are selected based on
hugepages/virtio-fs/file-backed-mem configuration.

Guard multi-NUMA topology generation to amd64 and arm64 only, since
other architectures (s390x, riscv64) do not support QEMU NUMA/DIMM.

Wire buildNUMATopology() into CreateVM so the QEMU config includes NUMA
nodes and distances.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
Fabiano Fidêncio
2026-04-14 15:03:49 +02:00
committed by Fabiano Fidêncio
parent 447e2a3faf
commit d0d7deb262
9 changed files with 919 additions and 5 deletions

View File

@@ -1071,6 +1071,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
IOMMU: h.IOMMU,
IOMMUPlatform: h.getIOMMUPlatform(),
GuestNUMANodes: h.defaultGuestNUMANodes(),
NUMAMapping: append([]string(nil), h.NUMAMapping...),
FileBackedMemRootDir: h.FileBackedMemRootDir,
FileBackedMemRootList: h.FileBackedMemRootList,
Debug: h.Debug,
@@ -1994,6 +1995,10 @@ func checkConfig(config oci.RuntimeConfig) error {
return err
}
if err := checkNumaConfig(config); err != nil {
return err
}
hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
machineType := config.HypervisorConfig.HypervisorMachineType
@@ -2005,6 +2010,25 @@ func checkConfig(config oci.RuntimeConfig) error {
return nil
}
func checkNumaConfig(config oci.RuntimeConfig) error {
if len(config.HypervisorConfig.GuestNUMANodes) <= 1 {
return nil
}
switch goruntime.GOARCH {
case "amd64", "arm64":
default:
return fmt.Errorf("multi-NUMA support is only available on amd64 and arm64, got %q", goruntime.GOARCH)
}
if !config.StaticSandboxResourceMgmt {
return fmt.Errorf("NUMA support requires static_sandbox_resource_mgmt to be enabled; " +
"NUMA topology is not compatible with dynamic CPU/memory hotplug")
}
return nil
}
// checkPCIeConfig ensures the PCIe configuration is valid.
// Only allow one of the following settings for cold-plug:
// no-port, root-port, switch-port

View File

@@ -794,11 +794,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
}
if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok {
guestNUMANodes, err := vcutils.GetGuestNUMANodes(strings.Fields(annotation))
mapping := strings.Fields(annotation)
guestNUMANodes, err := vcutils.GetGuestNUMANodes(mapping)
if err != nil {
return err
}
sbConfig.HypervisorConfig.GuestNUMANodes = guestNUMANodes
// Record the raw user-provided mapping so the hypervisor
// backend honors it verbatim instead of right-sizing.
sbConfig.HypervisorConfig.NUMAMapping = mapping
}
return nil

View File

@@ -803,6 +803,15 @@ type HypervisorConfig struct {
// GuestNUMANodes defines guest NUMA topology and mapping to host NUMA nodes and CPUs.
GuestNUMANodes []types.GuestNUMANode
// NUMAMapping is the raw user-provided NUMA mapping (TOML
// `numa_mapping` or the io.katacontainers.config.hypervisor.numa_mapping
// annotation). When empty, GuestNUMANodes was auto-derived from the
// host topology and may be right-sized at sandbox creation (e.g.
// collapsed to a single host node when the sandbox fits, or
// restricted to host nodes containing attached VFIO devices). When
// non-empty, the topology is honored verbatim.
NUMAMapping []string
// DisableNestingChecks is used to override customizations performed
// when running on top of another VMM.
DisableNestingChecks bool

View File

@@ -21,6 +21,7 @@ import (
"os/user"
"path/filepath"
"regexp"
goruntime "runtime"
"strconv"
"strings"
"sync"
@@ -251,6 +252,14 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
span, _ := katatrace.Trace(ctx, q.Logger(), "setup", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
// Right-size auto-derived NUMA topology before snapshotting the config.
// We mutate the caller-owned pointer so the sandbox's shared
// HypervisorConfig (used by vCPU pinning and cpuset.mems forwarding)
// observes the same trimmed topology that QEMU is launched with.
// No-op when numa_mapping was set explicitly or when the topology
// already has one or zero nodes.
maybeRightSizeAutoNUMA(hypervisorConfig, q.Logger())
if err := q.setConfig(hypervisorConfig); err != nil {
return err
}
@@ -326,8 +335,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
return nil
}
func (q *qemu) cpuTopology() govmmQemu.SMP {
return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, q.config.NumGuestNUMANodes())
func (q *qemu) cpuTopology(effectiveNUMANodes uint32) govmmQemu.SMP {
return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, effectiveNUMANodes)
}
func (q *qemu) memoryTopology() (govmmQemu.Memory, error) {
@@ -996,7 +1005,13 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
return err
}
smp := q.cpuTopology()
numaNodes, numaDists, err := q.buildNUMATopology()
if err != nil {
return err
}
effectiveNUMANodes := uint32(len(numaNodes))
smp := q.cpuTopology(effectiveNUMANodes)
memory, err := q.memoryTopology()
if err != nil {
@@ -1117,6 +1132,8 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
QMPSockets: qmpSockets,
Knobs: knobs,
Incoming: incoming,
NUMANodes: numaNodes,
NUMADists: numaDists,
VGA: "none",
GlobalParam: "kvm-pit.lost_tick_policy=discard",
Bios: firmwarePath,

View File

@@ -19,6 +19,7 @@ import (
"os"
"path"
"path/filepath"
"runtime"
"testing"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
@@ -29,6 +30,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
"github.com/pbnjay/memory"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
)
@@ -283,7 +285,7 @@ func TestQemuCPUTopology(t *testing.T) {
MaxCPUs: uint32(vcpus),
}
smp := q.cpuTopology()
smp := q.cpuTopology(0)
assert.Exactly(smp, expectedOut)
}
@@ -1200,3 +1202,570 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) {
// State should remain unchanged
assert.Equal(100, q.state.HotpluggedMemory)
}
func TestBuildNUMATopologySingleNode(t *testing.T) {
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 4,
MemorySize: 1024,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-3"},
},
},
}
nodes, dists, err := q.buildNUMATopology()
assert.NoError(err)
assert.Nil(nodes)
assert.Nil(dists)
}
func TestBuildNUMATopologyTwoNodes(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 4,
MemorySize: 1024,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-1"},
{HostNodes: "1", HostCPUs: "2-3"},
},
},
}
nodes, _, err := q.buildNUMATopology()
assert.NoError(err)
assert.Len(nodes, 2)
assert.Equal(uint32(0), nodes[0].NodeID)
assert.Equal("0-1", nodes[0].CPUs)
assert.Equal("512M", nodes[0].MemSize)
assert.Equal("memory-backend-ram", nodes[0].MemBackendType)
assert.Equal(uint32(1), nodes[1].NodeID)
assert.Equal("2-3", nodes[1].CPUs)
assert.Equal("512M", nodes[1].MemSize)
}
func TestBuildNUMATopologyHugePages(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
if _, err := os.Stat("/dev/hugepages"); err != nil {
t.Skip("skipping: /dev/hugepages not available")
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 4,
MemorySize: 1024,
HugePages: true,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-1"},
{HostNodes: "1", HostCPUs: "2-3"},
},
},
}
nodes, _, err := q.buildNUMATopology()
assert.NoError(err)
assert.Len(nodes, 2)
assert.Equal("memory-backend-file", nodes[0].MemBackendType)
assert.Equal("/dev/hugepages", nodes[0].MemBackendPath)
assert.Equal("512M", nodes[0].MemSize)
}
func TestBuildNUMATopologyVirtioFS(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 4,
MemorySize: 1024,
SharedFS: config.VirtioFS,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-1"},
{HostNodes: "1", HostCPUs: "2-3"},
},
},
}
nodes, _, err := q.buildNUMATopology()
assert.NoError(err)
assert.Len(nodes, 2)
assert.Equal("memory-backend-file", nodes[0].MemBackendType)
assert.Equal(fallbackFileBackedMemDir, nodes[0].MemBackendPath)
}
func TestBuildNUMATopologyFileBackedMem(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
tmpDir := t.TempDir()
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 4,
MemorySize: 1024,
FileBackedMemRootDir: tmpDir,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-1"},
{HostNodes: "1", HostCPUs: "2-3"},
},
},
}
nodes, _, err := q.buildNUMATopology()
assert.NoError(err)
assert.Len(nodes, 2)
assert.Equal("memory-backend-file", nodes[0].MemBackendType)
assert.Equal(tmpDir, nodes[0].MemBackendPath)
}
func TestBuildNUMATopologyTooFewVCPUs(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 1,
MemorySize: 1024,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0"},
{HostNodes: "1", HostCPUs: "1"},
},
},
}
nodes, dists, err := q.buildNUMATopology()
assert.NoError(err)
assert.Nil(nodes)
assert.Nil(dists)
}
func TestBuildNUMATopologyUnevenVCPUs(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 5,
MemorySize: 1024,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-1"},
{HostNodes: "1", HostCPUs: "2-4"},
},
},
}
nodes, _, err := q.buildNUMATopology()
assert.NoError(err)
assert.Len(nodes, 2)
// cpuTopology() rounds MaxCPUs to ceil(5/2)*2=6, so 6 CPU slots
// are distributed proportionally: 2 host CPUs → 2 vCPUs,
// 3 host CPUs → 4 vCPUs (3 proportional + 1 remainder).
assert.Equal("0-1", nodes[0].CPUs)
assert.Equal("2-5", nodes[1].CPUs)
}
func TestBuildNUMATopologyMemMisaligned(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 4,
MemorySize: 1,
HugePages: true,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-1"},
{HostNodes: "1", HostCPUs: "2-3"},
},
},
}
_, _, err := q.buildNUMATopology()
assert.Error(err)
assert.Contains(err.Error(), "cannot be evenly distributed")
}
func TestBuildNUMATopologyMemMisalignedRemainder(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 6,
MemorySize: 1025,
HugePages: true,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-2"},
{HostNodes: "1", HostCPUs: "3-5"},
},
},
}
_, _, err := q.buildNUMATopology()
assert.Error(err)
assert.Contains(err.Error(), "cannot be evenly distributed")
}
func TestBuildNUMATopologyEvenMemory(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 6,
MemorySize: 1024,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-2"},
{HostNodes: "1", HostCPUs: "3-5"},
},
},
}
nodes, _, err := q.buildNUMATopology()
assert.NoError(err)
assert.Len(nodes, 2)
assert.Equal("0-2", nodes[0].CPUs)
assert.Equal("512M", nodes[0].MemSize)
assert.Equal("3-5", nodes[1].CPUs)
assert.Equal("512M", nodes[1].MemSize)
}
func TestBuildNUMATopologyProportionalVCPUs(t *testing.T) {
if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" {
t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH)
}
assert := assert.New(t)
q := &qemu{
config: HypervisorConfig{
DefaultMaxVCPUs: 10,
MemorySize: 1000,
GuestNUMANodes: []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-7"},
{HostNodes: "1", HostCPUs: "8-9"},
},
},
}
nodes, _, err := q.buildNUMATopology()
assert.NoError(err)
assert.Len(nodes, 2)
// 8 out of 10 host CPUs on node 0 → 8 vCPUs
assert.Equal("0-7", nodes[0].CPUs)
assert.Equal("800M", nodes[0].MemSize)
// 2 out of 10 host CPUs on node 1 → 2 vCPUs
assert.Equal("8-9", nodes[1].CPUs)
assert.Equal("200M", nodes[1].MemSize)
}
func TestBuildCoveredHostNodes(t *testing.T) {
assert := assert.New(t)
covered := buildCoveredHostNodes([]types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-3"},
{HostNodes: "1", HostCPUs: "4-7"},
})
assert.Len(covered, 2)
assert.Equal(uint32(0), covered[0])
assert.Equal(uint32(1), covered[1])
}
func TestBuildCoveredHostNodesRange(t *testing.T) {
assert := assert.New(t)
covered := buildCoveredHostNodes([]types.GuestNUMANode{
{HostNodes: "0-1", HostCPUs: "0-7"},
})
assert.Len(covered, 2)
assert.Equal(uint32(0), covered[0])
assert.Equal(uint32(0), covered[1])
}
func TestBuildCoveredHostNodesEmpty(t *testing.T) {
assert := assert.New(t)
covered := buildCoveredHostNodes(nil)
assert.Len(covered, 0)
}
func TestBuildCoveredHostNodesInvalidParse(t *testing.T) {
assert := assert.New(t)
covered := buildCoveredHostNodes([]types.GuestNUMANode{
{HostNodes: "invalid", HostCPUs: "0-3"},
{HostNodes: "1", HostCPUs: "4-7"},
})
assert.Len(covered, 1)
assert.Equal(uint32(1), covered[1])
}
// silentLogger returns a logrus.Entry that discards all output, suitable
// for use in unit tests that exercise NUMA right-sizing decisions.
func silentLogger() *logrus.Entry {
l := logrus.New()
l.Out = io.Discard
return logrus.NewEntry(l)
}
// fakeCapFn returns a hostNUMACapFn backed by a static map. Unknown nodes
// produce an error so we exercise the "skip unknown" branch in
// sumNUMACapacity when intended.
func fakeCapFn(caps map[int]struct {
cpus int
memMB uint64
}) hostNUMACapFn {
return func(nodeID int) (int, uint64, error) {
if c, ok := caps[nodeID]; ok {
return c.cpus, c.memMB, nil
}
return 0, 0, fmt.Errorf("unknown host NUMA node %d", nodeID)
}
}
// twoNodeHostCaps describes a typical 2-socket host: 32 CPUs and 128 GiB
// per node.
func twoNodeHostCaps() map[int]struct {
cpus int
memMB uint64
} {
return map[int]struct {
cpus int
memMB uint64
}{
0: {cpus: 32, memMB: 128 * 1024},
1: {cpus: 32, memMB: 128 * 1024},
}
}
func twoNodeAutoTopology() []types.GuestNUMANode {
return []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-31"},
{HostNodes: "1", HostCPUs: "32-63"},
}
}
func TestSumNUMACapacity(t *testing.T) {
assert := assert.New(t)
cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(twoNodeHostCaps()))
assert.Equal(64, cpus)
assert.Equal(uint64(256*1024), memMB)
}
func TestSumNUMACapacityDeduplicatesHostNodes(t *testing.T) {
assert := assert.New(t)
// Two guest entries that both reference host node 0 must only count
// once. The merged "0-1" entry adds host node 1.
nodes := []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-31"},
{HostNodes: "0-1", HostCPUs: "0-63"},
}
cpus, memMB := sumNUMACapacity(nodes, fakeCapFn(twoNodeHostCaps()))
assert.Equal(64, cpus)
assert.Equal(uint64(256*1024), memMB)
}
func TestSumNUMACapacitySkipsUnknown(t *testing.T) {
assert := assert.New(t)
caps := map[int]struct {
cpus int
memMB uint64
}{
0: {cpus: 16, memMB: 32 * 1024},
// host node 1 missing on purpose
}
cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(caps))
assert.Equal(16, cpus)
assert.Equal(uint64(32*1024), memMB)
}
func TestSelectNUMANodesPassthroughForSingleNode(t *testing.T) {
assert := assert.New(t)
in := []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}}
out := selectNUMANodes(in, 4, 1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
assert.Equal(in, out)
}
func TestSelectNUMANodesNoVFIOFitsOneNode(t *testing.T) {
// Small sandbox (8 vCPUs / 16 GiB) fits comfortably in one host node:
// expect collapse to the first guest node.
assert := assert.New(t)
in := twoNodeAutoTopology()
out := selectNUMANodes(in, 8, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
assert.Len(out, 1)
assert.Equal("0", out[0].HostNodes)
}
func TestSelectNUMANodesNoVFIOExceedsOneNode(t *testing.T) {
// 64 vCPUs needs both 32-CPU nodes: expect full topology.
assert := assert.New(t)
in := twoNodeAutoTopology()
out := selectNUMANodes(in, 64, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
assert.Equal(in, out)
}
func TestSelectNUMANodesNoVFIOMemoryExceedsOneNode(t *testing.T) {
// CPU fits in one node but memory does not: expect full topology.
assert := assert.New(t)
in := twoNodeAutoTopology()
out := selectNUMANodes(in, 8, 200*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger())
assert.Equal(in, out)
}
func TestSelectNUMANodesVFIOSubsetFits(t *testing.T) {
// VFIO device on host node 1; sandbox fits in one node: expect
// collapse to the guest node covering host node 1.
assert := assert.New(t)
in := twoNodeAutoTopology()
vfio := map[int]struct{}{1: {}}
out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
assert.Len(out, 1)
assert.Equal("1", out[0].HostNodes)
}
func TestSelectNUMANodesVFIOSubsetTooSmall(t *testing.T) {
// VFIO device on host node 1, but sandbox needs more than one node's
// worth of memory: expect the full topology so the sandbox actually
// fits, even at the cost of cross-NUMA traffic.
assert := assert.New(t)
in := twoNodeAutoTopology()
vfio := map[int]struct{}{1: {}}
out := selectNUMANodes(in, 8, 200*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
assert.Equal(in, out)
}
func TestSelectNUMANodesVFIOSpansAllNodes(t *testing.T) {
// One VFIO device per host node: VFIO subset == full topology, no
// collapse possible. Result is the input unchanged.
assert := assert.New(t)
in := twoNodeAutoTopology()
vfio := map[int]struct{}{0: {}, 1: {}}
out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
assert.Equal(in, out)
}
func TestSelectNUMANodesVFIONoCoverage(t *testing.T) {
// VFIO host node not represented in the guest topology (rare, but can
// happen if numa_mapping has been customized). Keep the full topology
// rather than dropping all nodes.
assert := assert.New(t)
in := twoNodeAutoTopology()
vfio := map[int]struct{}{2: {}}
out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger())
assert.Equal(in, out)
}
// rightSizeNUMAWithFakeCaps mirrors maybeRightSizeAutoNUMA but lets tests
// inject a synthetic capacity oracle in place of realHostNUMACapFn so the
// decision is hermetic.
func rightSizeNUMAWithFakeCaps(hc *HypervisorConfig, capFn hostNUMACapFn) {
if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 {
return
}
hc.GuestNUMANodes = selectNUMANodes(
hc.GuestNUMANodes,
hc.DefaultMaxVCPUs,
uint64(hc.MemorySize),
nil, // no VFIO devices in this test
capFn,
silentLogger(),
)
}
func TestMaybeRightSizeAutoNUMACollapsesToOneNode(t *testing.T) {
// Empty NUMAMapping (auto) + sandbox fits in one host node:
// GuestNUMANodes is trimmed to a single entry.
assert := assert.New(t)
hc := &HypervisorConfig{
DefaultMaxVCPUs: 1,
MemorySize: 1,
GuestNUMANodes: twoNodeAutoTopology(),
}
rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
assert.Len(hc.GuestNUMANodes, 1)
assert.Equal("0", hc.GuestNUMANodes[0].HostNodes)
}
func TestMaybeRightSizeAutoNUMAExplicitMappingHonored(t *testing.T) {
// Non-empty NUMAMapping (user-provided) is left untouched, even if
// the sandbox would fit in a single node.
assert := assert.New(t)
hc := &HypervisorConfig{
DefaultMaxVCPUs: 1,
MemorySize: 1,
NUMAMapping: []string{"0", "1"},
GuestNUMANodes: twoNodeAutoTopology(),
}
rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
assert.Len(hc.GuestNUMANodes, 2)
}
func TestMaybeRightSizeAutoNUMAKeepsFullWhenSandboxSpansNodes(t *testing.T) {
// Sandbox needs more CPUs than a single host node has: full topology
// is preserved.
assert := assert.New(t)
hc := &HypervisorConfig{
DefaultMaxVCPUs: 64, // > one node's 32 CPUs
MemorySize: 1024,
GuestNUMANodes: twoNodeAutoTopology(),
}
rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps()))
assert.Len(hc.GuestNUMANodes, 2)
}
func TestMaybeRightSizeAutoNUMANoOpForFlatTopology(t *testing.T) {
// A topology with ≤ 1 node is a no-op regardless of NUMAMapping or
// budget.
assert := assert.New(t)
for _, tc := range []struct {
name string
hc *HypervisorConfig
}{
{
name: "nil config",
hc: nil,
},
{
name: "single node",
hc: &HypervisorConfig{
GuestNUMANodes: []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}},
},
},
{
name: "empty",
hc: &HypervisorConfig{},
},
} {
t.Run(tc.name, func(t *testing.T) {
before := 0
if tc.hc != nil {
before = len(tc.hc.GuestNUMANodes)
}
rightSizeNUMAWithFakeCaps(tc.hc, fakeCapFn(twoNodeHostCaps()))
after := 0
if tc.hc != nil {
after = len(tc.hc.GuestNUMANodes)
}
assert.Equal(before, after)
})
}
}

View File

@@ -13,6 +13,7 @@ import (
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
"syscall"
"time"
@@ -623,3 +624,182 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) {
return numaNodes, nil
}
// NUMADistEntry represents a single NUMA distance measurement between two nodes.
type NUMADistEntry struct {
Src uint32
Dst uint32
Val uint32
}
// GetHostNUMADistances reads the host NUMA distance matrix for the nodes
// referenced by the given GuestNUMANode list and returns off-diagonal
// pairwise entries (skipping self-distance src==dst).
// The distance row from sysfs is indexed by host NUMA node ID, so we parse
// each guest node's HostNodes to find the representative host node ID and
// use that to index into the distance row.
func GetHostNUMADistances(nodes []types.GuestNUMANode) []NUMADistEntry {
hostNodeIDs := make([]int, len(nodes))
for i, n := range nodes {
nodeSet, err := cpuset.Parse(n.HostNodes)
if err != nil {
hostNodeIDs[i] = -1
continue
}
ids := nodeSet.ToSlice()
if len(ids) == 0 {
hostNodeIDs[i] = -1
continue
}
hostNodeIDs[i] = ids[0]
}
var dists []NUMADistEntry
for srcIdx, srcNode := range nodes {
if hostNodeIDs[srcIdx] < 0 {
continue
}
distStr := getHostNUMADistance(srcNode.HostNodes)
if distStr == "" {
continue
}
fields := strings.Fields(distStr)
for dstIdx := range nodes {
if srcIdx == dstIdx {
continue
}
hostID := hostNodeIDs[dstIdx]
if hostID < 0 || hostID >= len(fields) {
continue
}
val, err := strconv.ParseUint(fields[hostID], 10, 32)
if err != nil {
continue
}
dists = append(dists, NUMADistEntry{
Src: uint32(srcIdx),
Dst: uint32(dstIdx),
Val: uint32(val),
})
}
}
return dists
}
// HostNUMANodeCapacity describes the CPU and memory capacity of a single
// host NUMA node, as seen via sysfs.
type HostNUMANodeCapacity struct {
NodeID int
CPUs int
MemMB uint64
}
// GetHostNUMANodeCapacity returns the CPU count and memory size (in MiB)
// of the given host NUMA node.
func GetHostNUMANodeCapacity(nodeID int) (HostNUMANodeCapacity, error) {
cap := HostNUMANodeCapacity{NodeID: nodeID}
cpuList, err := getHostNUMANodeCPUs(nodeID)
if err != nil {
return cap, err
}
cs, err := cpuset.Parse(cpuList)
if err != nil {
return cap, fmt.Errorf("parse host node %d cpulist %q: %w", nodeID, cpuList, err)
}
cap.CPUs = cs.Size()
memMB, err := getHostNUMANodeMemoryMB(nodeID)
if err != nil {
return cap, err
}
cap.MemMB = memMB
return cap, nil
}
// GetHostNUMANodeCapacities returns the capacities of the given host NUMA
// node IDs in the same order. Nodes that fail to be read are skipped and
// the corresponding error is logged via the returned error (the slice may
// be shorter than the input).
func GetHostNUMANodeCapacities(nodeIDs []int) ([]HostNUMANodeCapacity, error) {
out := make([]HostNUMANodeCapacity, 0, len(nodeIDs))
for _, id := range nodeIDs {
c, err := GetHostNUMANodeCapacity(id)
if err != nil {
return out, fmt.Errorf("read host NUMA node %d capacity: %w", id, err)
}
out = append(out, c)
}
return out, nil
}
// DistributeVCPUsProportionally distributes totalVCPUs across NUMA nodes
// proportionally to the number of host CPUs available on each node.
// Each node is guaranteed at least 1 vCPU. Remainder vCPUs go to nodes
// with the most host CPUs.
func DistributeVCPUsProportionally(numaNodes []types.GuestNUMANode, totalVCPUs uint32) ([]uint32, error) {
numNodes := len(numaNodes)
if numNodes == 0 {
return nil, fmt.Errorf("no NUMA nodes")
}
if totalVCPUs < uint32(numNodes) {
return nil, fmt.Errorf("totalVCPUs (%d) must be >= NUMA node count (%d)", totalVCPUs, numNodes)
}
hostCPUCounts := make([]int, numNodes)
totalHostCPUs := 0
for i, gn := range numaNodes {
parsed, err := cpuset.Parse(gn.HostCPUs)
if err != nil {
return nil, fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %w", i, err)
}
if parsed.Size() == 0 {
return nil, fmt.Errorf("HostCPUs for NUMA node %d must not be empty", i)
}
hostCPUCounts[i] = parsed.Size()
totalHostCPUs += hostCPUCounts[i]
}
if totalHostCPUs == 0 {
return nil, fmt.Errorf("total host CPU count is 0")
}
vcpusPerNode := make([]uint32, numNodes)
var assigned uint32
for i := range numaNodes {
vcpusPerNode[i] = uint32(int(totalVCPUs) * hostCPUCounts[i] / totalHostCPUs)
if vcpusPerNode[i] == 0 {
vcpusPerNode[i] = 1
}
assigned += vcpusPerNode[i]
}
// Use a copy for remainder distribution to avoid mutating the original counts.
weights := make([]int, numNodes)
copy(weights, hostCPUCounts)
for assigned < totalVCPUs {
bestIdx := 0
for i := 1; i < numNodes; i++ {
if weights[i] > weights[bestIdx] {
bestIdx = i
}
}
vcpusPerNode[bestIdx]++
assigned++
weights[bestIdx]--
}
for assigned > totalVCPUs {
bestIdx := 0
for i := 1; i < numNodes; i++ {
if vcpusPerNode[i] > vcpusPerNode[bestIdx] {
bestIdx = i
}
}
if vcpusPerNode[bestIdx] <= 1 {
break
}
vcpusPerNode[bestIdx]--
assigned--
}
return vcpusPerNode, nil
}

View File

@@ -22,3 +22,11 @@ func getHostNUMANodes() ([]int, error) {
func getHostNUMANodeCPUs(nodeId int) (string, error) {
return "", nil
}
func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) {
return 0, nil
}
func getHostNUMADistance(hostNodes string) string {
return ""
}

View File

@@ -12,6 +12,8 @@ import (
"io"
"math/big"
"os"
"regexp"
"strconv"
"strings"
"syscall"
"time"
@@ -23,6 +25,8 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
)
var nodeMemTotalRegexp = regexp.MustCompile(`Node\s+\d+\s+MemTotal:\s+(\d+)\s+kB`)
var ioctlFunc = Ioctl
// maxUInt represents the maximum valid value for the context ID.
@@ -220,3 +224,41 @@ func getHostNUMANodeCPUs(nodeId int) (string, error) {
}
return strings.TrimSuffix(string(data), "\n"), nil
}
// getHostNUMANodeMemoryMB returns the total memory in MiB for the given
// host NUMA node, parsed from /sys/devices/system/node/nodeN/meminfo.
func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) {
fileName := fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeId)
data, err := os.ReadFile(fileName)
if err != nil {
return 0, err
}
m := nodeMemTotalRegexp.FindSubmatch(data)
if m == nil {
return 0, fmt.Errorf("MemTotal not found in %s", fileName)
}
kb, err := strconv.ParseUint(string(m[1]), 10, 64)
if err != nil {
return 0, err
}
return kb / 1024, nil
}
// getHostNUMADistance reads the distance row for the first host NUMA node
// in the given hostNodes specifier (e.g. "0" or "0-1").
func getHostNUMADistance(hostNodes string) string {
nodeSet, err := cpuset.Parse(hostNodes)
if err != nil {
return ""
}
ids := nodeSet.ToSlice()
if len(ids) == 0 {
return ""
}
fileName := fmt.Sprintf("/sys/devices/system/node/node%d/distance", ids[0])
data, err := os.ReadFile(fileName)
if err != nil {
return ""
}
return strings.TrimSuffix(string(data), "\n")
}

View File

@@ -19,6 +19,8 @@ import (
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
)
const waitLocalProcessTimeoutSecs = 3
@@ -754,3 +756,62 @@ func TestDockerNetnsPath(t *testing.T) {
}
assert.Equal("", DockerNetnsPath(spec))
}
func TestDistributeVCPUsProportionallySymmetric(t *testing.T) {
assert := assert.New(t)
nodes := []types.GuestNUMANode{
{HostCPUs: "0-3"},
{HostCPUs: "4-7"},
}
dist, err := DistributeVCPUsProportionally(nodes, 8)
assert.NoError(err)
assert.Equal([]uint32{4, 4}, dist)
}
func TestDistributeVCPUsProportionallyAsymmetric(t *testing.T) {
assert := assert.New(t)
nodes := []types.GuestNUMANode{
{HostCPUs: "0-7"},
{HostCPUs: "8-9"},
}
dist, err := DistributeVCPUsProportionally(nodes, 10)
assert.NoError(err)
assert.Equal([]uint32{8, 2}, dist)
}
func TestDistributeVCPUsProportionallyMinOnePerNode(t *testing.T) {
assert := assert.New(t)
nodes := []types.GuestNUMANode{
{HostCPUs: "0-99"},
{HostCPUs: "100"},
}
dist, err := DistributeVCPUsProportionally(nodes, 2)
assert.NoError(err)
assert.Equal(uint32(1), dist[0])
assert.Equal(uint32(1), dist[1])
}
func TestDistributeVCPUsProportionallyThreeNodes(t *testing.T) {
assert := assert.New(t)
nodes := []types.GuestNUMANode{
{HostCPUs: "0-5"},
{HostCPUs: "6-8"},
{HostCPUs: "9"},
}
// 6+3+1=10 host CPUs, 10 vCPUs: proportional = 6, 3, 1
dist, err := DistributeVCPUsProportionally(nodes, 10)
assert.NoError(err)
assert.Equal([]uint32{6, 3, 1}, dist)
}
func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) {
assert := assert.New(t)
nodes := []types.GuestNUMANode{
{HostCPUs: "0"},
{HostCPUs: "1"},
{HostCPUs: "2"},
}
_, err := DistributeVCPUsProportionally(nodes, 2)
assert.Error(err)
assert.Contains(err.Error(), "must be >= NUMA node count")
}