diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 061bf8b2ed..c5c5f70c34 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -1071,6 +1071,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { IOMMU: h.IOMMU, IOMMUPlatform: h.getIOMMUPlatform(), GuestNUMANodes: h.defaultGuestNUMANodes(), + NUMAMapping: append([]string(nil), h.NUMAMapping...), FileBackedMemRootDir: h.FileBackedMemRootDir, FileBackedMemRootList: h.FileBackedMemRootList, Debug: h.Debug, @@ -1994,6 +1995,10 @@ func checkConfig(config oci.RuntimeConfig) error { return err } + if err := checkNumaConfig(config); err != nil { + return err + } + hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO machineType := config.HypervisorConfig.HypervisorMachineType @@ -2005,6 +2010,25 @@ func checkConfig(config oci.RuntimeConfig) error { return nil } +func checkNumaConfig(config oci.RuntimeConfig) error { + if len(config.HypervisorConfig.GuestNUMANodes) <= 1 { + return nil + } + + switch goruntime.GOARCH { + case "amd64", "arm64": + default: + return fmt.Errorf("multi-NUMA support is only available on amd64 and arm64, got %q", goruntime.GOARCH) + } + + if !config.StaticSandboxResourceMgmt { + return fmt.Errorf("NUMA support requires static_sandbox_resource_mgmt to be enabled; " + + "NUMA topology is not compatible with dynamic CPU/memory hotplug") + } + + return nil +} + // checkPCIeConfig ensures the PCIe configuration is valid. // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index b09a97e994..39bb029400 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -794,11 +794,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig } if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok { - guestNUMANodes, err := vcutils.GetGuestNUMANodes(strings.Fields(annotation)) + mapping := strings.Fields(annotation) + guestNUMANodes, err := vcutils.GetGuestNUMANodes(mapping) if err != nil { return err } sbConfig.HypervisorConfig.GuestNUMANodes = guestNUMANodes + // Record the raw user-provided mapping so the hypervisor + // backend honors it verbatim instead of right-sizing. + sbConfig.HypervisorConfig.NUMAMapping = mapping } return nil diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index b631960f6b..8b93b31428 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -803,6 +803,15 @@ type HypervisorConfig struct { // GuestNUMANodes defines guest NUMA topology and mapping to host NUMA nodes and CPUs. GuestNUMANodes []types.GuestNUMANode + // NUMAMapping is the raw user-provided NUMA mapping (TOML + // `numa_mapping` or the io.katacontainers.config.hypervisor.numa_mapping + // annotation). When empty, GuestNUMANodes was auto-derived from the + // host topology and may be right-sized at sandbox creation (e.g. + // collapsed to a single host node when the sandbox fits, or + // restricted to host nodes containing attached VFIO devices). When + // non-empty, the topology is honored verbatim. + NUMAMapping []string + // DisableNestingChecks is used to override customizations performed // when running on top of another VMM. DisableNestingChecks bool diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index e2a3e446d2..173f72b2c1 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -21,6 +21,7 @@ import ( "os/user" "path/filepath" "regexp" + goruntime "runtime" "strconv" "strings" "sync" @@ -251,6 +252,14 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso span, _ := katatrace.Trace(ctx, q.Logger(), "setup", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() + // Right-size auto-derived NUMA topology before snapshotting the config. + // We mutate the caller-owned pointer so the sandbox's shared + // HypervisorConfig (used by vCPU pinning and cpuset.mems forwarding) + // observes the same trimmed topology that QEMU is launched with. + // No-op when numa_mapping was set explicitly or when the topology + // already has one or zero nodes. + maybeRightSizeAutoNUMA(hypervisorConfig, q.Logger()) + if err := q.setConfig(hypervisorConfig); err != nil { return err } @@ -326,8 +335,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso return nil } -func (q *qemu) cpuTopology() govmmQemu.SMP { - return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, q.config.NumGuestNUMANodes()) +func (q *qemu) cpuTopology(effectiveNUMANodes uint32) govmmQemu.SMP { + return q.arch.cpuTopology(q.config.NumVCPUs(), q.config.DefaultMaxVCPUs, effectiveNUMANodes) } func (q *qemu) memoryTopology() (govmmQemu.Memory, error) { @@ -996,7 +1005,13 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi return err } - smp := q.cpuTopology() + numaNodes, numaDists, err := q.buildNUMATopology() + if err != nil { + return err + } + + effectiveNUMANodes := uint32(len(numaNodes)) + smp := q.cpuTopology(effectiveNUMANodes) memory, err := q.memoryTopology() if err != nil { @@ -1117,6 +1132,8 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi QMPSockets: qmpSockets, Knobs: knobs, Incoming: incoming, + NUMANodes: numaNodes, + NUMADists: numaDists, VGA: "none", GlobalParam: "kvm-pit.lost_tick_policy=discard", Bios: firmwarePath, diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index 5d4267f011..db494bf365 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -19,6 +19,7 @@ import ( "os" "path" "path/filepath" + "runtime" "testing" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" @@ -29,6 +30,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" "github.com/pbnjay/memory" "github.com/pkg/errors" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" ) @@ -283,7 +285,7 @@ func TestQemuCPUTopology(t *testing.T) { MaxCPUs: uint32(vcpus), } - smp := q.cpuTopology() + smp := q.cpuTopology(0) assert.Exactly(smp, expectedOut) } @@ -1200,3 +1202,570 @@ func TestResizeMemoryVirtioMemNegativeSize(t *testing.T) { // State should remain unchanged assert.Equal(100, q.state.HotpluggedMemory) } + +func TestBuildNUMATopologySingleNode(t *testing.T) { + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + }, + }, + } + nodes, dists, err := q.buildNUMATopology() + assert.NoError(err) + assert.Nil(nodes) + assert.Nil(dists) +} + +func TestBuildNUMATopologyTwoNodes(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + + assert.Equal(uint32(0), nodes[0].NodeID) + assert.Equal("0-1", nodes[0].CPUs) + assert.Equal("512M", nodes[0].MemSize) + assert.Equal("memory-backend-ram", nodes[0].MemBackendType) + + assert.Equal(uint32(1), nodes[1].NodeID) + assert.Equal("2-3", nodes[1].CPUs) + assert.Equal("512M", nodes[1].MemSize) +} + +func TestBuildNUMATopologyHugePages(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + if _, err := os.Stat("/dev/hugepages"); err != nil { + t.Skip("skipping: /dev/hugepages not available") + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal("/dev/hugepages", nodes[0].MemBackendPath) + assert.Equal("512M", nodes[0].MemSize) +} + +func TestBuildNUMATopologyVirtioFS(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + SharedFS: config.VirtioFS, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal(fallbackFileBackedMemDir, nodes[0].MemBackendPath) +} + +func TestBuildNUMATopologyFileBackedMem(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + tmpDir := t.TempDir() + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1024, + FileBackedMemRootDir: tmpDir, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + assert.Equal("memory-backend-file", nodes[0].MemBackendType) + assert.Equal(tmpDir, nodes[0].MemBackendPath) +} + +func TestBuildNUMATopologyTooFewVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0"}, + {HostNodes: "1", HostCPUs: "1"}, + }, + }, + } + nodes, dists, err := q.buildNUMATopology() + assert.NoError(err) + assert.Nil(nodes) + assert.Nil(dists) +} + +func TestBuildNUMATopologyUnevenVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 5, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-4"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + // cpuTopology() rounds MaxCPUs to ceil(5/2)*2=6, so 6 CPU slots + // are distributed proportionally: 2 host CPUs → 2 vCPUs, + // 3 host CPUs → 4 vCPUs (3 proportional + 1 remainder). + assert.Equal("0-1", nodes[0].CPUs) + assert.Equal("2-5", nodes[1].CPUs) +} + +func TestBuildNUMATopologyMemMisaligned(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 4, + MemorySize: 1, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-1"}, + {HostNodes: "1", HostCPUs: "2-3"}, + }, + }, + } + _, _, err := q.buildNUMATopology() + assert.Error(err) + assert.Contains(err.Error(), "cannot be evenly distributed") +} + +func TestBuildNUMATopologyMemMisalignedRemainder(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 6, + MemorySize: 1025, + HugePages: true, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-2"}, + {HostNodes: "1", HostCPUs: "3-5"}, + }, + }, + } + _, _, err := q.buildNUMATopology() + assert.Error(err) + assert.Contains(err.Error(), "cannot be evenly distributed") +} + +func TestBuildNUMATopologyEvenMemory(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 6, + MemorySize: 1024, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-2"}, + {HostNodes: "1", HostCPUs: "3-5"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + + assert.Equal("0-2", nodes[0].CPUs) + assert.Equal("512M", nodes[0].MemSize) + + assert.Equal("3-5", nodes[1].CPUs) + assert.Equal("512M", nodes[1].MemSize) +} + +func TestBuildNUMATopologyProportionalVCPUs(t *testing.T) { + if runtime.GOARCH != "amd64" && runtime.GOARCH != "arm64" { + t.Skipf("multi-NUMA not supported on %s", runtime.GOARCH) + } + assert := assert.New(t) + q := &qemu{ + config: HypervisorConfig{ + DefaultMaxVCPUs: 10, + MemorySize: 1000, + GuestNUMANodes: []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-7"}, + {HostNodes: "1", HostCPUs: "8-9"}, + }, + }, + } + nodes, _, err := q.buildNUMATopology() + assert.NoError(err) + assert.Len(nodes, 2) + // 8 out of 10 host CPUs on node 0 → 8 vCPUs + assert.Equal("0-7", nodes[0].CPUs) + assert.Equal("800M", nodes[0].MemSize) + // 2 out of 10 host CPUs on node 1 → 2 vCPUs + assert.Equal("8-9", nodes[1].CPUs) + assert.Equal("200M", nodes[1].MemSize) +} + +func TestBuildCoveredHostNodes(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + }) + assert.Len(covered, 2) + assert.Equal(uint32(0), covered[0]) + assert.Equal(uint32(1), covered[1]) +} + +func TestBuildCoveredHostNodesRange(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-7"}, + }) + assert.Len(covered, 2) + assert.Equal(uint32(0), covered[0]) + assert.Equal(uint32(0), covered[1]) +} + +func TestBuildCoveredHostNodesEmpty(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes(nil) + assert.Len(covered, 0) +} + +func TestBuildCoveredHostNodesInvalidParse(t *testing.T) { + assert := assert.New(t) + + covered := buildCoveredHostNodes([]types.GuestNUMANode{ + {HostNodes: "invalid", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + }) + assert.Len(covered, 1) + assert.Equal(uint32(1), covered[1]) +} + +// silentLogger returns a logrus.Entry that discards all output, suitable +// for use in unit tests that exercise NUMA right-sizing decisions. +func silentLogger() *logrus.Entry { + l := logrus.New() + l.Out = io.Discard + return logrus.NewEntry(l) +} + +// fakeCapFn returns a hostNUMACapFn backed by a static map. Unknown nodes +// produce an error so we exercise the "skip unknown" branch in +// sumNUMACapacity when intended. +func fakeCapFn(caps map[int]struct { + cpus int + memMB uint64 +}) hostNUMACapFn { + return func(nodeID int) (int, uint64, error) { + if c, ok := caps[nodeID]; ok { + return c.cpus, c.memMB, nil + } + return 0, 0, fmt.Errorf("unknown host NUMA node %d", nodeID) + } +} + +// twoNodeHostCaps describes a typical 2-socket host: 32 CPUs and 128 GiB +// per node. +func twoNodeHostCaps() map[int]struct { + cpus int + memMB uint64 +} { + return map[int]struct { + cpus int + memMB uint64 + }{ + 0: {cpus: 32, memMB: 128 * 1024}, + 1: {cpus: 32, memMB: 128 * 1024}, + } +} + +func twoNodeAutoTopology() []types.GuestNUMANode { + return []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-31"}, + {HostNodes: "1", HostCPUs: "32-63"}, + } +} + +func TestSumNUMACapacity(t *testing.T) { + assert := assert.New(t) + + cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(twoNodeHostCaps())) + assert.Equal(64, cpus) + assert.Equal(uint64(256*1024), memMB) +} + +func TestSumNUMACapacityDeduplicatesHostNodes(t *testing.T) { + assert := assert.New(t) + + // Two guest entries that both reference host node 0 must only count + // once. The merged "0-1" entry adds host node 1. + nodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-31"}, + {HostNodes: "0-1", HostCPUs: "0-63"}, + } + cpus, memMB := sumNUMACapacity(nodes, fakeCapFn(twoNodeHostCaps())) + assert.Equal(64, cpus) + assert.Equal(uint64(256*1024), memMB) +} + +func TestSumNUMACapacitySkipsUnknown(t *testing.T) { + assert := assert.New(t) + + caps := map[int]struct { + cpus int + memMB uint64 + }{ + 0: {cpus: 16, memMB: 32 * 1024}, + // host node 1 missing on purpose + } + cpus, memMB := sumNUMACapacity(twoNodeAutoTopology(), fakeCapFn(caps)) + assert.Equal(16, cpus) + assert.Equal(uint64(32*1024), memMB) +} + +func TestSelectNUMANodesPassthroughForSingleNode(t *testing.T) { + assert := assert.New(t) + + in := []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}} + out := selectNUMANodes(in, 4, 1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesNoVFIOFitsOneNode(t *testing.T) { + // Small sandbox (8 vCPUs / 16 GiB) fits comfortably in one host node: + // expect collapse to the first guest node. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 8, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Len(out, 1) + assert.Equal("0", out[0].HostNodes) +} + +func TestSelectNUMANodesNoVFIOExceedsOneNode(t *testing.T) { + // 64 vCPUs needs both 32-CPU nodes: expect full topology. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 64, 16*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesNoVFIOMemoryExceedsOneNode(t *testing.T) { + // CPU fits in one node but memory does not: expect full topology. + assert := assert.New(t) + + in := twoNodeAutoTopology() + out := selectNUMANodes(in, 8, 200*1024, nil, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIOSubsetFits(t *testing.T) { + // VFIO device on host node 1; sandbox fits in one node: expect + // collapse to the guest node covering host node 1. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{1: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Len(out, 1) + assert.Equal("1", out[0].HostNodes) +} + +func TestSelectNUMANodesVFIOSubsetTooSmall(t *testing.T) { + // VFIO device on host node 1, but sandbox needs more than one node's + // worth of memory: expect the full topology so the sandbox actually + // fits, even at the cost of cross-NUMA traffic. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{1: {}} + out := selectNUMANodes(in, 8, 200*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIOSpansAllNodes(t *testing.T) { + // One VFIO device per host node: VFIO subset == full topology, no + // collapse possible. Result is the input unchanged. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{0: {}, 1: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +func TestSelectNUMANodesVFIONoCoverage(t *testing.T) { + // VFIO host node not represented in the guest topology (rare, but can + // happen if numa_mapping has been customized). Keep the full topology + // rather than dropping all nodes. + assert := assert.New(t) + + in := twoNodeAutoTopology() + vfio := map[int]struct{}{2: {}} + out := selectNUMANodes(in, 8, 16*1024, vfio, fakeCapFn(twoNodeHostCaps()), silentLogger()) + assert.Equal(in, out) +} + +// rightSizeNUMAWithFakeCaps mirrors maybeRightSizeAutoNUMA but lets tests +// inject a synthetic capacity oracle in place of realHostNUMACapFn so the +// decision is hermetic. +func rightSizeNUMAWithFakeCaps(hc *HypervisorConfig, capFn hostNUMACapFn) { + if hc == nil || len(hc.NUMAMapping) > 0 || len(hc.GuestNUMANodes) <= 1 { + return + } + hc.GuestNUMANodes = selectNUMANodes( + hc.GuestNUMANodes, + hc.DefaultMaxVCPUs, + uint64(hc.MemorySize), + nil, // no VFIO devices in this test + capFn, + silentLogger(), + ) +} + +func TestMaybeRightSizeAutoNUMACollapsesToOneNode(t *testing.T) { + // Empty NUMAMapping (auto) + sandbox fits in one host node: + // GuestNUMANodes is trimmed to a single entry. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 1) + assert.Equal("0", hc.GuestNUMANodes[0].HostNodes) +} + +func TestMaybeRightSizeAutoNUMAExplicitMappingHonored(t *testing.T) { + // Non-empty NUMAMapping (user-provided) is left untouched, even if + // the sandbox would fit in a single node. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 1, + MemorySize: 1, + NUMAMapping: []string{"0", "1"}, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 2) +} + +func TestMaybeRightSizeAutoNUMAKeepsFullWhenSandboxSpansNodes(t *testing.T) { + // Sandbox needs more CPUs than a single host node has: full topology + // is preserved. + assert := assert.New(t) + + hc := &HypervisorConfig{ + DefaultMaxVCPUs: 64, // > one node's 32 CPUs + MemorySize: 1024, + GuestNUMANodes: twoNodeAutoTopology(), + } + rightSizeNUMAWithFakeCaps(hc, fakeCapFn(twoNodeHostCaps())) + assert.Len(hc.GuestNUMANodes, 2) +} + +func TestMaybeRightSizeAutoNUMANoOpForFlatTopology(t *testing.T) { + // A topology with ≤ 1 node is a no-op regardless of NUMAMapping or + // budget. + assert := assert.New(t) + + for _, tc := range []struct { + name string + hc *HypervisorConfig + }{ + { + name: "nil config", + hc: nil, + }, + { + name: "single node", + hc: &HypervisorConfig{ + GuestNUMANodes: []types.GuestNUMANode{{HostNodes: "0", HostCPUs: "0-31"}}, + }, + }, + { + name: "empty", + hc: &HypervisorConfig{}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + before := 0 + if tc.hc != nil { + before = len(tc.hc.GuestNUMANodes) + } + rightSizeNUMAWithFakeCaps(tc.hc, fakeCapFn(twoNodeHostCaps())) + after := 0 + if tc.hc != nil { + after = len(tc.hc.GuestNUMANodes) + } + assert.Equal(before, after) + }) + } +} diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 39bcfde8f4..bc2aa98121 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -13,6 +13,7 @@ import ( "os/exec" "path/filepath" "regexp" + "strconv" "strings" "syscall" "time" @@ -623,3 +624,182 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) { return numaNodes, nil } + +// NUMADistEntry represents a single NUMA distance measurement between two nodes. +type NUMADistEntry struct { + Src uint32 + Dst uint32 + Val uint32 +} + +// GetHostNUMADistances reads the host NUMA distance matrix for the nodes +// referenced by the given GuestNUMANode list and returns off-diagonal +// pairwise entries (skipping self-distance src==dst). +// The distance row from sysfs is indexed by host NUMA node ID, so we parse +// each guest node's HostNodes to find the representative host node ID and +// use that to index into the distance row. +func GetHostNUMADistances(nodes []types.GuestNUMANode) []NUMADistEntry { + hostNodeIDs := make([]int, len(nodes)) + for i, n := range nodes { + nodeSet, err := cpuset.Parse(n.HostNodes) + if err != nil { + hostNodeIDs[i] = -1 + continue + } + ids := nodeSet.ToSlice() + if len(ids) == 0 { + hostNodeIDs[i] = -1 + continue + } + hostNodeIDs[i] = ids[0] + } + + var dists []NUMADistEntry + for srcIdx, srcNode := range nodes { + if hostNodeIDs[srcIdx] < 0 { + continue + } + distStr := getHostNUMADistance(srcNode.HostNodes) + if distStr == "" { + continue + } + fields := strings.Fields(distStr) + for dstIdx := range nodes { + if srcIdx == dstIdx { + continue + } + hostID := hostNodeIDs[dstIdx] + if hostID < 0 || hostID >= len(fields) { + continue + } + val, err := strconv.ParseUint(fields[hostID], 10, 32) + if err != nil { + continue + } + dists = append(dists, NUMADistEntry{ + Src: uint32(srcIdx), + Dst: uint32(dstIdx), + Val: uint32(val), + }) + } + } + return dists +} + +// HostNUMANodeCapacity describes the CPU and memory capacity of a single +// host NUMA node, as seen via sysfs. +type HostNUMANodeCapacity struct { + NodeID int + CPUs int + MemMB uint64 +} + +// GetHostNUMANodeCapacity returns the CPU count and memory size (in MiB) +// of the given host NUMA node. +func GetHostNUMANodeCapacity(nodeID int) (HostNUMANodeCapacity, error) { + cap := HostNUMANodeCapacity{NodeID: nodeID} + cpuList, err := getHostNUMANodeCPUs(nodeID) + if err != nil { + return cap, err + } + cs, err := cpuset.Parse(cpuList) + if err != nil { + return cap, fmt.Errorf("parse host node %d cpulist %q: %w", nodeID, cpuList, err) + } + cap.CPUs = cs.Size() + memMB, err := getHostNUMANodeMemoryMB(nodeID) + if err != nil { + return cap, err + } + cap.MemMB = memMB + return cap, nil +} + +// GetHostNUMANodeCapacities returns the capacities of the given host NUMA +// node IDs in the same order. Nodes that fail to be read are skipped and +// the corresponding error is logged via the returned error (the slice may +// be shorter than the input). +func GetHostNUMANodeCapacities(nodeIDs []int) ([]HostNUMANodeCapacity, error) { + out := make([]HostNUMANodeCapacity, 0, len(nodeIDs)) + for _, id := range nodeIDs { + c, err := GetHostNUMANodeCapacity(id) + if err != nil { + return out, fmt.Errorf("read host NUMA node %d capacity: %w", id, err) + } + out = append(out, c) + } + return out, nil +} + +// DistributeVCPUsProportionally distributes totalVCPUs across NUMA nodes +// proportionally to the number of host CPUs available on each node. +// Each node is guaranteed at least 1 vCPU. Remainder vCPUs go to nodes +// with the most host CPUs. +func DistributeVCPUsProportionally(numaNodes []types.GuestNUMANode, totalVCPUs uint32) ([]uint32, error) { + numNodes := len(numaNodes) + if numNodes == 0 { + return nil, fmt.Errorf("no NUMA nodes") + } + if totalVCPUs < uint32(numNodes) { + return nil, fmt.Errorf("totalVCPUs (%d) must be >= NUMA node count (%d)", totalVCPUs, numNodes) + } + + hostCPUCounts := make([]int, numNodes) + totalHostCPUs := 0 + for i, gn := range numaNodes { + parsed, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + return nil, fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %w", i, err) + } + if parsed.Size() == 0 { + return nil, fmt.Errorf("HostCPUs for NUMA node %d must not be empty", i) + } + hostCPUCounts[i] = parsed.Size() + totalHostCPUs += hostCPUCounts[i] + } + if totalHostCPUs == 0 { + return nil, fmt.Errorf("total host CPU count is 0") + } + + vcpusPerNode := make([]uint32, numNodes) + var assigned uint32 + for i := range numaNodes { + vcpusPerNode[i] = uint32(int(totalVCPUs) * hostCPUCounts[i] / totalHostCPUs) + if vcpusPerNode[i] == 0 { + vcpusPerNode[i] = 1 + } + assigned += vcpusPerNode[i] + } + + // Use a copy for remainder distribution to avoid mutating the original counts. + weights := make([]int, numNodes) + copy(weights, hostCPUCounts) + + for assigned < totalVCPUs { + bestIdx := 0 + for i := 1; i < numNodes; i++ { + if weights[i] > weights[bestIdx] { + bestIdx = i + } + } + vcpusPerNode[bestIdx]++ + assigned++ + weights[bestIdx]-- + } + + for assigned > totalVCPUs { + bestIdx := 0 + for i := 1; i < numNodes; i++ { + if vcpusPerNode[i] > vcpusPerNode[bestIdx] { + bestIdx = i + } + } + if vcpusPerNode[bestIdx] <= 1 { + break + } + vcpusPerNode[bestIdx]-- + assigned-- + } + + return vcpusPerNode, nil +} diff --git a/src/runtime/virtcontainers/utils/utils_darwin.go b/src/runtime/virtcontainers/utils/utils_darwin.go index 4a64c921b1..a29d0378a2 100644 --- a/src/runtime/virtcontainers/utils/utils_darwin.go +++ b/src/runtime/virtcontainers/utils/utils_darwin.go @@ -22,3 +22,11 @@ func getHostNUMANodes() ([]int, error) { func getHostNUMANodeCPUs(nodeId int) (string, error) { return "", nil } + +func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) { + return 0, nil +} + +func getHostNUMADistance(hostNodes string) string { + return "" +} diff --git a/src/runtime/virtcontainers/utils/utils_linux.go b/src/runtime/virtcontainers/utils/utils_linux.go index 0ddb4dd5a9..11ae66b202 100644 --- a/src/runtime/virtcontainers/utils/utils_linux.go +++ b/src/runtime/virtcontainers/utils/utils_linux.go @@ -12,6 +12,8 @@ import ( "io" "math/big" "os" + "regexp" + "strconv" "strings" "syscall" "time" @@ -23,6 +25,8 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" ) +var nodeMemTotalRegexp = regexp.MustCompile(`Node\s+\d+\s+MemTotal:\s+(\d+)\s+kB`) + var ioctlFunc = Ioctl // maxUInt represents the maximum valid value for the context ID. @@ -220,3 +224,41 @@ func getHostNUMANodeCPUs(nodeId int) (string, error) { } return strings.TrimSuffix(string(data), "\n"), nil } + +// getHostNUMANodeMemoryMB returns the total memory in MiB for the given +// host NUMA node, parsed from /sys/devices/system/node/nodeN/meminfo. +func getHostNUMANodeMemoryMB(nodeId int) (uint64, error) { + fileName := fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeId) + data, err := os.ReadFile(fileName) + if err != nil { + return 0, err + } + m := nodeMemTotalRegexp.FindSubmatch(data) + if m == nil { + return 0, fmt.Errorf("MemTotal not found in %s", fileName) + } + kb, err := strconv.ParseUint(string(m[1]), 10, 64) + if err != nil { + return 0, err + } + return kb / 1024, nil +} + +// getHostNUMADistance reads the distance row for the first host NUMA node +// in the given hostNodes specifier (e.g. "0" or "0-1"). +func getHostNUMADistance(hostNodes string) string { + nodeSet, err := cpuset.Parse(hostNodes) + if err != nil { + return "" + } + ids := nodeSet.ToSlice() + if len(ids) == 0 { + return "" + } + fileName := fmt.Sprintf("/sys/devices/system/node/node%d/distance", ids[0]) + data, err := os.ReadFile(fileName) + if err != nil { + return "" + } + return strings.TrimSuffix(string(data), "\n") +} diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go index 8361caa1ee..cb11770c8c 100644 --- a/src/runtime/virtcontainers/utils/utils_test.go +++ b/src/runtime/virtcontainers/utils/utils_test.go @@ -19,6 +19,8 @@ import ( "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" + + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" ) const waitLocalProcessTimeoutSecs = 3 @@ -754,3 +756,62 @@ func TestDockerNetnsPath(t *testing.T) { } assert.Equal("", DockerNetnsPath(spec)) } + +func TestDistributeVCPUsProportionallySymmetric(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-3"}, + {HostCPUs: "4-7"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 8) + assert.NoError(err) + assert.Equal([]uint32{4, 4}, dist) +} + +func TestDistributeVCPUsProportionallyAsymmetric(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-7"}, + {HostCPUs: "8-9"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 10) + assert.NoError(err) + assert.Equal([]uint32{8, 2}, dist) +} + +func TestDistributeVCPUsProportionallyMinOnePerNode(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-99"}, + {HostCPUs: "100"}, + } + dist, err := DistributeVCPUsProportionally(nodes, 2) + assert.NoError(err) + assert.Equal(uint32(1), dist[0]) + assert.Equal(uint32(1), dist[1]) +} + +func TestDistributeVCPUsProportionallyThreeNodes(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0-5"}, + {HostCPUs: "6-8"}, + {HostCPUs: "9"}, + } + // 6+3+1=10 host CPUs, 10 vCPUs: proportional = 6, 3, 1 + dist, err := DistributeVCPUsProportionally(nodes, 10) + assert.NoError(err) + assert.Equal([]uint32{6, 3, 1}, dist) +} + +func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) { + assert := assert.New(t) + nodes := []types.GuestNUMANode{ + {HostCPUs: "0"}, + {HostCPUs: "1"}, + {HostCPUs: "2"}, + } + _, err := DistributeVCPUsProportionally(nodes, 2) + assert.Error(err) + assert.Contains(err.Error(), "must be >= NUMA node count") +}