diff --git a/docs/how-to/how-to-set-sandbox-config-kata.md b/docs/how-to/how-to-set-sandbox-config-kata.md index 14ba8ed898..cdca9bb05a 100644 --- a/docs/how-to/how-to-set-sandbox-config-kata.md +++ b/docs/how-to/how-to-set-sandbox-config-kata.md @@ -48,6 +48,7 @@ There are several kinds of Kata configurations and they are listed below. | `io.katacontainers.config.hypervisor.block_device_driver` | string | the driver to be used for block device, valid values are `virtio-blk`, `virtio-scsi`, `nvdimm`| | `io.katacontainers.config.hypervisor.cpu_features` | `string` | Comma-separated list of CPU features to pass to the CPU (QEMU) | | `io.katacontainers.config.hypervisor.default_max_vcpus` | uint32| the maximum number of vCPUs allocated for the VM by the hypervisor | +| `io.katacontainers.config.hypervisor.default_maxmemory` | uint32| the maximum memory assigned for a VM by the hypervisor in `MiB` | | `io.katacontainers.config.hypervisor.default_memory` | uint32| the memory assigned for a VM by the hypervisor in `MiB` | | `io.katacontainers.config.hypervisor.default_vcpus` | float32| the default vCPUs assigned for a VM by the hypervisor | | `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disallow a block device from being used | diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 67beb4b249..3e853fad66 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -2657,8 +2657,13 @@ type SMP struct { Sockets uint32 // MaxCPUs is the maximum number of VCPUs that a VM can have. - // This value, if non-zero, MUST BE equal to or greater than CPUs + // This value, if non-zero, MUST BE equal to or greater than CPUs, + // and must be equal to Sockets * Cores * Threads if all are non-zero. MaxCPUs uint32 + + // NumNUMA is the number of NUMA nodes that VM have. + // The value MUST NOT be greater than Sockets. + NumNUMA uint32 } // Memory is the guest memory configuration structure. @@ -2679,6 +2684,26 @@ type Memory struct { // Path is the file path of the memory device. It points to a local // file path used by FileBackedMem. Path string + + // MemoryModules describes memory topology and allocation policy. + MemoryModules []MemoryModule +} + +// MemoryModule represents single module of guest memory. +type MemoryModule struct { + // Size of memory module. + // It should be suffixed with M or G for sizes in megabytes or + // gigabytes respectively. + Size string + + // NodeId is the guest NUMA node this module belongs to. + NodeId uint32 + + // HostNodes defines host NUMA nodes mask for binding memory allocation. + HostNodes string + + // MemoryPolicy defines host NUMA memory allocation policy. + MemoryPolicy string } // Kernel is the guest kernel configuration structure. @@ -3064,11 +3089,25 @@ func (config *Config) appendCPUs() error { return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d", config.SMP.MaxCPUs, config.SMP.CPUs) } + topologyCPUs := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads + if topologyCPUs != 0 && config.SMP.MaxCPUs != topologyCPUs { + return fmt.Errorf("MaxCPUs %d must match CPU topology: sockets %d * cores %d * thread %d", + config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads) + } SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs)) } config.qemuParams = append(config.qemuParams, "-smp") config.qemuParams = append(config.qemuParams, strings.Join(SMPParams, ",")) + + if config.SMP.NumNUMA > 1 { + // Interleave CPU sockets over NUMA nodes. + for socketId := uint32(0); socketId < config.SMP.Sockets; socketId++ { + nodeId := socketId % config.SMP.NumNUMA + config.qemuParams = append(config.qemuParams, "-numa", + fmt.Sprintf("cpu,node-id=%d,socket-id=%d", nodeId, socketId)) + } + } } return nil @@ -3137,34 +3176,49 @@ func (config *Config) appendMemoryKnobs() { if config.Memory.Size == "" { return } - var objMemParam, numaMemParam string - dimmName := "dimm1" + if len(config.Memory.MemoryModules) == 0 { + config.appendMemoryModule("dimm1", MemoryModule{Size: config.Memory.Size}) + } + for i, memModule := range config.Memory.MemoryModules { + config.appendMemoryModule(fmt.Sprintf("dimm%d", i), memModule) + } +} + +func (config *Config) appendMemoryModule(memoryId string, memoryModule MemoryModule) { + var objMemParams []string + if config.Knobs.HugePages { - objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=/dev/hugepages" - numaMemParam = "node,memdev=" + dimmName + objMemParams = append(objMemParams, "memory-backend-file", "mem-path=/dev/hugepages") } else if config.Knobs.FileBackedMem && config.Memory.Path != "" { - objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=" + config.Memory.Path - numaMemParam = "node,memdev=" + dimmName + objMemParams = append(objMemParams, "memory-backend-file", "mem-path="+config.Memory.Path) } else { - objMemParam = "memory-backend-ram,id=" + dimmName + ",size=" + config.Memory.Size - numaMemParam = "node,memdev=" + dimmName + objMemParams = append(objMemParams, "memory-backend-ram") + } + + objMemParams = append(objMemParams, "id="+memoryId, "size="+memoryModule.Size) + + if memoryModule.MemoryPolicy != "" { + objMemParams = append(objMemParams, "policy="+memoryModule.MemoryPolicy) + } + + if memoryModule.HostNodes != "" { + objMemParams = append(objMemParams, "host-nodes="+memoryModule.HostNodes) } if config.Knobs.MemShared { - objMemParam += ",share=on" + objMemParams = append(objMemParams, "share=on") } if config.Knobs.MemPrealloc { - objMemParam += ",prealloc=on" + objMemParams = append(objMemParams, "prealloc=on") } - config.qemuParams = append(config.qemuParams, "-object") - config.qemuParams = append(config.qemuParams, objMemParam) + + config.qemuParams = append(config.qemuParams, "-object", strings.Join(objMemParams, ",")) if isDimmSupported(config) { - config.qemuParams = append(config.qemuParams, "-numa") - config.qemuParams = append(config.qemuParams, numaMemParam) + config.qemuParams = append(config.qemuParams, "-numa", + fmt.Sprintf("node,nodeid=%d,memdev=%s", memoryModule.NodeId, memoryId)) } else { - config.qemuParams = append(config.qemuParams, "-machine") - config.qemuParams = append(config.qemuParams, "memory-backend="+dimmName) + config.qemuParams = append(config.qemuParams, "-machine", "memory-backend="+memoryId) } } diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 5d4c15ed9d..8bef1aa3a9 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -687,7 +687,7 @@ func TestAppendMemory(t *testing.T) { testAppend(memory, memoryString, t) } -var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=6" +var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=4" func TestAppendCPUs(t *testing.T) { smp := SMP{ @@ -695,7 +695,7 @@ func TestAppendCPUs(t *testing.T) { Sockets: 2, Cores: 1, Threads: 2, - MaxCPUs: 6, + MaxCPUs: 4, } testAppend(smp, cpusString, t) @@ -717,6 +717,22 @@ func TestFailToAppendCPUs(t *testing.T) { } } +func TestFailToAppendCPUsWrongTopology(t *testing.T) { + config := Config{ + SMP: SMP{ + CPUs: 2, + Sockets: 2, + Cores: 1, + Threads: 2, + MaxCPUs: 6, + }, + } + + if err := config.appendCPUs(); err == nil { + t.Fatalf("Expected appendCPUs to fail") + } +} + var qmpSingleSocketServerString = "-qmp unix:path=cc-qmp,server=on,wait=off" var qmpSingleSocketString = "-qmp unix:path=cc-qmp" diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index f15d945ca9..d3f86b1f0a 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -25,6 +25,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" "github.com/pbnjay/memory" "github.com/sirupsen/logrus" @@ -63,6 +64,8 @@ const ( // the maximum valid loglevel for the hypervisor maxHypervisorLoglevel uint32 = 3 + // the maximum number of NUMA nodes in Linux kernel: 1 << CONFIG_NODES_SHIFT, which is up to 10. + maxNumNUMA uint32 = 1024 errInvalidHypervisorPrefix = "configuration file contains invalid hypervisor section" ) @@ -152,6 +155,8 @@ type hypervisor struct { VirtioMem bool `toml:"enable_virtio_mem"` IOMMU bool `toml:"enable_iommu"` IOMMUPlatform bool `toml:"enable_iommu_platform"` + NUMA bool `toml:"enable_numa"` + NUMAMapping []string `toml:"numa_mapping"` Debug bool `toml:"enable_debug"` DisableNestingChecks bool `toml:"disable_nesting_checks"` EnableIOThreads bool `toml:"enable_iothreads"` @@ -701,6 +706,18 @@ func (h hypervisor) getIOMMUPlatform() bool { return h.IOMMUPlatform } +func (h hypervisor) defaultNUMANodes() []types.NUMANode { + if !h.NUMA { + return nil + } + numaNodes, err := utils.GetNUMANodes(h.NUMAMapping) + if err != nil { + kataUtilsLogger.WithError(err).Warn("Cannot construct NUMA nodes.") + return nil + } + return numaNodes +} + func (h hypervisor) getRemoteHypervisorSocket() string { if h.RemoteHypervisorSocket == "" { return defaultRemoteHypervisorSocket @@ -954,6 +971,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { HugePages: h.HugePages, IOMMU: h.IOMMU, IOMMUPlatform: h.getIOMMUPlatform(), + NUMANodes: h.defaultNUMANodes(), FileBackedMemRootDir: h.FileBackedMemRootDir, FileBackedMemRootList: h.FileBackedMemRootList, Debug: h.Debug, diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index d49aabd988..2ebb974cbb 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -673,6 +673,16 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig return err } + if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMaxMemory).setUintWithCheck(func(memorySz uint64) error { + if memorySz < vc.MinHypervisorMemory && sbConfig.HypervisorType != vc.RemoteHypervisor { + return fmt.Errorf("Memory specified in annotation %s is less than minimum required %d, please specify a larger value", vcAnnotations.DefaultMemory, vc.MinHypervisorMemory) + } + sbConfig.HypervisorConfig.DefaultMaxMemorySize = memorySz + return nil + }); err != nil { + return err + } + if err := newAnnotationConfiguration(ocispec, vcAnnotations.MemSlots).setUint(func(mslots uint64) { if mslots > 0 { sbConfig.HypervisorConfig.MemSlots = uint32(mslots) @@ -744,6 +754,14 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig return err } + if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok { + numaNodes, err := vcutils.GetNUMANodes(strings.Fields(annotation)) + if err != nil { + return err + } + sbConfig.HypervisorConfig.NUMANodes = numaNodes + } + return nil } diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 22423ab122..8a9ef71a7a 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -631,6 +631,9 @@ type HypervisorConfig struct { // IOMMUPlatform is used to indicate if IOMMU_PLATFORM is enabled for supported devices IOMMUPlatform bool + // NUMANodes defines VM NUMA topology and mapping to host NUMA nodes and CPUs. + NUMANodes []types.NUMANode + // DisableNestingChecks is used to override customizations performed // when running on top of another VMM. DisableNestingChecks bool @@ -706,7 +709,8 @@ type HypervisorConfig struct { // vcpu mapping from vcpu number to thread number type VcpuThreadIDs struct { - vcpus map[int]int + vcpus map[int]int + vcpuToNodeId map[int]uint32 } func (conf *HypervisorConfig) CheckTemplateConfig() error { @@ -902,6 +906,10 @@ func (conf HypervisorConfig) NumVCPUs() uint32 { return RoundUpNumVCPUs(conf.NumVCPUsF) } +func (conf HypervisorConfig) NumNUMA() uint32 { + return uint32(len(conf.NUMANodes)) +} + func appendParam(params []Param, parameter string, value string) []Param { return append(params, Param{parameter, value}) } diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go index 1bcd47218c..12c89ae467 100644 --- a/src/runtime/virtcontainers/hypervisor_config_linux.go +++ b/src/runtime/virtcontainers/hypervisor_config_linux.go @@ -44,6 +44,10 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { conf.MemorySize = defaultMemSzMiB } + if uint64(conf.MemorySize) > conf.DefaultMaxMemorySize { + conf.MemorySize = uint32(conf.DefaultMaxMemorySize) + } + if conf.DefaultBridges == 0 { conf.DefaultBridges = defaultBridges } @@ -58,6 +62,15 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { conf.DefaultMaxVCPUs = defaultMaxVCPUs } + if numNUMA := conf.NumNUMA(); numNUMA > 1 { + conf.DefaultMaxVCPUs -= conf.DefaultMaxVCPUs % numNUMA + } + + if conf.ConfidentialGuest && conf.NumVCPUs() != conf.DefaultMaxVCPUs { + hvLogger.Warnf("Confidential guests do not support hotplugging of vCPUs. Setting DefaultMaxVCPUs to NumVCPUs (%d)", conf.NumVCPUs()) + conf.DefaultMaxVCPUs = conf.NumVCPUs() + } + if conf.Msize9p == 0 && conf.SharedFS != config.VirtioFS { conf.Msize9p = defaultMsize9p } diff --git a/src/runtime/virtcontainers/mock_hypervisor.go b/src/runtime/virtcontainers/mock_hypervisor.go index 7d6da561fa..c969a33273 100644 --- a/src/runtime/virtcontainers/mock_hypervisor.go +++ b/src/runtime/virtcontainers/mock_hypervisor.go @@ -113,7 +113,7 @@ func (m *mockHypervisor) Disconnect(ctx context.Context) { func (m *mockHypervisor) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) { vcpus := map[int]int{0: os.Getpid()} - return VcpuThreadIDs{vcpus}, nil + return VcpuThreadIDs{vcpus, nil}, nil } func (m *mockHypervisor) Cleanup(ctx context.Context) error { diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 03b9e9b70c..3a9f26a5ca 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -155,6 +155,9 @@ const ( // DefaultMemory is a sandbox annotation for the memory assigned for a VM by the hypervisor. DefaultMemory = kataAnnotHypervisorPrefix + "default_memory" + // MaxMemory is a sandbox annotation for the maximum memory assigned for a VM by the hypervisor. + DefaultMaxMemory = kataAnnotHypervisorPrefix + "default_maxmemory" + // MemSlots is a sandbox annotation to specify the memory slots assigned to the VM by the hypervisor. MemSlots = kataAnnotHypervisorPrefix + "memory_slots" @@ -182,6 +185,9 @@ const ( // FileBackedMemRootDir is a sandbox annotation to soecify file based memory backend root directory FileBackedMemRootDir = kataAnnotHypervisorPrefix + "file_mem_backend" + // NUMAMapping is a sandbox annotation that specifies mapping VM NUMA nodes to host NUMA nodes. + NUMAMapping = kataAnnotHypervisorPrefix + "numa_mapping" + // // Shared File System related annotations // diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 0de579c870..9e8dd1ecfd 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -2669,6 +2669,36 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff return memory } +func genericNUMAMemoryModles(memoryMb, memoryAlign uint64, numaNodes []types.NUMANode) []govmmQemu.MemoryModule { + if len(numaNodes) == 0 { + return nil + } + + memoryModules := make([]govmmQemu.MemoryModule, 0, len(numaNodes)) + + // Divide memory among NUMA nodes. + memoryPerNode := memoryMb / uint64(len(numaNodes)) + memoryPerNode -= memoryPerNode % memoryAlign + + // First NUMA node gets more if memory is not divide evenly. + moduleSize := memoryMb - memoryPerNode*uint64(len(numaNodes)-1) + + for nodeId, numaNode := range numaNodes { + memoryModules = append(memoryModules, govmmQemu.MemoryModule{ + Size: fmt.Sprintf("%dM", moduleSize), + NodeId: uint32(nodeId), + HostNodes: numaNode.HostNodes, + MemoryPolicy: "interleave", + }) + moduleSize = memoryPerNode + if moduleSize == 0 { + break + } + } + + return memoryModules +} + // genericAppendPCIeRootPort appends to devices the given pcie-root-port func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { var ( @@ -2793,9 +2823,11 @@ func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) { } tid.vcpus = make(map[int]int, len(cpuInfos)) + tid.vcpuToNodeId = make(map[int]uint32, len(cpuInfos)) for _, i := range cpuInfos { if i.ThreadID > 0 { tid.vcpus[i.CPUIndex] = i.ThreadID + tid.vcpuToNodeId[i.CPUIndex] = uint32(i.Props.Node) } } return tid, nil diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index 78e7a5fc9f..6d54f3bd8a 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -117,6 +117,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { qemuArchBase: qemuArchBase{ qemuMachine: *mp, qemuExePath: defaultQemuPath, + numaNodes: config.NUMANodes, memoryOffset: config.MemOffset, kernelParamsNonDebug: kernelParamsNonDebug, kernelParamsDebug: kernelParamsDebug, @@ -198,7 +199,9 @@ func (q *qemuAmd64) cpuModel() string { } func (q *qemuAmd64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory { - return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset) + memory := genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset) + memory.MemoryModules = genericNUMAMemoryModles(memoryMb, 4, q.numaNodes) + return memory } // Is Memory Hotplug supported by this architecture/machine type combination? diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index aa41445916..666715ee35 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -186,6 +186,7 @@ type qemuArchBase struct { kernelParamsDebug []Param kernelParams []Param Bridges []types.Bridge + numaNodes []types.NUMANode memoryOffset uint64 networkIndex int // Exclude from lint checking for it is ultimately only used in architecture-specific code @@ -324,12 +325,20 @@ func (q *qemuArchBase) bridges(number uint32) { } func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP { + numNUMA := uint32(len(q.numaNodes)) + + numSockets := numNUMA + if numSockets == 0 { + numSockets = maxvcpus + } + smp := govmmQemu.SMP{ CPUs: vcpus, - Sockets: maxvcpus, - Cores: defaultCores, + Sockets: numSockets, + Cores: maxvcpus / numSockets / defaultThreads, Threads: defaultThreads, MaxCPUs: maxvcpus, + NumNUMA: numNUMA, } return smp diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 01a466bb47..7ce9cfc74e 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -2793,11 +2793,12 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error { // is set to true. Then it fetches sandbox's number of vCPU threads // and number of CPUs in CPUSet. If the two are equal, each vCPU thread // is then pinned to one fixed CPU in CPUSet. +// For enforcing NUMA topology vCPU threads are pinned to related host CPUs. func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { if s.config == nil { return fmt.Errorf("no sandbox config found") } - if !s.config.EnableVCPUsPinning { + if !s.config.EnableVCPUsPinning && s.config.HypervisorConfig.NumNUMA() == 0 { return nil } @@ -2816,23 +2817,59 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { } cpuSetSlice := cpuSet.ToSlice() - // check if vCPU thread numbers and CPU numbers are equal - numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice) - // if not equal, we should reset threads scheduling to random pattern - if numVCPUs != numCPUs { - if s.isVCPUsPinningOn { - s.isVCPUsPinningOn = false - return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice) + // build NUMA topology mapping, or fake single node if NUMA is not enabled. + numNodes := max(s.config.HypervisorConfig.NumNUMA(), 1) + + numaNodeVCPUs := make([][]int, numNodes) + for vcpuId := range vCPUThreadsMap.vcpus { + nodeId, ok := vCPUThreadsMap.vcpuToNodeId[vcpuId] + if !ok || nodeId > numNodes { + nodeId = 0 } - return nil + numaNodeVCPUs[nodeId] = append(numaNodeVCPUs[nodeId], vcpuId) } - // if equal, we can use vCPU thread pinning - for i, tid := range vCPUThreadsMap.vcpus { - if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil { - if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { - return err + + numaNodeCPUs := make([][]int, numNodes) + numaNodeCPUs[0] = cpuSetSlice + for i, numaNode := range s.config.HypervisorConfig.NUMANodes { + nodeHostCPUs, err := cpuset.Parse(numaNode.HostCPUs) + if err != nil { + return fmt.Errorf("failed to parse NUMA CPUSet string: %v", err) + } + if !cpuSet.IsEmpty() { + nodeHostCPUs = cpuSet.Intersection(nodeHostCPUs) + } + numaNodeCPUs[i] = nodeHostCPUs.ToSlice() + } + + // check if vCPU threads have enough host CPUs in each NUMA node + // if not enough, we should reset threads affinity. + for nodeId := range numaNodeVCPUs { + numVCPUs, numCPUs := len(numaNodeVCPUs[nodeId]), len(numaNodeCPUs[nodeId]) + if s.config.EnableVCPUsPinning && numVCPUs != numCPUs || numVCPUs > numCPUs { + if s.isVCPUsPinningOn { + s.isVCPUsPinningOn = false + return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice) + } + virtLog.Warningf("cannot pin vcpus in vm numa node %d", nodeId) + return nil + } + } + + for nodeId := range numaNodeVCPUs { + nodeCpuSetSlice := numaNodeCPUs[nodeId] + for i, vcpuId := range numaNodeVCPUs[nodeId] { + tid := vCPUThreadsMap.vcpus[vcpuId] + affinity := nodeCpuSetSlice + if s.config.EnableVCPUsPinning { + affinity = affinity[i : i+1] + } + if err := resCtrl.SetThreadAffinity(tid, affinity); err != nil { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("failed to set vcpu thread %d cpu affinity to %v: %v", tid, affinity, err) } - return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d: %v", tid, cpuSetSlice[i], err) } } s.isVCPUsPinningOn = true diff --git a/src/runtime/virtcontainers/types/sandbox.go b/src/runtime/virtcontainers/types/sandbox.go index 29c909c977..40db6a25c4 100644 --- a/src/runtime/virtcontainers/types/sandbox.go +++ b/src/runtime/virtcontainers/types/sandbox.go @@ -342,3 +342,9 @@ type Resources struct { Memory uint MemorySlots uint8 } + +// NUMANode defines VM NUMA node mapping to host NUMA nodes and CPUs. +type NUMANode struct { + HostNodes string + HostCPUs string +} diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 2ab4afadae..d8922e19e4 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -21,6 +21,9 @@ import ( "golang.org/x/sys/unix" pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols" + + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" ) const cpBinaryName = "cp" @@ -507,3 +510,49 @@ func IsDockerContainer(spec *specs.Spec) bool { return false } +// GetNUMANodes constructs VM NUMA nodes mapping to host NUMA nodes and host CPUs. +func GetNUMANodes(numaMapping []string) ([]types.NUMANode, error) { + // Add VM NUMA node for each specified subsets of host NUMA nodes. + if numNUMA := len(numaMapping); numNUMA > 0 { + numaNodes := make([]types.NUMANode, numNUMA) + for i, hostNodes := range numaMapping { + hostNodeIds, err := cpuset.Parse(hostNodes) + if err != nil { + return nil, err + } + numaNodes[i].HostNodes = hostNodes + for _, nodeId := range hostNodeIds.ToSlice() { + cpus, err := getHostNUMANodeCPUs(nodeId) + if err != nil { + return nil, err + } + if numaNodes[i].HostCPUs != "" { + numaNodes[i].HostCPUs += "," + } + numaNodes[i].HostCPUs += cpus + } + } + return numaNodes, nil + } + + // Add VM NUMA node for each host NUMA node. + nodeIds, err := getHostNUMANodes() + if err != nil { + return nil, err + } + if len(nodeIds) == 0 { + return nil, nil + } + + numaNodes := make([]types.NUMANode, len(nodeIds)) + for i, nodeId := range nodeIds { + cpus, err := getHostNUMANodeCPUs(nodeId) + if err != nil { + return nil, err + } + numaNodes[i].HostNodes = fmt.Sprintf("%d", nodeId) + numaNodes[i].HostCPUs = cpus + } + + return numaNodes, nil +} diff --git a/src/runtime/virtcontainers/utils/utils_linux.go b/src/runtime/virtcontainers/utils/utils_linux.go index a31b8d3511..0ddb4dd5a9 100644 --- a/src/runtime/virtcontainers/utils/utils_linux.go +++ b/src/runtime/virtcontainers/utils/utils_linux.go @@ -19,6 +19,8 @@ import ( "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" ) var ioctlFunc = Ioctl @@ -197,3 +199,24 @@ func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) b return pidRunning } + +func getHostNUMANodes() ([]int, error) { + data, err := os.ReadFile("/sys/devices/system/node/online") + if err != nil { + return nil, err + } + nodes, err := cpuset.Parse(strings.TrimSuffix(string(data), "\n")) + if err != nil { + return nil, err + } + return nodes.ToSlice(), nil +} + +func getHostNUMANodeCPUs(nodeId int) (string, error) { + fileName := fmt.Sprintf("/sys/devices/system/node/node%v/cpulist", nodeId) + data, err := os.ReadFile(fileName) + if err != nil { + return "", err + } + return strings.TrimSuffix(string(data), "\n"), nil +} diff --git a/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf b/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf index 6c260c0bae..d41ff0fee6 100644 --- a/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf +++ b/tools/packaging/kernel/configs/fragments/x86_64/acpi.conf @@ -4,6 +4,8 @@ CONFIG_X86_INTEL_PSTATE=y # Firecracker needs this to support `vcpu_count` CONFIG_X86_MPPARSE=y +CONFIG_X86_64_ACPI_NUMA=y + CONFIG_ACPI_CPU_FREQ_PSS=y CONFIG_ACPI_HOTPLUG_IOAPIC=y CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y diff --git a/tools/packaging/scripts/configure-hypervisor.sh b/tools/packaging/scripts/configure-hypervisor.sh index c6d762906d..5fe6e9512a 100755 --- a/tools/packaging/scripts/configure-hypervisor.sh +++ b/tools/packaging/scripts/configure-hypervisor.sh @@ -445,6 +445,9 @@ generate_qemu_options() { qemu_options+=(functionality:--enable-cap-ng) qemu_options+=(functionality:--enable-seccomp) + # Support NUMA topology + qemu_options+=(functionality:--enable-numa) + # AVX2 is enabled by default by x86_64, make sure it's enabled only # for that architecture if [ "$arch" == x86_64 ]; then diff --git a/tools/packaging/static-build/qemu/Dockerfile b/tools/packaging/static-build/qemu/Dockerfile index a6d79f77b1..e0549e3672 100644 --- a/tools/packaging/static-build/qemu/Dockerfile +++ b/tools/packaging/static-build/qemu/Dockerfile @@ -50,6 +50,7 @@ RUN apt-get update && apt-get upgrade -y && \ libglib2.0-dev${DPKG_ARCH} git \ libltdl-dev${DPKG_ARCH} \ libmount-dev${DPKG_ARCH} \ + libnuma-dev${DPKG_ARCH} \ libpixman-1-dev${DPKG_ARCH} \ libselinux1-dev${DPKG_ARCH} \ libtool${DPKG_ARCH} \