This commit is contained in:
Zvonko Kaiser 2025-08-12 01:50:03 +08:00 committed by GitHub
commit aa17b2b8d1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 338 additions and 39 deletions

View File

@ -48,6 +48,7 @@ There are several kinds of Kata configurations and they are listed below.
| `io.katacontainers.config.hypervisor.block_device_driver` | string | the driver to be used for block device, valid values are `virtio-blk`, `virtio-scsi`, `nvdimm`|
| `io.katacontainers.config.hypervisor.cpu_features` | `string` | Comma-separated list of CPU features to pass to the CPU (QEMU) |
| `io.katacontainers.config.hypervisor.default_max_vcpus` | uint32| the maximum number of vCPUs allocated for the VM by the hypervisor |
| `io.katacontainers.config.hypervisor.default_maxmemory` | uint32| the maximum memory assigned for a VM by the hypervisor in `MiB` |
| `io.katacontainers.config.hypervisor.default_memory` | uint32| the memory assigned for a VM by the hypervisor in `MiB` |
| `io.katacontainers.config.hypervisor.default_vcpus` | float32| the default vCPUs assigned for a VM by the hypervisor |
| `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disallow a block device from being used |

View File

@ -2657,8 +2657,13 @@ type SMP struct {
Sockets uint32
// MaxCPUs is the maximum number of VCPUs that a VM can have.
// This value, if non-zero, MUST BE equal to or greater than CPUs
// This value, if non-zero, MUST BE equal to or greater than CPUs,
// and must be equal to Sockets * Cores * Threads if all are non-zero.
MaxCPUs uint32
// NumNUMA is the number of NUMA nodes that VM have.
// The value MUST NOT be greater than Sockets.
NumNUMA uint32
}
// Memory is the guest memory configuration structure.
@ -2679,6 +2684,26 @@ type Memory struct {
// Path is the file path of the memory device. It points to a local
// file path used by FileBackedMem.
Path string
// MemoryModules describes memory topology and allocation policy.
MemoryModules []MemoryModule
}
// MemoryModule represents single module of guest memory.
type MemoryModule struct {
// Size of memory module.
// It should be suffixed with M or G for sizes in megabytes or
// gigabytes respectively.
Size string
// NodeId is the guest NUMA node this module belongs to.
NodeId uint32
// HostNodes defines host NUMA nodes mask for binding memory allocation.
HostNodes string
// MemoryPolicy defines host NUMA memory allocation policy.
MemoryPolicy string
}
// Kernel is the guest kernel configuration structure.
@ -3064,11 +3089,25 @@ func (config *Config) appendCPUs() error {
return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d",
config.SMP.MaxCPUs, config.SMP.CPUs)
}
topologyCPUs := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads
if topologyCPUs != 0 && config.SMP.MaxCPUs != topologyCPUs {
return fmt.Errorf("MaxCPUs %d must match CPU topology: sockets %d * cores %d * thread %d",
config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads)
}
SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs))
}
config.qemuParams = append(config.qemuParams, "-smp")
config.qemuParams = append(config.qemuParams, strings.Join(SMPParams, ","))
if config.SMP.NumNUMA > 1 {
// Interleave CPU sockets over NUMA nodes.
for socketId := uint32(0); socketId < config.SMP.Sockets; socketId++ {
nodeId := socketId % config.SMP.NumNUMA
config.qemuParams = append(config.qemuParams, "-numa",
fmt.Sprintf("cpu,node-id=%d,socket-id=%d", nodeId, socketId))
}
}
}
return nil
@ -3137,34 +3176,49 @@ func (config *Config) appendMemoryKnobs() {
if config.Memory.Size == "" {
return
}
var objMemParam, numaMemParam string
dimmName := "dimm1"
if len(config.Memory.MemoryModules) == 0 {
config.appendMemoryModule("dimm1", MemoryModule{Size: config.Memory.Size})
}
for i, memModule := range config.Memory.MemoryModules {
config.appendMemoryModule(fmt.Sprintf("dimm%d", i), memModule)
}
}
func (config *Config) appendMemoryModule(memoryId string, memoryModule MemoryModule) {
var objMemParams []string
if config.Knobs.HugePages {
objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=/dev/hugepages"
numaMemParam = "node,memdev=" + dimmName
objMemParams = append(objMemParams, "memory-backend-file", "mem-path=/dev/hugepages")
} else if config.Knobs.FileBackedMem && config.Memory.Path != "" {
objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=" + config.Memory.Path
numaMemParam = "node,memdev=" + dimmName
objMemParams = append(objMemParams, "memory-backend-file", "mem-path="+config.Memory.Path)
} else {
objMemParam = "memory-backend-ram,id=" + dimmName + ",size=" + config.Memory.Size
numaMemParam = "node,memdev=" + dimmName
objMemParams = append(objMemParams, "memory-backend-ram")
}
objMemParams = append(objMemParams, "id="+memoryId, "size="+memoryModule.Size)
if memoryModule.MemoryPolicy != "" {
objMemParams = append(objMemParams, "policy="+memoryModule.MemoryPolicy)
}
if memoryModule.HostNodes != "" {
objMemParams = append(objMemParams, "host-nodes="+memoryModule.HostNodes)
}
if config.Knobs.MemShared {
objMemParam += ",share=on"
objMemParams = append(objMemParams, "share=on")
}
if config.Knobs.MemPrealloc {
objMemParam += ",prealloc=on"
objMemParams = append(objMemParams, "prealloc=on")
}
config.qemuParams = append(config.qemuParams, "-object")
config.qemuParams = append(config.qemuParams, objMemParam)
config.qemuParams = append(config.qemuParams, "-object", strings.Join(objMemParams, ","))
if isDimmSupported(config) {
config.qemuParams = append(config.qemuParams, "-numa")
config.qemuParams = append(config.qemuParams, numaMemParam)
config.qemuParams = append(config.qemuParams, "-numa",
fmt.Sprintf("node,nodeid=%d,memdev=%s", memoryModule.NodeId, memoryId))
} else {
config.qemuParams = append(config.qemuParams, "-machine")
config.qemuParams = append(config.qemuParams, "memory-backend="+dimmName)
config.qemuParams = append(config.qemuParams, "-machine", "memory-backend="+memoryId)
}
}

View File

@ -687,7 +687,7 @@ func TestAppendMemory(t *testing.T) {
testAppend(memory, memoryString, t)
}
var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=6"
var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=4"
func TestAppendCPUs(t *testing.T) {
smp := SMP{
@ -695,7 +695,7 @@ func TestAppendCPUs(t *testing.T) {
Sockets: 2,
Cores: 1,
Threads: 2,
MaxCPUs: 6,
MaxCPUs: 4,
}
testAppend(smp, cpusString, t)
@ -717,6 +717,22 @@ func TestFailToAppendCPUs(t *testing.T) {
}
}
func TestFailToAppendCPUsWrongTopology(t *testing.T) {
config := Config{
SMP: SMP{
CPUs: 2,
Sockets: 2,
Cores: 1,
Threads: 2,
MaxCPUs: 6,
},
}
if err := config.appendCPUs(); err == nil {
t.Fatalf("Expected appendCPUs to fail")
}
}
var qmpSingleSocketServerString = "-qmp unix:path=cc-qmp,server=on,wait=off"
var qmpSingleSocketString = "-qmp unix:path=cc-qmp"

View File

@ -25,6 +25,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
"github.com/pbnjay/memory"
"github.com/sirupsen/logrus"
@ -63,6 +64,8 @@ const (
// the maximum valid loglevel for the hypervisor
maxHypervisorLoglevel uint32 = 3
// the maximum number of NUMA nodes in Linux kernel: 1 << CONFIG_NODES_SHIFT, which is up to 10.
maxNumNUMA uint32 = 1024
errInvalidHypervisorPrefix = "configuration file contains invalid hypervisor section"
)
@ -152,6 +155,8 @@ type hypervisor struct {
VirtioMem bool `toml:"enable_virtio_mem"`
IOMMU bool `toml:"enable_iommu"`
IOMMUPlatform bool `toml:"enable_iommu_platform"`
NUMA bool `toml:"enable_numa"`
NUMAMapping []string `toml:"numa_mapping"`
Debug bool `toml:"enable_debug"`
DisableNestingChecks bool `toml:"disable_nesting_checks"`
EnableIOThreads bool `toml:"enable_iothreads"`
@ -701,6 +706,18 @@ func (h hypervisor) getIOMMUPlatform() bool {
return h.IOMMUPlatform
}
func (h hypervisor) defaultNUMANodes() []types.NUMANode {
if !h.NUMA {
return nil
}
numaNodes, err := utils.GetNUMANodes(h.NUMAMapping)
if err != nil {
kataUtilsLogger.WithError(err).Warn("Cannot construct NUMA nodes.")
return nil
}
return numaNodes
}
func (h hypervisor) getRemoteHypervisorSocket() string {
if h.RemoteHypervisorSocket == "" {
return defaultRemoteHypervisorSocket
@ -954,6 +971,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
HugePages: h.HugePages,
IOMMU: h.IOMMU,
IOMMUPlatform: h.getIOMMUPlatform(),
NUMANodes: h.defaultNUMANodes(),
FileBackedMemRootDir: h.FileBackedMemRootDir,
FileBackedMemRootList: h.FileBackedMemRootList,
Debug: h.Debug,

View File

@ -673,6 +673,16 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
return err
}
if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMaxMemory).setUintWithCheck(func(memorySz uint64) error {
if memorySz < vc.MinHypervisorMemory && sbConfig.HypervisorType != vc.RemoteHypervisor {
return fmt.Errorf("Memory specified in annotation %s is less than minimum required %d, please specify a larger value", vcAnnotations.DefaultMemory, vc.MinHypervisorMemory)
}
sbConfig.HypervisorConfig.DefaultMaxMemorySize = memorySz
return nil
}); err != nil {
return err
}
if err := newAnnotationConfiguration(ocispec, vcAnnotations.MemSlots).setUint(func(mslots uint64) {
if mslots > 0 {
sbConfig.HypervisorConfig.MemSlots = uint32(mslots)
@ -744,6 +754,14 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
return err
}
if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok {
numaNodes, err := vcutils.GetNUMANodes(strings.Fields(annotation))
if err != nil {
return err
}
sbConfig.HypervisorConfig.NUMANodes = numaNodes
}
return nil
}

View File

@ -631,6 +631,9 @@ type HypervisorConfig struct {
// IOMMUPlatform is used to indicate if IOMMU_PLATFORM is enabled for supported devices
IOMMUPlatform bool
// NUMANodes defines VM NUMA topology and mapping to host NUMA nodes and CPUs.
NUMANodes []types.NUMANode
// DisableNestingChecks is used to override customizations performed
// when running on top of another VMM.
DisableNestingChecks bool
@ -706,7 +709,8 @@ type HypervisorConfig struct {
// vcpu mapping from vcpu number to thread number
type VcpuThreadIDs struct {
vcpus map[int]int
vcpus map[int]int
vcpuToNodeId map[int]uint32
}
func (conf *HypervisorConfig) CheckTemplateConfig() error {
@ -902,6 +906,10 @@ func (conf HypervisorConfig) NumVCPUs() uint32 {
return RoundUpNumVCPUs(conf.NumVCPUsF)
}
func (conf HypervisorConfig) NumNUMA() uint32 {
return uint32(len(conf.NUMANodes))
}
func appendParam(params []Param, parameter string, value string) []Param {
return append(params, Param{parameter, value})
}

View File

@ -44,6 +44,10 @@ func validateHypervisorConfig(conf *HypervisorConfig) error {
conf.MemorySize = defaultMemSzMiB
}
if uint64(conf.MemorySize) > conf.DefaultMaxMemorySize {
conf.MemorySize = uint32(conf.DefaultMaxMemorySize)
}
if conf.DefaultBridges == 0 {
conf.DefaultBridges = defaultBridges
}
@ -58,6 +62,15 @@ func validateHypervisorConfig(conf *HypervisorConfig) error {
conf.DefaultMaxVCPUs = defaultMaxVCPUs
}
if numNUMA := conf.NumNUMA(); numNUMA > 1 {
conf.DefaultMaxVCPUs -= conf.DefaultMaxVCPUs % numNUMA
}
if conf.ConfidentialGuest && conf.NumVCPUs() != conf.DefaultMaxVCPUs {
hvLogger.Warnf("Confidential guests do not support hotplugging of vCPUs. Setting DefaultMaxVCPUs to NumVCPUs (%d)", conf.NumVCPUs())
conf.DefaultMaxVCPUs = conf.NumVCPUs()
}
if conf.Msize9p == 0 && conf.SharedFS != config.VirtioFS {
conf.Msize9p = defaultMsize9p
}

View File

@ -113,7 +113,7 @@ func (m *mockHypervisor) Disconnect(ctx context.Context) {
func (m *mockHypervisor) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
vcpus := map[int]int{0: os.Getpid()}
return VcpuThreadIDs{vcpus}, nil
return VcpuThreadIDs{vcpus, nil}, nil
}
func (m *mockHypervisor) Cleanup(ctx context.Context) error {

View File

@ -155,6 +155,9 @@ const (
// DefaultMemory is a sandbox annotation for the memory assigned for a VM by the hypervisor.
DefaultMemory = kataAnnotHypervisorPrefix + "default_memory"
// MaxMemory is a sandbox annotation for the maximum memory assigned for a VM by the hypervisor.
DefaultMaxMemory = kataAnnotHypervisorPrefix + "default_maxmemory"
// MemSlots is a sandbox annotation to specify the memory slots assigned to the VM by the hypervisor.
MemSlots = kataAnnotHypervisorPrefix + "memory_slots"
@ -182,6 +185,9 @@ const (
// FileBackedMemRootDir is a sandbox annotation to soecify file based memory backend root directory
FileBackedMemRootDir = kataAnnotHypervisorPrefix + "file_mem_backend"
// NUMAMapping is a sandbox annotation that specifies mapping VM NUMA nodes to host NUMA nodes.
NUMAMapping = kataAnnotHypervisorPrefix + "numa_mapping"
//
// Shared File System related annotations
//

View File

@ -2669,6 +2669,36 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff
return memory
}
func genericNUMAMemoryModles(memoryMb, memoryAlign uint64, numaNodes []types.NUMANode) []govmmQemu.MemoryModule {
if len(numaNodes) == 0 {
return nil
}
memoryModules := make([]govmmQemu.MemoryModule, 0, len(numaNodes))
// Divide memory among NUMA nodes.
memoryPerNode := memoryMb / uint64(len(numaNodes))
memoryPerNode -= memoryPerNode % memoryAlign
// First NUMA node gets more if memory is not divide evenly.
moduleSize := memoryMb - memoryPerNode*uint64(len(numaNodes)-1)
for nodeId, numaNode := range numaNodes {
memoryModules = append(memoryModules, govmmQemu.MemoryModule{
Size: fmt.Sprintf("%dM", moduleSize),
NodeId: uint32(nodeId),
HostNodes: numaNode.HostNodes,
MemoryPolicy: "interleave",
})
moduleSize = memoryPerNode
if moduleSize == 0 {
break
}
}
return memoryModules
}
// genericAppendPCIeRootPort appends to devices the given pcie-root-port
func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
var (
@ -2793,9 +2823,11 @@ func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
}
tid.vcpus = make(map[int]int, len(cpuInfos))
tid.vcpuToNodeId = make(map[int]uint32, len(cpuInfos))
for _, i := range cpuInfos {
if i.ThreadID > 0 {
tid.vcpus[i.CPUIndex] = i.ThreadID
tid.vcpuToNodeId[i.CPUIndex] = uint32(i.Props.Node)
}
}
return tid, nil

View File

@ -117,6 +117,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
qemuArchBase: qemuArchBase{
qemuMachine: *mp,
qemuExePath: defaultQemuPath,
numaNodes: config.NUMANodes,
memoryOffset: config.MemOffset,
kernelParamsNonDebug: kernelParamsNonDebug,
kernelParamsDebug: kernelParamsDebug,
@ -198,7 +199,9 @@ func (q *qemuAmd64) cpuModel() string {
}
func (q *qemuAmd64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory {
return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
memory := genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
memory.MemoryModules = genericNUMAMemoryModles(memoryMb, 4, q.numaNodes)
return memory
}
// Is Memory Hotplug supported by this architecture/machine type combination?

View File

@ -186,6 +186,7 @@ type qemuArchBase struct {
kernelParamsDebug []Param
kernelParams []Param
Bridges []types.Bridge
numaNodes []types.NUMANode
memoryOffset uint64
networkIndex int
// Exclude from lint checking for it is ultimately only used in architecture-specific code
@ -324,12 +325,20 @@ func (q *qemuArchBase) bridges(number uint32) {
}
func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP {
numNUMA := uint32(len(q.numaNodes))
numSockets := numNUMA
if numSockets == 0 {
numSockets = maxvcpus
}
smp := govmmQemu.SMP{
CPUs: vcpus,
Sockets: maxvcpus,
Cores: defaultCores,
Sockets: numSockets,
Cores: maxvcpus / numSockets / defaultThreads,
Threads: defaultThreads,
MaxCPUs: maxvcpus,
NumNUMA: numNUMA,
}
return smp

View File

@ -2793,11 +2793,12 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error {
// is set to true. Then it fetches sandbox's number of vCPU threads
// and number of CPUs in CPUSet. If the two are equal, each vCPU thread
// is then pinned to one fixed CPU in CPUSet.
// For enforcing NUMA topology vCPU threads are pinned to related host CPUs.
func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
if s.config == nil {
return fmt.Errorf("no sandbox config found")
}
if !s.config.EnableVCPUsPinning {
if !s.config.EnableVCPUsPinning && s.config.HypervisorConfig.NumNUMA() == 0 {
return nil
}
@ -2816,23 +2817,59 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
}
cpuSetSlice := cpuSet.ToSlice()
// check if vCPU thread numbers and CPU numbers are equal
numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice)
// if not equal, we should reset threads scheduling to random pattern
if numVCPUs != numCPUs {
if s.isVCPUsPinningOn {
s.isVCPUsPinningOn = false
return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice)
// build NUMA topology mapping, or fake single node if NUMA is not enabled.
numNodes := max(s.config.HypervisorConfig.NumNUMA(), 1)
numaNodeVCPUs := make([][]int, numNodes)
for vcpuId := range vCPUThreadsMap.vcpus {
nodeId, ok := vCPUThreadsMap.vcpuToNodeId[vcpuId]
if !ok || nodeId > numNodes {
nodeId = 0
}
return nil
numaNodeVCPUs[nodeId] = append(numaNodeVCPUs[nodeId], vcpuId)
}
// if equal, we can use vCPU thread pinning
for i, tid := range vCPUThreadsMap.vcpus {
if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil {
if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
return err
numaNodeCPUs := make([][]int, numNodes)
numaNodeCPUs[0] = cpuSetSlice
for i, numaNode := range s.config.HypervisorConfig.NUMANodes {
nodeHostCPUs, err := cpuset.Parse(numaNode.HostCPUs)
if err != nil {
return fmt.Errorf("failed to parse NUMA CPUSet string: %v", err)
}
if !cpuSet.IsEmpty() {
nodeHostCPUs = cpuSet.Intersection(nodeHostCPUs)
}
numaNodeCPUs[i] = nodeHostCPUs.ToSlice()
}
// check if vCPU threads have enough host CPUs in each NUMA node
// if not enough, we should reset threads affinity.
for nodeId := range numaNodeVCPUs {
numVCPUs, numCPUs := len(numaNodeVCPUs[nodeId]), len(numaNodeCPUs[nodeId])
if s.config.EnableVCPUsPinning && numVCPUs != numCPUs || numVCPUs > numCPUs {
if s.isVCPUsPinningOn {
s.isVCPUsPinningOn = false
return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice)
}
virtLog.Warningf("cannot pin vcpus in vm numa node %d", nodeId)
return nil
}
}
for nodeId := range numaNodeVCPUs {
nodeCpuSetSlice := numaNodeCPUs[nodeId]
for i, vcpuId := range numaNodeVCPUs[nodeId] {
tid := vCPUThreadsMap.vcpus[vcpuId]
affinity := nodeCpuSetSlice
if s.config.EnableVCPUsPinning {
affinity = affinity[i : i+1]
}
if err := resCtrl.SetThreadAffinity(tid, affinity); err != nil {
if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
return err
}
return fmt.Errorf("failed to set vcpu thread %d cpu affinity to %v: %v", tid, affinity, err)
}
return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d: %v", tid, cpuSetSlice[i], err)
}
}
s.isVCPUsPinningOn = true

View File

@ -342,3 +342,9 @@ type Resources struct {
Memory uint
MemorySlots uint8
}
// NUMANode defines VM NUMA node mapping to host NUMA nodes and CPUs.
type NUMANode struct {
HostNodes string
HostCPUs string
}

View File

@ -21,6 +21,9 @@ import (
"golang.org/x/sys/unix"
pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
)
const cpBinaryName = "cp"
@ -507,3 +510,49 @@ func IsDockerContainer(spec *specs.Spec) bool {
return false
}
// GetNUMANodes constructs VM NUMA nodes mapping to host NUMA nodes and host CPUs.
func GetNUMANodes(numaMapping []string) ([]types.NUMANode, error) {
// Add VM NUMA node for each specified subsets of host NUMA nodes.
if numNUMA := len(numaMapping); numNUMA > 0 {
numaNodes := make([]types.NUMANode, numNUMA)
for i, hostNodes := range numaMapping {
hostNodeIds, err := cpuset.Parse(hostNodes)
if err != nil {
return nil, err
}
numaNodes[i].HostNodes = hostNodes
for _, nodeId := range hostNodeIds.ToSlice() {
cpus, err := getHostNUMANodeCPUs(nodeId)
if err != nil {
return nil, err
}
if numaNodes[i].HostCPUs != "" {
numaNodes[i].HostCPUs += ","
}
numaNodes[i].HostCPUs += cpus
}
}
return numaNodes, nil
}
// Add VM NUMA node for each host NUMA node.
nodeIds, err := getHostNUMANodes()
if err != nil {
return nil, err
}
if len(nodeIds) == 0 {
return nil, nil
}
numaNodes := make([]types.NUMANode, len(nodeIds))
for i, nodeId := range nodeIds {
cpus, err := getHostNUMANodeCPUs(nodeId)
if err != nil {
return nil, err
}
numaNodes[i].HostNodes = fmt.Sprintf("%d", nodeId)
numaNodes[i].HostCPUs = cpus
}
return numaNodes, nil
}

View File

@ -19,6 +19,8 @@ import (
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
)
var ioctlFunc = Ioctl
@ -197,3 +199,24 @@ func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) b
return pidRunning
}
func getHostNUMANodes() ([]int, error) {
data, err := os.ReadFile("/sys/devices/system/node/online")
if err != nil {
return nil, err
}
nodes, err := cpuset.Parse(strings.TrimSuffix(string(data), "\n"))
if err != nil {
return nil, err
}
return nodes.ToSlice(), nil
}
func getHostNUMANodeCPUs(nodeId int) (string, error) {
fileName := fmt.Sprintf("/sys/devices/system/node/node%v/cpulist", nodeId)
data, err := os.ReadFile(fileName)
if err != nil {
return "", err
}
return strings.TrimSuffix(string(data), "\n"), nil
}

View File

@ -4,6 +4,8 @@ CONFIG_X86_INTEL_PSTATE=y
# Firecracker needs this to support `vcpu_count`
CONFIG_X86_MPPARSE=y
CONFIG_X86_64_ACPI_NUMA=y
CONFIG_ACPI_CPU_FREQ_PSS=y
CONFIG_ACPI_HOTPLUG_IOAPIC=y
CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y

View File

@ -445,6 +445,9 @@ generate_qemu_options() {
qemu_options+=(functionality:--enable-cap-ng)
qemu_options+=(functionality:--enable-seccomp)
# Support NUMA topology
qemu_options+=(functionality:--enable-numa)
# AVX2 is enabled by default by x86_64, make sure it's enabled only
# for that architecture
if [ "$arch" == x86_64 ]; then

View File

@ -50,6 +50,7 @@ RUN apt-get update && apt-get upgrade -y && \
libglib2.0-dev${DPKG_ARCH} git \
libltdl-dev${DPKG_ARCH} \
libmount-dev${DPKG_ARCH} \
libnuma-dev${DPKG_ARCH} \
libpixman-1-dev${DPKG_ARCH} \
libselinux1-dev${DPKG_ARCH} \
libtool${DPKG_ARCH} \