mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-03-06 12:52:07 +00:00
Compare commits
14 Commits
dependabot
...
numa-topol
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
86ef395595 | ||
|
|
01fd3cd8bc | ||
|
|
7c8de25398 | ||
|
|
a92cc4ede1 | ||
|
|
08a0b71a57 | ||
|
|
2e9bd5441d | ||
|
|
4983bd9e1e | ||
|
|
3e4c875add | ||
|
|
a5c39aae12 | ||
|
|
4c7f1ac1e5 | ||
|
|
e7fd1518ee | ||
|
|
8381ee44a1 | ||
|
|
5fab5f9e5e | ||
|
|
2ab5be14c1 |
@@ -48,6 +48,7 @@ There are several kinds of Kata configurations and they are listed below.
|
||||
| `io.katacontainers.config.hypervisor.block_device_driver` | string | the driver to be used for block device, valid values are `virtio-blk`, `virtio-scsi`, `nvdimm`|
|
||||
| `io.katacontainers.config.hypervisor.cpu_features` | `string` | Comma-separated list of CPU features to pass to the CPU (QEMU) |
|
||||
| `io.katacontainers.config.hypervisor.default_max_vcpus` | uint32| the maximum number of vCPUs allocated for the VM by the hypervisor |
|
||||
| `io.katacontainers.config.hypervisor.default_maxmemory` | uint32| the maximum memory assigned for a VM by the hypervisor in `MiB` |
|
||||
| `io.katacontainers.config.hypervisor.default_memory` | uint32| the memory assigned for a VM by the hypervisor in `MiB` |
|
||||
| `io.katacontainers.config.hypervisor.default_vcpus` | float32| the default vCPUs assigned for a VM by the hypervisor |
|
||||
| `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disallow a block device from being used |
|
||||
|
||||
16
src/agent/Cargo.lock
generated
16
src/agent/Cargo.lock
generated
@@ -780,9 +780,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "container-device-interface"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "653849f0c250f73d9afab4b2a9a6b07adaee1f34c44ffa6f2d2c3f9392002c1a"
|
||||
checksum = "62aabe8ef7f15f505201aa88a97f4856fd572cb869b73232db95ade2366090cd"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
@@ -1207,9 +1207,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "fancy-regex"
|
||||
version = "0.14.0"
|
||||
version = "0.16.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
|
||||
checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f"
|
||||
dependencies = [
|
||||
"bit-set",
|
||||
"regex-automata 0.4.9",
|
||||
@@ -2007,9 +2007,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "jsonschema"
|
||||
version = "0.30.0"
|
||||
version = "0.33.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1b46a0365a611fbf1d2143104dcf910aada96fafd295bab16c60b802bf6fa1d"
|
||||
checksum = "d46662859bc5f60a145b75f4632fbadc84e829e45df6c5de74cfc8e05acb96b5"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"base64 0.22.1",
|
||||
@@ -3405,9 +3405,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "referencing"
|
||||
version = "0.30.0"
|
||||
version = "0.33.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8eff4fa778b5c2a57e85c5f2fe3a709c52f0e60d23146e2151cbef5893f420e"
|
||||
checksum = "9e9c261f7ce75418b3beadfb3f0eb1299fe8eb9640deba45ffa2cb783098697d"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"fluent-uri 0.3.2",
|
||||
|
||||
@@ -2689,8 +2689,13 @@ type SMP struct {
|
||||
Sockets uint32
|
||||
|
||||
// MaxCPUs is the maximum number of VCPUs that a VM can have.
|
||||
// This value, if non-zero, MUST BE equal to or greater than CPUs
|
||||
// This value, if non-zero, MUST BE equal to or greater than CPUs,
|
||||
// and must be equal to Sockets * Cores * Threads if all are non-zero.
|
||||
MaxCPUs uint32
|
||||
|
||||
// NumNUMA is the number of NUMA nodes that VM have.
|
||||
// The value MUST NOT be greater than Sockets.
|
||||
NumNUMA uint32
|
||||
}
|
||||
|
||||
// Memory is the guest memory configuration structure.
|
||||
@@ -2711,6 +2716,26 @@ type Memory struct {
|
||||
// Path is the file path of the memory device. It points to a local
|
||||
// file path used by FileBackedMem.
|
||||
Path string
|
||||
|
||||
// MemoryModules describes memory topology and allocation policy.
|
||||
MemoryModules []MemoryModule
|
||||
}
|
||||
|
||||
// MemoryModule represents single module of guest memory.
|
||||
type MemoryModule struct {
|
||||
// Size of memory module.
|
||||
// It should be suffixed with M or G for sizes in megabytes or
|
||||
// gigabytes respectively.
|
||||
Size string
|
||||
|
||||
// NodeId is the guest NUMA node this module belongs to.
|
||||
NodeId uint32
|
||||
|
||||
// HostNodes defines host NUMA nodes mask for binding memory allocation.
|
||||
HostNodes string
|
||||
|
||||
// MemoryPolicy defines host NUMA memory allocation policy.
|
||||
MemoryPolicy string
|
||||
}
|
||||
|
||||
// Kernel is the guest kernel configuration structure.
|
||||
@@ -3096,11 +3121,25 @@ func (config *Config) appendCPUs() error {
|
||||
return fmt.Errorf("MaxCPUs %d must be equal to or greater than CPUs %d",
|
||||
config.SMP.MaxCPUs, config.SMP.CPUs)
|
||||
}
|
||||
topologyCPUs := config.SMP.Sockets * config.SMP.Cores * config.SMP.Threads
|
||||
if topologyCPUs != 0 && config.SMP.MaxCPUs != topologyCPUs {
|
||||
return fmt.Errorf("MaxCPUs %d must match CPU topology: sockets %d * cores %d * thread %d",
|
||||
config.SMP.MaxCPUs, config.SMP.Sockets, config.SMP.Cores, config.SMP.Threads)
|
||||
}
|
||||
SMPParams = append(SMPParams, fmt.Sprintf("maxcpus=%d", config.SMP.MaxCPUs))
|
||||
}
|
||||
|
||||
config.qemuParams = append(config.qemuParams, "-smp")
|
||||
config.qemuParams = append(config.qemuParams, strings.Join(SMPParams, ","))
|
||||
|
||||
if config.SMP.NumNUMA > 1 {
|
||||
// Interleave CPU sockets over NUMA nodes.
|
||||
for socketId := uint32(0); socketId < config.SMP.Sockets; socketId++ {
|
||||
nodeId := socketId % config.SMP.NumNUMA
|
||||
config.qemuParams = append(config.qemuParams, "-numa",
|
||||
fmt.Sprintf("cpu,node-id=%d,socket-id=%d", nodeId, socketId))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -3169,34 +3208,49 @@ func (config *Config) appendMemoryKnobs() {
|
||||
if config.Memory.Size == "" {
|
||||
return
|
||||
}
|
||||
var objMemParam, numaMemParam string
|
||||
dimmName := "dimm1"
|
||||
if len(config.Memory.MemoryModules) == 0 {
|
||||
config.appendMemoryModule("dimm1", MemoryModule{Size: config.Memory.Size})
|
||||
}
|
||||
for i, memModule := range config.Memory.MemoryModules {
|
||||
config.appendMemoryModule(fmt.Sprintf("dimm%d", i), memModule)
|
||||
}
|
||||
}
|
||||
|
||||
func (config *Config) appendMemoryModule(memoryId string, memoryModule MemoryModule) {
|
||||
var objMemParams []string
|
||||
|
||||
if config.Knobs.HugePages {
|
||||
objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=/dev/hugepages"
|
||||
numaMemParam = "node,memdev=" + dimmName
|
||||
objMemParams = append(objMemParams, "memory-backend-file", "mem-path=/dev/hugepages")
|
||||
} else if config.Knobs.FileBackedMem && config.Memory.Path != "" {
|
||||
objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=" + config.Memory.Path
|
||||
numaMemParam = "node,memdev=" + dimmName
|
||||
objMemParams = append(objMemParams, "memory-backend-file", "mem-path="+config.Memory.Path)
|
||||
} else {
|
||||
objMemParam = "memory-backend-ram,id=" + dimmName + ",size=" + config.Memory.Size
|
||||
numaMemParam = "node,memdev=" + dimmName
|
||||
objMemParams = append(objMemParams, "memory-backend-ram")
|
||||
}
|
||||
|
||||
objMemParams = append(objMemParams, "id="+memoryId, "size="+memoryModule.Size)
|
||||
|
||||
if memoryModule.MemoryPolicy != "" {
|
||||
objMemParams = append(objMemParams, "policy="+memoryModule.MemoryPolicy)
|
||||
}
|
||||
|
||||
if memoryModule.HostNodes != "" {
|
||||
objMemParams = append(objMemParams, "host-nodes="+memoryModule.HostNodes)
|
||||
}
|
||||
|
||||
if config.Knobs.MemShared {
|
||||
objMemParam += ",share=on"
|
||||
objMemParams = append(objMemParams, "share=on")
|
||||
}
|
||||
if config.Knobs.MemPrealloc {
|
||||
objMemParam += ",prealloc=on"
|
||||
objMemParams = append(objMemParams, "prealloc=on")
|
||||
}
|
||||
config.qemuParams = append(config.qemuParams, "-object")
|
||||
config.qemuParams = append(config.qemuParams, objMemParam)
|
||||
|
||||
config.qemuParams = append(config.qemuParams, "-object", strings.Join(objMemParams, ","))
|
||||
|
||||
if isDimmSupported(config) {
|
||||
config.qemuParams = append(config.qemuParams, "-numa")
|
||||
config.qemuParams = append(config.qemuParams, numaMemParam)
|
||||
config.qemuParams = append(config.qemuParams, "-numa",
|
||||
fmt.Sprintf("node,nodeid=%d,memdev=%s", memoryModule.NodeId, memoryId))
|
||||
} else {
|
||||
config.qemuParams = append(config.qemuParams, "-machine")
|
||||
config.qemuParams = append(config.qemuParams, "memory-backend="+dimmName)
|
||||
config.qemuParams = append(config.qemuParams, "-machine", "memory-backend="+memoryId)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -516,8 +516,8 @@ func TestAppendMemoryHugePages(t *testing.T) {
|
||||
FileBackedMem: true,
|
||||
MemShared: true,
|
||||
}
|
||||
objMemString := "-object memory-backend-file,id=dimm1,size=1G,mem-path=/dev/hugepages,share=on,prealloc=on"
|
||||
numaMemString := "-numa node,memdev=dimm1"
|
||||
objMemString := "-object memory-backend-file,mem-path=/dev/hugepages,id=dimm1,size=1G,share=on,prealloc=on"
|
||||
numaMemString := "-numa node,nodeid=0,memdev=dimm1"
|
||||
memBackendString := "-machine memory-backend=dimm1"
|
||||
|
||||
knobsString := objMemString + " "
|
||||
@@ -547,7 +547,7 @@ func TestAppendMemoryMemPrealloc(t *testing.T) {
|
||||
MemShared: true,
|
||||
}
|
||||
objMemString := "-object memory-backend-ram,id=dimm1,size=1G,share=on,prealloc=on"
|
||||
numaMemString := "-numa node,memdev=dimm1"
|
||||
numaMemString := "-numa node,nodeid=0,memdev=dimm1"
|
||||
memBackendString := "-machine memory-backend=dimm1"
|
||||
|
||||
knobsString := objMemString + " "
|
||||
@@ -576,8 +576,8 @@ func TestAppendMemoryMemShared(t *testing.T) {
|
||||
FileBackedMem: true,
|
||||
MemShared: true,
|
||||
}
|
||||
objMemString := "-object memory-backend-file,id=dimm1,size=1G,mem-path=foobar,share=on"
|
||||
numaMemString := "-numa node,memdev=dimm1"
|
||||
objMemString := "-object memory-backend-file,mem-path=foobar,id=dimm1,size=1G,share=on"
|
||||
numaMemString := "-numa node,nodeid=0,memdev=dimm1"
|
||||
memBackendString := "-machine memory-backend=dimm1"
|
||||
|
||||
knobsString := objMemString + " "
|
||||
@@ -606,8 +606,8 @@ func TestAppendMemoryFileBackedMem(t *testing.T) {
|
||||
FileBackedMem: true,
|
||||
MemShared: false,
|
||||
}
|
||||
objMemString := "-object memory-backend-file,id=dimm1,size=1G,mem-path=foobar"
|
||||
numaMemString := "-numa node,memdev=dimm1"
|
||||
objMemString := "-object memory-backend-file,mem-path=foobar,id=dimm1,size=1G"
|
||||
numaMemString := "-numa node,nodeid=0,memdev=dimm1"
|
||||
memBackendString := "-machine memory-backend=dimm1"
|
||||
|
||||
knobsString := objMemString + " "
|
||||
@@ -637,8 +637,8 @@ func TestAppendMemoryFileBackedMemPrealloc(t *testing.T) {
|
||||
MemShared: true,
|
||||
MemPrealloc: true,
|
||||
}
|
||||
objMemString := "-object memory-backend-file,id=dimm1,size=1G,mem-path=foobar,share=on,prealloc=on"
|
||||
numaMemString := "-numa node,memdev=dimm1"
|
||||
objMemString := "-object memory-backend-file,mem-path=foobar,id=dimm1,size=1G,share=on,prealloc=on"
|
||||
numaMemString := "-numa node,nodeid=0,memdev=dimm1"
|
||||
memBackendString := "-machine memory-backend=dimm1"
|
||||
|
||||
knobsString := objMemString + " "
|
||||
@@ -687,7 +687,7 @@ func TestAppendMemory(t *testing.T) {
|
||||
testAppend(memory, memoryString, t)
|
||||
}
|
||||
|
||||
var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=6"
|
||||
var cpusString = "-smp 2,cores=1,threads=2,sockets=2,maxcpus=4"
|
||||
|
||||
func TestAppendCPUs(t *testing.T) {
|
||||
smp := SMP{
|
||||
@@ -695,7 +695,7 @@ func TestAppendCPUs(t *testing.T) {
|
||||
Sockets: 2,
|
||||
Cores: 1,
|
||||
Threads: 2,
|
||||
MaxCPUs: 6,
|
||||
MaxCPUs: 4,
|
||||
}
|
||||
|
||||
testAppend(smp, cpusString, t)
|
||||
@@ -717,6 +717,22 @@ func TestFailToAppendCPUs(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFailToAppendCPUsWrongTopology(t *testing.T) {
|
||||
config := Config{
|
||||
SMP: SMP{
|
||||
CPUs: 2,
|
||||
Sockets: 2,
|
||||
Cores: 1,
|
||||
Threads: 2,
|
||||
MaxCPUs: 6,
|
||||
},
|
||||
}
|
||||
|
||||
if err := config.appendCPUs(); err == nil {
|
||||
t.Fatalf("Expected appendCPUs to fail")
|
||||
}
|
||||
}
|
||||
|
||||
var qmpSingleSocketServerString = "-qmp unix:path=cc-qmp,server=on,wait=off"
|
||||
var qmpSingleSocketString = "-qmp unix:path=cc-qmp"
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ import (
|
||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
|
||||
"github.com/pbnjay/memory"
|
||||
"github.com/sirupsen/logrus"
|
||||
@@ -154,6 +155,8 @@ type hypervisor struct {
|
||||
VirtioMem bool `toml:"enable_virtio_mem"`
|
||||
IOMMU bool `toml:"enable_iommu"`
|
||||
IOMMUPlatform bool `toml:"enable_iommu_platform"`
|
||||
NUMA bool `toml:"enable_numa"`
|
||||
NUMAMapping []string `toml:"numa_mapping"`
|
||||
Debug bool `toml:"enable_debug"`
|
||||
DisableNestingChecks bool `toml:"disable_nesting_checks"`
|
||||
EnableIOThreads bool `toml:"enable_iothreads"`
|
||||
@@ -719,6 +722,18 @@ func (h hypervisor) getIOMMUPlatform() bool {
|
||||
return h.IOMMUPlatform
|
||||
}
|
||||
|
||||
func (h hypervisor) defaultNUMANodes() []types.NUMANode {
|
||||
if !h.NUMA {
|
||||
return nil
|
||||
}
|
||||
numaNodes, err := utils.GetNUMANodes(h.NUMAMapping)
|
||||
if err != nil {
|
||||
kataUtilsLogger.WithError(err).Warn("Cannot construct NUMA nodes.")
|
||||
return nil
|
||||
}
|
||||
return numaNodes
|
||||
}
|
||||
|
||||
func (h hypervisor) getRemoteHypervisorSocket() string {
|
||||
if h.RemoteHypervisorSocket == "" {
|
||||
return defaultRemoteHypervisorSocket
|
||||
@@ -974,6 +989,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
HugePages: h.HugePages,
|
||||
IOMMU: h.IOMMU,
|
||||
IOMMUPlatform: h.getIOMMUPlatform(),
|
||||
NUMA: h.NUMA,
|
||||
NUMANodes: h.defaultNUMANodes(),
|
||||
FileBackedMemRootDir: h.FileBackedMemRootDir,
|
||||
FileBackedMemRootList: h.FileBackedMemRootList,
|
||||
Debug: h.Debug,
|
||||
@@ -1868,6 +1885,20 @@ func checkConfig(config oci.RuntimeConfig) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := checkNumaConfig(config); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// checkNumaConfig ensures that we have static resource management set since
|
||||
// NUMA does not support hot-plug of memory or CPU, VFIO devices can be
|
||||
// hot-plugged.
|
||||
func checkNumaConfig(config oci.RuntimeConfig) error {
|
||||
if !config.StaticSandboxResourceMgmt && config.HypervisorConfig.NUMA {
|
||||
return errors.New("NUMA is enabled but static sandbox resource management is false, NUMA cannot hot-plug CPUs or memory, VFIO hot-plugging works, set static_sandbox_resource_mgmt=true")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -705,6 +705,16 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
|
||||
return err
|
||||
}
|
||||
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMaxMemory).setUintWithCheck(func(memorySz uint64) error {
|
||||
if memorySz < vc.MinHypervisorMemory && sbConfig.HypervisorType != vc.RemoteHypervisor {
|
||||
return fmt.Errorf("Memory specified in annotation %s is less than minimum required %d, please specify a larger value", vcAnnotations.DefaultMemory, vc.MinHypervisorMemory)
|
||||
}
|
||||
sbConfig.HypervisorConfig.DefaultMaxMemorySize = memorySz
|
||||
return nil
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.MemSlots).setUint(func(mslots uint64) {
|
||||
if mslots > 0 {
|
||||
sbConfig.HypervisorConfig.MemSlots = uint32(mslots)
|
||||
@@ -776,6 +786,14 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig
|
||||
return err
|
||||
}
|
||||
|
||||
if annotation, ok := ocispec.Annotations[vcAnnotations.NUMAMapping]; ok {
|
||||
numaNodes, err := vcutils.GetNUMANodes(strings.Fields(annotation))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
sbConfig.HypervisorConfig.NUMANodes = numaNodes
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -641,6 +641,12 @@ type HypervisorConfig struct {
|
||||
// IOMMUPlatform is used to indicate if IOMMU_PLATFORM is enabled for supported devices
|
||||
IOMMUPlatform bool
|
||||
|
||||
// NUMA states if we have NUMA enabled or not
|
||||
NUMA bool
|
||||
|
||||
// NUMANodes defines VM NUMA topology and mapping to host NUMA nodes and CPUs.
|
||||
NUMANodes []types.NUMANode
|
||||
|
||||
// DisableNestingChecks is used to override customizations performed
|
||||
// when running on top of another VMM.
|
||||
DisableNestingChecks bool
|
||||
@@ -720,7 +726,8 @@ type HypervisorConfig struct {
|
||||
|
||||
// vcpu mapping from vcpu number to thread number
|
||||
type VcpuThreadIDs struct {
|
||||
vcpus map[int]int
|
||||
vcpus map[int]int
|
||||
vcpuToNodeId map[int]uint32
|
||||
}
|
||||
|
||||
func (conf *HypervisorConfig) CheckTemplateConfig() error {
|
||||
@@ -916,6 +923,10 @@ func (conf HypervisorConfig) NumVCPUs() uint32 {
|
||||
return RoundUpNumVCPUs(conf.NumVCPUsF)
|
||||
}
|
||||
|
||||
func (conf HypervisorConfig) NumNUMA() uint32 {
|
||||
return uint32(len(conf.NUMANodes))
|
||||
}
|
||||
|
||||
func appendParam(params []Param, parameter string, value string) []Param {
|
||||
return append(params, Param{parameter, value})
|
||||
}
|
||||
|
||||
@@ -44,6 +44,10 @@ func validateHypervisorConfig(conf *HypervisorConfig) error {
|
||||
conf.MemorySize = defaultMemSzMiB
|
||||
}
|
||||
|
||||
if conf.DefaultMaxMemorySize != 0 && uint64(conf.MemorySize) > conf.DefaultMaxMemorySize {
|
||||
conf.MemorySize = uint32(conf.DefaultMaxMemorySize)
|
||||
}
|
||||
|
||||
if conf.DefaultBridges == 0 {
|
||||
conf.DefaultBridges = defaultBridges
|
||||
}
|
||||
@@ -58,6 +62,10 @@ func validateHypervisorConfig(conf *HypervisorConfig) error {
|
||||
conf.DefaultMaxVCPUs = defaultMaxVCPUs
|
||||
}
|
||||
|
||||
if numNUMA := conf.NumNUMA(); numNUMA > 1 {
|
||||
conf.DefaultMaxVCPUs -= conf.DefaultMaxVCPUs % numNUMA
|
||||
}
|
||||
|
||||
if conf.Msize9p == 0 && conf.SharedFS != config.VirtioFS {
|
||||
conf.Msize9p = defaultMsize9p
|
||||
}
|
||||
|
||||
@@ -113,7 +113,7 @@ func (m *mockHypervisor) Disconnect(ctx context.Context) {
|
||||
|
||||
func (m *mockHypervisor) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
|
||||
vcpus := map[int]int{0: os.Getpid()}
|
||||
return VcpuThreadIDs{vcpus}, nil
|
||||
return VcpuThreadIDs{vcpus, nil}, nil
|
||||
}
|
||||
|
||||
func (m *mockHypervisor) Cleanup(ctx context.Context) error {
|
||||
|
||||
@@ -155,6 +155,9 @@ const (
|
||||
// DefaultMemory is a sandbox annotation for the memory assigned for a VM by the hypervisor.
|
||||
DefaultMemory = kataAnnotHypervisorPrefix + "default_memory"
|
||||
|
||||
// MaxMemory is a sandbox annotation for the maximum memory assigned for a VM by the hypervisor.
|
||||
DefaultMaxMemory = kataAnnotHypervisorPrefix + "default_maxmemory"
|
||||
|
||||
// MemSlots is a sandbox annotation to specify the memory slots assigned to the VM by the hypervisor.
|
||||
MemSlots = kataAnnotHypervisorPrefix + "memory_slots"
|
||||
|
||||
@@ -182,6 +185,9 @@ const (
|
||||
// FileBackedMemRootDir is a sandbox annotation to soecify file based memory backend root directory
|
||||
FileBackedMemRootDir = kataAnnotHypervisorPrefix + "file_mem_backend"
|
||||
|
||||
// NUMAMapping is a sandbox annotation that specifies mapping VM NUMA nodes to host NUMA nodes.
|
||||
NUMAMapping = kataAnnotHypervisorPrefix + "numa_mapping"
|
||||
|
||||
//
|
||||
// Shared File System related annotations
|
||||
//
|
||||
|
||||
@@ -2612,6 +2612,36 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOff
|
||||
return memory
|
||||
}
|
||||
|
||||
func genericNUMAMemoryModules(memoryMb, memoryAlign uint64, numaNodes []types.NUMANode) []govmmQemu.MemoryModule {
|
||||
if len(numaNodes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
memoryModules := make([]govmmQemu.MemoryModule, 0, len(numaNodes))
|
||||
|
||||
// Divide memory among NUMA nodes.
|
||||
memoryPerNode := memoryMb / uint64(len(numaNodes))
|
||||
memoryPerNode -= memoryPerNode % memoryAlign
|
||||
|
||||
// First NUMA node gets more if memory is not divide evenly.
|
||||
moduleSize := memoryMb - memoryPerNode*uint64(len(numaNodes)-1)
|
||||
|
||||
for nodeId, numaNode := range numaNodes {
|
||||
memoryModules = append(memoryModules, govmmQemu.MemoryModule{
|
||||
Size: fmt.Sprintf("%dM", moduleSize),
|
||||
NodeId: uint32(nodeId),
|
||||
HostNodes: numaNode.HostNodes,
|
||||
MemoryPolicy: "interleave",
|
||||
})
|
||||
moduleSize = memoryPerNode
|
||||
if moduleSize == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return memoryModules
|
||||
}
|
||||
|
||||
// genericAppendPCIeRootPort appends to devices the given pcie-root-port
|
||||
func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
|
||||
var (
|
||||
@@ -2736,9 +2766,11 @@ func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
|
||||
}
|
||||
|
||||
tid.vcpus = make(map[int]int, len(cpuInfos))
|
||||
tid.vcpuToNodeId = make(map[int]uint32, len(cpuInfos))
|
||||
for _, i := range cpuInfos {
|
||||
if i.ThreadID > 0 {
|
||||
tid.vcpus[i.CPUIndex] = i.ThreadID
|
||||
tid.vcpuToNodeId[i.CPUIndex] = uint32(i.Props.Node)
|
||||
}
|
||||
}
|
||||
return tid, nil
|
||||
|
||||
@@ -119,6 +119,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
|
||||
qemuArchBase: qemuArchBase{
|
||||
qemuMachine: *mp,
|
||||
qemuExePath: defaultQemuPath,
|
||||
numaNodes: config.NUMANodes,
|
||||
memoryOffset: config.MemOffset,
|
||||
kernelParamsNonDebug: kernelParamsNonDebug,
|
||||
kernelParamsDebug: kernelParamsDebug,
|
||||
@@ -201,7 +202,9 @@ func (q *qemuAmd64) cpuModel() string {
|
||||
}
|
||||
|
||||
func (q *qemuAmd64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory {
|
||||
return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
|
||||
memory := genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
|
||||
memory.MemoryModules = genericNUMAMemoryModules(memoryMb, 4, q.numaNodes)
|
||||
return memory
|
||||
}
|
||||
|
||||
// Is Memory Hotplug supported by this architecture/machine type combination?
|
||||
|
||||
@@ -192,6 +192,7 @@ type qemuArchBase struct {
|
||||
kernelParamsDebug []Param
|
||||
kernelParams []Param
|
||||
Bridges []types.Bridge
|
||||
numaNodes []types.NUMANode
|
||||
memoryOffset uint64
|
||||
networkIndex int
|
||||
// Exclude from lint checking for it is ultimately only used in architecture-specific code
|
||||
@@ -330,12 +331,20 @@ func (q *qemuArchBase) bridges(number uint32) {
|
||||
}
|
||||
|
||||
func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP {
|
||||
numNUMA := uint32(len(q.numaNodes))
|
||||
|
||||
numSockets := numNUMA
|
||||
if numSockets == 0 {
|
||||
numSockets = maxvcpus
|
||||
}
|
||||
|
||||
smp := govmmQemu.SMP{
|
||||
CPUs: vcpus,
|
||||
Sockets: maxvcpus,
|
||||
Cores: defaultCores,
|
||||
Sockets: numSockets,
|
||||
Cores: maxvcpus / numSockets / defaultThreads,
|
||||
Threads: defaultThreads,
|
||||
MaxCPUs: maxvcpus,
|
||||
NumNUMA: numNUMA,
|
||||
}
|
||||
|
||||
return smp
|
||||
|
||||
@@ -220,5 +220,7 @@ func (q *qemuArm64) appendProtectionDevice(devices []govmmQemu.Device, firmware,
|
||||
}
|
||||
|
||||
func (q *qemuArm64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory {
|
||||
return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
|
||||
memory := genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
|
||||
memory.MemoryModules = genericNUMAMemoryModules(memoryMb, 4, q.numaNodes)
|
||||
return memory
|
||||
}
|
||||
|
||||
@@ -2791,11 +2791,12 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error {
|
||||
// is set to true. Then it fetches sandbox's number of vCPU threads
|
||||
// and number of CPUs in CPUSet. If the two are equal, each vCPU thread
|
||||
// is then pinned to one fixed CPU in CPUSet.
|
||||
// For enforcing NUMA topology vCPU threads are pinned to related host CPUs.
|
||||
func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
|
||||
if s.config == nil {
|
||||
return fmt.Errorf("no sandbox config found")
|
||||
}
|
||||
if !s.config.EnableVCPUsPinning {
|
||||
if !s.config.EnableVCPUsPinning && s.config.HypervisorConfig.NumNUMA() == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -2814,23 +2815,67 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
|
||||
}
|
||||
cpuSetSlice := cpuSet.ToSlice()
|
||||
|
||||
// check if vCPU thread numbers and CPU numbers are equal
|
||||
numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice)
|
||||
// if not equal, we should reset threads scheduling to random pattern
|
||||
if numVCPUs != numCPUs {
|
||||
if s.isVCPUsPinningOn {
|
||||
s.isVCPUsPinningOn = false
|
||||
return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice)
|
||||
// build NUMA topology mapping, or fake single node if NUMA is not enabled.
|
||||
numNodes := max(s.config.HypervisorConfig.NumNUMA(), 1)
|
||||
|
||||
numaNodeVCPUs := make([][]int, numNodes)
|
||||
for vcpuId := range vCPUThreadsMap.vcpus {
|
||||
nodeId, ok := vCPUThreadsMap.vcpuToNodeId[vcpuId]
|
||||
if !ok || nodeId > numNodes {
|
||||
nodeId = 0
|
||||
}
|
||||
return nil
|
||||
numaNodeVCPUs[nodeId] = append(numaNodeVCPUs[nodeId], vcpuId)
|
||||
}
|
||||
// if equal, we can use vCPU thread pinning
|
||||
for i, tid := range vCPUThreadsMap.vcpus {
|
||||
if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil {
|
||||
if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
|
||||
return err
|
||||
|
||||
numaNodeCPUs := make([][]int, numNodes)
|
||||
numaNodeCPUs[0] = cpuSetSlice
|
||||
for i, numaNode := range s.config.HypervisorConfig.NUMANodes {
|
||||
nodeHostCPUs, err := cpuset.Parse(numaNode.HostCPUs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse NUMA CPUSet string: %v", err)
|
||||
}
|
||||
if !cpuSet.IsEmpty() {
|
||||
nodeHostCPUs = cpuSet.Intersection(nodeHostCPUs)
|
||||
}
|
||||
numaNodeCPUs[i] = nodeHostCPUs.ToSlice()
|
||||
}
|
||||
|
||||
// check if vCPU threads have enough host CPUs in each NUMA node
|
||||
// if not enough, we should reset threads affinity.
|
||||
for nodeId := range numaNodeVCPUs {
|
||||
numVCPUs := len(numaNodeVCPUs[nodeId])
|
||||
numCPUs := len(numaNodeCPUs[nodeId])
|
||||
|
||||
// Not enough host CPUs for the number of vCPUs in this NUMA node.
|
||||
// Two cases trigger a reset:
|
||||
// 1) Pinning is enabled in config but the counts differ.
|
||||
// 2) Regardless of config, vCPUs exceed available CPUs.
|
||||
insufficientCPUs := numVCPUs > numCPUs
|
||||
pinningMismatch := s.config.EnableVCPUsPinning && (numVCPUs != numCPUs)
|
||||
if pinningMismatch || insufficientCPUs {
|
||||
if s.isVCPUsPinningOn {
|
||||
s.isVCPUsPinningOn = false
|
||||
return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice)
|
||||
}
|
||||
virtLog.Warningf("cannot pin vcpus in vm numa node %d", nodeId)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
for nodeId := range numaNodeVCPUs {
|
||||
nodeCpuSetSlice := numaNodeCPUs[nodeId]
|
||||
for i, vcpuId := range numaNodeVCPUs[nodeId] {
|
||||
tid := vCPUThreadsMap.vcpus[vcpuId]
|
||||
affinity := nodeCpuSetSlice
|
||||
if s.config.EnableVCPUsPinning {
|
||||
affinity = affinity[i : i+1]
|
||||
}
|
||||
if err := resCtrl.SetThreadAffinity(tid, affinity); err != nil {
|
||||
if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("failed to set vcpu thread %d cpu affinity to %v: %v", tid, affinity, err)
|
||||
}
|
||||
return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d: %v", tid, cpuSetSlice[i], err)
|
||||
}
|
||||
}
|
||||
s.isVCPUsPinningOn = true
|
||||
|
||||
@@ -342,3 +342,9 @@ type Resources struct {
|
||||
Memory uint
|
||||
MemorySlots uint8
|
||||
}
|
||||
|
||||
// NUMANode defines VM NUMA node mapping to host NUMA nodes and CPUs.
|
||||
type NUMANode struct {
|
||||
HostNodes string
|
||||
HostCPUs string
|
||||
}
|
||||
|
||||
@@ -19,6 +19,9 @@ import (
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
||||
)
|
||||
|
||||
var ioctlFunc = Ioctl
|
||||
@@ -197,3 +200,71 @@ func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) b
|
||||
|
||||
return pidRunning
|
||||
}
|
||||
|
||||
func getHostNUMANodes() ([]int, error) {
|
||||
data, err := os.ReadFile("/sys/devices/system/node/online")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nodes, err := cpuset.Parse(strings.TrimSuffix(string(data), "\n"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nodes.ToSlice(), nil
|
||||
}
|
||||
|
||||
func getHostNUMANodeCPUs(nodeId int) (string, error) {
|
||||
fileName := fmt.Sprintf("/sys/devices/system/node/node%v/cpulist", nodeId)
|
||||
data, err := os.ReadFile(fileName)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSuffix(string(data), "\n"), nil
|
||||
}
|
||||
|
||||
// GetNUMANodes constructs VM NUMA nodes mapping to host NUMA nodes and host CPUs.
|
||||
func GetNUMANodes(numaMapping []string) ([]types.NUMANode, error) {
|
||||
// Add VM NUMA node for each specified subsets of host NUMA nodes.
|
||||
if numNUMA := len(numaMapping); numNUMA > 0 {
|
||||
numaNodes := make([]types.NUMANode, numNUMA)
|
||||
for i, hostNodes := range numaMapping {
|
||||
hostNodeIds, err := cpuset.Parse(hostNodes)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
numaNodes[i].HostNodes = hostNodes
|
||||
for _, nodeId := range hostNodeIds.ToSlice() {
|
||||
cpus, err := getHostNUMANodeCPUs(nodeId)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if numaNodes[i].HostCPUs != "" {
|
||||
numaNodes[i].HostCPUs += ","
|
||||
}
|
||||
numaNodes[i].HostCPUs += cpus
|
||||
}
|
||||
}
|
||||
return numaNodes, nil
|
||||
}
|
||||
|
||||
// Add VM NUMA node for each host NUMA node.
|
||||
nodeIds, err := getHostNUMANodes()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(nodeIds) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
numaNodes := make([]types.NUMANode, len(nodeIds))
|
||||
for i, nodeId := range nodeIds {
|
||||
cpus, err := getHostNUMANodeCPUs(nodeId)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
numaNodes[i].HostNodes = fmt.Sprintf("%d", nodeId)
|
||||
numaNodes[i].HostCPUs = cpus
|
||||
}
|
||||
|
||||
return numaNodes, nil
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ CONFIG_X86_INTEL_PSTATE=y
|
||||
# Firecracker needs this to support `vcpu_count`
|
||||
CONFIG_X86_MPPARSE=y
|
||||
|
||||
CONFIG_X86_64_ACPI_NUMA=y
|
||||
|
||||
CONFIG_ACPI_CPU_FREQ_PSS=y
|
||||
CONFIG_ACPI_HOTPLUG_IOAPIC=y
|
||||
CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
173
|
||||
|
||||
|
||||
@@ -450,6 +450,9 @@ generate_qemu_options() {
|
||||
qemu_options+=(functionality:--enable-cap-ng)
|
||||
qemu_options+=(functionality:--enable-seccomp)
|
||||
|
||||
# Support NUMA topology
|
||||
qemu_options+=(functionality:--enable-numa)
|
||||
|
||||
# AVX2 is enabled by default by x86_64, make sure it's enabled only
|
||||
# for that architecture
|
||||
if ! gt_eq "${qemu_version}" "10.1.0" ; then
|
||||
|
||||
Reference in New Issue
Block a user