runtime: Add NUMA-aware vCPU pinning and cpuset.mems forwarding

Make checkVCPUsPinning() NUMA-aware: when GuestNUMANodes are configured,
vCPU threads are pinned to host CPUs belonging to the same NUMA node as
the vCPU's guest NUMA node assignment via checkVCPUsPinningNUMA(),
preserving memory locality. vCPUs are distributed proportionally across
NUMA nodes, matching the distribution in buildNUMATopology().

Stop unconditionally stripping cpuset.mems in constrainGRPCSpec() and
container update(). When multi-NUMA is configured, translate host NUMA
node IDs to guest NUMA node IDs using translateHostMemsToGuest() before
forwarding to the agent. This allows the agent to enforce NUMA-aware
memory placement for containers.

Filter guest NUMA nodes at VM creation time: before calling CreateVM(),
prune GuestNUMANodes to only those whose HostCPUs intersect the sandbox
cpuset. This avoids exposing fake NUMA topology to the guest when
Kubernetes allocates CPUs from fewer nodes than the host has (e.g. all
CPUs from node 0 on a 2-node host), improving memory locality and
avoiding unnecessary cross-node memory traffic.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
Fabiano Fidêncio
2026-04-14 15:04:47 +02:00
committed by Fabiano Fidêncio
parent d0d7deb262
commit 12e5985dbd
7 changed files with 303 additions and 16 deletions

View File

@@ -1742,12 +1742,17 @@ func (c *Container) update(ctx context.Context, resources specs.LinuxResources)
return err
}
// There currently isn't a notion of cpusets.cpus or mems being tracked
// inside of the guest. Make sure we clear these before asking agent to update
// the container's cgroups.
// Cpus/Mems in cgroup cpuset are host-relative; clear Cpus since vCPU
// numbering differs inside the guest. For Mems, translate host NUMA node
// IDs to guest node IDs when multi-NUMA is configured, otherwise clear.
if resources.CPU != nil {
resources.CPU.Mems = ""
resources.CPU.Cpus = ""
numaNodes := c.sandbox.config.HypervisorConfig.GuestNUMANodes
if len(numaNodes) > 1 && resources.CPU.Mems != "" {
resources.CPU.Mems = translateHostMemsToGuest(resources.CPU.Mems, numaNodes)
} else {
resources.CPU.Mems = ""
}
}
return c.sandbox.agent.updateContainer(ctx, c.sandbox, *c, resources)

View File

@@ -34,6 +34,7 @@ import (
kataclient "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/client"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
@@ -1018,7 +1019,36 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st
return nil
}
func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error {
// translateHostMemsToGuest converts a host cpuset.mems string (e.g. "0,2")
// into guest NUMA node IDs. Each guest NUMA node index maps to a set of host
// nodes via GuestNUMANode.HostNodes. If a host node from `mems` appears in
// a GuestNUMANode's HostNodes, the corresponding guest node index is included.
func translateHostMemsToGuest(hostMems string, numaNodes []types.GuestNUMANode) string {
hostSet, err := cpuset.Parse(hostMems)
if err != nil {
return ""
}
hostSlice := hostSet.ToSlice()
var guestNodes []int
for guestIdx, gn := range numaNodes {
nodeSet, err := cpuset.Parse(gn.HostNodes)
if err != nil {
continue
}
for _, hostNode := range hostSlice {
if nodeSet.Contains(hostNode) {
guestNodes = append(guestNodes, guestIdx)
break
}
}
}
if len(guestNodes) == 0 {
return ""
}
return cpuset.NewCPUSet(guestNodes...).String()
}
func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool, numaNodes []types.GuestNUMANode) error {
// Disable Hooks since they have been handled on the host and there is
// no reason to send them to the agent. It would make no sense to try
// to apply them on the guest.
@@ -1060,7 +1090,6 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis
}
}
// By now only CPU constraints are supported
// Issue: https://github.com/kata-containers/runtime/issues/158
// Issue: https://github.com/kata-containers/runtime/issues/204
grpcSpec.Linux.Resources.Devices = nil
@@ -1069,7 +1098,12 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis
grpcSpec.Linux.Resources.Network = nil
if grpcSpec.Linux.Resources.CPU != nil {
grpcSpec.Linux.Resources.CPU.Cpus = ""
grpcSpec.Linux.Resources.CPU.Mems = ""
if len(numaNodes) > 1 && grpcSpec.Linux.Resources.CPU.Mems != "" {
guestMems := translateHostMemsToGuest(grpcSpec.Linux.Resources.CPU.Mems, numaNodes)
grpcSpec.Linux.Resources.CPU.Mems = guestMems
} else {
grpcSpec.Linux.Resources.CPU.Mems = ""
}
}
// Disable network and time namespaces since they are handled on the host
@@ -1495,7 +1529,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
// We need to constrain the spec to make sure we're not
// passing irrelevant information to the agent.
err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel)
err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel, sandbox.config.HypervisorConfig.GuestNUMANodes)
if err != nil {
return nil, err
}

View File

@@ -638,7 +638,7 @@ func TestConstrainGRPCSpec(t *testing.T) {
}
k := kataAgent{}
k.constrainGRPCSpec(g, true, true, "", true)
k.constrainGRPCSpec(g, true, true, "", true, nil)
// Check nil fields
assert.Nil(g.Hooks)
@@ -1370,3 +1370,51 @@ func TestKataAgentCreateContainerVFIODevices(t *testing.T) {
})
}
}
func TestTranslateHostMemsToGuest(t *testing.T) {
assert := assert.New(t)
numaNodes := []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-3"},
{HostNodes: "1", HostCPUs: "4-7"},
}
result := translateHostMemsToGuest("0", numaNodes)
assert.Equal("0", result)
result = translateHostMemsToGuest("1", numaNodes)
assert.Equal("1", result)
result = translateHostMemsToGuest("0-1", numaNodes)
assert.Equal("0-1", result)
result = translateHostMemsToGuest("0,1", numaNodes)
assert.Equal("0-1", result)
result = translateHostMemsToGuest("42", numaNodes)
assert.Equal("", result)
result = translateHostMemsToGuest("invalid", numaNodes)
assert.Equal("", result)
result = translateHostMemsToGuest("", numaNodes)
assert.Equal("", result)
}
func TestTranslateHostMemsToGuestRangeNodes(t *testing.T) {
assert := assert.New(t)
numaNodes := []types.GuestNUMANode{
{HostNodes: "0-1", HostCPUs: "0-7"},
{HostNodes: "2-3", HostCPUs: "8-15"},
}
result := translateHostMemsToGuest("1", numaNodes)
assert.Equal("0", result)
result = translateHostMemsToGuest("2", numaNodes)
assert.Equal("1", result)
result = translateHostMemsToGuest("0,3", numaNodes)
assert.Equal("0-1", result)
}

View File

@@ -2961,9 +2961,26 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error {
// checkVCPUsPinning is used to support CPUSet mode of kata container.
// CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning
// is set to true. Then it fetches sandbox's number of vCPU threads
// and number of CPUs in CPUSet. If the two are equal, each vCPU thread
// is then pinned to one fixed CPU in CPUSet.
// is set to true.
//
// When NUMA topology is configured (GuestNUMANodes is non-empty), vCPU
// threads are pinned to host CPUs belonging to the same host NUMA node
// as the vCPU's assigned guest NUMA node, preserving memory locality.
// vCPUs are distributed proportionally across nodes and each vCPU is
// pinned round-robin to the host CPUs within its NUMA node; the 1:1
// count equality check does not apply.
//
// This is true for both multi-node sandboxes and right-sized
// single-node sandboxes: when buildNUMATopology()/maybeRightSizeAutoNUMA
// collapses the topology to one node, that single node still carries a
// meaningful HostCPUs subset (the CPUs of the chosen host NUMA node),
// and pinning to that subset is what makes right-sizing actually deliver
// host-thread locality, not just guest-topology locality.
//
// In the non-NUMA path (GuestNUMANodes is empty, e.g. enable_numa=false),
// it fetches the sandbox's number of vCPU threads and number of CPUs in
// CPUSet. If the two are equal, each vCPU thread is pinned 1:1 to the
// CPUs in CPUSet; otherwise pinning is skipped.
func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
if s.config == nil {
return fmt.Errorf("no sandbox config found")
@@ -2972,7 +2989,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
return nil
}
// fetch vCPU thread ids and CPUSet
vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx)
if err != nil {
return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err)
@@ -2987,9 +3003,42 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
}
cpuSetSlice := cpuSet.ToSlice()
// check if vCPU thread numbers and CPU numbers are equal
numaNodes := s.config.HypervisorConfig.GuestNUMANodes
if len(cpuSetSlice) == 0 {
if len(numaNodes) >= 1 {
// No cpuset constraint (e.g. ctr without k8s, or a Burstable
// pod with cpuManagerPolicy=none). Build an effective cpuset
// from the NUMA nodes' HostCPUs so pinning works using the
// (possibly right-sized) host NUMA topology. Even a single
// NUMA node here meaningfully constrains pinning to that
// node's host CPUs.
for _, gn := range numaNodes {
hostCPUs, err := cpuset.Parse(gn.HostCPUs)
if err != nil {
continue
}
cpuSet = cpuSet.Union(hostCPUs)
}
cpuSetSlice = cpuSet.ToSlice()
if len(cpuSetSlice) == 0 {
s.Logger().Warn("sandbox CPUSet is empty and cannot derive from NUMA HostCPUs; skipping vCPU pinning")
s.isVCPUsPinningOn = false
return nil
}
s.Logger().WithField("effective-cpuset", cpuSet.String()).Debug("derived cpuset from NUMA HostCPUs for pinning")
} else {
s.Logger().Warn("sandbox CPUSet is empty; skipping vCPU pinning")
s.isVCPUsPinningOn = false
return nil
}
}
if len(numaNodes) >= 1 {
return s.checkVCPUsPinningNUMA(ctx, vCPUThreadsMap, numaNodes, cpuSetSlice)
}
numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice)
// if not equal, we should reset threads scheduling to random pattern
if numVCPUs != numCPUs {
if s.isVCPUsPinningOn {
s.isVCPUsPinningOn = false
@@ -2997,7 +3046,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
}
return nil
}
// if equal, we can use vCPU thread pinning
for i, tid := range vCPUThreadsMap.vcpus {
if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil {
if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
@@ -3010,6 +3058,68 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
return nil
}
// checkVCPUsPinningNUMA pins vCPU threads to host CPUs that belong to the
// same NUMA node as the vCPU's guest NUMA node assignment. vCPUs are
// distributed proportionally to the host CPU count per NUMA node
// (matching buildNUMATopology). It handles any non-empty numaNodes
// slice — including the right-sized single-node case, where every vCPU
// is pinned within the single chosen host NUMA node's CPU set.
func (s *Sandbox) checkVCPUsPinningNUMA(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, numaNodes []types.GuestNUMANode, cpuSetSlice []int) error {
numVCPUs := uint32(len(vCPUThreadsMap.vcpus))
numNodes := uint32(len(numaNodes))
if numVCPUs < numNodes {
return fmt.Errorf("number of vCPUs (%d) must be >= NUMA node count (%d) for NUMA pinning", numVCPUs, numNodes)
}
vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, numVCPUs)
if err != nil {
return fmt.Errorf("failed to compute NUMA vCPU distribution for pinning: %v", err)
}
cpuSetAll := cpuset.NewCPUSet(cpuSetSlice...)
var cpuOffset uint32
for i, gn := range numaNodes {
hostCPUs, err := cpuset.Parse(gn.HostCPUs)
if err != nil {
return fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %v", i, err)
}
allowedCPUs := hostCPUs.Intersection(cpuSetAll).ToSlice()
if len(allowedCPUs) == 0 {
s.Logger().WithFields(logrus.Fields{
"numa-node": i,
"host-cpus": gn.HostCPUs,
"sandbox-cpus": cpuSetSlice,
}).Warn("NUMA node HostCPUs do not intersect sandbox CPUSet; pinning vCPUs to full cpuset for this node")
allowedCPUs = cpuSetSlice
}
startVCPU := cpuOffset
endVCPU := startVCPU + vcpusPerNode[i]
cpuOffset = endVCPU
for vcpuIdx := startVCPU; vcpuIdx < endVCPU; vcpuIdx++ {
tid, ok := vCPUThreadsMap.vcpus[int(vcpuIdx)]
if !ok {
if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
return err
}
return fmt.Errorf("missing vcpu thread id for vcpu index %d", vcpuIdx)
}
pinIdx := int(vcpuIdx-startVCPU) % len(allowedCPUs)
if err := resCtrl.SetThreadAffinity(tid, allowedCPUs[pinIdx:pinIdx+1]); err != nil {
if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
return err
}
return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d (NUMA node %d): %v", tid, allowedCPUs[pinIdx], i, err)
}
}
}
s.isVCPUsPinningOn = true
return nil
}
// resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling
func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error {
for _, tid := range vCPUThreadsMap.vcpus {

View File

@@ -1679,3 +1679,29 @@ func TestSandboxHugepageLimit(t *testing.T) {
err = s.updateResources(context.Background())
assert.NoError(t, err)
}
func TestCheckVCPUsPinningNUMATooFewVCPUs(t *testing.T) {
assert := assert.New(t)
s := &Sandbox{}
vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100}}
numaNodes := []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-3"},
{HostNodes: "1", HostCPUs: "4-7"},
}
err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7})
assert.Error(err)
assert.Contains(err.Error(), "must be >= NUMA node count")
}
func TestCheckVCPUsPinningNUMABadHostCPUs(t *testing.T) {
assert := assert.New(t)
s := &Sandbox{}
vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100, 1: 101, 2: 102, 3: 103}}
numaNodes := []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "not-valid"},
{HostNodes: "1", HostCPUs: "4-7"},
}
err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7})
assert.Error(err)
assert.Contains(err.Error(), "failed to parse HostCPUs")
}

View File

@@ -625,6 +625,29 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) {
return numaNodes, nil
}
// FilterNUMANodesByCPUSet returns only those guest NUMA nodes whose HostCPUs
// intersect with the given sandbox cpuset. If sandboxCPUs is empty (size 0),
// no filtering is applied and the original slice is returned unchanged.
func FilterNUMANodesByCPUSet(nodes []types.GuestNUMANode, sandboxCPUs cpuset.CPUSet) []types.GuestNUMANode {
if sandboxCPUs.Size() == 0 {
return nodes
}
var filtered []types.GuestNUMANode
for _, n := range nodes {
hostCPUs, err := cpuset.Parse(n.HostCPUs)
if err != nil {
continue
}
if hostCPUs.Intersection(sandboxCPUs).Size() > 0 {
filtered = append(filtered, n)
}
}
if len(filtered) == 0 {
return nodes
}
return filtered
}
// NUMADistEntry represents a single NUMA distance measurement between two nodes.
type NUMADistEntry struct {
Src uint32

View File

@@ -20,6 +20,7 @@ import (
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
)
@@ -815,3 +816,43 @@ func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) {
assert.Error(err)
assert.Contains(err.Error(), "must be >= NUMA node count")
}
func TestFilterNUMANodesByCPUSet(t *testing.T) {
assert := assert.New(t)
nodes := []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-55,112-167"},
{HostNodes: "1", HostCPUs: "56-111,168-223"},
}
// Sandbox cpuset only from node 0 -> should return 1 node
sandboxCPUs, _ := cpuset.Parse("1-40,113-152")
filtered := FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
assert.Len(filtered, 1)
assert.Equal("0", filtered[0].HostNodes)
// Sandbox cpuset from both nodes -> should return 2 nodes
sandboxCPUs, _ = cpuset.Parse("1-40,56-80")
filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
assert.Len(filtered, 2)
// Sandbox cpuset only from node 1 -> should return 1 node
sandboxCPUs, _ = cpuset.Parse("60-70,170-180")
filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
assert.Len(filtered, 1)
assert.Equal("1", filtered[0].HostNodes)
// Empty cpuset -> no filtering, return all
emptyCPUs := cpuset.NewCPUSet()
filtered = FilterNUMANodesByCPUSet(nodes, emptyCPUs)
assert.Len(filtered, 2)
// Single-node host (1 NUMA node) -> returns 1 regardless
singleNode := []types.GuestNUMANode{
{HostNodes: "0", HostCPUs: "0-7"},
}
sandboxCPUs, _ = cpuset.Parse("0-3")
filtered = FilterNUMANodesByCPUSet(singleNode, sandboxCPUs)
assert.Len(filtered, 1)
assert.Equal("0", filtered[0].HostNodes)
}