runtime: Add NUMA-aware vCPU pinning and cpuset.mems forwarding

Make checkVCPUsPinning() NUMA-aware: when GuestNUMANodes are configured, vCPU threads are pinned to host CPUs belonging to the same NUMA node as the vCPU's guest NUMA node assignment via checkVCPUsPinningNUMA(), preserving memory locality. vCPUs are distributed proportionally across NUMA nodes, matching the distribution in buildNUMATopology(). Stop unconditionally stripping cpuset.mems in constrainGRPCSpec() and container update(). When multi-NUMA is configured, translate host NUMA node IDs to guest NUMA node IDs using translateHostMemsToGuest() before forwarding to the agent. This allows the agent to enforce NUMA-aware memory placement for containers. Filter guest NUMA nodes at VM creation time: before calling CreateVM(), prune GuestNUMANodes to only those whose HostCPUs intersect the sandbox cpuset. This avoids exposing fake NUMA topology to the guest when Kubernetes allocates CPUs from fewer nodes than the host has (e.g. all CPUs from node 0 on a 2-node host), improving memory locality and avoiding unnecessary cross-node memory traffic. Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com> Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-06-30 22:21:05 +00:00 · 2026-04-14 15:04:47 +02:00
parent d0d7deb262
commit 12e5985dbd
7 changed files with 303 additions and 16 deletions
--- a/src/runtime/virtcontainers/container.go
+++ b/src/runtime/virtcontainers/container.go
@@ -1742,12 +1742,17 @@ func (c *Container) update(ctx context.Context, resources specs.LinuxResources)
 		return err
 	}

-	// There currently isn't a notion of cpusets.cpus or mems being tracked
-	// inside of the guest. Make sure we clear these before asking agent to update
-	// the container's cgroups.
+	// Cpus/Mems in cgroup cpuset are host-relative; clear Cpus since vCPU
+	// numbering differs inside the guest. For Mems, translate host NUMA node
+	// IDs to guest node IDs when multi-NUMA is configured, otherwise clear.
 	if resources.CPU != nil {
-		resources.CPU.Mems = ""
 		resources.CPU.Cpus = ""
+		numaNodes := c.sandbox.config.HypervisorConfig.GuestNUMANodes
+		if len(numaNodes) > 1 && resources.CPU.Mems != "" {
+			resources.CPU.Mems = translateHostMemsToGuest(resources.CPU.Mems, numaNodes)
+		} else {
+			resources.CPU.Mems = ""
+		}
 	}

 	return c.sandbox.agent.updateContainer(ctx, c.sandbox, *c, resources)
--- a/src/runtime/virtcontainers/kata_agent.go
+++ b/src/runtime/virtcontainers/kata_agent.go
@@ -34,6 +34,7 @@ import (
 	kataclient "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/client"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
 	vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
@@ -1018,7 +1019,36 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st
 	return nil
 }

-func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error {
+// translateHostMemsToGuest converts a host cpuset.mems string (e.g. "0,2")
+// into guest NUMA node IDs. Each guest NUMA node index maps to a set of host
+// nodes via GuestNUMANode.HostNodes. If a host node from `mems` appears in
+// a GuestNUMANode's HostNodes, the corresponding guest node index is included.
+func translateHostMemsToGuest(hostMems string, numaNodes []types.GuestNUMANode) string {
+	hostSet, err := cpuset.Parse(hostMems)
+	if err != nil {
+		return ""
+	}
+	hostSlice := hostSet.ToSlice()
+	var guestNodes []int
+	for guestIdx, gn := range numaNodes {
+		nodeSet, err := cpuset.Parse(gn.HostNodes)
+		if err != nil {
+			continue
+		}
+		for _, hostNode := range hostSlice {
+			if nodeSet.Contains(hostNode) {
+				guestNodes = append(guestNodes, guestIdx)
+				break
+			}
+		}
+	}
+	if len(guestNodes) == 0 {
+		return ""
+	}
+	return cpuset.NewCPUSet(guestNodes...).String()
+}
+
+func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool, numaNodes []types.GuestNUMANode) error {
 	// Disable Hooks since they have been handled on the host and there is
 	// no reason to send them to the agent. It would make no sense to try
 	// to apply them on the guest.
@@ -1060,7 +1090,6 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis
 		}
 	}

-	// By now only CPU constraints are supported
 	// Issue: https://github.com/kata-containers/runtime/issues/158
 	// Issue: https://github.com/kata-containers/runtime/issues/204
 	grpcSpec.Linux.Resources.Devices = nil
@@ -1069,7 +1098,12 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis
 	grpcSpec.Linux.Resources.Network = nil
 	if grpcSpec.Linux.Resources.CPU != nil {
 		grpcSpec.Linux.Resources.CPU.Cpus = ""
-		grpcSpec.Linux.Resources.CPU.Mems = ""
+		if len(numaNodes) > 1 && grpcSpec.Linux.Resources.CPU.Mems != "" {
+			guestMems := translateHostMemsToGuest(grpcSpec.Linux.Resources.CPU.Mems, numaNodes)
+			grpcSpec.Linux.Resources.CPU.Mems = guestMems
+		} else {
+			grpcSpec.Linux.Resources.CPU.Mems = ""
+		}
 	}

 	// Disable network and time namespaces since they are handled on the host
@@ -1495,7 +1529,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co

 	// We need to constrain the spec to make sure we're not
 	// passing irrelevant information to the agent.
-	err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel)
+	err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel, sandbox.config.HypervisorConfig.GuestNUMANodes)
 	if err != nil {
 		return nil, err
 	}
--- a/src/runtime/virtcontainers/kata_agent_test.go
+++ b/src/runtime/virtcontainers/kata_agent_test.go
@@ -638,7 +638,7 @@ func TestConstrainGRPCSpec(t *testing.T) {
 	}

 	k := kataAgent{}
-	k.constrainGRPCSpec(g, true, true, "", true)
+	k.constrainGRPCSpec(g, true, true, "", true, nil)

 	// Check nil fields
 	assert.Nil(g.Hooks)
@@ -1370,3 +1370,51 @@ func TestKataAgentCreateContainerVFIODevices(t *testing.T) {
 		})
 	}
 }
+
+func TestTranslateHostMemsToGuest(t *testing.T) {
+	assert := assert.New(t)
+
+	numaNodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-3"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	}
+
+	result := translateHostMemsToGuest("0", numaNodes)
+	assert.Equal("0", result)
+
+	result = translateHostMemsToGuest("1", numaNodes)
+	assert.Equal("1", result)
+
+	result = translateHostMemsToGuest("0-1", numaNodes)
+	assert.Equal("0-1", result)
+
+	result = translateHostMemsToGuest("0,1", numaNodes)
+	assert.Equal("0-1", result)
+
+	result = translateHostMemsToGuest("42", numaNodes)
+	assert.Equal("", result)
+
+	result = translateHostMemsToGuest("invalid", numaNodes)
+	assert.Equal("", result)
+
+	result = translateHostMemsToGuest("", numaNodes)
+	assert.Equal("", result)
+}
+
+func TestTranslateHostMemsToGuestRangeNodes(t *testing.T) {
+	assert := assert.New(t)
+
+	numaNodes := []types.GuestNUMANode{
+		{HostNodes: "0-1", HostCPUs: "0-7"},
+		{HostNodes: "2-3", HostCPUs: "8-15"},
+	}
+
+	result := translateHostMemsToGuest("1", numaNodes)
+	assert.Equal("0", result)
+
+	result = translateHostMemsToGuest("2", numaNodes)
+	assert.Equal("1", result)
+
+	result = translateHostMemsToGuest("0,3", numaNodes)
+	assert.Equal("0-1", result)
+}
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -2961,9 +2961,26 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error {

 // checkVCPUsPinning is used to support CPUSet mode of kata container.
 // CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning
-// is set to true. Then it fetches sandbox's number of vCPU threads
-// and number of CPUs in CPUSet. If the two are equal, each vCPU thread
-// is then pinned to one fixed CPU in CPUSet.
+// is set to true.
+//
+// When NUMA topology is configured (GuestNUMANodes is non-empty), vCPU
+// threads are pinned to host CPUs belonging to the same host NUMA node
+// as the vCPU's assigned guest NUMA node, preserving memory locality.
+// vCPUs are distributed proportionally across nodes and each vCPU is
+// pinned round-robin to the host CPUs within its NUMA node; the 1:1
+// count equality check does not apply.
+//
+// This is true for both multi-node sandboxes and right-sized
+// single-node sandboxes: when buildNUMATopology()/maybeRightSizeAutoNUMA
+// collapses the topology to one node, that single node still carries a
+// meaningful HostCPUs subset (the CPUs of the chosen host NUMA node),
+// and pinning to that subset is what makes right-sizing actually deliver
+// host-thread locality, not just guest-topology locality.
+//
+// In the non-NUMA path (GuestNUMANodes is empty, e.g. enable_numa=false),
+// it fetches the sandbox's number of vCPU threads and number of CPUs in
+// CPUSet. If the two are equal, each vCPU thread is pinned 1:1 to the
+// CPUs in CPUSet; otherwise pinning is skipped.
 func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 	if s.config == nil {
 		return fmt.Errorf("no sandbox config found")
@@ -2972,7 +2989,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 		return nil
 	}

-	// fetch vCPU thread ids and CPUSet
 	vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx)
 	if err != nil {
 		return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err)
@@ -2987,9 +3003,42 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 	}
 	cpuSetSlice := cpuSet.ToSlice()

-	// check if vCPU thread numbers and CPU numbers are equal
+	numaNodes := s.config.HypervisorConfig.GuestNUMANodes
+
+	if len(cpuSetSlice) == 0 {
+		if len(numaNodes) >= 1 {
+			// No cpuset constraint (e.g. ctr without k8s, or a Burstable
+			// pod with cpuManagerPolicy=none). Build an effective cpuset
+			// from the NUMA nodes' HostCPUs so pinning works using the
+			// (possibly right-sized) host NUMA topology. Even a single
+			// NUMA node here meaningfully constrains pinning to that
+			// node's host CPUs.
+			for _, gn := range numaNodes {
+				hostCPUs, err := cpuset.Parse(gn.HostCPUs)
+				if err != nil {
+					continue
+				}
+				cpuSet = cpuSet.Union(hostCPUs)
+			}
+			cpuSetSlice = cpuSet.ToSlice()
+			if len(cpuSetSlice) == 0 {
+				s.Logger().Warn("sandbox CPUSet is empty and cannot derive from NUMA HostCPUs; skipping vCPU pinning")
+				s.isVCPUsPinningOn = false
+				return nil
+			}
+			s.Logger().WithField("effective-cpuset", cpuSet.String()).Debug("derived cpuset from NUMA HostCPUs for pinning")
+		} else {
+			s.Logger().Warn("sandbox CPUSet is empty; skipping vCPU pinning")
+			s.isVCPUsPinningOn = false
+			return nil
+		}
+	}
+
+	if len(numaNodes) >= 1 {
+		return s.checkVCPUsPinningNUMA(ctx, vCPUThreadsMap, numaNodes, cpuSetSlice)
+	}
+
 	numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice)
-	// if not equal, we should reset threads scheduling to random pattern
 	if numVCPUs != numCPUs {
 		if s.isVCPUsPinningOn {
 			s.isVCPUsPinningOn = false
@@ -2997,7 +3046,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 		}
 		return nil
 	}
-	// if equal, we can use vCPU thread pinning
 	for i, tid := range vCPUThreadsMap.vcpus {
 		if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil {
 			if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
@@ -3010,6 +3058,68 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
 	return nil
 }

+// checkVCPUsPinningNUMA pins vCPU threads to host CPUs that belong to the
+// same NUMA node as the vCPU's guest NUMA node assignment. vCPUs are
+// distributed proportionally to the host CPU count per NUMA node
+// (matching buildNUMATopology). It handles any non-empty numaNodes
+// slice — including the right-sized single-node case, where every vCPU
+// is pinned within the single chosen host NUMA node's CPU set.
+func (s *Sandbox) checkVCPUsPinningNUMA(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, numaNodes []types.GuestNUMANode, cpuSetSlice []int) error {
+	numVCPUs := uint32(len(vCPUThreadsMap.vcpus))
+	numNodes := uint32(len(numaNodes))
+	if numVCPUs < numNodes {
+		return fmt.Errorf("number of vCPUs (%d) must be >= NUMA node count (%d) for NUMA pinning", numVCPUs, numNodes)
+	}
+
+	vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, numVCPUs)
+	if err != nil {
+		return fmt.Errorf("failed to compute NUMA vCPU distribution for pinning: %v", err)
+	}
+
+	cpuSetAll := cpuset.NewCPUSet(cpuSetSlice...)
+
+	var cpuOffset uint32
+	for i, gn := range numaNodes {
+		hostCPUs, err := cpuset.Parse(gn.HostCPUs)
+		if err != nil {
+			return fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %v", i, err)
+		}
+		allowedCPUs := hostCPUs.Intersection(cpuSetAll).ToSlice()
+		if len(allowedCPUs) == 0 {
+			s.Logger().WithFields(logrus.Fields{
+				"numa-node":    i,
+				"host-cpus":    gn.HostCPUs,
+				"sandbox-cpus": cpuSetSlice,
+			}).Warn("NUMA node HostCPUs do not intersect sandbox CPUSet; pinning vCPUs to full cpuset for this node")
+			allowedCPUs = cpuSetSlice
+		}
+
+		startVCPU := cpuOffset
+		endVCPU := startVCPU + vcpusPerNode[i]
+		cpuOffset = endVCPU
+
+		for vcpuIdx := startVCPU; vcpuIdx < endVCPU; vcpuIdx++ {
+			tid, ok := vCPUThreadsMap.vcpus[int(vcpuIdx)]
+			if !ok {
+				if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
+					return err
+				}
+				return fmt.Errorf("missing vcpu thread id for vcpu index %d", vcpuIdx)
+			}
+			pinIdx := int(vcpuIdx-startVCPU) % len(allowedCPUs)
+			if err := resCtrl.SetThreadAffinity(tid, allowedCPUs[pinIdx:pinIdx+1]); err != nil {
+				if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
+					return err
+				}
+				return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d (NUMA node %d): %v", tid, allowedCPUs[pinIdx], i, err)
+			}
+		}
+	}
+
+	s.isVCPUsPinningOn = true
+	return nil
+}
+
 // resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling
 func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error {
 	for _, tid := range vCPUThreadsMap.vcpus {
--- a/src/runtime/virtcontainers/sandbox_test.go
+++ b/src/runtime/virtcontainers/sandbox_test.go
@@ -1679,3 +1679,29 @@ func TestSandboxHugepageLimit(t *testing.T) {
 	err = s.updateResources(context.Background())
 	assert.NoError(t, err)
 }
+
+func TestCheckVCPUsPinningNUMATooFewVCPUs(t *testing.T) {
+	assert := assert.New(t)
+	s := &Sandbox{}
+	vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100}}
+	numaNodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-3"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	}
+	err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7})
+	assert.Error(err)
+	assert.Contains(err.Error(), "must be >= NUMA node count")
+}
+
+func TestCheckVCPUsPinningNUMABadHostCPUs(t *testing.T) {
+	assert := assert.New(t)
+	s := &Sandbox{}
+	vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100, 1: 101, 2: 102, 3: 103}}
+	numaNodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "not-valid"},
+		{HostNodes: "1", HostCPUs: "4-7"},
+	}
+	err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7})
+	assert.Error(err)
+	assert.Contains(err.Error(), "failed to parse HostCPUs")
+}
--- a/src/runtime/virtcontainers/utils/utils.go
+++ b/src/runtime/virtcontainers/utils/utils.go
@@ -625,6 +625,29 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) {
 	return numaNodes, nil
 }

+// FilterNUMANodesByCPUSet returns only those guest NUMA nodes whose HostCPUs
+// intersect with the given sandbox cpuset. If sandboxCPUs is empty (size 0),
+// no filtering is applied and the original slice is returned unchanged.
+func FilterNUMANodesByCPUSet(nodes []types.GuestNUMANode, sandboxCPUs cpuset.CPUSet) []types.GuestNUMANode {
+	if sandboxCPUs.Size() == 0 {
+		return nodes
+	}
+	var filtered []types.GuestNUMANode
+	for _, n := range nodes {
+		hostCPUs, err := cpuset.Parse(n.HostCPUs)
+		if err != nil {
+			continue
+		}
+		if hostCPUs.Intersection(sandboxCPUs).Size() > 0 {
+			filtered = append(filtered, n)
+		}
+	}
+	if len(filtered) == 0 {
+		return nodes
+	}
+	return filtered
+}
+
 // NUMADistEntry represents a single NUMA distance measurement between two nodes.
 type NUMADistEntry struct {
 	Src uint32
--- a/src/runtime/virtcontainers/utils/utils_test.go
+++ b/src/runtime/virtcontainers/utils/utils_test.go
@@ -20,6 +20,7 @@ import (
 	"github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"

+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 )

@@ -815,3 +816,43 @@ func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) {
 	assert.Error(err)
 	assert.Contains(err.Error(), "must be >= NUMA node count")
 }
+
+func TestFilterNUMANodesByCPUSet(t *testing.T) {
+	assert := assert.New(t)
+
+	nodes := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-55,112-167"},
+		{HostNodes: "1", HostCPUs: "56-111,168-223"},
+	}
+
+	// Sandbox cpuset only from node 0 -> should return 1 node
+	sandboxCPUs, _ := cpuset.Parse("1-40,113-152")
+	filtered := FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
+	assert.Len(filtered, 1)
+	assert.Equal("0", filtered[0].HostNodes)
+
+	// Sandbox cpuset from both nodes -> should return 2 nodes
+	sandboxCPUs, _ = cpuset.Parse("1-40,56-80")
+	filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
+	assert.Len(filtered, 2)
+
+	// Sandbox cpuset only from node 1 -> should return 1 node
+	sandboxCPUs, _ = cpuset.Parse("60-70,170-180")
+	filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs)
+	assert.Len(filtered, 1)
+	assert.Equal("1", filtered[0].HostNodes)
+
+	// Empty cpuset -> no filtering, return all
+	emptyCPUs := cpuset.NewCPUSet()
+	filtered = FilterNUMANodesByCPUSet(nodes, emptyCPUs)
+	assert.Len(filtered, 2)
+
+	// Single-node host (1 NUMA node) -> returns 1 regardless
+	singleNode := []types.GuestNUMANode{
+		{HostNodes: "0", HostCPUs: "0-7"},
+	}
+	sandboxCPUs, _ = cpuset.Parse("0-3")
+	filtered = FilterNUMANodesByCPUSet(singleNode, sandboxCPUs)
+	assert.Len(filtered, 1)
+	assert.Equal("0", filtered[0].HostNodes)
+}