diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index dc96e3cf39..c3784712f3 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -1742,12 +1742,17 @@ func (c *Container) update(ctx context.Context, resources specs.LinuxResources) return err } - // There currently isn't a notion of cpusets.cpus or mems being tracked - // inside of the guest. Make sure we clear these before asking agent to update - // the container's cgroups. + // Cpus/Mems in cgroup cpuset are host-relative; clear Cpus since vCPU + // numbering differs inside the guest. For Mems, translate host NUMA node + // IDs to guest node IDs when multi-NUMA is configured, otherwise clear. if resources.CPU != nil { - resources.CPU.Mems = "" resources.CPU.Cpus = "" + numaNodes := c.sandbox.config.HypervisorConfig.GuestNUMANodes + if len(numaNodes) > 1 && resources.CPU.Mems != "" { + resources.CPU.Mems = translateHostMemsToGuest(resources.CPU.Mems, numaNodes) + } else { + resources.CPU.Mems = "" + } } return c.sandbox.agent.updateContainer(ctx, c.sandbox, *c, resources) diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 452c64f9ce..8b34cb246a 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -34,6 +34,7 @@ import ( kataclient "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/client" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" @@ -1018,7 +1019,36 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st return nil } -func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error { +// translateHostMemsToGuest converts a host cpuset.mems string (e.g. "0,2") +// into guest NUMA node IDs. Each guest NUMA node index maps to a set of host +// nodes via GuestNUMANode.HostNodes. If a host node from `mems` appears in +// a GuestNUMANode's HostNodes, the corresponding guest node index is included. +func translateHostMemsToGuest(hostMems string, numaNodes []types.GuestNUMANode) string { + hostSet, err := cpuset.Parse(hostMems) + if err != nil { + return "" + } + hostSlice := hostSet.ToSlice() + var guestNodes []int + for guestIdx, gn := range numaNodes { + nodeSet, err := cpuset.Parse(gn.HostNodes) + if err != nil { + continue + } + for _, hostNode := range hostSlice { + if nodeSet.Contains(hostNode) { + guestNodes = append(guestNodes, guestIdx) + break + } + } + } + if len(guestNodes) == 0 { + return "" + } + return cpuset.NewCPUSet(guestNodes...).String() +} + +func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool, numaNodes []types.GuestNUMANode) error { // Disable Hooks since they have been handled on the host and there is // no reason to send them to the agent. It would make no sense to try // to apply them on the guest. @@ -1060,7 +1090,6 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis } } - // By now only CPU constraints are supported // Issue: https://github.com/kata-containers/runtime/issues/158 // Issue: https://github.com/kata-containers/runtime/issues/204 grpcSpec.Linux.Resources.Devices = nil @@ -1069,7 +1098,12 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis grpcSpec.Linux.Resources.Network = nil if grpcSpec.Linux.Resources.CPU != nil { grpcSpec.Linux.Resources.CPU.Cpus = "" - grpcSpec.Linux.Resources.CPU.Mems = "" + if len(numaNodes) > 1 && grpcSpec.Linux.Resources.CPU.Mems != "" { + guestMems := translateHostMemsToGuest(grpcSpec.Linux.Resources.CPU.Mems, numaNodes) + grpcSpec.Linux.Resources.CPU.Mems = guestMems + } else { + grpcSpec.Linux.Resources.CPU.Mems = "" + } } // Disable network and time namespaces since they are handled on the host @@ -1495,7 +1529,7 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co // We need to constrain the spec to make sure we're not // passing irrelevant information to the agent. - err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel) + err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel, sandbox.config.HypervisorConfig.GuestNUMANodes) if err != nil { return nil, err } diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index 62bdd76eac..4b27f0c07e 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -638,7 +638,7 @@ func TestConstrainGRPCSpec(t *testing.T) { } k := kataAgent{} - k.constrainGRPCSpec(g, true, true, "", true) + k.constrainGRPCSpec(g, true, true, "", true, nil) // Check nil fields assert.Nil(g.Hooks) @@ -1370,3 +1370,51 @@ func TestKataAgentCreateContainerVFIODevices(t *testing.T) { }) } } + +func TestTranslateHostMemsToGuest(t *testing.T) { + assert := assert.New(t) + + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + + result := translateHostMemsToGuest("0", numaNodes) + assert.Equal("0", result) + + result = translateHostMemsToGuest("1", numaNodes) + assert.Equal("1", result) + + result = translateHostMemsToGuest("0-1", numaNodes) + assert.Equal("0-1", result) + + result = translateHostMemsToGuest("0,1", numaNodes) + assert.Equal("0-1", result) + + result = translateHostMemsToGuest("42", numaNodes) + assert.Equal("", result) + + result = translateHostMemsToGuest("invalid", numaNodes) + assert.Equal("", result) + + result = translateHostMemsToGuest("", numaNodes) + assert.Equal("", result) +} + +func TestTranslateHostMemsToGuestRangeNodes(t *testing.T) { + assert := assert.New(t) + + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0-1", HostCPUs: "0-7"}, + {HostNodes: "2-3", HostCPUs: "8-15"}, + } + + result := translateHostMemsToGuest("1", numaNodes) + assert.Equal("0", result) + + result = translateHostMemsToGuest("2", numaNodes) + assert.Equal("1", result) + + result = translateHostMemsToGuest("0,3", numaNodes) + assert.Equal("0-1", result) +} diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 6ceae42de9..0a236e32b8 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -2961,9 +2961,26 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error { // checkVCPUsPinning is used to support CPUSet mode of kata container. // CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning -// is set to true. Then it fetches sandbox's number of vCPU threads -// and number of CPUs in CPUSet. If the two are equal, each vCPU thread -// is then pinned to one fixed CPU in CPUSet. +// is set to true. +// +// When NUMA topology is configured (GuestNUMANodes is non-empty), vCPU +// threads are pinned to host CPUs belonging to the same host NUMA node +// as the vCPU's assigned guest NUMA node, preserving memory locality. +// vCPUs are distributed proportionally across nodes and each vCPU is +// pinned round-robin to the host CPUs within its NUMA node; the 1:1 +// count equality check does not apply. +// +// This is true for both multi-node sandboxes and right-sized +// single-node sandboxes: when buildNUMATopology()/maybeRightSizeAutoNUMA +// collapses the topology to one node, that single node still carries a +// meaningful HostCPUs subset (the CPUs of the chosen host NUMA node), +// and pinning to that subset is what makes right-sizing actually deliver +// host-thread locality, not just guest-topology locality. +// +// In the non-NUMA path (GuestNUMANodes is empty, e.g. enable_numa=false), +// it fetches the sandbox's number of vCPU threads and number of CPUs in +// CPUSet. If the two are equal, each vCPU thread is pinned 1:1 to the +// CPUs in CPUSet; otherwise pinning is skipped. func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { if s.config == nil { return fmt.Errorf("no sandbox config found") @@ -2972,7 +2989,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { return nil } - // fetch vCPU thread ids and CPUSet vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx) if err != nil { return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err) @@ -2987,9 +3003,42 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { } cpuSetSlice := cpuSet.ToSlice() - // check if vCPU thread numbers and CPU numbers are equal + numaNodes := s.config.HypervisorConfig.GuestNUMANodes + + if len(cpuSetSlice) == 0 { + if len(numaNodes) >= 1 { + // No cpuset constraint (e.g. ctr without k8s, or a Burstable + // pod with cpuManagerPolicy=none). Build an effective cpuset + // from the NUMA nodes' HostCPUs so pinning works using the + // (possibly right-sized) host NUMA topology. Even a single + // NUMA node here meaningfully constrains pinning to that + // node's host CPUs. + for _, gn := range numaNodes { + hostCPUs, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + continue + } + cpuSet = cpuSet.Union(hostCPUs) + } + cpuSetSlice = cpuSet.ToSlice() + if len(cpuSetSlice) == 0 { + s.Logger().Warn("sandbox CPUSet is empty and cannot derive from NUMA HostCPUs; skipping vCPU pinning") + s.isVCPUsPinningOn = false + return nil + } + s.Logger().WithField("effective-cpuset", cpuSet.String()).Debug("derived cpuset from NUMA HostCPUs for pinning") + } else { + s.Logger().Warn("sandbox CPUSet is empty; skipping vCPU pinning") + s.isVCPUsPinningOn = false + return nil + } + } + + if len(numaNodes) >= 1 { + return s.checkVCPUsPinningNUMA(ctx, vCPUThreadsMap, numaNodes, cpuSetSlice) + } + numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice) - // if not equal, we should reset threads scheduling to random pattern if numVCPUs != numCPUs { if s.isVCPUsPinningOn { s.isVCPUsPinningOn = false @@ -2997,7 +3046,6 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { } return nil } - // if equal, we can use vCPU thread pinning for i, tid := range vCPUThreadsMap.vcpus { if err := resCtrl.SetThreadAffinity(tid, cpuSetSlice[i:i+1]); err != nil { if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { @@ -3010,6 +3058,68 @@ func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error { return nil } +// checkVCPUsPinningNUMA pins vCPU threads to host CPUs that belong to the +// same NUMA node as the vCPU's guest NUMA node assignment. vCPUs are +// distributed proportionally to the host CPU count per NUMA node +// (matching buildNUMATopology). It handles any non-empty numaNodes +// slice — including the right-sized single-node case, where every vCPU +// is pinned within the single chosen host NUMA node's CPU set. +func (s *Sandbox) checkVCPUsPinningNUMA(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, numaNodes []types.GuestNUMANode, cpuSetSlice []int) error { + numVCPUs := uint32(len(vCPUThreadsMap.vcpus)) + numNodes := uint32(len(numaNodes)) + if numVCPUs < numNodes { + return fmt.Errorf("number of vCPUs (%d) must be >= NUMA node count (%d) for NUMA pinning", numVCPUs, numNodes) + } + + vcpusPerNode, err := utils.DistributeVCPUsProportionally(numaNodes, numVCPUs) + if err != nil { + return fmt.Errorf("failed to compute NUMA vCPU distribution for pinning: %v", err) + } + + cpuSetAll := cpuset.NewCPUSet(cpuSetSlice...) + + var cpuOffset uint32 + for i, gn := range numaNodes { + hostCPUs, err := cpuset.Parse(gn.HostCPUs) + if err != nil { + return fmt.Errorf("failed to parse HostCPUs for NUMA node %d: %v", i, err) + } + allowedCPUs := hostCPUs.Intersection(cpuSetAll).ToSlice() + if len(allowedCPUs) == 0 { + s.Logger().WithFields(logrus.Fields{ + "numa-node": i, + "host-cpus": gn.HostCPUs, + "sandbox-cpus": cpuSetSlice, + }).Warn("NUMA node HostCPUs do not intersect sandbox CPUSet; pinning vCPUs to full cpuset for this node") + allowedCPUs = cpuSetSlice + } + + startVCPU := cpuOffset + endVCPU := startVCPU + vcpusPerNode[i] + cpuOffset = endVCPU + + for vcpuIdx := startVCPU; vcpuIdx < endVCPU; vcpuIdx++ { + tid, ok := vCPUThreadsMap.vcpus[int(vcpuIdx)] + if !ok { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("missing vcpu thread id for vcpu index %d", vcpuIdx) + } + pinIdx := int(vcpuIdx-startVCPU) % len(allowedCPUs) + if err := resCtrl.SetThreadAffinity(tid, allowedCPUs[pinIdx:pinIdx+1]); err != nil { + if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil { + return err + } + return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d (NUMA node %d): %v", tid, allowedCPUs[pinIdx], i, err) + } + } + } + + s.isVCPUsPinningOn = true + return nil +} + // resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error { for _, tid := range vCPUThreadsMap.vcpus { diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index 7e521f3842..50115c7a5b 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -1679,3 +1679,29 @@ func TestSandboxHugepageLimit(t *testing.T) { err = s.updateResources(context.Background()) assert.NoError(t, err) } + +func TestCheckVCPUsPinningNUMATooFewVCPUs(t *testing.T) { + assert := assert.New(t) + s := &Sandbox{} + vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100}} + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-3"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7}) + assert.Error(err) + assert.Contains(err.Error(), "must be >= NUMA node count") +} + +func TestCheckVCPUsPinningNUMABadHostCPUs(t *testing.T) { + assert := assert.New(t) + s := &Sandbox{} + vCPUThreadsMap := VcpuThreadIDs{vcpus: map[int]int{0: 100, 1: 101, 2: 102, 3: 103}} + numaNodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "not-valid"}, + {HostNodes: "1", HostCPUs: "4-7"}, + } + err := s.checkVCPUsPinningNUMA(context.Background(), vCPUThreadsMap, numaNodes, []int{0, 1, 2, 3, 4, 5, 6, 7}) + assert.Error(err) + assert.Contains(err.Error(), "failed to parse HostCPUs") +} diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index bc2aa98121..5e1ff51ae3 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -625,6 +625,29 @@ func GetGuestNUMANodes(numaMapping []string) ([]types.GuestNUMANode, error) { return numaNodes, nil } +// FilterNUMANodesByCPUSet returns only those guest NUMA nodes whose HostCPUs +// intersect with the given sandbox cpuset. If sandboxCPUs is empty (size 0), +// no filtering is applied and the original slice is returned unchanged. +func FilterNUMANodesByCPUSet(nodes []types.GuestNUMANode, sandboxCPUs cpuset.CPUSet) []types.GuestNUMANode { + if sandboxCPUs.Size() == 0 { + return nodes + } + var filtered []types.GuestNUMANode + for _, n := range nodes { + hostCPUs, err := cpuset.Parse(n.HostCPUs) + if err != nil { + continue + } + if hostCPUs.Intersection(sandboxCPUs).Size() > 0 { + filtered = append(filtered, n) + } + } + if len(filtered) == 0 { + return nodes + } + return filtered +} + // NUMADistEntry represents a single NUMA distance measurement between two nodes. type NUMADistEntry struct { Src uint32 diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go index cb11770c8c..90663e64b5 100644 --- a/src/runtime/virtcontainers/utils/utils_test.go +++ b/src/runtime/virtcontainers/utils/utils_test.go @@ -20,6 +20,7 @@ import ( "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" ) @@ -815,3 +816,43 @@ func TestDistributeVCPUsProportionallyTooFewVCPUs(t *testing.T) { assert.Error(err) assert.Contains(err.Error(), "must be >= NUMA node count") } + +func TestFilterNUMANodesByCPUSet(t *testing.T) { + assert := assert.New(t) + + nodes := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-55,112-167"}, + {HostNodes: "1", HostCPUs: "56-111,168-223"}, + } + + // Sandbox cpuset only from node 0 -> should return 1 node + sandboxCPUs, _ := cpuset.Parse("1-40,113-152") + filtered := FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("0", filtered[0].HostNodes) + + // Sandbox cpuset from both nodes -> should return 2 nodes + sandboxCPUs, _ = cpuset.Parse("1-40,56-80") + filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 2) + + // Sandbox cpuset only from node 1 -> should return 1 node + sandboxCPUs, _ = cpuset.Parse("60-70,170-180") + filtered = FilterNUMANodesByCPUSet(nodes, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("1", filtered[0].HostNodes) + + // Empty cpuset -> no filtering, return all + emptyCPUs := cpuset.NewCPUSet() + filtered = FilterNUMANodesByCPUSet(nodes, emptyCPUs) + assert.Len(filtered, 2) + + // Single-node host (1 NUMA node) -> returns 1 regardless + singleNode := []types.GuestNUMANode{ + {HostNodes: "0", HostCPUs: "0-7"}, + } + sandboxCPUs, _ = cpuset.Parse("0-3") + filtered = FilterNUMANodesByCPUSet(singleNode, sandboxCPUs) + assert.Len(filtered, 1) + assert.Equal("0", filtered[0].HostNodes) +}