diff --git a/pkg/kubelet/cm/internal_container_lifecycle.go b/pkg/kubelet/cm/internal_container_lifecycle.go index 0d3a3357b06..278f13eb08e 100644 --- a/pkg/kubelet/cm/internal_container_lifecycle.go +++ b/pkg/kubelet/cm/internal_container_lifecycle.go @@ -46,10 +46,7 @@ func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, containe } if i.memoryManager != nil { - err := i.memoryManager.AddContainer(pod, container, containerID) - if err != nil { - return err - } + i.memoryManager.AddContainer(pod, container, containerID) } if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManager) { diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go index dd0a37f086a..9cf41620b8c 100644 --- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go +++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go @@ -19,6 +19,9 @@ limitations under the License. package cm import ( + "strconv" + "strings" + "k8s.io/api/core/v1" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" ) @@ -31,5 +34,16 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain } } + if i.memoryManager != nil { + numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container) + if numaNodes.Len() > 0 { + var affinity []string + for _, numaNode := range numaNodes.List() { + affinity = append(affinity, strconv.Itoa(numaNode)) + } + containerConfig.Linux.Resources.CpusetMems = strings.Join(affinity, ",") + } + } + return nil } diff --git a/pkg/kubelet/cm/memorymanager/fake_memory_manager.go b/pkg/kubelet/cm/memorymanager/fake_memory_manager.go index 1cbe0171013..11328567926 100644 --- a/pkg/kubelet/cm/memorymanager/fake_memory_manager.go +++ b/pkg/kubelet/cm/memorymanager/fake_memory_manager.go @@ -18,6 +18,7 @@ package memorymanager import ( v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/kubelet/cm/containermap" "k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state" @@ -45,8 +46,12 @@ func (m *fakeManager) Allocate(pod *v1.Pod, container *v1.Container) error { return nil } -func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) error { +func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) { klog.Infof("[fake memorymanager] AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID) +} + +func (m *fakeManager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Int { + klog.Infof("[fake memorymanager] GetMemoryNUMANodes (pod: %s, container: %s)", pod.Name, container.Name) return nil } diff --git a/pkg/kubelet/cm/memorymanager/memory_manager.go b/pkg/kubelet/cm/memorymanager/memory_manager.go index 1b00008bfa4..57a689d83af 100644 --- a/pkg/kubelet/cm/memorymanager/memory_manager.go +++ b/pkg/kubelet/cm/memorymanager/memory_manager.go @@ -18,14 +18,13 @@ package memorymanager import ( "fmt" - "strconv" - "strings" "sync" cadvisorapi "github.com/google/cadvisor/info/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/util/sets" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" "k8s.io/klog/v2" corev1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" @@ -57,10 +56,9 @@ type Manager interface { // Start is called during Kubelet initialization. Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error - // AddContainer is called between container create and container start - // so that initial memory affinity settings can be written through to the - // container runtime before the first process begins to execute. - AddContainer(p *v1.Pod, c *v1.Container, containerID string) error + // AddContainer adds the mapping between container ID to pod UID and the container name + // The mapping used to remove the memory allocation during the container removal + AddContainer(p *v1.Pod, c *v1.Container, containerID string) // Allocate is called to pre-allocate memory resources during Pod admission. // This must be called at some point prior to the AddContainer() call for a container, e.g. at pod admission time. @@ -82,6 +80,9 @@ type Manager interface { // and is consulted to achieve NUMA aware resource alignment among this // and other resource controllers. GetPodTopologyHints(*v1.Pod) map[string][]topologymanager.TopologyHint + + // GetMemoryNUMANodes provides NUMA nodes that are used to allocate the container memory + GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Int } type manager struct { @@ -176,38 +177,31 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe } // AddContainer saves the value of requested memory for the guaranteed pod under the state and set memory affinity according to the topolgy manager -func (m *manager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) error { +func (m *manager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) { m.Lock() - m.containerMap.Add(string(pod.UID), container.Name, containerID) - m.Unlock() + defer m.Unlock() + m.containerMap.Add(string(pod.UID), container.Name, containerID) +} + +// GetMemory provides NUMA nodes that used to allocate the container memory +func (m *manager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Int { // Get NUMA node affinity of blocks assigned to the container during Allocate() - var nodes []string + numaNodes := sets.NewInt() for _, block := range m.state.GetMemoryBlocks(string(pod.UID), container.Name) { for _, nodeID := range block.NUMAAffinity { - nodes = append(nodes, strconv.Itoa(nodeID)) + // avoid nodes duplication when hugepages and memory blocks pinned to the same NUMA node + numaNodes.Insert(nodeID) } } - if len(nodes) < 1 { - klog.V(5).Infof("[memorymanager] update container resources is skipped due to memory blocks are empty") + if numaNodes.Len() == 0 { + klog.V(5).Infof("No allocation is available for (Pod: %s, Container: %s)", pod.Name, container.Name) return nil } - affinity := strings.Join(nodes, ",") - klog.Infof("[memorymanager] Set container %q cpuset.mems to %q", containerID, affinity) - err := m.containerRuntime.UpdateContainerResources(containerID, &runtimeapi.LinuxContainerResources{CpusetMems: affinity}) - if err != nil { - klog.Errorf("[memorymanager] AddContainer error: error updating cpuset.mems for container (pod: %s, container: %s, container id: %s, err: %v)", pod.Name, container.Name, containerID, err) - - m.Lock() - err = m.policyRemoveContainerByRef(string(pod.UID), container.Name) - if err != nil { - klog.Errorf("[memorymanager] AddContainer rollback state error: %v", err) - } - m.Unlock() - } - return err + klog.Infof("(Pod: %s, Container: %s) memory affinity is %v", pod.Name, container.Name, numaNodes) + return numaNodes } // Allocate is called to pre-allocate memory resources during Pod admission. diff --git a/pkg/kubelet/cm/memorymanager/memory_manager_test.go b/pkg/kubelet/cm/memorymanager/memory_manager_test.go index ff2b3f811c9..fec6d3303d7 100644 --- a/pkg/kubelet/cm/memorymanager/memory_manager_test.go +++ b/pkg/kubelet/cm/memorymanager/memory_manager_test.go @@ -1122,102 +1122,6 @@ func TestAddContainer(t *testing.T) { assignments: state.ContainerMemoryAssignments{}, activePods: nil, }, - { - description: "Adding container should fail (CRI error) but without an error", - updateError: fmt.Errorf("fake reg error"), - policyName: policyTypeStatic, - machineInfo: machineInfo, - reserved: reserved, - machineState: state.NUMANodeMap{ - 0: &state.NUMANodeState{ - Cells: []int{0}, - NumberOfAssignments: 0, - MemoryMap: map[v1.ResourceName]*state.MemoryTable{ - v1.ResourceMemory: { - Allocatable: 9 * gb, - Free: 9 * gb, - Reserved: 0 * gb, - SystemReserved: 1 * gb, - TotalMemSize: 10 * gb, - }, - hugepages1Gi: { - Allocatable: 5 * gb, - Free: 5 * gb, - Reserved: 0, - SystemReserved: 0, - TotalMemSize: 5 * gb, - }, - }, - }, - 1: &state.NUMANodeState{ - Cells: []int{1}, - NumberOfAssignments: 0, - MemoryMap: map[v1.ResourceName]*state.MemoryTable{ - v1.ResourceMemory: { - Allocatable: 9 * gb, - Free: 9 * gb, - Reserved: 0 * gb, - SystemReserved: 1 * gb, - TotalMemSize: 10 * gb, - }, - hugepages1Gi: { - Allocatable: 5 * gb, - Free: 5 * gb, - Reserved: 0, - SystemReserved: 0, - TotalMemSize: 5 * gb, - }, - }, - }, - }, - expectedMachineState: state.NUMANodeMap{ - 0: &state.NUMANodeState{ - Cells: []int{0}, - NumberOfAssignments: 0, - MemoryMap: map[v1.ResourceName]*state.MemoryTable{ - v1.ResourceMemory: { - Allocatable: 9 * gb, - Free: 9 * gb, - Reserved: 0 * gb, - SystemReserved: 1 * gb, - TotalMemSize: 10 * gb, - }, - hugepages1Gi: { - Allocatable: 5 * gb, - Free: 5 * gb, - Reserved: 0, - SystemReserved: 0, - TotalMemSize: 5 * gb, - }, - }, - }, - 1: &state.NUMANodeState{ - Cells: []int{1}, - NumberOfAssignments: 0, - MemoryMap: map[v1.ResourceName]*state.MemoryTable{ - v1.ResourceMemory: { - Allocatable: 9 * gb, - Free: 9 * gb, - Reserved: 0 * gb, - SystemReserved: 1 * gb, - TotalMemSize: 10 * gb, - }, - hugepages1Gi: { - Allocatable: 5 * gb, - Free: 5 * gb, - Reserved: 0, - SystemReserved: 0, - TotalMemSize: 5 * gb, - }, - }, - }, - }, - expectedAllocateError: nil, - expectedAddContainerError: nil, - podAllocate: pod, - assignments: state.ContainerMemoryAssignments{}, - activePods: nil, - }, { description: "Correct allocation of container requiring amount of memory higher than capacity of one NUMA node", policyName: policyTypeStatic, @@ -1487,7 +1391,8 @@ func TestAddContainer(t *testing.T) { t.Errorf("Memory Manager Allocate() error (%v), expected error: %v, but got: %v", testCase.description, testCase.expectedAllocateError, err) } - err = mgr.AddContainer(pod, container, "fakeID") + mgr.AddContainer(pod, container, "fakeID") + _, _, err = mgr.containerMap.GetContainerRef("fakeID") if !reflect.DeepEqual(err, testCase.expectedAddContainerError) { t.Errorf("Memory Manager AddContainer() error (%v), expected error: %v, but got: %v", testCase.description, testCase.expectedAddContainerError, err)