From 39726b119fa98492f09de824a3636ca210308dcd Mon Sep 17 00:00:00 2001 From: holder Date: Thu, 15 Dec 2022 10:32:41 +0800 Subject: [PATCH 1/5] fix: fix state validate error after memorymanager with static policy start (cherry picked from commit b91951f847d0b159c9d8ef32688cc96489ac1884) --- pkg/kubelet/cm/memorymanager/policy_static.go | 30 ++++- .../cm/memorymanager/policy_static_test.go | 119 ++++++++++++++++++ 2 files changed, 147 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index 130b62ab637..ecfbe0c206c 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -18,7 +18,6 @@ package memorymanager import ( "fmt" - "reflect" "sort" cadvisorapi "github.com/google/cadvisor/info/v1" @@ -682,10 +681,37 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool { return false } - if !reflect.DeepEqual(*memoryState1, *memoryState2) { + if memoryState1.TotalMemSize != memoryState2.TotalMemSize { klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) return false } + + if memoryState1.SystemReserved != memoryState2.SystemReserved { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + + if memoryState1.Allocatable != memoryState2.Allocatable { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + + totalFree1 := uint64(0) + totalReserved1 := uint64(0) + totalFree2 := uint64(0) + totalReserved2 := uint64(0) + for _, nodeId := range nodeState1.Cells { + totalFree1 += ms1[nodeId].MemoryMap[resourceName].Free + totalReserved1 += ms1[nodeId].MemoryMap[resourceName].Reserved + totalFree2 += ms2[nodeId].MemoryMap[resourceName].Free + totalReserved2 += ms2[nodeId].MemoryMap[resourceName].Reserved + } + + if totalFree1 != totalFree2 || totalReserved1 != totalReserved2 { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + } } return true diff --git a/pkg/kubelet/cm/memorymanager/policy_static_test.go b/pkg/kubelet/cm/memorymanager/policy_static_test.go index 7b105591580..9a6c3f5575d 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static_test.go +++ b/pkg/kubelet/cm/memorymanager/policy_static_test.go @@ -1024,6 +1024,125 @@ func TestStaticPolicyStart(t *testing.T) { }, }, }, + { + description: "shoud equal", + expectedAssignments: state.ContainerMemoryAssignments{ + "pod1": map[string][]state.Block{ + "container1": { + { + NUMAAffinity: []int{0, 1}, + Type: v1.ResourceMemory, + Size: 240 * mb, + }, + }, + }, + "pod2": map[string][]state.Block{ + "container2": { + { + NUMAAffinity: []int{0, 1}, + Type: v1.ResourceMemory, + Size: 10 * mb, + }, + }, + }, + }, + assignments: state.ContainerMemoryAssignments{ + "pod1": map[string][]state.Block{ + "container1": { + { + NUMAAffinity: []int{0, 1}, + Type: v1.ResourceMemory, + Size: 240 * mb, + }, + }, + }, + "pod2": map[string][]state.Block{ + "container2": { + { + NUMAAffinity: []int{0, 1}, + Type: v1.ResourceMemory, + Size: 10 * mb, + }, + }, + }, + }, + expectedMachineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 220 * mb, + Free: 30 * mb, + Reserved: 190 * mb, + SystemReserved: 20 * mb, + TotalMemSize: 240 * mb, + }, + }, + Cells: []int{0, 1}, + NumberOfAssignments: 2, + }, + 1: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 220 * mb, + Free: 160 * mb, + Reserved: 60 * mb, + SystemReserved: 20 * mb, + TotalMemSize: 240 * mb, + }, + }, + Cells: []int{0, 1}, + NumberOfAssignments: 2, + }, + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 220 * mb, + Free: 10 * mb, + Reserved: 210 * mb, + SystemReserved: 20 * mb, + TotalMemSize: 240 * mb, + }, + }, + Cells: []int{0, 1}, + NumberOfAssignments: 2, + }, + 1: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 220 * mb, + Free: 180 * mb, + Reserved: 40 * mb, + SystemReserved: 20 * mb, + TotalMemSize: 240 * mb, + }, + }, + Cells: []int{0, 1}, + NumberOfAssignments: 2, + }, + }, + systemReserved: systemReservedMemory{ + 0: map[v1.ResourceName]uint64{ + v1.ResourceMemory: 20 * mb, + }, + 1: map[v1.ResourceName]uint64{ + v1.ResourceMemory: 20 * mb, + }, + }, + machineInfo: &cadvisorapi.MachineInfo{ + Topology: []cadvisorapi.Node{ + { + Id: 0, + Memory: 240 * mb, + }, + { + Id: 1, + Memory: 240 * mb, + }, + }, + }, + }, } for _, testCase := range testCases { From 6d7a1226d5abf89caf2010da905fe0f7ab879bb3 Mon Sep 17 00:00:00 2001 From: holder Date: Thu, 15 Dec 2022 10:57:20 +0800 Subject: [PATCH 2/5] update the test case name (cherry picked from commit de033352079c7d87417f88f073d6b7891e51e590) --- pkg/kubelet/cm/memorymanager/policy_static_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/kubelet/cm/memorymanager/policy_static_test.go b/pkg/kubelet/cm/memorymanager/policy_static_test.go index 9a6c3f5575d..a5cff7060a5 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static_test.go +++ b/pkg/kubelet/cm/memorymanager/policy_static_test.go @@ -1025,7 +1025,7 @@ func TestStaticPolicyStart(t *testing.T) { }, }, { - description: "shoud equal", + description: "should validate the totalFree and totalReserved size for a resource within a group", expectedAssignments: state.ContainerMemoryAssignments{ "pod1": map[string][]state.Block{ "container1": { From 6709317ae21bc1184732894b762a7b3b3edffe9d Mon Sep 17 00:00:00 2001 From: holder Date: Wed, 14 Jun 2023 18:59:23 +0800 Subject: [PATCH 3/5] chore: optimize code logic (cherry picked from commit 91a9a195ac0fe0e31301dc60af0ea868fc4756ff) --- pkg/kubelet/cm/memorymanager/policy_static.go | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index ecfbe0c206c..454bd25aa36 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -681,17 +681,7 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool { return false } - if memoryState1.TotalMemSize != memoryState2.TotalMemSize { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) - return false - } - - if memoryState1.SystemReserved != memoryState2.SystemReserved { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) - return false - } - - if memoryState1.Allocatable != memoryState2.Allocatable { + if memoryState1.TotalMemSize != memoryState2.TotalMemSize || memoryState1.SystemReserved != memoryState2.SystemReserved || memoryState1.Allocatable != memoryState2.Allocatable { klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) return false } From 7476f46d71860db52cf01eee5a45b14bbda4ecd9 Mon Sep 17 00:00:00 2001 From: Talor Itzhak Date: Mon, 2 Sep 2024 17:02:51 +0300 Subject: [PATCH 4/5] memorymanager: fix checkpoint file comparison For a resource within a group, such as memory, we should validate the total `Free` and total `Reserved` size of the expected `machineState` and state restored from checkpoint file after kubelet start. If total `Free` and total `Reserved` are equal, the restored state is valid. The old comparison however was done by reflection. There're times when the memory accounting is equals but the allocations across the NUMA nodes are varies. In such cases we still need to consider the states as equals. Signed-off-by: Talor Itzhak --- pkg/kubelet/cm/memorymanager/policy_static.go | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index 454bd25aa36..01abf6b7b64 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -681,27 +681,38 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool { return false } - if memoryState1.TotalMemSize != memoryState2.TotalMemSize || memoryState1.SystemReserved != memoryState2.SystemReserved || memoryState1.Allocatable != memoryState2.Allocatable { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + if memoryState1.TotalMemSize != memoryState2.TotalMemSize { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2) return false } - totalFree1 := uint64(0) - totalReserved1 := uint64(0) - totalFree2 := uint64(0) - totalReserved2 := uint64(0) - for _, nodeId := range nodeState1.Cells { - totalFree1 += ms1[nodeId].MemoryMap[resourceName].Free - totalReserved1 += ms1[nodeId].MemoryMap[resourceName].Reserved - totalFree2 += ms2[nodeId].MemoryMap[resourceName].Free - totalReserved2 += ms2[nodeId].MemoryMap[resourceName].Reserved - } - - if totalFree1 != totalFree2 || totalReserved1 != totalReserved2 { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + if memoryState1.SystemReserved != memoryState2.SystemReserved { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2) return false } + if memoryState1.Allocatable != memoryState2.Allocatable { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + + tmpState1 := state.MemoryTable{} + tmpState2 := state.MemoryTable{} + for _, nodeID := range nodeState1.Cells { + tmpState1.Free += ms1[nodeID].MemoryMap[resourceName].Free + tmpState1.Reserved += ms1[nodeID].MemoryMap[resourceName].Reserved + tmpState2.Free += ms2[nodeID].MemoryMap[resourceName].Free + tmpState2.Reserved += ms2[nodeID].MemoryMap[resourceName].Reserved + } + + if tmpState1.Free != tmpState2.Free { + klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + if tmpState1.Reserved != tmpState2.Reserved { + klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } } } return true From d64f34eb2cf57ca017368bd94dc577952c2647d9 Mon Sep 17 00:00:00 2001 From: Talor Itzhak Date: Tue, 29 Oct 2024 12:23:34 +0200 Subject: [PATCH 5/5] memorymanager: `areMemoryStatesEqual` helper perform the memoryStates comparison in helper function Signed-off-by: Talor Itzhak --- pkg/kubelet/cm/memorymanager/policy_static.go | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index 01abf6b7b64..bca456c3fd7 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -681,18 +681,7 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool { return false } - if memoryState1.TotalMemSize != memoryState2.TotalMemSize { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2) - return false - } - - if memoryState1.SystemReserved != memoryState2.SystemReserved { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2) - return false - } - - if memoryState1.Allocatable != memoryState2.Allocatable { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + if !areMemoryStatesEqual(memoryState1, memoryState2, nodeID, resourceName) { return false } @@ -718,6 +707,24 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool { return true } +func areMemoryStatesEqual(memoryState1, memoryState2 *state.MemoryTable, nodeID int, resourceName v1.ResourceName) bool { + if memoryState1.TotalMemSize != memoryState2.TotalMemSize { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + + if memoryState1.SystemReserved != memoryState2.SystemReserved { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + + if memoryState1.Allocatable != memoryState2.Allocatable { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + return true +} + func (p *staticPolicy) getDefaultMachineState() state.NUMANodeMap { defaultMachineState := state.NUMANodeMap{} nodeHugepages := map[int]uint64{}