From 7476f46d71860db52cf01eee5a45b14bbda4ecd9 Mon Sep 17 00:00:00 2001 From: Talor Itzhak Date: Mon, 2 Sep 2024 17:02:51 +0300 Subject: [PATCH] memorymanager: fix checkpoint file comparison For a resource within a group, such as memory, we should validate the total `Free` and total `Reserved` size of the expected `machineState` and state restored from checkpoint file after kubelet start. If total `Free` and total `Reserved` are equal, the restored state is valid. The old comparison however was done by reflection. There're times when the memory accounting is equals but the allocations across the NUMA nodes are varies. In such cases we still need to consider the states as equals. Signed-off-by: Talor Itzhak --- pkg/kubelet/cm/memorymanager/policy_static.go | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index 454bd25aa36..01abf6b7b64 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -681,27 +681,38 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool { return false } - if memoryState1.TotalMemSize != memoryState2.TotalMemSize || memoryState1.SystemReserved != memoryState2.SystemReserved || memoryState1.Allocatable != memoryState2.Allocatable { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + if memoryState1.TotalMemSize != memoryState2.TotalMemSize { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2) return false } - totalFree1 := uint64(0) - totalReserved1 := uint64(0) - totalFree2 := uint64(0) - totalReserved2 := uint64(0) - for _, nodeId := range nodeState1.Cells { - totalFree1 += ms1[nodeId].MemoryMap[resourceName].Free - totalReserved1 += ms1[nodeId].MemoryMap[resourceName].Reserved - totalFree2 += ms2[nodeId].MemoryMap[resourceName].Free - totalReserved2 += ms2[nodeId].MemoryMap[resourceName].Reserved - } - - if totalFree1 != totalFree2 || totalReserved1 != totalReserved2 { - klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + if memoryState1.SystemReserved != memoryState2.SystemReserved { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2) return false } + if memoryState1.Allocatable != memoryState2.Allocatable { + klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + + tmpState1 := state.MemoryTable{} + tmpState2 := state.MemoryTable{} + for _, nodeID := range nodeState1.Cells { + tmpState1.Free += ms1[nodeID].MemoryMap[resourceName].Free + tmpState1.Reserved += ms1[nodeID].MemoryMap[resourceName].Reserved + tmpState2.Free += ms2[nodeID].MemoryMap[resourceName].Free + tmpState2.Reserved += ms2[nodeID].MemoryMap[resourceName].Reserved + } + + if tmpState1.Free != tmpState2.Free { + klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } + if tmpState1.Reserved != tmpState2.Reserved { + klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2) + return false + } } } return true