Merge pull request #129684 from swatisehgal/mm-mgr-logs-improvements

Memory Manager logging improvements
This commit is contained in:
Kubernetes Prow Robot 2025-02-07 03:49:55 -08:00 committed by GitHub
commit e094e5e89c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 31 additions and 26 deletions

View File

@ -205,6 +205,7 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
m.allocatableMemory = m.policy.GetAllocatableMemory(m.state)
klog.V(4).InfoS("memorymanager started", "policy", m.policy.Name())
return nil
}
@ -248,7 +249,7 @@ func (m *manager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.
}
if numaNodes.Len() == 0 {
klog.V(5).InfoS("No allocation is available", "pod", klog.KObj(pod), "containerName", container.Name)
klog.V(5).InfoS("NUMA nodes not available for allocation", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
@ -266,7 +267,7 @@ func (m *manager) Allocate(pod *v1.Pod, container *v1.Container) error {
// Call down into the policy to assign this container memory if required.
if err := m.policy.Allocate(m.state, pod, container); err != nil {
klog.ErrorS(err, "Allocate error")
klog.ErrorS(err, "Allocate error", "pod", klog.KObj(pod), "containerName", container.Name)
return err
}
return nil
@ -280,7 +281,7 @@ func (m *manager) RemoveContainer(containerID string) error {
// if error appears it means container entry already does not exist under the container map
podUID, containerName, err := m.containerMap.GetContainerRef(containerID)
if err != nil {
klog.InfoS("Failed to get container from container map", "containerID", containerID, "err", err)
klog.ErrorS(err, "Failed to get container from container map", "containerID", containerID)
return nil
}
@ -344,7 +345,7 @@ func (m *manager) removeStaleState() {
for podUID := range assignments {
for containerName := range assignments[podUID] {
if _, ok := activeContainers[podUID][containerName]; !ok {
klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
klog.V(2).InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
m.policyRemoveContainerByRef(podUID, containerName)
}
}
@ -352,7 +353,7 @@ func (m *manager) removeStaleState() {
m.containerMap.Visit(func(podUID, containerName, containerID string) {
if _, ok := activeContainers[podUID][containerName]; !ok {
klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
klog.V(2).InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
m.policyRemoveContainerByRef(podUID, containerName)
}
})

View File

@ -96,7 +96,9 @@ func (p *staticPolicy) Start(s state.State) error {
// Allocate call is idempotent
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
// allocate the memory only for guaranteed pods
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
qos := v1qos.GetPodQOS(pod)
if qos != v1.PodQOSGuaranteed {
klog.V(5).InfoS("Exclusive memory allocation skipped, pod QoS is not guaranteed", "pod", klog.KObj(pod), "containerName", container.Name, "qos", qos)
return nil
}
@ -196,6 +198,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
// TODO: we should refactor our state structs to reflect the amount of the re-used memory
p.updateInitContainersMemoryBlocks(s, pod, container, containerBlocks)
klog.V(4).InfoS("Allocated exclusive memory", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
@ -304,24 +307,24 @@ func regenerateHints(pod *v1.Pod, ctn *v1.Container, ctnBlocks []state.Block, re
}
if len(ctnBlocks) != len(reqRsrc) {
klog.ErrorS(nil, "The number of requested resources by the container differs from the number of memory blocks", "containerName", ctn.Name)
klog.InfoS("The number of requested resources by the container differs from the number of memory blocks", "pod", klog.KObj(pod), "containerName", ctn.Name)
return nil
}
for _, b := range ctnBlocks {
if _, ok := reqRsrc[b.Type]; !ok {
klog.ErrorS(nil, "Container requested resources do not have resource of this type", "containerName", ctn.Name, "type", b.Type)
klog.InfoS("Container requested resources but none available of this type", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type)
return nil
}
if b.Size != reqRsrc[b.Type] {
klog.ErrorS(nil, "Memory already allocated with different numbers than requested", "podUID", pod.UID, "type", b.Type, "containerName", ctn.Name, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size)
klog.InfoS("Memory already allocated with different numbers than requested", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size)
return nil
}
containerNUMAAffinity, err := bitmask.NewBitMask(b.NUMAAffinity...)
if err != nil {
klog.ErrorS(err, "Failed to generate NUMA bitmask")
klog.ErrorS(err, "Failed to generate NUMA bitmask", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type)
return nil
}
@ -660,36 +663,36 @@ func (p *staticPolicy) validateState(s state.State) error {
func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
if len(ms1) != len(ms2) {
klog.ErrorS(nil, "Node states are different", "lengthNode1", len(ms1), "lengthNode2", len(ms2))
klog.InfoS("Node states were different", "lengthNode1", len(ms1), "lengthNode2", len(ms2))
return false
}
for nodeID, nodeState1 := range ms1 {
nodeState2, ok := ms2[nodeID]
if !ok {
klog.ErrorS(nil, "Node state does not have node ID", "nodeID", nodeID)
klog.InfoS("Node state didn't have node ID", "nodeID", nodeID)
return false
}
if nodeState1.NumberOfAssignments != nodeState2.NumberOfAssignments {
klog.ErrorS(nil, "Node states number of assignments are different", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments)
klog.InfoS("Node state had a different number of memory assignments.", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments)
return false
}
if !areGroupsEqual(nodeState1.Cells, nodeState2.Cells) {
klog.ErrorS(nil, "Node states groups are different", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells)
klog.InfoS("Node states had different groups", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells)
return false
}
if len(nodeState1.MemoryMap) != len(nodeState2.MemoryMap) {
klog.ErrorS(nil, "Node states memory map have different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap))
klog.InfoS("Node state had memory maps of different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap))
return false
}
for resourceName, memoryState1 := range nodeState1.MemoryMap {
memoryState2, ok := nodeState2.MemoryMap[resourceName]
if !ok {
klog.ErrorS(nil, "Memory state does not have resource", "resource", resourceName)
klog.InfoS("Memory state didn't have resource", "resource", resourceName)
return false
}
@ -707,11 +710,11 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
}
if tmpState1.Free != tmpState2.Free {
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("NUMA node and resource had different memory states", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
if tmpState1.Reserved != tmpState2.Reserved {
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("NUMA node and resource had different memory states", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
}
@ -721,17 +724,17 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
func areMemoryStatesEqual(memoryState1, memoryState2 *state.MemoryTable, nodeID int, resourceName v1.ResourceName) bool {
if memoryState1.TotalMemSize != memoryState2.TotalMemSize {
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
if memoryState1.SystemReserved != memoryState2.SystemReserved {
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
if memoryState1.Allocatable != memoryState2.Allocatable {
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
return true

View File

@ -131,7 +131,7 @@ func (sc *stateCheckpoint) SetMachineState(memoryMap NUMANodeMap) {
sc.cache.SetMachineState(memoryMap)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}
@ -143,7 +143,7 @@ func (sc *stateCheckpoint) SetMemoryBlocks(podUID string, containerName string,
sc.cache.SetMemoryBlocks(podUID, containerName, blocks)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
}
}
@ -155,7 +155,7 @@ func (sc *stateCheckpoint) SetMemoryAssignments(assignments ContainerMemoryAssig
sc.cache.SetMemoryAssignments(assignments)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}
@ -167,7 +167,7 @@ func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
sc.cache.Delete(podUID, containerName)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
}
}
@ -179,6 +179,6 @@ func (sc *stateCheckpoint) ClearState() {
sc.cache.ClearState()
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}

View File

@ -94,6 +94,7 @@ func (s *stateMemory) SetMemoryAssignments(assignments ContainerMemoryAssignment
defer s.Unlock()
s.assignments = assignments.Clone()
klog.V(5).InfoS("Updated Memory assignments", "assignments", assignments)
}
// Delete deletes corresponding Blocks from ContainerMemoryAssignments