From a89c843edd8a5558838573c5d00d53e5b9b3439c Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Mon, 11 Sep 2023 16:05:18 +0200 Subject: [PATCH 1/4] node: cpumgr: ErrorS -> InfoS Convert uncommon use of ErrorS(nil, ...) into more regular use of InfoS. Set the verbosiness level to make sure the message is still emitted in regular expected configuration. Signed-off-by: Francesco Romani --- pkg/kubelet/cm/cpumanager/cpu_manager.go | 4 ++-- pkg/kubelet/cm/cpumanager/policy_static.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index 8b5049d7d74..9eb20bdbf4e 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -381,7 +381,7 @@ func (m *manager) removeStaleState() { for podUID := range assignments { for containerName := range assignments[podUID] { if _, ok := activeContainers[podUID][containerName]; !ok { - klog.ErrorS(nil, "RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName) + klog.V(2).InfoS("RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName) err := m.policyRemoveContainerByRef(podUID, containerName) if err != nil { klog.ErrorS(err, "RemoveStaleState: failed to remove container", "podUID", podUID, "containerName", containerName) @@ -392,7 +392,7 @@ func (m *manager) removeStaleState() { m.containerMap.Visit(func(podUID, containerName, containerID string) { if _, ok := activeContainers[podUID][containerName]; !ok { - klog.ErrorS(nil, "RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName) + klog.V(2).InfoS("RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName) err := m.policyRemoveContainerByRef(podUID, containerName) if err != nil { klog.ErrorS(err, "RemoveStaleState: failed to remove container", "podUID", podUID, "containerName", containerName) diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index 049e35f9883..19c98af703e 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -515,7 +515,7 @@ func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v // kubelet restart, for example. if allocated, exists := s.GetCPUSet(string(pod.UID), container.Name); exists { if allocated.Size() != requested { - klog.ErrorS(nil, "CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "requestedSize", requested, "allocatedSize", allocated.Size()) + klog.InfoS("CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "requestedSize", requested, "allocatedSize", allocated.Size()) // An empty list of hints will be treated as a preference that cannot be satisfied. // In definition of hints this is equal to: TopologyHint[NUMANodeAffinity: nil, Preferred: false]. // For all but the best-effort policy, the Topology Manager will throw a pod-admission error. @@ -565,7 +565,7 @@ func (p *staticPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[strin // kubelet restart, for example. if allocated, exists := s.GetCPUSet(string(pod.UID), container.Name); exists { if allocated.Size() != requestedByContainer { - klog.ErrorS(nil, "CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "allocatedSize", requested, "requestedByContainer", requestedByContainer, "allocatedSize", allocated.Size()) + klog.InfoS("CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "allocatedSize", requested, "requestedByContainer", requestedByContainer, "allocatedSize", allocated.Size()) // An empty list of hints will be treated as a preference that cannot be satisfied. // In definition of hints this is equal to: TopologyHint[NUMANodeAffinity: nil, Preferred: false]. // For all but the best-effort policy, the Topology Manager will throw a pod-admission error. From 5a0bc1020bf7b2a9779ca9a13ad74afdfe1e1e97 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Mon, 11 Sep 2023 15:10:51 +0200 Subject: [PATCH 2/4] node: cpumgr: move flow to left and add logs Refactor the code to align to the left bailing out earlier if the code must do nothing. Add log to trace this occurrence. Besides extra log, no intended change in behavior. Signed-off-by: Francesco Romani --- pkg/kubelet/cm/cpumanager/cpu_manager.go | 26 ++++++++++++++---------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index 9eb20bdbf4e..60924487b4f 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -380,24 +380,28 @@ func (m *manager) removeStaleState() { assignments := m.state.GetCPUAssignments() for podUID := range assignments { for containerName := range assignments[podUID] { - if _, ok := activeContainers[podUID][containerName]; !ok { - klog.V(2).InfoS("RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName) - err := m.policyRemoveContainerByRef(podUID, containerName) - if err != nil { - klog.ErrorS(err, "RemoveStaleState: failed to remove container", "podUID", podUID, "containerName", containerName) - } + if _, ok := activeContainers[podUID][containerName]; ok { + klog.V(4).InfoS("RemoveStaleState: container still active", "podUID", podUID, "containerName", containerName) + continue } - } - } - - m.containerMap.Visit(func(podUID, containerName, containerID string) { - if _, ok := activeContainers[podUID][containerName]; !ok { klog.V(2).InfoS("RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName) err := m.policyRemoveContainerByRef(podUID, containerName) if err != nil { klog.ErrorS(err, "RemoveStaleState: failed to remove container", "podUID", podUID, "containerName", containerName) } } + } + + m.containerMap.Visit(func(podUID, containerName, containerID string) { + if _, ok := activeContainers[podUID][containerName]; ok { + klog.V(4).InfoS("RemoveStaleState: containerMap: container still active", "podUID", podUID, "containerName", containerName) + return + } + klog.V(2).InfoS("RemoveStaleState: containerMap: removing container", "podUID", podUID, "containerName", containerName) + err := m.policyRemoveContainerByRef(podUID, containerName) + if err != nil { + klog.ErrorS(err, "RemoveStaleState: containerMap: failed to remove container", "podUID", podUID, "containerName", containerName) + } }) } From 2dc5ddd08a41a171134fd0922825ed03faa379ca Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 13 Sep 2023 11:52:50 +0200 Subject: [PATCH 3/4] node: cpumgr: logs: bump log verbosiness for expected skips In the reconciliation flow, there are expected skipping conditions (e.g. for active logs). To reduce noise in the logs, bump up the verbosiness of these messages, using odd levels. Signed-off-by: Francesco Romani --- pkg/kubelet/cm/cpumanager/cpu_manager.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index 60924487b4f..2af1e731a7f 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -381,7 +381,7 @@ func (m *manager) removeStaleState() { for podUID := range assignments { for containerName := range assignments[podUID] { if _, ok := activeContainers[podUID][containerName]; ok { - klog.V(4).InfoS("RemoveStaleState: container still active", "podUID", podUID, "containerName", containerName) + klog.V(5).InfoS("RemoveStaleState: container still active", "podUID", podUID, "containerName", containerName) continue } klog.V(2).InfoS("RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName) @@ -394,7 +394,7 @@ func (m *manager) removeStaleState() { m.containerMap.Visit(func(podUID, containerName, containerID string) { if _, ok := activeContainers[podUID][containerName]; ok { - klog.V(4).InfoS("RemoveStaleState: containerMap: container still active", "podUID", podUID, "containerName", containerName) + klog.V(5).InfoS("RemoveStaleState: containerMap: container still active", "podUID", podUID, "containerName", containerName) return } klog.V(2).InfoS("RemoveStaleState: containerMap: removing container", "podUID", podUID, "containerName", containerName) @@ -414,7 +414,7 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec for _, pod := range m.activePods() { pstatus, ok := m.podStatusProvider.GetPodStatus(pod.UID) if !ok { - klog.V(4).InfoS("ReconcileState: skipping pod; status not found", "pod", klog.KObj(pod)) + klog.V(5).InfoS("ReconcileState: skipping pod; status not found", "pod", klog.KObj(pod)) failure = append(failure, reconciledContainer{pod.Name, "", ""}) continue } @@ -424,14 +424,14 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec for _, container := range allContainers { containerID, err := findContainerIDByName(&pstatus, container.Name) if err != nil { - klog.V(4).InfoS("ReconcileState: skipping container; ID not found in pod status", "pod", klog.KObj(pod), "containerName", container.Name, "err", err) + klog.V(5).InfoS("ReconcileState: skipping container; ID not found in pod status", "pod", klog.KObj(pod), "containerName", container.Name, "err", err) failure = append(failure, reconciledContainer{pod.Name, container.Name, ""}) continue } cstatus, err := findContainerStatusByName(&pstatus, container.Name) if err != nil { - klog.V(4).InfoS("ReconcileState: skipping container; container status not found in pod status", "pod", klog.KObj(pod), "containerName", container.Name, "err", err) + klog.V(5).InfoS("ReconcileState: skipping container; container status not found in pod status", "pod", klog.KObj(pod), "containerName", container.Name, "err", err) failure = append(failure, reconciledContainer{pod.Name, container.Name, ""}) continue } @@ -474,7 +474,7 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec lcset := m.lastUpdateState.GetCPUSetOrDefault(string(pod.UID), container.Name) if !cset.Equals(lcset) { - klog.V(4).InfoS("ReconcileState: updating container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID, "cpuSet", cset) + klog.V(5).InfoS("ReconcileState: updating container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID, "cpuSet", cset) err = m.updateContainerCPUSet(ctx, containerID, cset) if err != nil { klog.ErrorS(err, "ReconcileState: failed to update container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID, "cpuSet", cset) From 0a9b17771da7093d12bbb9c56b9ed940d3afdcee Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 13 Sep 2023 11:54:08 +0200 Subject: [PATCH 4/4] node: cpumgr: log: make errors louder We have a special case which is not supposed to happen. Make it louder with default log settings to make sure this is visible. Signed-off-by: Francesco Romani --- pkg/kubelet/cm/cpumanager/cpu_manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index 2af1e731a7f..9f9cd48eb5f 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -467,7 +467,7 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec cset := m.state.GetCPUSetOrDefault(string(pod.UID), container.Name) if cset.IsEmpty() { // NOTE: This should not happen outside of tests. - klog.V(4).InfoS("ReconcileState: skipping container; assigned cpuset is empty", "pod", klog.KObj(pod), "containerName", container.Name) + klog.V(2).InfoS("ReconcileState: skipping container; assigned cpuset is empty", "pod", klog.KObj(pod), "containerName", container.Name) failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID}) continue }