From 110868691b271a7448e613f10032b150d23f429e Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 17 Jan 2025 10:06:19 +0000 Subject: [PATCH 1/4] node: cpu-mgr: Update klog.Infof(..., err) to klog.ErrorS(err,...) We are trying to ensure consistency across resource managers when it comes to logging. While working on logging improvements for memory manager, it was identified that some parts of code base is still using klog.InfoS(..., err) instead of klog.ErrorS(err,...). This change is addressing this in CPU Manager. Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/cpumanager/state/state_checkpoint.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go b/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go index f6acc7c42ce..64b7aa8f532 100644 --- a/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go +++ b/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go @@ -201,7 +201,7 @@ func (sc *stateCheckpoint) SetCPUSet(podUID string, containerName string, cset c sc.cache.SetCPUSet(podUID, containerName, cset) err := sc.storeState() if err != nil { - klog.InfoS("Store state to checkpoint error", "err", err) + klog.ErrorS(err, "Store state to checkpoint error", "podUID", podUID, "containerName", containerName) } } @@ -212,7 +212,7 @@ func (sc *stateCheckpoint) SetDefaultCPUSet(cset cpuset.CPUSet) { sc.cache.SetDefaultCPUSet(cset) err := sc.storeState() if err != nil { - klog.InfoS("Store state to checkpoint error", "err", err) + klog.ErrorS(err, "Store state to checkpoint error") } } @@ -223,7 +223,7 @@ func (sc *stateCheckpoint) SetCPUAssignments(a ContainerCPUAssignments) { sc.cache.SetCPUAssignments(a) err := sc.storeState() if err != nil { - klog.InfoS("Store state to checkpoint error", "err", err) + klog.ErrorS(err, "Store state to checkpoint error") } } @@ -234,7 +234,7 @@ func (sc *stateCheckpoint) Delete(podUID string, containerName string) { sc.cache.Delete(podUID, containerName) err := sc.storeState() if err != nil { - klog.InfoS("Store state to checkpoint error", "err", err) + klog.ErrorS(err, "Store state to checkpoint error", "podUID", podUID, "containerName", containerName) } } @@ -245,6 +245,6 @@ func (sc *stateCheckpoint) ClearState() { sc.cache.ClearState() err := sc.storeState() if err != nil { - klog.InfoS("Store state to checkpoint error", "err", err) + klog.ErrorS(err, "Store state to checkpoint error") } } From 01a546fe53aec6a035e037236ae44c7ef98f06a0 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 17 Jan 2025 12:36:36 +0000 Subject: [PATCH 2/4] node: cpu-mgr: Add logs on the happy path While debugging, it can be useful to have logs to indicate that things have gone as expected especially when it comes to important events like successful startup of CPU manager and successful allocation of resources. Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/cpumanager/cpu_manager.go | 2 ++ pkg/kubelet/cm/cpumanager/policy_static.go | 1 + 2 files changed, 3 insertions(+) diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index 9e72a7cc97a..cd2c4c05a4f 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -239,6 +239,8 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe return err } + klog.V(4).InfoS("CPU manager started", "policy", m.policy.Name()) + m.allocatableCPUs = m.policy.GetAllocatableCPUs(m.state) if m.policy.Name() == string(PolicyNone) { diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index 26d1fc6d91b..7abb868b8fc 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -388,6 +388,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai p.updateCPUsToReuse(pod, container, cpuset) p.updateMetricsOnAllocate(cpuset) + klog.V(4).InfoS("Allocated exclusive CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpuset", cpuset) return nil } From ca2c46a273033440fe142232fbb2120327ee314c Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 17 Jan 2025 13:02:15 +0000 Subject: [PATCH 3/4] node: cpu-mgr: Add logs when CPU allocation is skipped CPU Allocation is skipped in CPU Manager with static policy in case the pod doesn't belong to Guaranteed QoS or the CPUs requested are not integral. We add logs to capture these skips. Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/cpumanager/policy_static.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index 7abb868b8fc..d517aa698b5 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -456,7 +456,9 @@ func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bit } func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int { - if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed { + qos := v1qos.GetPodQOS(pod) + if qos != v1.PodQOSGuaranteed { + klog.V(5).InfoS("Exclusive CPU allocation skipped, pod QoS is not guaranteed", "pod", klog.KObj(pod), "containerName", container.Name, "qos", qos) return 0 } cpuQuantity := container.Resources.Requests[v1.ResourceCPU] @@ -469,7 +471,9 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int cpuQuantity = cs.AllocatedResources[v1.ResourceCPU] } } - if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() { + cpuValue := cpuQuantity.Value() + if cpuValue*1000 != cpuQuantity.MilliValue() { + klog.V(5).InfoS("Exclusive CPU allocation skipped, pod is not requesting integral CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpu", cpuValue) return 0 } // Safe downcast to do for all systems with < 2.1 billion CPUs. From 7997c93cfdf7e7e00711d6a3731ccf0964eddbf8 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Thu, 6 Feb 2025 13:34:11 +0000 Subject: [PATCH 4/4] node: cpu-mgr: Adhere to the message style guidelines Ensure that the log messages adhere to the message style guildelines as captured [here](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md#message-style-guidelines). Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/cpumanager/cpu_manager.go | 2 +- pkg/kubelet/cm/cpumanager/policy_static.go | 2 +- pkg/kubelet/cm/cpumanager/state/state_checkpoint.go | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index cd2c4c05a4f..8b59ba67121 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -467,7 +467,7 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec cset := m.state.GetCPUSetOrDefault(string(pod.UID), container.Name) if cset.IsEmpty() { // NOTE: This should not happen outside of tests. - klog.V(2).InfoS("ReconcileState: skipping container; assigned cpuset is empty", "pod", klog.KObj(pod), "containerName", container.Name) + klog.V(2).InfoS("ReconcileState: skipping container; empty cpuset assigned", "pod", klog.KObj(pod), "containerName", container.Name) failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID}) continue } diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index d517aa698b5..16a44c5d582 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -473,7 +473,7 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int } cpuValue := cpuQuantity.Value() if cpuValue*1000 != cpuQuantity.MilliValue() { - klog.V(5).InfoS("Exclusive CPU allocation skipped, pod is not requesting integral CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpu", cpuValue) + klog.V(5).InfoS("Exclusive CPU allocation skipped, pod requested non-integral CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpu", cpuValue) return 0 } // Safe downcast to do for all systems with < 2.1 billion CPUs. diff --git a/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go b/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go index 64b7aa8f532..bda90ba1f4c 100644 --- a/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go +++ b/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go @@ -201,7 +201,7 @@ func (sc *stateCheckpoint) SetCPUSet(podUID string, containerName string, cset c sc.cache.SetCPUSet(podUID, containerName, cset) err := sc.storeState() if err != nil { - klog.ErrorS(err, "Store state to checkpoint error", "podUID", podUID, "containerName", containerName) + klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName) } } @@ -212,7 +212,7 @@ func (sc *stateCheckpoint) SetDefaultCPUSet(cset cpuset.CPUSet) { sc.cache.SetDefaultCPUSet(cset) err := sc.storeState() if err != nil { - klog.ErrorS(err, "Store state to checkpoint error") + klog.ErrorS(err, "Failed to store state to checkpoint") } } @@ -223,7 +223,7 @@ func (sc *stateCheckpoint) SetCPUAssignments(a ContainerCPUAssignments) { sc.cache.SetCPUAssignments(a) err := sc.storeState() if err != nil { - klog.ErrorS(err, "Store state to checkpoint error") + klog.ErrorS(err, "Failed to store state to checkpoint") } } @@ -234,7 +234,7 @@ func (sc *stateCheckpoint) Delete(podUID string, containerName string) { sc.cache.Delete(podUID, containerName) err := sc.storeState() if err != nil { - klog.ErrorS(err, "Store state to checkpoint error", "podUID", podUID, "containerName", containerName) + klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName) } } @@ -245,6 +245,6 @@ func (sc *stateCheckpoint) ClearState() { sc.cache.ClearState() err := sc.storeState() if err != nil { - klog.ErrorS(err, "Store state to checkpoint error") + klog.ErrorS(err, "Failed to store state to checkpoint") } }