From c635a7e7d8362ac7c706680e77f7680895b1d517 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 11 Jul 2023 18:54:51 +0200 Subject: [PATCH] node: devicemgr: topomgr: add logs One of the contributing factors of issues #118559 and #109595 hard to debug and fix is that the devicemanager has very few logs in important flow, so it's unnecessarily hard to reconstruct the state from logs. We add minimal logs to be able to improve troubleshooting. We add minimal logs to be backport-friendly, deferring a more comprehensive review of logging to later PRs. Signed-off-by: Francesco Romani --- pkg/kubelet/cm/devicemanager/manager.go | 4 ++++ pkg/kubelet/cm/topologymanager/topology_manager.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index fed9ba4bb89..d780ee801bd 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -226,6 +226,7 @@ func (m *ManagerImpl) PluginConnected(resourceName string, p plugin.DevicePlugin defer m.mutex.Unlock() m.endpoints[resourceName] = endpointInfo{e, options} + klog.V(2).InfoS("Device plugin connected", "resourceName", resourceName) return nil } @@ -256,6 +257,7 @@ func (m *ManagerImpl) PluginListAndWatchReceiver(resourceName string, resp *plug } func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) { + healthyCount := 0 m.mutex.Lock() m.healthyDevices[resourceName] = sets.NewString() m.unhealthyDevices[resourceName] = sets.NewString() @@ -264,6 +266,7 @@ func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices [ m.allDevices[resourceName][dev.ID] = dev if dev.Health == pluginapi.Healthy { m.healthyDevices[resourceName].Insert(dev.ID) + healthyCount++ } else { m.unhealthyDevices[resourceName].Insert(dev.ID) } @@ -272,6 +275,7 @@ func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices [ if err := m.writeCheckpoint(); err != nil { klog.ErrorS(err, "Writing checkpoint encountered") } + klog.V(2).InfoS("Processed device updates for resource", "resourceName", resourceName, "totalCount", len(devices), "healthyCount", healthyCount) } // GetWatcherHandler returns the plugin handler diff --git a/pkg/kubelet/cm/topologymanager/topology_manager.go b/pkg/kubelet/cm/topologymanager/topology_manager.go index 567736e82d3..35f3e3e4715 100644 --- a/pkg/kubelet/cm/topologymanager/topology_manager.go +++ b/pkg/kubelet/cm/topologymanager/topology_manager.go @@ -209,7 +209,7 @@ func (m *manager) RemoveContainer(containerID string) error { } func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult { - klog.InfoS("Topology Admit Handler") + klog.InfoS("Topology Admit Handler", "podUID", attrs.Pod.UID, "podNamespace", attrs.Pod.Namespace, "podName", attrs.Pod.Name) metrics.TopologyManagerAdmissionRequestsTotal.Inc() startTime := time.Now()