node: devicemgr: topomgr: add logs

One of the contributing factors of issues #118559 and #109595 hard to
debug and fix is that the devicemanager has very few logs in important
flow, so it's unnecessarily hard to reconstruct the state from logs.

We add minimal logs to be able to improve troubleshooting.
We add minimal logs to be backport-friendly, deferring a more
comprehensive review of logging to later PRs.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani 2023-07-11 18:54:51 +02:00
parent d78671447f
commit c635a7e7d8
2 changed files with 5 additions and 1 deletions

View File

@ -226,6 +226,7 @@ func (m *ManagerImpl) PluginConnected(resourceName string, p plugin.DevicePlugin
defer m.mutex.Unlock()
m.endpoints[resourceName] = endpointInfo{e, options}
klog.V(2).InfoS("Device plugin connected", "resourceName", resourceName)
return nil
}
@ -256,6 +257,7 @@ func (m *ManagerImpl) PluginListAndWatchReceiver(resourceName string, resp *plug
}
func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) {
healthyCount := 0
m.mutex.Lock()
m.healthyDevices[resourceName] = sets.NewString()
m.unhealthyDevices[resourceName] = sets.NewString()
@ -264,6 +266,7 @@ func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices [
m.allDevices[resourceName][dev.ID] = dev
if dev.Health == pluginapi.Healthy {
m.healthyDevices[resourceName].Insert(dev.ID)
healthyCount++
} else {
m.unhealthyDevices[resourceName].Insert(dev.ID)
}
@ -272,6 +275,7 @@ func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices [
if err := m.writeCheckpoint(); err != nil {
klog.ErrorS(err, "Writing checkpoint encountered")
}
klog.V(2).InfoS("Processed device updates for resource", "resourceName", resourceName, "totalCount", len(devices), "healthyCount", healthyCount)
}
// GetWatcherHandler returns the plugin handler

View File

@ -209,7 +209,7 @@ func (m *manager) RemoveContainer(containerID string) error {
}
func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
klog.InfoS("Topology Admit Handler")
klog.InfoS("Topology Admit Handler", "podUID", attrs.Pod.UID, "podNamespace", attrs.Pod.Namespace, "podName", attrs.Pod.Name)
metrics.TopologyManagerAdmissionRequestsTotal.Inc()
startTime := time.Now()