From f8596d6d289bf292d5147791b688b9e76c2750d9 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Tue, 21 Jan 2025 14:41:39 +0000 Subject: [PATCH 1/9] node: device-mgr: Change ErrorS(nil, ...) to InfoS Ensure consistency across resource managers and update ErrorS(nil, ...) to InfoS. Similar changes have been proposed in CPU Manager and Memory Manager. Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/manager.go | 6 +++--- pkg/kubelet/cm/devicemanager/topology_hints.go | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index b6a8104624a..bb1dc13c0e6 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -453,7 +453,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) // should always be consistent. Otherwise, we run with the risk // of failing to garbage collect non-existing resources or devices. if !ok { - klog.ErrorS(nil, "Unexpected: healthyDevices and endpoints are out of sync") + klog.InfoS("Unexpected: healthyDevices and endpoints are out of sync") } delete(m.endpoints, resourceName) delete(m.healthyDevices, resourceName) @@ -468,7 +468,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) eI, ok := m.endpoints[resourceName] if (ok && eI.e.stopGracePeriodExpired()) || !ok { if !ok { - klog.ErrorS(nil, "Unexpected: unhealthyDevices and endpoints are out of sync") + klog.InfoS("Unexpected: unhealthyDevices and endpoints are out of sync") } delete(m.endpoints, resourceName) delete(m.unhealthyDevices, resourceName) @@ -952,7 +952,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co } if !m.checkPodActive(pod) { - klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID) + klog.InfoS("pod deleted from activePods, skip to reAllocate", "podUID", podUID) continue } diff --git a/pkg/kubelet/cm/devicemanager/topology_hints.go b/pkg/kubelet/cm/devicemanager/topology_hints.go index 7b5c9f4c9dd..0693eb68b72 100644 --- a/pkg/kubelet/cm/devicemanager/topology_hints.go +++ b/pkg/kubelet/cm/devicemanager/topology_hints.go @@ -54,7 +54,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource) if allocated.Len() > 0 { if allocated.Len() != requested { - klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len()) + klog.InfoS("Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } @@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map available := m.getAvailableDevices(resource) reusable := m.devicesToReuse[string(pod.UID)][resource] if available.Union(reusable).Len() < requested { - klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len()) + klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } @@ -105,7 +105,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana allocated := m.podDevices.podDevices(string(pod.UID), resource) if allocated.Len() > 0 { if allocated.Len() != requested { - klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len()) + klog.InfoS("Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } @@ -117,7 +117,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana // Get the list of available devices, for which TopologyHints should be generated. available := m.getAvailableDevices(resource) if available.Len() < requested { - klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len()) + klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } From c56426bd9f69d1e251bb3b9df97a3e5ba2082c40 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Tue, 21 Jan 2025 14:45:40 +0000 Subject: [PATCH 2/9] node: device-mgr: Update klog.Infof(..., err) to klog.ErrorS(err,...) Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/manager.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index bb1dc13c0e6..d76c7798bb4 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -351,7 +351,7 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc // Loads in allocatedDevices information from disk. err := m.readCheckpoint() if err != nil { - klog.InfoS("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date", "err", err) + klog.ErrorS(err, "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date") } return m.server.Start() @@ -503,7 +503,7 @@ func (m *ManagerImpl) writeCheckpoint() error { err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data) if err != nil { err2 := fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err) - klog.InfoS("Failed to write checkpoint file", "err", err) + klog.ErrorS(err, "Failed to write checkpoint file") return err2 } return nil @@ -516,7 +516,7 @@ func (m *ManagerImpl) readCheckpoint() error { if err != nil { if err == errors.ErrCheckpointNotFound { // no point in trying anything else - klog.InfoS("Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint, "err", err) + klog.ErrorS(err, "Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint) return nil } return err @@ -1167,7 +1167,7 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool { func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool { cntID, err := m.containerMap.GetContainerID(podUID, cntName) if err != nil { - klog.V(4).InfoS("container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName, "err", err) + klog.ErrorS(err, "container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName) return false } From 584e224b4318336dc15ccf3886f7df9abd89fd04 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Tue, 21 Jan 2025 15:07:53 +0000 Subject: [PATCH 3/9] node: device-mgr: Add metadata to logs Ensure that if possible, we provide sufficient metadata inclusing pod name and UID to allow filtering by pod name or its UID. Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/manager.go | 12 ++++++------ pkg/kubelet/cm/devicemanager/topology_hints.go | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index d76c7798bb4..579ec884230 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -836,7 +836,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont for k, v := range container.Resources.Limits { resource := string(k) needed := int(v.Value()) - klog.V(3).InfoS("Looking for needed resources", "needed", needed, "resourceName", resource) + klog.V(3).InfoS("Looking for needed resources", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "needed", needed) if !m.isDevicePluginResource(resource) { continue } @@ -882,7 +882,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont devs := allocDevices.UnsortedList() // TODO: refactor this part of code to just append a ContainerAllocationRequest // in a passed in AllocateRequest pointer, and issues a single Allocate call per pod. - klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource) + klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name) resp, err := eI.e.allocate(devs) metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime)) if err != nil { @@ -952,7 +952,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co } if !m.checkPodActive(pod) { - klog.InfoS("pod deleted from activePods, skip to reAllocate", "podUID", podUID) + klog.InfoS("pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name) continue } @@ -984,7 +984,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s if eI.opts == nil || !eI.opts.PreStartRequired { m.mutex.Unlock() - klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "resourceName", resource) + klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName) return nil } @@ -1014,12 +1014,12 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re } if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable { - klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource) + klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName) return nil, nil } m.mutex.Unlock() - klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "containerName", contName, "podUID", podUID) + klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "resourceName", resource, "containerName", contName, "podUID", podUID) resp, err := eI.e.getPreferredAllocation(available.UnsortedList(), mustInclude.UnsortedList(), size) m.mutex.Lock() if err != nil { diff --git a/pkg/kubelet/cm/devicemanager/topology_hints.go b/pkg/kubelet/cm/devicemanager/topology_hints.go index 0693eb68b72..4f3d6ab581f 100644 --- a/pkg/kubelet/cm/devicemanager/topology_hints.go +++ b/pkg/kubelet/cm/devicemanager/topology_hints.go @@ -43,7 +43,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map for resource, requested := range accumulatedResourceRequests { // Only consider devices that actually contain topology information. if aligned := m.deviceHasTopologyAlignment(resource); !aligned { - klog.InfoS("Resource does not have a topology preference", "resource", resource) + klog.InfoS("Resource does not have a topology preference", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested) deviceHints[resource] = nil continue } @@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map available := m.getAvailableDevices(resource) reusable := m.devicesToReuse[string(pod.UID)][resource] if available.Union(reusable).Len() < requested { - klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len()) + klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } @@ -94,7 +94,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana for resource, requested := range accumulatedResourceRequests { // Only consider devices that actually contain topology information. if aligned := m.deviceHasTopologyAlignment(resource); !aligned { - klog.InfoS("Resource does not have a topology preference", "resource", resource) + klog.InfoS("Resource does not have a topology preference", "resource", resource, "pod", klog.KObj(pod), "request", requested) deviceHints[resource] = nil continue } @@ -109,7 +109,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana deviceHints[resource] = []topologymanager.TopologyHint{} continue } - klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod)) + klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "allocated", allocated.Len()) deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested) continue } @@ -117,7 +117,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana // Get the list of available devices, for which TopologyHints should be generated. available := m.getAvailableDevices(resource) if available.Len() < requested { - klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len()) + klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } From 4b613ae96fec5290a072aa9cb622aca8836198c4 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Tue, 21 Jan 2025 18:03:26 +0000 Subject: [PATCH 4/9] node: device-mgr: Bump log level for expected skips Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/manager.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 579ec884230..ed4703cfb36 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -952,7 +952,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co } if !m.checkPodActive(pod) { - klog.InfoS("pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name) + klog.V(5).InfoS("pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name) continue } @@ -984,7 +984,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s if eI.opts == nil || !eI.opts.PreStartRequired { m.mutex.Unlock() - klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName) + klog.V(5).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName) return nil } @@ -1014,7 +1014,7 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re } if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable { - klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName) + klog.V(5).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName) return nil, nil } From a585bd74c481192e6861d06dbe810cf04b72faae Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Tue, 21 Jan 2025 18:11:21 +0000 Subject: [PATCH 5/9] node: device-mgr: Keep log level consistent across all gRPC calls Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index ed4703cfb36..547c7b8c958 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -882,7 +882,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont devs := allocDevices.UnsortedList() // TODO: refactor this part of code to just append a ContainerAllocationRequest // in a passed in AllocateRequest pointer, and issues a single Allocate call per pod. - klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name) + klog.V(4).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name) resp, err := eI.e.allocate(devs) metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime)) if err != nil { From 9cd041156fb220f39ce39086f596068c5bec54ba Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Wed, 22 Jan 2025 15:58:31 +0000 Subject: [PATCH 6/9] node: device-mgr: Add logs in the happy path We have reasonable amount of logs when things go wrong. While debugging, it can be useful to have logs to indicate that things have gone as expected. Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/manager.go | 3 +++ pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go | 2 ++ pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go | 2 ++ pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go | 1 + 4 files changed, 8 insertions(+) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 547c7b8c958..8a50af44123 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -506,6 +506,7 @@ func (m *ManagerImpl) writeCheckpoint() error { klog.ErrorS(err, "Failed to write checkpoint file") return err2 } + klog.V(4).InfoS("checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint) return nil } @@ -534,6 +535,8 @@ func (m *ManagerImpl) readCheckpoint() error { m.unhealthyDevices[resource] = sets.New[string]() m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil} } + + klog.V(4).InfoS("read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint) return nil } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go index bf66875a493..47946de0748 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go @@ -106,6 +106,8 @@ func (c *client) Disconnect() error { } c.mutex.Unlock() c.handler.PluginDisconnected(c.resource) + + klog.V(2).InfoS("device plugin disconnected", "resource", c.resource) return nil } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go index 9c3af0f9094..8f34793bb99 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go @@ -62,6 +62,7 @@ func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []s return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName)) } + klog.V(2).InfoS("plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions) return nil } @@ -75,6 +76,7 @@ func (s *server) connectClient(name string, socketPath string) error { return err } + klog.V(2).InfoS("Connected to new client", "resource", name) go func() { s.runClient(name, c) }() diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go index 312aa930a0f..c3998142957 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go @@ -145,6 +145,7 @@ func (s *server) Stop() error { // During kubelet termination, we do not need the registration server, // and we consider the kubelet to be healthy even when it is down. s.setHealthy() + klog.V(2).InfoS("Stopping device plugin registration server") return nil } From 19e406a3574f6f1b659f04b87e995c924e4b33d8 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Wed, 22 Jan 2025 16:03:54 +0000 Subject: [PATCH 7/9] node: device-mgr: Update klog.Infof(..., err) to klog.ErrorS(err,...) Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go | 2 +- pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go index 8f34793bb99..18c381bc6b0 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go @@ -114,7 +114,7 @@ func (s *server) runClient(name string, c Client) { } if err := s.disconnectClient(name, c); err != nil { - klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err) + klog.ErrorS(err, "Unable to disconnect client", "resource", name, "client", c) } } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go index c3998142957..130d143c9e8 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go @@ -91,7 +91,7 @@ func (s *server) Start() error { if selinux.GetEnabled() { if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil { - klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err) + klog.ErrorS(err, "Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir) } } @@ -128,7 +128,7 @@ func (s *server) Start() error { func (s *server) Stop() error { s.visitClients(func(r string, c Client) { if err := s.disconnectClient(r, c); err != nil { - klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err) + klog.ErrorS(err, "Error disconnecting device plugin client", "resourceName", r) } }) @@ -160,18 +160,18 @@ func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Emp if !s.isVersionCompatibleWithPlugin(r.Version) { err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions) - klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err) + klog.ErrorS(err, "Bad registration request from device plugin with resource", "resourceName", r.ResourceName) return &api.Empty{}, err } if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) { err := fmt.Errorf(errInvalidResourceName, r.ResourceName) - klog.InfoS("Bad registration request from device plugin", "err", err) + klog.ErrorS(err, "Bad registration request from device plugin") return &api.Empty{}, err } if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil { - klog.InfoS("Error connecting to device plugin client", "err", err) + klog.ErrorS(err, "Error connecting to device plugin client") return &api.Empty{}, err } From 40c86d8e064a25556e278374a8bedffe7759c8b1 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Wed, 29 Jan 2025 17:40:16 +0000 Subject: [PATCH 8/9] node: device-mgr: Ensure consistent use of named arguments Throughout the devicemanager codebase the named argument to represent resource for logging pupose is `resourceName` as opposed to `resource`. The latter can only be seen in topology_hints.go files. To ensure consistency with the rest of the codebase and also because we want to adhere to the recommendations in the Kubernetes documentation about named arguments: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/migration-to-structured-logging.md#name-arguments we update the key from `resource` to `resourceName`. Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/topology_hints.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/topology_hints.go b/pkg/kubelet/cm/devicemanager/topology_hints.go index 4f3d6ab581f..f4d7fa73a8a 100644 --- a/pkg/kubelet/cm/devicemanager/topology_hints.go +++ b/pkg/kubelet/cm/devicemanager/topology_hints.go @@ -43,7 +43,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map for resource, requested := range accumulatedResourceRequests { // Only consider devices that actually contain topology information. if aligned := m.deviceHasTopologyAlignment(resource); !aligned { - klog.InfoS("Resource does not have a topology preference", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested) + klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested) deviceHints[resource] = nil continue } @@ -54,11 +54,11 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource) if allocated.Len() > 0 { if allocated.Len() != requested { - klog.InfoS("Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len()) + klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } - klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name) + klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name) deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested) continue } @@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map available := m.getAvailableDevices(resource) reusable := m.devicesToReuse[string(pod.UID)][resource] if available.Union(reusable).Len() < requested { - klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len()) + klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } @@ -94,7 +94,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana for resource, requested := range accumulatedResourceRequests { // Only consider devices that actually contain topology information. if aligned := m.deviceHasTopologyAlignment(resource); !aligned { - klog.InfoS("Resource does not have a topology preference", "resource", resource, "pod", klog.KObj(pod), "request", requested) + klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "request", requested) deviceHints[resource] = nil continue } @@ -105,11 +105,11 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana allocated := m.podDevices.podDevices(string(pod.UID), resource) if allocated.Len() > 0 { if allocated.Len() != requested { - klog.InfoS("Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len()) + klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } - klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "allocated", allocated.Len()) + klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "allocated", allocated.Len()) deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested) continue } @@ -117,7 +117,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana // Get the list of available devices, for which TopologyHints should be generated. available := m.getAvailableDevices(resource) if available.Len() < requested { - klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len()) + klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } From f449697457334218e8285afe52ea74c4d67f8726 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Thu, 6 Feb 2025 17:42:11 +0000 Subject: [PATCH 9/9] node: device-mgr: Adhere to the message style guidelines Ensure that the log messages adhere to the message style guildelines as captured [here](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md#message-style-guidelines). Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/manager.go | 20 +++++++++---------- .../cm/devicemanager/plugin/v1beta1/client.go | 2 +- .../devicemanager/plugin/v1beta1/handler.go | 2 +- .../cm/devicemanager/plugin/v1beta1/server.go | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 8a50af44123..5f4a72a43e3 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -468,7 +468,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) eI, ok := m.endpoints[resourceName] if (ok && eI.e.stopGracePeriodExpired()) || !ok { if !ok { - klog.InfoS("Unexpected: unhealthyDevices and endpoints are out of sync") + klog.InfoS("Unexpected: unhealthyDevices and endpoints became out of sync") } delete(m.endpoints, resourceName) delete(m.unhealthyDevices, resourceName) @@ -484,7 +484,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) m.mutex.Unlock() if needsUpdateCheckpoint { if err := m.writeCheckpoint(); err != nil { - klog.ErrorS(err, "Error on writing checkpoint") + klog.ErrorS(err, "Failed to write checkpoint file") } } return capacity, allocatable, deletedResources.UnsortedList() @@ -506,7 +506,7 @@ func (m *ManagerImpl) writeCheckpoint() error { klog.ErrorS(err, "Failed to write checkpoint file") return err2 } - klog.V(4).InfoS("checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint) + klog.V(4).InfoS("Checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint) return nil } @@ -536,7 +536,7 @@ func (m *ManagerImpl) readCheckpoint() error { m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil} } - klog.V(4).InfoS("read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint) + klog.V(4).InfoS("Read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint) return nil } @@ -599,7 +599,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi // running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be // because it already has access to all the required devices, so we got nothing to do and we can bail out. if !m.sourcesReady.AllReady() && m.isContainerAlreadyRunning(podUID, contName) { - klog.V(3).InfoS("container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) + klog.V(3).InfoS("Container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) return nil, nil } @@ -630,7 +630,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi // We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path. // We cover container restart on kubelet steady state with the same flow. if needed == 0 { - klog.V(3).InfoS("no devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) + klog.V(3).InfoS("No devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) // No change, no work. return nil, nil } @@ -955,7 +955,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co } if !m.checkPodActive(pod) { - klog.V(5).InfoS("pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name) + klog.V(5).InfoS("Pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name) continue } @@ -1170,7 +1170,7 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool { func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool { cntID, err := m.containerMap.GetContainerID(podUID, cntName) if err != nil { - klog.ErrorS(err, "container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName) + klog.ErrorS(err, "Container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName) return false } @@ -1178,11 +1178,11 @@ func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool { // so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again. // This scenario should however be rare enough. if !m.containerRunningSet.Has(cntID) { - klog.V(4).InfoS("container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID) + klog.V(4).InfoS("Container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID) return false } // Once we make it here we know we have a running container. - klog.V(4).InfoS("container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID) + klog.V(4).InfoS("Container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID) return true } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go index 47946de0748..98359ab32c0 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go @@ -107,7 +107,7 @@ func (c *client) Disconnect() error { c.mutex.Unlock() c.handler.PluginDisconnected(c.resource) - klog.V(2).InfoS("device plugin disconnected", "resource", c.resource) + klog.V(2).InfoS("Device plugin disconnected", "resource", c.resource) return nil } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go index 18c381bc6b0..d1bca0cccae 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go @@ -62,7 +62,7 @@ func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []s return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName)) } - klog.V(2).InfoS("plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions) + klog.V(2).InfoS("Device plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions) return nil } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go index 130d143c9e8..fd37315810e 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go @@ -128,7 +128,7 @@ func (s *server) Start() error { func (s *server) Stop() error { s.visitClients(func(r string, c Client) { if err := s.disconnectClient(r, c); err != nil { - klog.ErrorS(err, "Error disconnecting device plugin client", "resourceName", r) + klog.ErrorS(err, "Failed to disconnect device plugin client", "resourceName", r) } })