diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index b6a8104624a..5f4a72a43e3 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -351,7 +351,7 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc // Loads in allocatedDevices information from disk. err := m.readCheckpoint() if err != nil { - klog.InfoS("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date", "err", err) + klog.ErrorS(err, "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date") } return m.server.Start() @@ -453,7 +453,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) // should always be consistent. Otherwise, we run with the risk // of failing to garbage collect non-existing resources or devices. if !ok { - klog.ErrorS(nil, "Unexpected: healthyDevices and endpoints are out of sync") + klog.InfoS("Unexpected: healthyDevices and endpoints are out of sync") } delete(m.endpoints, resourceName) delete(m.healthyDevices, resourceName) @@ -468,7 +468,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) eI, ok := m.endpoints[resourceName] if (ok && eI.e.stopGracePeriodExpired()) || !ok { if !ok { - klog.ErrorS(nil, "Unexpected: unhealthyDevices and endpoints are out of sync") + klog.InfoS("Unexpected: unhealthyDevices and endpoints became out of sync") } delete(m.endpoints, resourceName) delete(m.unhealthyDevices, resourceName) @@ -484,7 +484,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) m.mutex.Unlock() if needsUpdateCheckpoint { if err := m.writeCheckpoint(); err != nil { - klog.ErrorS(err, "Error on writing checkpoint") + klog.ErrorS(err, "Failed to write checkpoint file") } } return capacity, allocatable, deletedResources.UnsortedList() @@ -503,9 +503,10 @@ func (m *ManagerImpl) writeCheckpoint() error { err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data) if err != nil { err2 := fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err) - klog.InfoS("Failed to write checkpoint file", "err", err) + klog.ErrorS(err, "Failed to write checkpoint file") return err2 } + klog.V(4).InfoS("Checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint) return nil } @@ -516,7 +517,7 @@ func (m *ManagerImpl) readCheckpoint() error { if err != nil { if err == errors.ErrCheckpointNotFound { // no point in trying anything else - klog.InfoS("Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint, "err", err) + klog.ErrorS(err, "Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint) return nil } return err @@ -534,6 +535,8 @@ func (m *ManagerImpl) readCheckpoint() error { m.unhealthyDevices[resource] = sets.New[string]() m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil} } + + klog.V(4).InfoS("Read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint) return nil } @@ -596,7 +599,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi // running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be // because it already has access to all the required devices, so we got nothing to do and we can bail out. if !m.sourcesReady.AllReady() && m.isContainerAlreadyRunning(podUID, contName) { - klog.V(3).InfoS("container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) + klog.V(3).InfoS("Container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) return nil, nil } @@ -627,7 +630,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi // We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path. // We cover container restart on kubelet steady state with the same flow. if needed == 0 { - klog.V(3).InfoS("no devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) + klog.V(3).InfoS("No devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) // No change, no work. return nil, nil } @@ -836,7 +839,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont for k, v := range container.Resources.Limits { resource := string(k) needed := int(v.Value()) - klog.V(3).InfoS("Looking for needed resources", "needed", needed, "resourceName", resource) + klog.V(3).InfoS("Looking for needed resources", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "needed", needed) if !m.isDevicePluginResource(resource) { continue } @@ -882,7 +885,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont devs := allocDevices.UnsortedList() // TODO: refactor this part of code to just append a ContainerAllocationRequest // in a passed in AllocateRequest pointer, and issues a single Allocate call per pod. - klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource) + klog.V(4).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name) resp, err := eI.e.allocate(devs) metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime)) if err != nil { @@ -952,7 +955,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co } if !m.checkPodActive(pod) { - klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID) + klog.V(5).InfoS("Pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name) continue } @@ -984,7 +987,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s if eI.opts == nil || !eI.opts.PreStartRequired { m.mutex.Unlock() - klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "resourceName", resource) + klog.V(5).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName) return nil } @@ -1014,12 +1017,12 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re } if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable { - klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource) + klog.V(5).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName) return nil, nil } m.mutex.Unlock() - klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "containerName", contName, "podUID", podUID) + klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "resourceName", resource, "containerName", contName, "podUID", podUID) resp, err := eI.e.getPreferredAllocation(available.UnsortedList(), mustInclude.UnsortedList(), size) m.mutex.Lock() if err != nil { @@ -1167,7 +1170,7 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool { func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool { cntID, err := m.containerMap.GetContainerID(podUID, cntName) if err != nil { - klog.V(4).InfoS("container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName, "err", err) + klog.ErrorS(err, "Container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName) return false } @@ -1175,11 +1178,11 @@ func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool { // so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again. // This scenario should however be rare enough. if !m.containerRunningSet.Has(cntID) { - klog.V(4).InfoS("container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID) + klog.V(4).InfoS("Container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID) return false } // Once we make it here we know we have a running container. - klog.V(4).InfoS("container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID) + klog.V(4).InfoS("Container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID) return true } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go index bf66875a493..98359ab32c0 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go @@ -106,6 +106,8 @@ func (c *client) Disconnect() error { } c.mutex.Unlock() c.handler.PluginDisconnected(c.resource) + + klog.V(2).InfoS("Device plugin disconnected", "resource", c.resource) return nil } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go index 9c3af0f9094..d1bca0cccae 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go @@ -62,6 +62,7 @@ func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []s return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName)) } + klog.V(2).InfoS("Device plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions) return nil } @@ -75,6 +76,7 @@ func (s *server) connectClient(name string, socketPath string) error { return err } + klog.V(2).InfoS("Connected to new client", "resource", name) go func() { s.runClient(name, c) }() @@ -112,7 +114,7 @@ func (s *server) runClient(name string, c Client) { } if err := s.disconnectClient(name, c); err != nil { - klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err) + klog.ErrorS(err, "Unable to disconnect client", "resource", name, "client", c) } } diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go index 312aa930a0f..fd37315810e 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go @@ -91,7 +91,7 @@ func (s *server) Start() error { if selinux.GetEnabled() { if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil { - klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err) + klog.ErrorS(err, "Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir) } } @@ -128,7 +128,7 @@ func (s *server) Start() error { func (s *server) Stop() error { s.visitClients(func(r string, c Client) { if err := s.disconnectClient(r, c); err != nil { - klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err) + klog.ErrorS(err, "Failed to disconnect device plugin client", "resourceName", r) } }) @@ -145,6 +145,7 @@ func (s *server) Stop() error { // During kubelet termination, we do not need the registration server, // and we consider the kubelet to be healthy even when it is down. s.setHealthy() + klog.V(2).InfoS("Stopping device plugin registration server") return nil } @@ -159,18 +160,18 @@ func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Emp if !s.isVersionCompatibleWithPlugin(r.Version) { err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions) - klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err) + klog.ErrorS(err, "Bad registration request from device plugin with resource", "resourceName", r.ResourceName) return &api.Empty{}, err } if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) { err := fmt.Errorf(errInvalidResourceName, r.ResourceName) - klog.InfoS("Bad registration request from device plugin", "err", err) + klog.ErrorS(err, "Bad registration request from device plugin") return &api.Empty{}, err } if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil { - klog.InfoS("Error connecting to device plugin client", "err", err) + klog.ErrorS(err, "Error connecting to device plugin client") return &api.Empty{}, err } diff --git a/pkg/kubelet/cm/devicemanager/topology_hints.go b/pkg/kubelet/cm/devicemanager/topology_hints.go index 7b5c9f4c9dd..f4d7fa73a8a 100644 --- a/pkg/kubelet/cm/devicemanager/topology_hints.go +++ b/pkg/kubelet/cm/devicemanager/topology_hints.go @@ -43,7 +43,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map for resource, requested := range accumulatedResourceRequests { // Only consider devices that actually contain topology information. if aligned := m.deviceHasTopologyAlignment(resource); !aligned { - klog.InfoS("Resource does not have a topology preference", "resource", resource) + klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested) deviceHints[resource] = nil continue } @@ -54,11 +54,11 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource) if allocated.Len() > 0 { if allocated.Len() != requested { - klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len()) + klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } - klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name) + klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name) deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested) continue } @@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map available := m.getAvailableDevices(resource) reusable := m.devicesToReuse[string(pod.UID)][resource] if available.Union(reusable).Len() < requested { - klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len()) + klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } @@ -94,7 +94,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana for resource, requested := range accumulatedResourceRequests { // Only consider devices that actually contain topology information. if aligned := m.deviceHasTopologyAlignment(resource); !aligned { - klog.InfoS("Resource does not have a topology preference", "resource", resource) + klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "request", requested) deviceHints[resource] = nil continue } @@ -105,11 +105,11 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana allocated := m.podDevices.podDevices(string(pod.UID), resource) if allocated.Len() > 0 { if allocated.Len() != requested { - klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len()) + klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue } - klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod)) + klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "allocated", allocated.Len()) deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested) continue } @@ -117,7 +117,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana // Get the list of available devices, for which TopologyHints should be generated. available := m.getAvailableDevices(resource) if available.Len() < requested { - klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len()) + klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len()) deviceHints[resource] = []topologymanager.TopologyHint{} continue }