Merge pull request #129735 from swatisehgal/device-mgr-logs-improvements

Device Manager logging improvements
This commit is contained in:
Kubernetes Prow Robot 2025-02-07 07:11:56 -08:00 committed by GitHub
commit 39f1c90ac0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 39 additions and 31 deletions

View File

@ -351,7 +351,7 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
// Loads in allocatedDevices information from disk. // Loads in allocatedDevices information from disk.
err := m.readCheckpoint() err := m.readCheckpoint()
if err != nil { if err != nil {
klog.InfoS("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date", "err", err) klog.ErrorS(err, "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date")
} }
return m.server.Start() return m.server.Start()
@ -453,7 +453,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
// should always be consistent. Otherwise, we run with the risk // should always be consistent. Otherwise, we run with the risk
// of failing to garbage collect non-existing resources or devices. // of failing to garbage collect non-existing resources or devices.
if !ok { if !ok {
klog.ErrorS(nil, "Unexpected: healthyDevices and endpoints are out of sync") klog.InfoS("Unexpected: healthyDevices and endpoints are out of sync")
} }
delete(m.endpoints, resourceName) delete(m.endpoints, resourceName)
delete(m.healthyDevices, resourceName) delete(m.healthyDevices, resourceName)
@ -468,7 +468,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
eI, ok := m.endpoints[resourceName] eI, ok := m.endpoints[resourceName]
if (ok && eI.e.stopGracePeriodExpired()) || !ok { if (ok && eI.e.stopGracePeriodExpired()) || !ok {
if !ok { if !ok {
klog.ErrorS(nil, "Unexpected: unhealthyDevices and endpoints are out of sync") klog.InfoS("Unexpected: unhealthyDevices and endpoints became out of sync")
} }
delete(m.endpoints, resourceName) delete(m.endpoints, resourceName)
delete(m.unhealthyDevices, resourceName) delete(m.unhealthyDevices, resourceName)
@ -484,7 +484,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
m.mutex.Unlock() m.mutex.Unlock()
if needsUpdateCheckpoint { if needsUpdateCheckpoint {
if err := m.writeCheckpoint(); err != nil { if err := m.writeCheckpoint(); err != nil {
klog.ErrorS(err, "Error on writing checkpoint") klog.ErrorS(err, "Failed to write checkpoint file")
} }
} }
return capacity, allocatable, deletedResources.UnsortedList() return capacity, allocatable, deletedResources.UnsortedList()
@ -503,9 +503,10 @@ func (m *ManagerImpl) writeCheckpoint() error {
err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data) err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data)
if err != nil { if err != nil {
err2 := fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err) err2 := fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
klog.InfoS("Failed to write checkpoint file", "err", err) klog.ErrorS(err, "Failed to write checkpoint file")
return err2 return err2
} }
klog.V(4).InfoS("Checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint)
return nil return nil
} }
@ -516,7 +517,7 @@ func (m *ManagerImpl) readCheckpoint() error {
if err != nil { if err != nil {
if err == errors.ErrCheckpointNotFound { if err == errors.ErrCheckpointNotFound {
// no point in trying anything else // no point in trying anything else
klog.InfoS("Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint, "err", err) klog.ErrorS(err, "Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint)
return nil return nil
} }
return err return err
@ -534,6 +535,8 @@ func (m *ManagerImpl) readCheckpoint() error {
m.unhealthyDevices[resource] = sets.New[string]() m.unhealthyDevices[resource] = sets.New[string]()
m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil} m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil}
} }
klog.V(4).InfoS("Read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint)
return nil return nil
} }
@ -596,7 +599,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
// running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be // running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be
// because it already has access to all the required devices, so we got nothing to do and we can bail out. // because it already has access to all the required devices, so we got nothing to do and we can bail out.
if !m.sourcesReady.AllReady() && m.isContainerAlreadyRunning(podUID, contName) { if !m.sourcesReady.AllReady() && m.isContainerAlreadyRunning(podUID, contName) {
klog.V(3).InfoS("container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) klog.V(3).InfoS("Container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
return nil, nil return nil, nil
} }
@ -627,7 +630,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
// We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path. // We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path.
// We cover container restart on kubelet steady state with the same flow. // We cover container restart on kubelet steady state with the same flow.
if needed == 0 { if needed == 0 {
klog.V(3).InfoS("no devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName) klog.V(3).InfoS("No devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
// No change, no work. // No change, no work.
return nil, nil return nil, nil
} }
@ -836,7 +839,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
for k, v := range container.Resources.Limits { for k, v := range container.Resources.Limits {
resource := string(k) resource := string(k)
needed := int(v.Value()) needed := int(v.Value())
klog.V(3).InfoS("Looking for needed resources", "needed", needed, "resourceName", resource) klog.V(3).InfoS("Looking for needed resources", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "needed", needed)
if !m.isDevicePluginResource(resource) { if !m.isDevicePluginResource(resource) {
continue continue
} }
@ -882,7 +885,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
devs := allocDevices.UnsortedList() devs := allocDevices.UnsortedList()
// TODO: refactor this part of code to just append a ContainerAllocationRequest // TODO: refactor this part of code to just append a ContainerAllocationRequest
// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod. // in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource) klog.V(4).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
resp, err := eI.e.allocate(devs) resp, err := eI.e.allocate(devs)
metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime)) metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime))
if err != nil { if err != nil {
@ -952,7 +955,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
} }
if !m.checkPodActive(pod) { if !m.checkPodActive(pod) {
klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID) klog.V(5).InfoS("Pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name)
continue continue
} }
@ -984,7 +987,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s
if eI.opts == nil || !eI.opts.PreStartRequired { if eI.opts == nil || !eI.opts.PreStartRequired {
m.mutex.Unlock() m.mutex.Unlock()
klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "resourceName", resource) klog.V(5).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName)
return nil return nil
} }
@ -1014,12 +1017,12 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re
} }
if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable { if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable {
klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource) klog.V(5).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName)
return nil, nil return nil, nil
} }
m.mutex.Unlock() m.mutex.Unlock()
klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "containerName", contName, "podUID", podUID) klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "resourceName", resource, "containerName", contName, "podUID", podUID)
resp, err := eI.e.getPreferredAllocation(available.UnsortedList(), mustInclude.UnsortedList(), size) resp, err := eI.e.getPreferredAllocation(available.UnsortedList(), mustInclude.UnsortedList(), size)
m.mutex.Lock() m.mutex.Lock()
if err != nil { if err != nil {
@ -1167,7 +1170,7 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool {
func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool { func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
cntID, err := m.containerMap.GetContainerID(podUID, cntName) cntID, err := m.containerMap.GetContainerID(podUID, cntName)
if err != nil { if err != nil {
klog.V(4).InfoS("container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName, "err", err) klog.ErrorS(err, "Container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName)
return false return false
} }
@ -1175,11 +1178,11 @@ func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
// so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again. // so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again.
// This scenario should however be rare enough. // This scenario should however be rare enough.
if !m.containerRunningSet.Has(cntID) { if !m.containerRunningSet.Has(cntID) {
klog.V(4).InfoS("container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID) klog.V(4).InfoS("Container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
return false return false
} }
// Once we make it here we know we have a running container. // Once we make it here we know we have a running container.
klog.V(4).InfoS("container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID) klog.V(4).InfoS("Container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
return true return true
} }

View File

@ -106,6 +106,8 @@ func (c *client) Disconnect() error {
} }
c.mutex.Unlock() c.mutex.Unlock()
c.handler.PluginDisconnected(c.resource) c.handler.PluginDisconnected(c.resource)
klog.V(2).InfoS("Device plugin disconnected", "resource", c.resource)
return nil return nil
} }

View File

@ -62,6 +62,7 @@ func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []s
return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName)) return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
} }
klog.V(2).InfoS("Device plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions)
return nil return nil
} }
@ -75,6 +76,7 @@ func (s *server) connectClient(name string, socketPath string) error {
return err return err
} }
klog.V(2).InfoS("Connected to new client", "resource", name)
go func() { go func() {
s.runClient(name, c) s.runClient(name, c)
}() }()
@ -112,7 +114,7 @@ func (s *server) runClient(name string, c Client) {
} }
if err := s.disconnectClient(name, c); err != nil { if err := s.disconnectClient(name, c); err != nil {
klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err) klog.ErrorS(err, "Unable to disconnect client", "resource", name, "client", c)
} }
} }

View File

@ -91,7 +91,7 @@ func (s *server) Start() error {
if selinux.GetEnabled() { if selinux.GetEnabled() {
if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil { if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil {
klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err) klog.ErrorS(err, "Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir)
} }
} }
@ -128,7 +128,7 @@ func (s *server) Start() error {
func (s *server) Stop() error { func (s *server) Stop() error {
s.visitClients(func(r string, c Client) { s.visitClients(func(r string, c Client) {
if err := s.disconnectClient(r, c); err != nil { if err := s.disconnectClient(r, c); err != nil {
klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err) klog.ErrorS(err, "Failed to disconnect device plugin client", "resourceName", r)
} }
}) })
@ -145,6 +145,7 @@ func (s *server) Stop() error {
// During kubelet termination, we do not need the registration server, // During kubelet termination, we do not need the registration server,
// and we consider the kubelet to be healthy even when it is down. // and we consider the kubelet to be healthy even when it is down.
s.setHealthy() s.setHealthy()
klog.V(2).InfoS("Stopping device plugin registration server")
return nil return nil
} }
@ -159,18 +160,18 @@ func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Emp
if !s.isVersionCompatibleWithPlugin(r.Version) { if !s.isVersionCompatibleWithPlugin(r.Version) {
err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions) err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions)
klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err) klog.ErrorS(err, "Bad registration request from device plugin with resource", "resourceName", r.ResourceName)
return &api.Empty{}, err return &api.Empty{}, err
} }
if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) { if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) {
err := fmt.Errorf(errInvalidResourceName, r.ResourceName) err := fmt.Errorf(errInvalidResourceName, r.ResourceName)
klog.InfoS("Bad registration request from device plugin", "err", err) klog.ErrorS(err, "Bad registration request from device plugin")
return &api.Empty{}, err return &api.Empty{}, err
} }
if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil { if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil {
klog.InfoS("Error connecting to device plugin client", "err", err) klog.ErrorS(err, "Error connecting to device plugin client")
return &api.Empty{}, err return &api.Empty{}, err
} }

View File

@ -43,7 +43,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
for resource, requested := range accumulatedResourceRequests { for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information. // Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned { if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource) klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested)
deviceHints[resource] = nil deviceHints[resource] = nil
continue continue
} }
@ -54,11 +54,11 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource) allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
if allocated.Len() > 0 { if allocated.Len() > 0 {
if allocated.Len() != requested { if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len()) klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{} deviceHints[resource] = []topologymanager.TopologyHint{}
continue continue
} }
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name) klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested) deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue continue
} }
@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
available := m.getAvailableDevices(resource) available := m.getAvailableDevices(resource)
reusable := m.devicesToReuse[string(pod.UID)][resource] reusable := m.devicesToReuse[string(pod.UID)][resource]
if available.Union(reusable).Len() < requested { if available.Union(reusable).Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len()) klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len())
deviceHints[resource] = []topologymanager.TopologyHint{} deviceHints[resource] = []topologymanager.TopologyHint{}
continue continue
} }
@ -94,7 +94,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
for resource, requested := range accumulatedResourceRequests { for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information. // Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned { if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource) klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "request", requested)
deviceHints[resource] = nil deviceHints[resource] = nil
continue continue
} }
@ -105,11 +105,11 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
allocated := m.podDevices.podDevices(string(pod.UID), resource) allocated := m.podDevices.podDevices(string(pod.UID), resource)
if allocated.Len() > 0 { if allocated.Len() > 0 {
if allocated.Len() != requested { if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len()) klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{} deviceHints[resource] = []topologymanager.TopologyHint{}
continue continue
} }
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod)) klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "allocated", allocated.Len())
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested) deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue continue
} }
@ -117,7 +117,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
// Get the list of available devices, for which TopologyHints should be generated. // Get the list of available devices, for which TopologyHints should be generated.
available := m.getAvailableDevices(resource) available := m.getAvailableDevices(resource)
if available.Len() < requested { if available.Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len()) klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len())
deviceHints[resource] = []topologymanager.TopologyHint{} deviceHints[resource] = []topologymanager.TopologyHint{}
continue continue
} }