mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-13 22:05:59 +00:00
Merge pull request #129735 from swatisehgal/device-mgr-logs-improvements
Device Manager logging improvements
This commit is contained in:
commit
39f1c90ac0
@ -351,7 +351,7 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
|
||||
// Loads in allocatedDevices information from disk.
|
||||
err := m.readCheckpoint()
|
||||
if err != nil {
|
||||
klog.InfoS("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date", "err", err)
|
||||
klog.ErrorS(err, "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date")
|
||||
}
|
||||
|
||||
return m.server.Start()
|
||||
@ -453,7 +453,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
|
||||
// should always be consistent. Otherwise, we run with the risk
|
||||
// of failing to garbage collect non-existing resources or devices.
|
||||
if !ok {
|
||||
klog.ErrorS(nil, "Unexpected: healthyDevices and endpoints are out of sync")
|
||||
klog.InfoS("Unexpected: healthyDevices and endpoints are out of sync")
|
||||
}
|
||||
delete(m.endpoints, resourceName)
|
||||
delete(m.healthyDevices, resourceName)
|
||||
@ -468,7 +468,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
|
||||
eI, ok := m.endpoints[resourceName]
|
||||
if (ok && eI.e.stopGracePeriodExpired()) || !ok {
|
||||
if !ok {
|
||||
klog.ErrorS(nil, "Unexpected: unhealthyDevices and endpoints are out of sync")
|
||||
klog.InfoS("Unexpected: unhealthyDevices and endpoints became out of sync")
|
||||
}
|
||||
delete(m.endpoints, resourceName)
|
||||
delete(m.unhealthyDevices, resourceName)
|
||||
@ -484,7 +484,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
|
||||
m.mutex.Unlock()
|
||||
if needsUpdateCheckpoint {
|
||||
if err := m.writeCheckpoint(); err != nil {
|
||||
klog.ErrorS(err, "Error on writing checkpoint")
|
||||
klog.ErrorS(err, "Failed to write checkpoint file")
|
||||
}
|
||||
}
|
||||
return capacity, allocatable, deletedResources.UnsortedList()
|
||||
@ -503,9 +503,10 @@ func (m *ManagerImpl) writeCheckpoint() error {
|
||||
err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data)
|
||||
if err != nil {
|
||||
err2 := fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
|
||||
klog.InfoS("Failed to write checkpoint file", "err", err)
|
||||
klog.ErrorS(err, "Failed to write checkpoint file")
|
||||
return err2
|
||||
}
|
||||
klog.V(4).InfoS("Checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint)
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -516,7 +517,7 @@ func (m *ManagerImpl) readCheckpoint() error {
|
||||
if err != nil {
|
||||
if err == errors.ErrCheckpointNotFound {
|
||||
// no point in trying anything else
|
||||
klog.InfoS("Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint, "err", err)
|
||||
klog.ErrorS(err, "Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint)
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
@ -534,6 +535,8 @@ func (m *ManagerImpl) readCheckpoint() error {
|
||||
m.unhealthyDevices[resource] = sets.New[string]()
|
||||
m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil}
|
||||
}
|
||||
|
||||
klog.V(4).InfoS("Read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint)
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -596,7 +599,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
||||
// running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be
|
||||
// because it already has access to all the required devices, so we got nothing to do and we can bail out.
|
||||
if !m.sourcesReady.AllReady() && m.isContainerAlreadyRunning(podUID, contName) {
|
||||
klog.V(3).InfoS("container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
|
||||
klog.V(3).InfoS("Container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@ -627,7 +630,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
||||
// We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path.
|
||||
// We cover container restart on kubelet steady state with the same flow.
|
||||
if needed == 0 {
|
||||
klog.V(3).InfoS("no devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
|
||||
klog.V(3).InfoS("No devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
|
||||
// No change, no work.
|
||||
return nil, nil
|
||||
}
|
||||
@ -836,7 +839,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
|
||||
for k, v := range container.Resources.Limits {
|
||||
resource := string(k)
|
||||
needed := int(v.Value())
|
||||
klog.V(3).InfoS("Looking for needed resources", "needed", needed, "resourceName", resource)
|
||||
klog.V(3).InfoS("Looking for needed resources", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "needed", needed)
|
||||
if !m.isDevicePluginResource(resource) {
|
||||
continue
|
||||
}
|
||||
@ -882,7 +885,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
|
||||
devs := allocDevices.UnsortedList()
|
||||
// TODO: refactor this part of code to just append a ContainerAllocationRequest
|
||||
// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
|
||||
klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource)
|
||||
klog.V(4).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
|
||||
resp, err := eI.e.allocate(devs)
|
||||
metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime))
|
||||
if err != nil {
|
||||
@ -952,7 +955,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
|
||||
}
|
||||
|
||||
if !m.checkPodActive(pod) {
|
||||
klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID)
|
||||
klog.V(5).InfoS("Pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
@ -984,7 +987,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s
|
||||
|
||||
if eI.opts == nil || !eI.opts.PreStartRequired {
|
||||
m.mutex.Unlock()
|
||||
klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "resourceName", resource)
|
||||
klog.V(5).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName)
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -1014,12 +1017,12 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re
|
||||
}
|
||||
|
||||
if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable {
|
||||
klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource)
|
||||
klog.V(5).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
m.mutex.Unlock()
|
||||
klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "containerName", contName, "podUID", podUID)
|
||||
klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "resourceName", resource, "containerName", contName, "podUID", podUID)
|
||||
resp, err := eI.e.getPreferredAllocation(available.UnsortedList(), mustInclude.UnsortedList(), size)
|
||||
m.mutex.Lock()
|
||||
if err != nil {
|
||||
@ -1167,7 +1170,7 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool {
|
||||
func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
|
||||
cntID, err := m.containerMap.GetContainerID(podUID, cntName)
|
||||
if err != nil {
|
||||
klog.V(4).InfoS("container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName, "err", err)
|
||||
klog.ErrorS(err, "Container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName)
|
||||
return false
|
||||
}
|
||||
|
||||
@ -1175,11 +1178,11 @@ func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
|
||||
// so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again.
|
||||
// This scenario should however be rare enough.
|
||||
if !m.containerRunningSet.Has(cntID) {
|
||||
klog.V(4).InfoS("container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
|
||||
klog.V(4).InfoS("Container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
|
||||
return false
|
||||
}
|
||||
|
||||
// Once we make it here we know we have a running container.
|
||||
klog.V(4).InfoS("container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
|
||||
klog.V(4).InfoS("Container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
|
||||
return true
|
||||
}
|
||||
|
@ -106,6 +106,8 @@ func (c *client) Disconnect() error {
|
||||
}
|
||||
c.mutex.Unlock()
|
||||
c.handler.PluginDisconnected(c.resource)
|
||||
|
||||
klog.V(2).InfoS("Device plugin disconnected", "resource", c.resource)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -62,6 +62,7 @@ func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []s
|
||||
return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
|
||||
}
|
||||
|
||||
klog.V(2).InfoS("Device plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions)
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -75,6 +76,7 @@ func (s *server) connectClient(name string, socketPath string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
klog.V(2).InfoS("Connected to new client", "resource", name)
|
||||
go func() {
|
||||
s.runClient(name, c)
|
||||
}()
|
||||
@ -112,7 +114,7 @@ func (s *server) runClient(name string, c Client) {
|
||||
}
|
||||
|
||||
if err := s.disconnectClient(name, c); err != nil {
|
||||
klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err)
|
||||
klog.ErrorS(err, "Unable to disconnect client", "resource", name, "client", c)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -91,7 +91,7 @@ func (s *server) Start() error {
|
||||
|
||||
if selinux.GetEnabled() {
|
||||
if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil {
|
||||
klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err)
|
||||
klog.ErrorS(err, "Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir)
|
||||
}
|
||||
}
|
||||
|
||||
@ -128,7 +128,7 @@ func (s *server) Start() error {
|
||||
func (s *server) Stop() error {
|
||||
s.visitClients(func(r string, c Client) {
|
||||
if err := s.disconnectClient(r, c); err != nil {
|
||||
klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err)
|
||||
klog.ErrorS(err, "Failed to disconnect device plugin client", "resourceName", r)
|
||||
}
|
||||
})
|
||||
|
||||
@ -145,6 +145,7 @@ func (s *server) Stop() error {
|
||||
// During kubelet termination, we do not need the registration server,
|
||||
// and we consider the kubelet to be healthy even when it is down.
|
||||
s.setHealthy()
|
||||
klog.V(2).InfoS("Stopping device plugin registration server")
|
||||
|
||||
return nil
|
||||
}
|
||||
@ -159,18 +160,18 @@ func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Emp
|
||||
|
||||
if !s.isVersionCompatibleWithPlugin(r.Version) {
|
||||
err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions)
|
||||
klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err)
|
||||
klog.ErrorS(err, "Bad registration request from device plugin with resource", "resourceName", r.ResourceName)
|
||||
return &api.Empty{}, err
|
||||
}
|
||||
|
||||
if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) {
|
||||
err := fmt.Errorf(errInvalidResourceName, r.ResourceName)
|
||||
klog.InfoS("Bad registration request from device plugin", "err", err)
|
||||
klog.ErrorS(err, "Bad registration request from device plugin")
|
||||
return &api.Empty{}, err
|
||||
}
|
||||
|
||||
if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil {
|
||||
klog.InfoS("Error connecting to device plugin client", "err", err)
|
||||
klog.ErrorS(err, "Error connecting to device plugin client")
|
||||
return &api.Empty{}, err
|
||||
}
|
||||
|
||||
|
@ -43,7 +43,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
|
||||
for resource, requested := range accumulatedResourceRequests {
|
||||
// Only consider devices that actually contain topology information.
|
||||
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
|
||||
klog.InfoS("Resource does not have a topology preference", "resource", resource)
|
||||
klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested)
|
||||
deviceHints[resource] = nil
|
||||
continue
|
||||
}
|
||||
@ -54,11 +54,11 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
|
||||
allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
|
||||
if allocated.Len() > 0 {
|
||||
if allocated.Len() != requested {
|
||||
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
|
||||
klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
|
||||
deviceHints[resource] = []topologymanager.TopologyHint{}
|
||||
continue
|
||||
}
|
||||
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name)
|
||||
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
|
||||
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
|
||||
continue
|
||||
}
|
||||
@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
|
||||
available := m.getAvailableDevices(resource)
|
||||
reusable := m.devicesToReuse[string(pod.UID)][resource]
|
||||
if available.Union(reusable).Len() < requested {
|
||||
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len())
|
||||
klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len())
|
||||
deviceHints[resource] = []topologymanager.TopologyHint{}
|
||||
continue
|
||||
}
|
||||
@ -94,7 +94,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
|
||||
for resource, requested := range accumulatedResourceRequests {
|
||||
// Only consider devices that actually contain topology information.
|
||||
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
|
||||
klog.InfoS("Resource does not have a topology preference", "resource", resource)
|
||||
klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "request", requested)
|
||||
deviceHints[resource] = nil
|
||||
continue
|
||||
}
|
||||
@ -105,11 +105,11 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
|
||||
allocated := m.podDevices.podDevices(string(pod.UID), resource)
|
||||
if allocated.Len() > 0 {
|
||||
if allocated.Len() != requested {
|
||||
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
|
||||
klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
|
||||
deviceHints[resource] = []topologymanager.TopologyHint{}
|
||||
continue
|
||||
}
|
||||
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod))
|
||||
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "allocated", allocated.Len())
|
||||
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
|
||||
continue
|
||||
}
|
||||
@ -117,7 +117,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
|
||||
// Get the list of available devices, for which TopologyHints should be generated.
|
||||
available := m.getAvailableDevices(resource)
|
||||
if available.Len() < requested {
|
||||
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len())
|
||||
klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len())
|
||||
deviceHints[resource] = []topologymanager.TopologyHint{}
|
||||
continue
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user