diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index 9407a03c63a..4126d7ea9c3 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -394,7 +394,8 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n for i, c := range containers { status, err := m.runtimeService.ContainerStatus(c.Id) if err != nil { - klog.Errorf("ContainerStatus for %s error: %v", c.Id, err) + // Merely log this here; GetPodStatus will actually report the error out. + klog.V(4).Infof("ContainerStatus for %s error: %v", c.Id, err) return nil, err } cStatus := toKubeContainerStatus(status, m.runtimeName) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 90039beda09..05d2aea3a62 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -59,6 +59,8 @@ const ( // The expiration time of version cache. versionCacheTTL = 60 * time.Second + // How frequently to report identical errors + identicalErrorDelay = 1 * time.Minute ) var ( @@ -123,6 +125,12 @@ type kubeGenericRuntimeManager struct { // Manage RuntimeClass resources. runtimeClassManager *runtimeclass.Manager + + // Cache last per-container error message to reduce log spam + lastError map[string]string + + // Time last per-container error message was printed + errorPrinted map[string]time.Time } // KubeGenericRuntime is a interface contains interfaces for container runtime and command. @@ -177,6 +185,8 @@ func NewKubeGenericRuntimeManager( internalLifecycle: internalLifecycle, legacyLogProvider: legacyLogProvider, runtimeClassManager: runtimeClassManager, + lastError: make(map[string]string), + errorPrinted: make(map[string]time.Time), } typedVersion, err := kubeRuntimeManager.runtimeService.Version(kubeRuntimeAPIVersion) @@ -819,6 +829,15 @@ func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPo return } +func (m *kubeGenericRuntimeManager) cleanupErrorTimeouts() { + for name, timeout := range m.errorPrinted { + if time.Now().Sub(timeout) >= identicalErrorDelay { + delete(m.errorPrinted, name) + delete(m.lastError, name) + } + } +} + // GetPodStatus retrieves the status of the pod, including the // information of all containers in the pod that are visible in Runtime. func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) { @@ -868,9 +887,16 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namesp // Get statuses of all containers visible in the pod. containerStatuses, err := m.getPodContainerStatuses(uid, name, namespace) if err != nil { - klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err) + lastMsg, ok := m.lastError[podFullName] + if !ok || err.Error() != lastMsg || time.Now().Sub(m.errorPrinted[podFullName]) >= identicalErrorDelay { + klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err) + m.errorPrinted[podFullName] = time.Now() + m.lastError[podFullName] = err.Error() + } return nil, err } + delete(m.errorPrinted, podFullName) + delete(m.lastError, podFullName) return &kubecontainer.PodStatus{ ID: uid, diff --git a/pkg/kubelet/pleg/generic.go b/pkg/kubelet/pleg/generic.go index 4de9c721130..f69e0d49552 100644 --- a/pkg/kubelet/pleg/generic.go +++ b/pkg/kubelet/pleg/generic.go @@ -244,7 +244,8 @@ func (g *GenericPLEG) relist() { // serially may take a while. We should be aware of this and // parallelize if needed. if err := g.updateCache(pod, pid); err != nil { - klog.Errorf("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err) + // Rely on updateCache calling GetPodStatus to log the actual error. + klog.V(4).Infof("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err) // make sure we try to reinspect the pod during the next relisting needsReinspection[pid] = pod @@ -274,7 +275,8 @@ func (g *GenericPLEG) relist() { klog.V(5).Infof("GenericPLEG: Reinspecting pods that previously failed inspection") for pid, pod := range g.podsToReinspect { if err := g.updateCache(pod, pid); err != nil { - klog.Errorf("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err) + // Rely on updateCache calling GetPodStatus to log the actual error. + klog.V(5).Infof("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err) needsReinspection[pid] = pod } } diff --git a/pkg/kubelet/remote/remote_runtime.go b/pkg/kubelet/remote/remote_runtime.go index 16e16daff84..c1cc69085b1 100644 --- a/pkg/kubelet/remote/remote_runtime.go +++ b/pkg/kubelet/remote/remote_runtime.go @@ -36,8 +36,17 @@ import ( type RemoteRuntimeService struct { timeout time.Duration runtimeClient runtimeapi.RuntimeServiceClient + // Cache last per-container error message to reduce log spam + lastError map[string]string + // Time last per-container error message was printed + errorPrinted map[string]time.Time } +const ( + // How frequently to report identical errors + identicalErrorDelay = 1 * time.Minute +) + // NewRemoteRuntimeService creates a new internalapi.RuntimeService. func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (internalapi.RuntimeService, error) { klog.V(3).Infof("Connecting to runtime service %s", endpoint) @@ -57,6 +66,8 @@ func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) ( return &RemoteRuntimeService{ timeout: connectionTimeout, runtimeClient: runtimeapi.NewRuntimeServiceClient(conn), + lastError: make(map[string]string), + errorPrinted: make(map[string]time.Time), }, nil } @@ -225,6 +236,8 @@ func (r *RemoteRuntimeService) StopContainer(containerID string, timeout int64) ctx, cancel := getContextWithTimeout(t) defer cancel() + delete(r.lastError, containerID) + delete(r.errorPrinted, containerID) _, err := r.runtimeClient.StopContainer(ctx, &runtimeapi.StopContainerRequest{ ContainerId: containerID, Timeout: timeout, @@ -243,6 +256,8 @@ func (r *RemoteRuntimeService) RemoveContainer(containerID string) error { ctx, cancel := getContextWithTimeout(r.timeout) defer cancel() + delete(r.lastError, containerID) + delete(r.errorPrinted, containerID) _, err := r.runtimeClient.RemoveContainer(ctx, &runtimeapi.RemoveContainerRequest{ ContainerId: containerID, }) @@ -270,6 +285,16 @@ func (r *RemoteRuntimeService) ListContainers(filter *runtimeapi.ContainerFilter return resp.Containers, nil } +// Clean up any expired last-error timers +func (r *RemoteRuntimeService) cleanupErrorTimeouts() { + for ID, timeout := range r.errorPrinted { + if time.Now().Sub(timeout) >= identicalErrorDelay { + delete(r.lastError, ID) + delete(r.errorPrinted, ID) + } + } +} + // ContainerStatus returns the container status. func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.ContainerStatus, error) { ctx, cancel := getContextWithTimeout(r.timeout) @@ -278,10 +303,19 @@ func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi. resp, err := r.runtimeClient.ContainerStatus(ctx, &runtimeapi.ContainerStatusRequest{ ContainerId: containerID, }) + r.cleanupErrorTimeouts() if err != nil { - klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err) + // Don't spam the log with endless messages about the same failure. + lastMsg, ok := r.lastError[containerID] + if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay { + klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err) + r.errorPrinted[containerID] = time.Now() + r.lastError[containerID] = err.Error() + } return nil, err } + delete(r.lastError, containerID) + delete(r.errorPrinted, containerID) if resp.Status != nil { if err := verifyContainerStatus(resp.Status); err != nil { @@ -456,10 +490,18 @@ func (r *RemoteRuntimeService) ContainerStats(containerID string) (*runtimeapi.C resp, err := r.runtimeClient.ContainerStats(ctx, &runtimeapi.ContainerStatsRequest{ ContainerId: containerID, }) + r.cleanupErrorTimeouts() if err != nil { - klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err) + lastMsg, ok := r.lastError[containerID] + if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay { + klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err) + r.errorPrinted[containerID] = time.Now() + r.lastError[containerID] = err.Error() + } return nil, err } + delete(r.lastError, containerID) + delete(r.errorPrinted, containerID) return resp.GetStats(), nil }