From a02f10fa3a956a2fc1fe0328f58b7b1cea548264 Mon Sep 17 00:00:00 2001 From: Derek Carr Date: Tue, 13 Jun 2017 18:04:19 -0400 Subject: [PATCH 1/3] Strip containerID from events to reduce spam --- .../kuberuntime/kuberuntime_container.go | 54 ++++++++++++------- pkg/kubelet/pod_workers.go | 8 ++- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index 73fbed91099..0770b977406 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -24,6 +24,7 @@ import ( "os" "path/filepath" "sort" + "strings" "sync" "time" @@ -45,6 +46,30 @@ import ( "k8s.io/kubernetes/pkg/util/tail" ) +// recordContainerEvent should be used by the runtime manager for all container related events. +// it has sanity checks to ensure that we do not write events that can abuse our masters. +// in particular, it ensures that a containerID never appears in an event message as that +// is prone to causing a lot of distinct events that do not count well. +// it replaces any reference to a containerID with the containerName which is stable, and is what users know. +func (m *kubeGenericRuntimeManager) recordContainerEvent(pod *v1.Pod, container *v1.Container, containerID, eventType, reason, message string, args ...interface{}) { + ref, err := kubecontainer.GenerateContainerRef(pod, container) + if err != nil { + glog.Errorf("Can't make a ref to pod %q, container %v: %v", format.Pod(pod), container.Name, err) + return + } + eventMessage := message + if len(args) > 0 { + eventMessage = fmt.Sprintf(message, args...) + } + // this is a hack, but often the error from the runtime includes the containerID + // which kills our ability to deduplicate events. this protection makes a huge + // difference in the number of unique events + if containerID != "" { + eventMessage = strings.Replace(eventMessage, containerID, container.Name, -1) + } + m.recorder.Event(events.ToObjectReference(ref), eventType, reason, eventMessage) +} + // startContainer starts a container and returns a message indicates why it is failed on error. // It starts the container through the following steps: // * pull the image @@ -74,15 +99,16 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb containerConfig, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef) if err != nil { - m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToCreateContainer, "Failed to create container with error: %v", err) + m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", err) return "Generate Container Config Failed", err } containerID, err := m.runtimeService.CreateContainer(podSandboxID, containerConfig, podSandboxConfig) if err != nil { - m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToCreateContainer, "Failed to create container with error: %v", err) + m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", err) return "Create Container Failed", err } - m.recorder.Eventf(ref, v1.EventTypeNormal, events.CreatedContainer, "Created container with id %v", containerID) + m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, "Created container") + if ref != nil { m.containerRefManager.SetRef(kubecontainer.ContainerID{ Type: m.runtimeName, @@ -93,11 +119,10 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb // Step 3: start the container. err = m.runtimeService.StartContainer(containerID) if err != nil { - m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToStartContainer, - "Failed to start container with id %v with error: %v", containerID, err) + m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", err) return "Start Container Failed", err } - m.recorder.Eventf(ref, v1.EventTypeNormal, events.StartedContainer, "Started container with id %v", containerID) + m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, "Started container") // Symlink container logs to the legacy container log location for cluster logging // support. @@ -120,8 +145,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb } msg, handlerErr := m.runner.Run(kubeContainerID, pod, container, container.Lifecycle.PostStart) if handlerErr != nil { - err := fmt.Errorf("PostStart handler: %v", handlerErr) - m.generateContainerEvent(kubeContainerID, v1.EventTypeWarning, events.FailedPostStartHook, msg) + m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, msg) m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", nil) return "PostStart Hook Failed", err } @@ -439,16 +463,6 @@ func toKubeContainerStatus(status *runtimeapi.ContainerStatus, runtimeName strin return cStatus } -// generateContainerEvent generates an event for the container. -func (m *kubeGenericRuntimeManager) generateContainerEvent(containerID kubecontainer.ContainerID, eventType, reason, message string) { - ref, ok := m.containerRefManager.GetRef(containerID) - if !ok { - glog.Warningf("No ref for container %q", containerID) - return - } - m.recorder.Event(events.ToObjectReference(ref), eventType, reason, message) -} - // executePreStopHook runs the pre-stop lifecycle hooks if applicable and returns the duration it takes. func (m *kubeGenericRuntimeManager) executePreStopHook(pod *v1.Pod, containerID kubecontainer.ContainerID, containerSpec *v1.Container, gracePeriod int64) int64 { glog.V(3).Infof("Running preStop hook for container %q", containerID.String()) @@ -460,7 +474,7 @@ func (m *kubeGenericRuntimeManager) executePreStopHook(pod *v1.Pod, containerID defer utilruntime.HandleCrash() if msg, err := m.runner.Run(containerID, pod, containerSpec, containerSpec.Lifecycle.PreStop); err != nil { glog.Errorf("preStop hook for container %q failed: %v", containerSpec.Name, err) - m.generateContainerEvent(containerID, v1.EventTypeWarning, events.FailedPreStopHook, msg) + m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeWarning, events.FailedPreStopHook, msg) } }() @@ -568,7 +582,7 @@ func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubec if reason != "" { message = fmt.Sprint(message, ":", reason) } - m.generateContainerEvent(containerID, v1.EventTypeNormal, events.KillingContainer, message) + m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeNormal, events.KillingContainer, message) m.containerRefManager.ClearRef(containerID) return err diff --git a/pkg/kubelet/pod_workers.go b/pkg/kubelet/pod_workers.go index 8efe6da7980..ea2ace249cf 100644 --- a/pkg/kubelet/pod_workers.go +++ b/pkg/kubelet/pod_workers.go @@ -180,7 +180,13 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) { } if err != nil { glog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err) - p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod, skipping: %v", err) + // if we failed sync, we throw more specific events for why it happened. + // as a result, i question the value of this event. + // TODO: determine if we can remove this in a future release. + // do not include descriptive text that can vary on why it failed so in a pathological + // scenario, kubelet does not create enough discrete events that miss default aggregation + // window. + p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod") } p.wrapUp(update.Pod.UID, err) } From 4a5a221d8f8c2d74d78e5bf798e5757b45e2cc4f Mon Sep 17 00:00:00 2001 From: Derek Carr Date: Tue, 13 Jun 2017 18:05:23 -0400 Subject: [PATCH 2/3] parse executable not found error --- pkg/kubelet/dockershim/docker_container.go | 1 + pkg/kubelet/dockershim/helpers.go | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/pkg/kubelet/dockershim/docker_container.go b/pkg/kubelet/dockershim/docker_container.go index 283dc644dbd..ac79f3fe64a 100644 --- a/pkg/kubelet/dockershim/docker_container.go +++ b/pkg/kubelet/dockershim/docker_container.go @@ -233,6 +233,7 @@ func (ds *dockerService) removeContainerLogSymlink(containerID string) error { func (ds *dockerService) StartContainer(containerID string) error { err := ds.client.StartContainer(containerID) if err != nil { + err = transformStartContainerError(err) return fmt.Errorf("failed to start container %q: %v", containerID, err) } // Create container log symlink. diff --git a/pkg/kubelet/dockershim/helpers.go b/pkg/kubelet/dockershim/helpers.go index 22ab777e6af..c5b99a32d57 100644 --- a/pkg/kubelet/dockershim/helpers.go +++ b/pkg/kubelet/dockershim/helpers.go @@ -55,6 +55,10 @@ const ( var ( conflictRE = regexp.MustCompile(`Conflict. (?:.)+ is already in use by container ([0-9a-z]+)`) + // this is hacky, but extremely common. + // if a container starts but the executable file is not found, runc gives a message that matches + startRE = regexp.MustCompile(`\\\\\\\"(.*)\\\\\\\": executable file not found`) + // Docker changes the security option separator from ':' to '=' in the 1.23 // API version. optsSeparatorChangeVersion = semver.MustParse(securityOptSeparatorChangeVersion) @@ -359,6 +363,19 @@ func recoverFromCreationConflictIfNeeded(client libdocker.Interface, createConfi return client.CreateContainer(createConfig) } +// transformStartContainerError does regex parsing on returned error +// for where container runtimes are giving less than ideal error messages. +func transformStartContainerError(err error) error { + if err == nil { + return nil + } + matches := startRE.FindStringSubmatch(err.Error()) + if len(matches) > 0 { + return fmt.Errorf("executable not found in $PATH") + } + return err +} + // getSecurityOptSeparator returns the security option separator based on the // docker API version. // TODO: Remove this function along with the relevant code when we no longer From 36619fa2178a8196942df27c79a4ef360e50f300 Mon Sep 17 00:00:00 2001 From: Derek Carr Date: Tue, 13 Jun 2017 23:42:15 -0400 Subject: [PATCH 3/3] surface rpc error desc only in events --- pkg/kubelet/kuberuntime/BUILD | 1 + pkg/kubelet/kuberuntime/kuberuntime_container.go | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/kuberuntime/BUILD b/pkg/kubelet/kuberuntime/BUILD index 620d13aa247..110b40c6c83 100644 --- a/pkg/kubelet/kuberuntime/BUILD +++ b/pkg/kubelet/kuberuntime/BUILD @@ -55,6 +55,7 @@ go_library( "//vendor/github.com/fsnotify/fsnotify:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/google/cadvisor/info/v1:go_default_library", + "//vendor/google.golang.org/grpc:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/types:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library", diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index 0770b977406..bd7a5ba6c92 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -28,6 +28,8 @@ import ( "sync" "time" + "google.golang.org/grpc" + "github.com/armon/circbuf" "github.com/golang/glog" @@ -99,12 +101,12 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb containerConfig, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef) if err != nil { - m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", err) + m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err)) return "Generate Container Config Failed", err } containerID, err := m.runtimeService.CreateContainer(podSandboxID, containerConfig, podSandboxConfig) if err != nil { - m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", err) + m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err)) return "Create Container Failed", err } m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, "Created container") @@ -119,7 +121,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb // Step 3: start the container. err = m.runtimeService.StartContainer(containerID) if err != nil { - m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", err) + m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", grpc.ErrorDesc(err)) return "Start Container Failed", err } m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, "Started container")