e2e: Flag questionable durations in container status on termination

The e2e that create/deletes pods rapidly and verifies their status was reporting a very long timing: timings total=12.211347385s t=491ms run=2s execute=450402h8m25s in a few scenarios. Add error checks that clarify when this happens and why. Report p50/75/90/99 latencies on teardown as observed from the test for baseline for future changes.
2025-09-04 02:37:36 +00:00 · 2021-05-19 14:52:42 -04:00
parent 3464112cf9
commit 2d361d4306
4 changed files with 39 additions and 2 deletions
--- a/go.mod
+++ b/go.mod
@@ -74,6 +74,7 @@ require (
 	github.com/opencontainers/selinux v1.8.0
 	github.com/pkg/errors v0.9.1
 	github.com/pmezard/go-difflib v1.0.0
+	github.com/prometheus/client_golang v1.7.1
 	github.com/prometheus/client_model v0.2.0
 	github.com/prometheus/common v0.10.0
 	github.com/quobyte/api v0.1.8
--- a/hack/verify-prometheus-imports.sh
+++ b/hack/verify-prometheus-imports.sh
@@ -63,6 +63,7 @@ allowed_prometheus_importers=(
  ./staging/src/k8s.io/component-base/metrics/value.go
  ./staging/src/k8s.io/component-base/metrics/wrappers.go
  ./test/e2e/apimachinery/flowcontrol.go
+  ./test/e2e/node/pods.go
  ./test/e2e_node/dynamic_kubelet_config_test.go
  ./test/e2e_node/resource_metrics_test.go
  ./test/instrumentation/main_test.go
--- a/test/e2e/node/pods.go
+++ b/test/e2e/node/pods.go
@@ -17,6 +17,7 @@ limitations under the License.
 package node

 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
@@ -43,6 +44,8 @@ import (
 	imageutils "k8s.io/kubernetes/test/utils/image"

 	"github.com/onsi/ginkgo"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/common/expfmt"
 )

 var _ = SIGDescribe("Pods Extended", func() {
@@ -215,6 +218,18 @@ var _ = SIGDescribe("Pods Extended", func() {
 				wg sync.WaitGroup
 			)

+			r := prometheus.NewRegistry()
+			h := prometheus.NewSummaryVec(prometheus.SummaryOpts{
+				Name: "start_latency",
+				Objectives: map[float64]float64{
+					0.5:  0.05,
+					0.75: 0.025,
+					0.9:  0.01,
+					0.99: 0.001,
+				},
+			}, []string{"node"})
+			r.MustRegister(h)
+
 			const delay = 2000
 			const workers = 3
 			const pods = 15
@@ -352,10 +367,15 @@ var _ = SIGDescribe("Pods Extended", func() {
 									return fmt.Errorf("pod %s on node %s was terminated and then had termination cleared: %#v", pod.Name, pod.Spec.NodeName, status)
 								}
 							}
+							var hasNoStartTime bool
 							hasRunningContainers = status.State.Waiting == nil && status.State.Terminated == nil
 							if t != nil {
 								if !t.FinishedAt.Time.IsZero() {
-									duration = t.FinishedAt.Sub(t.StartedAt.Time)
+									if t.StartedAt.IsZero() {
+										hasNoStartTime = true
+									} else {
+										duration = t.FinishedAt.Sub(t.StartedAt.Time)
+									}
 									completeDuration = t.FinishedAt.Sub(pod.CreationTimestamp.Time)
 								}

@@ -369,6 +389,14 @@ var _ = SIGDescribe("Pods Extended", func() {
 								default:
 									return fmt.Errorf("pod %s on node %s container unexpected exit code %d: start=%s end=%s reason=%s message=%s", pod.Name, pod.Spec.NodeName, t.ExitCode, t.StartedAt, t.FinishedAt, t.Reason, t.Message)
 								}
+								switch {
+								case duration > time.Hour:
+									// problem with status reporting
+									return fmt.Errorf("pod %s container %s on node %s had very long duration %s: start=%s end=%s", pod.Name, status.Name, pod.Spec.NodeName, duration, t.StartedAt, t.FinishedAt)
+								case hasNoStartTime:
+									// should never happen
+									return fmt.Errorf("pod %s container %s on node %s had finish time but not start time: end=%s", pod.Name, status.Name, pod.Spec.NodeName, t.FinishedAt)
+								}
 							}
 							if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
 								hasTerminalPhase = true
@@ -422,6 +450,7 @@ var _ = SIGDescribe("Pods Extended", func() {
 						if duration > max || max == 0 {
 							max = duration
 						}
+						h.WithLabelValues(pod.Spec.NodeName).Observe(end.Sub(start).Seconds())
 						framework.Logf("Pod %s on node %s timings total=%s t=%s run=%s execute=%s", pod.Name, pod.Spec.NodeName, end.Sub(start), t, completeDuration, duration)
 					}

@@ -437,8 +466,13 @@ var _ = SIGDescribe("Pods Extended", func() {
 				}
 				framework.Failf("%d errors:\n%v", len(errs), strings.Join(messages, "\n"))
 			}
+			values, _ := r.Gather()
+			var buf bytes.Buffer
+			for _, m := range values {
+				expfmt.MetricFamilyToText(&buf, m)
+			}
+			framework.Logf("Summary of latencies:\n%s", buf.String())
 		})
-
 	})

 	ginkgo.Describe("Pod Container lifecycle", func() {
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -687,6 +687,7 @@ github.com/pmezard/go-difflib/difflib
 github.com/pquerna/cachecontrol
 github.com/pquerna/cachecontrol/cacheobject
 # github.com/prometheus/client_golang v1.7.1 => github.com/prometheus/client_golang v1.7.1
+## explicit
 github.com/prometheus/client_golang/prometheus
 github.com/prometheus/client_golang/prometheus/internal
 github.com/prometheus/client_golang/prometheus/promhttp