e2e: Flag questionable durations in container status on termination

The e2e that create/deletes pods rapidly and verifies their status
was reporting a very long timing:

timings total=12.211347385s t=491ms run=2s execute=450402h8m25s

in a few scenarios. Add error checks that clarify when this happens
and why. Report p50/75/90/99 latencies on teardown as observed from
the test for baseline for future changes.
This commit is contained in:
Clayton Coleman 2021-05-19 14:52:42 -04:00
parent 3464112cf9
commit 2d361d4306
No known key found for this signature in database
GPG Key ID: 3D16906B4F1C5CB3
4 changed files with 39 additions and 2 deletions

1
go.mod
View File

@ -74,6 +74,7 @@ require (
github.com/opencontainers/selinux v1.8.0
github.com/pkg/errors v0.9.1
github.com/pmezard/go-difflib v1.0.0
github.com/prometheus/client_golang v1.7.1
github.com/prometheus/client_model v0.2.0
github.com/prometheus/common v0.10.0
github.com/quobyte/api v0.1.8

View File

@ -63,6 +63,7 @@ allowed_prometheus_importers=(
./staging/src/k8s.io/component-base/metrics/value.go
./staging/src/k8s.io/component-base/metrics/wrappers.go
./test/e2e/apimachinery/flowcontrol.go
./test/e2e/node/pods.go
./test/e2e_node/dynamic_kubelet_config_test.go
./test/e2e_node/resource_metrics_test.go
./test/instrumentation/main_test.go

View File

@ -17,6 +17,7 @@ limitations under the License.
package node
import (
"bytes"
"context"
"encoding/json"
"fmt"
@ -43,6 +44,8 @@ import (
imageutils "k8s.io/kubernetes/test/utils/image"
"github.com/onsi/ginkgo"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/expfmt"
)
var _ = SIGDescribe("Pods Extended", func() {
@ -215,6 +218,18 @@ var _ = SIGDescribe("Pods Extended", func() {
wg sync.WaitGroup
)
r := prometheus.NewRegistry()
h := prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "start_latency",
Objectives: map[float64]float64{
0.5: 0.05,
0.75: 0.025,
0.9: 0.01,
0.99: 0.001,
},
}, []string{"node"})
r.MustRegister(h)
const delay = 2000
const workers = 3
const pods = 15
@ -352,10 +367,15 @@ var _ = SIGDescribe("Pods Extended", func() {
return fmt.Errorf("pod %s on node %s was terminated and then had termination cleared: %#v", pod.Name, pod.Spec.NodeName, status)
}
}
var hasNoStartTime bool
hasRunningContainers = status.State.Waiting == nil && status.State.Terminated == nil
if t != nil {
if !t.FinishedAt.Time.IsZero() {
duration = t.FinishedAt.Sub(t.StartedAt.Time)
if t.StartedAt.IsZero() {
hasNoStartTime = true
} else {
duration = t.FinishedAt.Sub(t.StartedAt.Time)
}
completeDuration = t.FinishedAt.Sub(pod.CreationTimestamp.Time)
}
@ -369,6 +389,14 @@ var _ = SIGDescribe("Pods Extended", func() {
default:
return fmt.Errorf("pod %s on node %s container unexpected exit code %d: start=%s end=%s reason=%s message=%s", pod.Name, pod.Spec.NodeName, t.ExitCode, t.StartedAt, t.FinishedAt, t.Reason, t.Message)
}
switch {
case duration > time.Hour:
// problem with status reporting
return fmt.Errorf("pod %s container %s on node %s had very long duration %s: start=%s end=%s", pod.Name, status.Name, pod.Spec.NodeName, duration, t.StartedAt, t.FinishedAt)
case hasNoStartTime:
// should never happen
return fmt.Errorf("pod %s container %s on node %s had finish time but not start time: end=%s", pod.Name, status.Name, pod.Spec.NodeName, t.FinishedAt)
}
}
if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
hasTerminalPhase = true
@ -422,6 +450,7 @@ var _ = SIGDescribe("Pods Extended", func() {
if duration > max || max == 0 {
max = duration
}
h.WithLabelValues(pod.Spec.NodeName).Observe(end.Sub(start).Seconds())
framework.Logf("Pod %s on node %s timings total=%s t=%s run=%s execute=%s", pod.Name, pod.Spec.NodeName, end.Sub(start), t, completeDuration, duration)
}
@ -437,8 +466,13 @@ var _ = SIGDescribe("Pods Extended", func() {
}
framework.Failf("%d errors:\n%v", len(errs), strings.Join(messages, "\n"))
}
values, _ := r.Gather()
var buf bytes.Buffer
for _, m := range values {
expfmt.MetricFamilyToText(&buf, m)
}
framework.Logf("Summary of latencies:\n%s", buf.String())
})
})
ginkgo.Describe("Pod Container lifecycle", func() {

1
vendor/modules.txt vendored
View File

@ -687,6 +687,7 @@ github.com/pmezard/go-difflib/difflib
github.com/pquerna/cachecontrol
github.com/pquerna/cachecontrol/cacheobject
# github.com/prometheus/client_golang v1.7.1 => github.com/prometheus/client_golang v1.7.1
## explicit
github.com/prometheus/client_golang/prometheus
github.com/prometheus/client_golang/prometheus/internal
github.com/prometheus/client_golang/prometheus/promhttp