mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-19 09:52:49 +00:00
e2e: Flag questionable durations in container status on termination
The e2e that create/deletes pods rapidly and verifies their status was reporting a very long timing: timings total=12.211347385s t=491ms run=2s execute=450402h8m25s in a few scenarios. Add error checks that clarify when this happens and why. Report p50/75/90/99 latencies on teardown as observed from the test for baseline for future changes.
This commit is contained in:
parent
3464112cf9
commit
2d361d4306
1
go.mod
1
go.mod
@ -74,6 +74,7 @@ require (
|
||||
github.com/opencontainers/selinux v1.8.0
|
||||
github.com/pkg/errors v0.9.1
|
||||
github.com/pmezard/go-difflib v1.0.0
|
||||
github.com/prometheus/client_golang v1.7.1
|
||||
github.com/prometheus/client_model v0.2.0
|
||||
github.com/prometheus/common v0.10.0
|
||||
github.com/quobyte/api v0.1.8
|
||||
|
@ -63,6 +63,7 @@ allowed_prometheus_importers=(
|
||||
./staging/src/k8s.io/component-base/metrics/value.go
|
||||
./staging/src/k8s.io/component-base/metrics/wrappers.go
|
||||
./test/e2e/apimachinery/flowcontrol.go
|
||||
./test/e2e/node/pods.go
|
||||
./test/e2e_node/dynamic_kubelet_config_test.go
|
||||
./test/e2e_node/resource_metrics_test.go
|
||||
./test/instrumentation/main_test.go
|
||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
||||
package node
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
@ -43,6 +44,8 @@ import (
|
||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||
|
||||
"github.com/onsi/ginkgo"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
)
|
||||
|
||||
var _ = SIGDescribe("Pods Extended", func() {
|
||||
@ -215,6 +218,18 @@ var _ = SIGDescribe("Pods Extended", func() {
|
||||
wg sync.WaitGroup
|
||||
)
|
||||
|
||||
r := prometheus.NewRegistry()
|
||||
h := prometheus.NewSummaryVec(prometheus.SummaryOpts{
|
||||
Name: "start_latency",
|
||||
Objectives: map[float64]float64{
|
||||
0.5: 0.05,
|
||||
0.75: 0.025,
|
||||
0.9: 0.01,
|
||||
0.99: 0.001,
|
||||
},
|
||||
}, []string{"node"})
|
||||
r.MustRegister(h)
|
||||
|
||||
const delay = 2000
|
||||
const workers = 3
|
||||
const pods = 15
|
||||
@ -352,10 +367,15 @@ var _ = SIGDescribe("Pods Extended", func() {
|
||||
return fmt.Errorf("pod %s on node %s was terminated and then had termination cleared: %#v", pod.Name, pod.Spec.NodeName, status)
|
||||
}
|
||||
}
|
||||
var hasNoStartTime bool
|
||||
hasRunningContainers = status.State.Waiting == nil && status.State.Terminated == nil
|
||||
if t != nil {
|
||||
if !t.FinishedAt.Time.IsZero() {
|
||||
duration = t.FinishedAt.Sub(t.StartedAt.Time)
|
||||
if t.StartedAt.IsZero() {
|
||||
hasNoStartTime = true
|
||||
} else {
|
||||
duration = t.FinishedAt.Sub(t.StartedAt.Time)
|
||||
}
|
||||
completeDuration = t.FinishedAt.Sub(pod.CreationTimestamp.Time)
|
||||
}
|
||||
|
||||
@ -369,6 +389,14 @@ var _ = SIGDescribe("Pods Extended", func() {
|
||||
default:
|
||||
return fmt.Errorf("pod %s on node %s container unexpected exit code %d: start=%s end=%s reason=%s message=%s", pod.Name, pod.Spec.NodeName, t.ExitCode, t.StartedAt, t.FinishedAt, t.Reason, t.Message)
|
||||
}
|
||||
switch {
|
||||
case duration > time.Hour:
|
||||
// problem with status reporting
|
||||
return fmt.Errorf("pod %s container %s on node %s had very long duration %s: start=%s end=%s", pod.Name, status.Name, pod.Spec.NodeName, duration, t.StartedAt, t.FinishedAt)
|
||||
case hasNoStartTime:
|
||||
// should never happen
|
||||
return fmt.Errorf("pod %s container %s on node %s had finish time but not start time: end=%s", pod.Name, status.Name, pod.Spec.NodeName, t.FinishedAt)
|
||||
}
|
||||
}
|
||||
if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
|
||||
hasTerminalPhase = true
|
||||
@ -422,6 +450,7 @@ var _ = SIGDescribe("Pods Extended", func() {
|
||||
if duration > max || max == 0 {
|
||||
max = duration
|
||||
}
|
||||
h.WithLabelValues(pod.Spec.NodeName).Observe(end.Sub(start).Seconds())
|
||||
framework.Logf("Pod %s on node %s timings total=%s t=%s run=%s execute=%s", pod.Name, pod.Spec.NodeName, end.Sub(start), t, completeDuration, duration)
|
||||
}
|
||||
|
||||
@ -437,8 +466,13 @@ var _ = SIGDescribe("Pods Extended", func() {
|
||||
}
|
||||
framework.Failf("%d errors:\n%v", len(errs), strings.Join(messages, "\n"))
|
||||
}
|
||||
values, _ := r.Gather()
|
||||
var buf bytes.Buffer
|
||||
for _, m := range values {
|
||||
expfmt.MetricFamilyToText(&buf, m)
|
||||
}
|
||||
framework.Logf("Summary of latencies:\n%s", buf.String())
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
ginkgo.Describe("Pod Container lifecycle", func() {
|
||||
|
1
vendor/modules.txt
vendored
1
vendor/modules.txt
vendored
@ -687,6 +687,7 @@ github.com/pmezard/go-difflib/difflib
|
||||
github.com/pquerna/cachecontrol
|
||||
github.com/pquerna/cachecontrol/cacheobject
|
||||
# github.com/prometheus/client_golang v1.7.1 => github.com/prometheus/client_golang v1.7.1
|
||||
## explicit
|
||||
github.com/prometheus/client_golang/prometheus
|
||||
github.com/prometheus/client_golang/prometheus/internal
|
||||
github.com/prometheus/client_golang/prometheus/promhttp
|
||||
|
Loading…
Reference in New Issue
Block a user