mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 03:41:45 +00:00
e2e: Flag questionable durations in container status on termination
The e2e that create/deletes pods rapidly and verifies their status was reporting a very long timing: timings total=12.211347385s t=491ms run=2s execute=450402h8m25s in a few scenarios. Add error checks that clarify when this happens and why. Report p50/75/90/99 latencies on teardown as observed from the test for baseline for future changes.
This commit is contained in:
parent
3464112cf9
commit
2d361d4306
1
go.mod
1
go.mod
@ -74,6 +74,7 @@ require (
|
|||||||
github.com/opencontainers/selinux v1.8.0
|
github.com/opencontainers/selinux v1.8.0
|
||||||
github.com/pkg/errors v0.9.1
|
github.com/pkg/errors v0.9.1
|
||||||
github.com/pmezard/go-difflib v1.0.0
|
github.com/pmezard/go-difflib v1.0.0
|
||||||
|
github.com/prometheus/client_golang v1.7.1
|
||||||
github.com/prometheus/client_model v0.2.0
|
github.com/prometheus/client_model v0.2.0
|
||||||
github.com/prometheus/common v0.10.0
|
github.com/prometheus/common v0.10.0
|
||||||
github.com/quobyte/api v0.1.8
|
github.com/quobyte/api v0.1.8
|
||||||
|
@ -63,6 +63,7 @@ allowed_prometheus_importers=(
|
|||||||
./staging/src/k8s.io/component-base/metrics/value.go
|
./staging/src/k8s.io/component-base/metrics/value.go
|
||||||
./staging/src/k8s.io/component-base/metrics/wrappers.go
|
./staging/src/k8s.io/component-base/metrics/wrappers.go
|
||||||
./test/e2e/apimachinery/flowcontrol.go
|
./test/e2e/apimachinery/flowcontrol.go
|
||||||
|
./test/e2e/node/pods.go
|
||||||
./test/e2e_node/dynamic_kubelet_config_test.go
|
./test/e2e_node/dynamic_kubelet_config_test.go
|
||||||
./test/e2e_node/resource_metrics_test.go
|
./test/e2e_node/resource_metrics_test.go
|
||||||
./test/instrumentation/main_test.go
|
./test/instrumentation/main_test.go
|
||||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||||||
package node
|
package node
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
@ -43,6 +44,8 @@ import (
|
|||||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||||
|
|
||||||
"github.com/onsi/ginkgo"
|
"github.com/onsi/ginkgo"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/common/expfmt"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _ = SIGDescribe("Pods Extended", func() {
|
var _ = SIGDescribe("Pods Extended", func() {
|
||||||
@ -215,6 +218,18 @@ var _ = SIGDescribe("Pods Extended", func() {
|
|||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
)
|
)
|
||||||
|
|
||||||
|
r := prometheus.NewRegistry()
|
||||||
|
h := prometheus.NewSummaryVec(prometheus.SummaryOpts{
|
||||||
|
Name: "start_latency",
|
||||||
|
Objectives: map[float64]float64{
|
||||||
|
0.5: 0.05,
|
||||||
|
0.75: 0.025,
|
||||||
|
0.9: 0.01,
|
||||||
|
0.99: 0.001,
|
||||||
|
},
|
||||||
|
}, []string{"node"})
|
||||||
|
r.MustRegister(h)
|
||||||
|
|
||||||
const delay = 2000
|
const delay = 2000
|
||||||
const workers = 3
|
const workers = 3
|
||||||
const pods = 15
|
const pods = 15
|
||||||
@ -352,10 +367,15 @@ var _ = SIGDescribe("Pods Extended", func() {
|
|||||||
return fmt.Errorf("pod %s on node %s was terminated and then had termination cleared: %#v", pod.Name, pod.Spec.NodeName, status)
|
return fmt.Errorf("pod %s on node %s was terminated and then had termination cleared: %#v", pod.Name, pod.Spec.NodeName, status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
var hasNoStartTime bool
|
||||||
hasRunningContainers = status.State.Waiting == nil && status.State.Terminated == nil
|
hasRunningContainers = status.State.Waiting == nil && status.State.Terminated == nil
|
||||||
if t != nil {
|
if t != nil {
|
||||||
if !t.FinishedAt.Time.IsZero() {
|
if !t.FinishedAt.Time.IsZero() {
|
||||||
|
if t.StartedAt.IsZero() {
|
||||||
|
hasNoStartTime = true
|
||||||
|
} else {
|
||||||
duration = t.FinishedAt.Sub(t.StartedAt.Time)
|
duration = t.FinishedAt.Sub(t.StartedAt.Time)
|
||||||
|
}
|
||||||
completeDuration = t.FinishedAt.Sub(pod.CreationTimestamp.Time)
|
completeDuration = t.FinishedAt.Sub(pod.CreationTimestamp.Time)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -369,6 +389,14 @@ var _ = SIGDescribe("Pods Extended", func() {
|
|||||||
default:
|
default:
|
||||||
return fmt.Errorf("pod %s on node %s container unexpected exit code %d: start=%s end=%s reason=%s message=%s", pod.Name, pod.Spec.NodeName, t.ExitCode, t.StartedAt, t.FinishedAt, t.Reason, t.Message)
|
return fmt.Errorf("pod %s on node %s container unexpected exit code %d: start=%s end=%s reason=%s message=%s", pod.Name, pod.Spec.NodeName, t.ExitCode, t.StartedAt, t.FinishedAt, t.Reason, t.Message)
|
||||||
}
|
}
|
||||||
|
switch {
|
||||||
|
case duration > time.Hour:
|
||||||
|
// problem with status reporting
|
||||||
|
return fmt.Errorf("pod %s container %s on node %s had very long duration %s: start=%s end=%s", pod.Name, status.Name, pod.Spec.NodeName, duration, t.StartedAt, t.FinishedAt)
|
||||||
|
case hasNoStartTime:
|
||||||
|
// should never happen
|
||||||
|
return fmt.Errorf("pod %s container %s on node %s had finish time but not start time: end=%s", pod.Name, status.Name, pod.Spec.NodeName, t.FinishedAt)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
|
if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
|
||||||
hasTerminalPhase = true
|
hasTerminalPhase = true
|
||||||
@ -422,6 +450,7 @@ var _ = SIGDescribe("Pods Extended", func() {
|
|||||||
if duration > max || max == 0 {
|
if duration > max || max == 0 {
|
||||||
max = duration
|
max = duration
|
||||||
}
|
}
|
||||||
|
h.WithLabelValues(pod.Spec.NodeName).Observe(end.Sub(start).Seconds())
|
||||||
framework.Logf("Pod %s on node %s timings total=%s t=%s run=%s execute=%s", pod.Name, pod.Spec.NodeName, end.Sub(start), t, completeDuration, duration)
|
framework.Logf("Pod %s on node %s timings total=%s t=%s run=%s execute=%s", pod.Name, pod.Spec.NodeName, end.Sub(start), t, completeDuration, duration)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -437,8 +466,13 @@ var _ = SIGDescribe("Pods Extended", func() {
|
|||||||
}
|
}
|
||||||
framework.Failf("%d errors:\n%v", len(errs), strings.Join(messages, "\n"))
|
framework.Failf("%d errors:\n%v", len(errs), strings.Join(messages, "\n"))
|
||||||
}
|
}
|
||||||
|
values, _ := r.Gather()
|
||||||
|
var buf bytes.Buffer
|
||||||
|
for _, m := range values {
|
||||||
|
expfmt.MetricFamilyToText(&buf, m)
|
||||||
|
}
|
||||||
|
framework.Logf("Summary of latencies:\n%s", buf.String())
|
||||||
})
|
})
|
||||||
|
|
||||||
})
|
})
|
||||||
|
|
||||||
ginkgo.Describe("Pod Container lifecycle", func() {
|
ginkgo.Describe("Pod Container lifecycle", func() {
|
||||||
|
1
vendor/modules.txt
vendored
1
vendor/modules.txt
vendored
@ -687,6 +687,7 @@ github.com/pmezard/go-difflib/difflib
|
|||||||
github.com/pquerna/cachecontrol
|
github.com/pquerna/cachecontrol
|
||||||
github.com/pquerna/cachecontrol/cacheobject
|
github.com/pquerna/cachecontrol/cacheobject
|
||||||
# github.com/prometheus/client_golang v1.7.1 => github.com/prometheus/client_golang v1.7.1
|
# github.com/prometheus/client_golang v1.7.1 => github.com/prometheus/client_golang v1.7.1
|
||||||
|
## explicit
|
||||||
github.com/prometheus/client_golang/prometheus
|
github.com/prometheus/client_golang/prometheus
|
||||||
github.com/prometheus/client_golang/prometheus/internal
|
github.com/prometheus/client_golang/prometheus/internal
|
||||||
github.com/prometheus/client_golang/prometheus/promhttp
|
github.com/prometheus/client_golang/prometheus/promhttp
|
||||||
|
Loading…
Reference in New Issue
Block a user