mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-10 12:32:03 +00:00
Merge pull request #10802 from bprashanth/max_containers_fail
Don't ignore containers restarting during tests
This commit is contained in:
commit
34dd9c7880
@ -43,6 +43,9 @@ import (
|
|||||||
// NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
|
// NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
|
||||||
const NodeStartupThreshold = 4 * time.Second
|
const NodeStartupThreshold = 4 * time.Second
|
||||||
|
|
||||||
|
// Maximum container failures this test tolerates before failing.
|
||||||
|
var MaxContainerFailures = 0
|
||||||
|
|
||||||
// podLatencyData encapsulates pod startup latency information.
|
// podLatencyData encapsulates pod startup latency information.
|
||||||
type podLatencyData struct {
|
type podLatencyData struct {
|
||||||
// Name of the pod
|
// Name of the pod
|
||||||
@ -190,14 +193,14 @@ var _ = Describe("Density", func() {
|
|||||||
fileHndl, err := os.Create(fmt.Sprintf(testContext.OutputDir+"/%s/pod_states.csv", uuid))
|
fileHndl, err := os.Create(fmt.Sprintf(testContext.OutputDir+"/%s/pod_states.csv", uuid))
|
||||||
expectNoError(err)
|
expectNoError(err)
|
||||||
defer fileHndl.Close()
|
defer fileHndl.Close()
|
||||||
|
|
||||||
config := RCConfig{Client: c,
|
config := RCConfig{Client: c,
|
||||||
Image: "gcr.io/google_containers/pause:go",
|
Image: "gcr.io/google_containers/pause:go",
|
||||||
Name: RCName,
|
Name: RCName,
|
||||||
Namespace: ns,
|
Namespace: ns,
|
||||||
PollInterval: itArg.interval,
|
PollInterval: itArg.interval,
|
||||||
PodStatusFile: fileHndl,
|
PodStatusFile: fileHndl,
|
||||||
Replicas: totalPods,
|
Replicas: totalPods,
|
||||||
|
MaxContainerFailures: &MaxContainerFailures,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a listener for events.
|
// Create a listener for events.
|
||||||
|
@ -162,7 +162,7 @@ func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nod
|
|||||||
}
|
}
|
||||||
sort.Sort(KubeletMetricByLatency(metric))
|
sort.Sort(KubeletMetricByLatency(metric))
|
||||||
var badMetrics []KubeletMetric
|
var badMetrics []KubeletMetric
|
||||||
Logf("Latency metrics for node %v", nodeName)
|
Logf("\nLatency metrics for node %v", nodeName)
|
||||||
for _, m := range metric {
|
for _, m := range metric {
|
||||||
if m.Latency > threshold {
|
if m.Latency > threshold {
|
||||||
badMetrics = append(badMetrics, m)
|
badMetrics = append(badMetrics, m)
|
||||||
|
@ -178,6 +178,10 @@ type RCConfig struct {
|
|||||||
// Pointer to a list of pods; if non-nil, will be set to a list of pods
|
// Pointer to a list of pods; if non-nil, will be set to a list of pods
|
||||||
// created by this RC by RunRC.
|
// created by this RC by RunRC.
|
||||||
CreatedPods *[]*api.Pod
|
CreatedPods *[]*api.Pod
|
||||||
|
|
||||||
|
// Maximum allowable container failures. If exceeded, RunRC returns an error.
|
||||||
|
// Defaults to replicas*0.1 if unspecified.
|
||||||
|
MaxContainerFailures *int
|
||||||
}
|
}
|
||||||
|
|
||||||
func Logf(format string, a ...interface{}) {
|
func Logf(format string, a ...interface{}) {
|
||||||
@ -984,7 +988,15 @@ func Diff(oldPods []*api.Pod, curPods []*api.Pod) PodDiff {
|
|||||||
// It's the caller's responsibility to clean up externally (i.e. use the
|
// It's the caller's responsibility to clean up externally (i.e. use the
|
||||||
// namespace lifecycle for handling cleanup).
|
// namespace lifecycle for handling cleanup).
|
||||||
func RunRC(config RCConfig) error {
|
func RunRC(config RCConfig) error {
|
||||||
maxContainerFailures := int(math.Max(1.0, float64(config.Replicas)*.01))
|
|
||||||
|
// Don't force tests to fail if they don't care about containers restarting.
|
||||||
|
var maxContainerFailures int
|
||||||
|
if config.MaxContainerFailures == nil {
|
||||||
|
maxContainerFailures = int(math.Max(1.0, float64(config.Replicas)*.01))
|
||||||
|
} else {
|
||||||
|
maxContainerFailures = *config.MaxContainerFailures
|
||||||
|
}
|
||||||
|
|
||||||
label := labels.SelectorFromSet(labels.Set(map[string]string{"name": config.Name}))
|
label := labels.SelectorFromSet(labels.Set(map[string]string{"name": config.Name}))
|
||||||
|
|
||||||
By(fmt.Sprintf("%v Creating replication controller %s", time.Now(), config.Name))
|
By(fmt.Sprintf("%v Creating replication controller %s", time.Now(), config.Name))
|
||||||
@ -1058,6 +1070,8 @@ func RunRC(config RCConfig) error {
|
|||||||
unknown := 0
|
unknown := 0
|
||||||
inactive := 0
|
inactive := 0
|
||||||
failedContainers := 0
|
failedContainers := 0
|
||||||
|
containerRestartNodes := util.NewStringSet()
|
||||||
|
|
||||||
pods := podStore.List()
|
pods := podStore.List()
|
||||||
if config.CreatedPods != nil {
|
if config.CreatedPods != nil {
|
||||||
*config.CreatedPods = pods
|
*config.CreatedPods = pods
|
||||||
@ -1067,6 +1081,7 @@ func RunRC(config RCConfig) error {
|
|||||||
running++
|
running++
|
||||||
for _, v := range FailedContainers(p) {
|
for _, v := range FailedContainers(p) {
|
||||||
failedContainers = failedContainers + v.restarts
|
failedContainers = failedContainers + v.restarts
|
||||||
|
containerRestartNodes.Insert(p.Spec.NodeName)
|
||||||
}
|
}
|
||||||
} else if p.Status.Phase == api.PodPending {
|
} else if p.Status.Phase == api.PodPending {
|
||||||
if p.Spec.NodeName == "" {
|
if p.Spec.NodeName == "" {
|
||||||
@ -1088,6 +1103,7 @@ func RunRC(config RCConfig) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if failedContainers > maxContainerFailures {
|
if failedContainers > maxContainerFailures {
|
||||||
|
dumpNodeDebugInfo(config.Client, containerRestartNodes.List())
|
||||||
return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
|
return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
|
||||||
}
|
}
|
||||||
if len(pods) < len(oldPods) || len(pods) > config.Replicas {
|
if len(pods) < len(oldPods) || len(pods) > config.Replicas {
|
||||||
@ -1137,6 +1153,11 @@ func dumpPodDebugInfo(c *client.Client, pods []*api.Pod) {
|
|||||||
|
|
||||||
func dumpNodeDebugInfo(c *client.Client, nodeNames []string) {
|
func dumpNodeDebugInfo(c *client.Client, nodeNames []string) {
|
||||||
for _, n := range nodeNames {
|
for _, n := range nodeNames {
|
||||||
|
Logf("\nLogging kubelet events for node %v", n)
|
||||||
|
for _, e := range getNodeEvents(c, n) {
|
||||||
|
Logf("source %v message %v reason %v first ts %v last ts %v, involved obj %+v",
|
||||||
|
e.Source, e.Message, e.Reason, e.FirstTimestamp, e.LastTimestamp, e.InvolvedObject)
|
||||||
|
}
|
||||||
Logf("\nLogging pods the kubelet thinks is on node %v", n)
|
Logf("\nLogging pods the kubelet thinks is on node %v", n)
|
||||||
podList, err := GetKubeletPods(c, n)
|
podList, err := GetKubeletPods(c, n)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -1155,6 +1176,25 @@ func dumpNodeDebugInfo(c *client.Client, nodeNames []string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// logNodeEvents logs kubelet events from the given node. This includes kubelet
|
||||||
|
// restart and node unhealthy events. Note that listing events like this will mess
|
||||||
|
// with latency metrics, beware of calling it during a test.
|
||||||
|
func getNodeEvents(c *client.Client, nodeName string) []api.Event {
|
||||||
|
events, err := c.Events(api.NamespaceDefault).List(
|
||||||
|
labels.Everything(),
|
||||||
|
fields.Set{
|
||||||
|
"involvedObject.kind": "Node",
|
||||||
|
"involvedObject.name": nodeName,
|
||||||
|
"involvedObject.namespace": api.NamespaceAll,
|
||||||
|
"source": "kubelet",
|
||||||
|
}.AsSelector())
|
||||||
|
if err != nil {
|
||||||
|
Logf("Unexpected error retrieving node events %v", err)
|
||||||
|
return []api.Event{}
|
||||||
|
}
|
||||||
|
return events.Items
|
||||||
|
}
|
||||||
|
|
||||||
func ScaleRC(c *client.Client, ns, name string, size uint) error {
|
func ScaleRC(c *client.Client, ns, name string, size uint) error {
|
||||||
By(fmt.Sprintf("%v Scaling replication controller %s in namespace %s to %d", time.Now(), name, ns, size))
|
By(fmt.Sprintf("%v Scaling replication controller %s in namespace %s to %d", time.Now(), name, ns, size))
|
||||||
scaler, err := kubectl.ScalerFor("ReplicationController", kubectl.NewScalerClient(c))
|
scaler, err := kubectl.ScalerFor("ReplicationController", kubectl.NewScalerClient(c))
|
||||||
|
Loading…
Reference in New Issue
Block a user