From 78979c77ce3487e9beea4f7f60683cb8c036827e Mon Sep 17 00:00:00 2001 From: Sam Abed Date: Thu, 15 Oct 2015 15:58:20 +1100 Subject: [PATCH] backoff e2e tests Signed-off-by: Sam Abed --- hack/jenkins/e2e.sh | 1 + pkg/kubelet/kubelet.go | 8 +- test/e2e/pods.go | 256 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 261 insertions(+), 4 deletions(-) diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh index bcc8b3c482b..00e6619b72b 100755 --- a/hack/jenkins/e2e.sh +++ b/hack/jenkins/e2e.sh @@ -149,6 +149,7 @@ GCE_SLOW_TESTS=( "resource\susage\stracking" # 1 hour, file: kubelet_perf.go, slow by design "monotonically\sincreasing\srestart\scount" # 1.5 to 5 min, file: pods.go, slow by design "KubeProxy\sshould\stest\skube-proxy" # 9 min 30 sec, file: kubeproxy.go, issue: #14204 + "cap\sback-off\sat\sMaxContainerBackOff" # 20 mins file: manager.go, PR: #12648 ) # Tests which are not able to be run in parallel. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 8d917fd7ce3..0c3229d31d3 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -88,8 +88,8 @@ const ( // Location of container logs. containerLogsDir = "/var/log/containers" - // max backoff period - maxContainerBackOff = 300 * time.Second + // max backoff period, exported for the e2e test + MaxContainerBackOff = 300 * time.Second // Capacity of the channel for storing pods to kill. A small number should // suffice because a goroutine is dedicated to check the channel and does @@ -308,7 +308,7 @@ func NewMainKubelet( } procFs := procfs.NewProcFs() - imageBackOff := util.NewBackOff(resyncInterval, maxContainerBackOff) + imageBackOff := util.NewBackOff(resyncInterval, MaxContainerBackOff) // Initialize the runtime. switch containerRuntime { case "docker": @@ -425,7 +425,7 @@ func NewMainKubelet( } } - klet.backOff = util.NewBackOff(resyncInterval, maxContainerBackOff) + klet.backOff = util.NewBackOff(resyncInterval, MaxContainerBackOff) klet.podKillingCh = make(chan *kubecontainer.Pod, podKillingChannelCapacity) klet.sourcesSeen = sets.NewString() diff --git a/test/e2e/pods.go b/test/e2e/pods.go index 9a1be3724f9..131485d9ca5 100644 --- a/test/e2e/pods.go +++ b/test/e2e/pods.go @@ -20,6 +20,7 @@ import ( "bytes" "fmt" "io" + "math" "strconv" "strings" "time" @@ -31,6 +32,7 @@ import ( "k8s.io/kubernetes/pkg/api/resource" client "k8s.io/kubernetes/pkg/client/unversioned" "k8s.io/kubernetes/pkg/fields" + "k8s.io/kubernetes/pkg/kubelet" "k8s.io/kubernetes/pkg/labels" "k8s.io/kubernetes/pkg/util" "k8s.io/kubernetes/pkg/util/wait" @@ -44,6 +46,12 @@ const ( defaultObservationTimeout = time.Minute * 2 ) +var ( + buildBackOffDuration = time.Minute + syncLoopFrequency = 10 * time.Second + maxBackOffTolerance = time.Duration(1.3 * float64(kubelet.MaxContainerBackOff)) +) + func runLivenessTest(c *client.Client, ns string, podDescr *api.Pod, expectNumRestarts int, timeout time.Duration) { By(fmt.Sprintf("Creating pod %s in namespace %s", podDescr.Name, ns)) _, err := c.Pods(ns).Create(podDescr) @@ -134,6 +142,70 @@ func testHostIP(c *client.Client, ns string, pod *api.Pod) { } } +func runPodFromStruct(framework *Framework, pod *api.Pod) { + By("submitting the pod to kubernetes") + + podClient := framework.Client.Pods(framework.Namespace.Name) + pod, err := podClient.Create(pod) + if err != nil { + Failf("Failed to create pod: %v", err) + } + + expectNoError(framework.WaitForPodRunning(pod.Name)) + + By("verifying the pod is in kubernetes") + pod, err = podClient.Get(pod.Name) + if err != nil { + Failf("failed to get pod: %v", err) + } +} + +func startPodAndGetBackOffs(framework *Framework, pod *api.Pod, podName string, containerName string, sleepAmount time.Duration) (time.Duration, time.Duration) { + runPodFromStruct(framework, pod) + time.Sleep(sleepAmount) + + By("getting restart delay-0") + _, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName) + if err != nil { + Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName) + } + + By("getting restart delay-1") + delay1, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName) + if err != nil { + Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName) + } + + By("getting restart delay-2") + delay2, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName) + if err != nil { + Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName) + } + return delay1, delay2 +} + +func getRestartDelay(c *client.Client, pod *api.Pod, ns string, name string, containerName string) (time.Duration, error) { + beginTime := time.Now() + for time.Since(beginTime) < (2 * maxBackOffTolerance) { // may just miss the 1st MaxContainerBackOff delay + time.Sleep(time.Second) + pod, err := c.Pods(ns).Get(name) + expectNoError(err, fmt.Sprintf("getting pod %s", name)) + status, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, containerName) + if !ok { + Logf("getRestartDelay: status missing") + continue + } + + if status.State.Waiting == nil && status.State.Running != nil && status.LastTerminationState.Terminated != nil && status.State.Running.StartedAt.Time.After(beginTime) { + startedAt := status.State.Running.StartedAt.Time + finishedAt := status.LastTerminationState.Terminated.FinishedAt.Time + Logf("getRestartDelay: finishedAt=%s restartedAt=%s (%s)", finishedAt, startedAt, startedAt.Sub(finishedAt)) + return startedAt.Sub(finishedAt), nil + } + } + return 0, fmt.Errorf("timeout getting pod restart delay") +} + var _ = Describe("Pods", func() { framework := NewFramework("pods") @@ -755,6 +827,190 @@ var _ = Describe("Pods", func() { } }) + It("should have their container restart back-off timer increase exponentially", func() { + podName := "pod-back-off-exponentially" + containerName := "back-off" + podClient := framework.Client.Pods(framework.Namespace.Name) + pod := &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: podName, + Labels: map[string]string{"test": "back-off-image"}, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + Image: "gcr.io/google_containers/busybox", + Command: []string{"/bin/sh", "-c", "sleep 5", "/crash/missing"}, + }, + }, + }, + } + + defer func() { + By("deleting the pod") + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) + }() + + delay1, delay2 := startPodAndGetBackOffs(framework, pod, podName, containerName, buildBackOffDuration) + delay1 += 1 // divide by zero + ratio := float64(delay2) / float64(delay1) + if math.Floor(ratio) != 2 && math.Ceil(ratio) != 2 { + Failf("back-off gap is not increasing exponentially pod=%s/%s delay1=%s delay2=%s", podName, containerName, delay1, delay2) + } + }) + + It("should have their auto-restart back-off timer reset on image update", func() { + podName := "pod-back-off-image" + containerName := "back-off" + podClient := framework.Client.Pods(framework.Namespace.Name) + pod := &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: podName, + Labels: map[string]string{"test": "back-off-image"}, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + Image: "gcr.io/google_containers/busybox", + Command: []string{"/bin/sh", "-c", "sleep 5", "/crash/missing"}, + }, + }, + }, + } + + defer func() { + By("deleting the pod") + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) + }() + + delay1, delay2 := startPodAndGetBackOffs(framework, pod, podName, containerName, buildBackOffDuration) + + By("updating the image") + pod, err := podClient.Get(pod.Name) + if err != nil { + Failf("failed to get pod: %v", err) + } + pod.Spec.Containers[0].Image = "nginx" + pod, err = podClient.Update(pod) + if err != nil { + Failf("error updating pod=%s/%s %v", podName, containerName, err) + } + time.Sleep(syncLoopFrequency) + expectNoError(framework.WaitForPodRunning(pod.Name)) + + By("get restart delay after image update") + delayAfterUpdate, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName) + if err != nil { + Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName) + } + + if delayAfterUpdate > delay2 || delayAfterUpdate > delay1 { + Failf("updating image did not reset the back-off value in pod=%s/%s d3=%s d2=%s d1=%s", podName, containerName, delayAfterUpdate, delay1, delay2) + } + }) + + It("should not back-off restarting a container on LivenessProbe failure", func() { + podClient := framework.Client.Pods(framework.Namespace.Name) + podName := "pod-back-off-liveness" + containerName := "back-off-liveness" + pod := &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: podName, + Labels: map[string]string{"test": "liveness"}, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + Image: "gcr.io/google_containers/busybox", + Command: []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 5; rm -rf /tmp/health; sleep 600"}, + LivenessProbe: &api.Probe{ + Handler: api.Handler{ + Exec: &api.ExecAction{ + Command: []string{"cat", "/tmp/health"}, + }, + }, + InitialDelaySeconds: 5, + }, + }, + }, + }, + } + + defer func() { + By("deleting the pod") + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) + }() + + delay1, delay2 := startPodAndGetBackOffs(framework, pod, podName, containerName, buildBackOffDuration) + + if math.Abs(float64(delay2-delay1)) > float64(syncLoopFrequency) { + Failf("back-off increasing on LivenessProbe failure delay1=%s delay2=%s", delay1, delay2) + } + }) + + It("should cap back-off at MaxContainerBackOff", func() { + podClient := framework.Client.Pods(framework.Namespace.Name) + podName := "back-off-cap" + containerName := "back-off-cap" + pod := &api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: podName, + Labels: map[string]string{"test": "liveness"}, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + Image: "gcr.io/google_containers/busybox", + Command: []string{"/bin/sh", "-c", "sleep 5", "/crash/missing"}, + }, + }, + }, + } + + defer func() { + By("deleting the pod") + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) + }() + + runPodFromStruct(framework, pod) + time.Sleep(2 * kubelet.MaxContainerBackOff) // it takes slightly more than 2*x to get to a back-off of x + + // wait for a delay == capped delay of MaxContainerBackOff + By("geting restart delay when capped") + var ( + delay1 time.Duration + err error + ) + for i := 0; i < 3; i++ { + delay1, err = getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName) + if err != nil { + Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName) + } + + if delay1 < kubelet.MaxContainerBackOff { + continue + } + } + + if (delay1 < kubelet.MaxContainerBackOff) || (delay1 > maxBackOffTolerance) { + Failf("expected %s back-off got=%s in delay1", kubelet.MaxContainerBackOff, delay1) + } + + By("getting restart delay after a capped delay") + delay2, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName) + if err != nil { + Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName) + } + + if delay2 < kubelet.MaxContainerBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift + Failf("expected %s back-off got=%s on delay2", kubelet.MaxContainerBackOff, delay2) + } + }) + // The following tests for remote command execution and port forwarding are // commented out because the GCE environment does not currently have nsenter // in the kubelet's PATH, nor does it have socat installed. Once we figure