From 78979c77ce3487e9beea4f7f60683cb8c036827e Mon Sep 17 00:00:00 2001
From: Sam Abed <samsabed@gmail.com>
Date: Thu, 15 Oct 2015 15:58:20 +1100
Subject: [PATCH] backoff e2e tests

Signed-off-by: Sam Abed <samsabed@gmail.com>
---
 hack/jenkins/e2e.sh    |   1 +
 pkg/kubelet/kubelet.go |   8 +-
 test/e2e/pods.go       | 256 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 261 insertions(+), 4 deletions(-)

diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh
index bcc8b3c482b..00e6619b72b 100755
--- a/hack/jenkins/e2e.sh
+++ b/hack/jenkins/e2e.sh
@@ -149,6 +149,7 @@ GCE_SLOW_TESTS=(
     "resource\susage\stracking"                       # 1 hour,       file: kubelet_perf.go,         slow by design
     "monotonically\sincreasing\srestart\scount"       # 1.5 to 5 min, file: pods.go,                 slow by design
     "KubeProxy\sshould\stest\skube-proxy"             # 9 min 30 sec, file: kubeproxy.go,            issue: #14204
+    "cap\sback-off\sat\sMaxContainerBackOff"          # 20 mins       file: manager.go,              PR:    #12648
     )
 
 # Tests which are not able to be run in parallel.
diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go
index 8d917fd7ce3..0c3229d31d3 100644
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -88,8 +88,8 @@ const (
 	// Location of container logs.
 	containerLogsDir = "/var/log/containers"
 
-	// max backoff period
-	maxContainerBackOff = 300 * time.Second
+	// max backoff period, exported for the e2e test
+	MaxContainerBackOff = 300 * time.Second
 
 	// Capacity of the channel for storing pods to kill. A small number should
 	// suffice because a goroutine is dedicated to check the channel and does
@@ -308,7 +308,7 @@ func NewMainKubelet(
 	}
 
 	procFs := procfs.NewProcFs()
-	imageBackOff := util.NewBackOff(resyncInterval, maxContainerBackOff)
+	imageBackOff := util.NewBackOff(resyncInterval, MaxContainerBackOff)
 	// Initialize the runtime.
 	switch containerRuntime {
 	case "docker":
@@ -425,7 +425,7 @@ func NewMainKubelet(
 		}
 	}
 
-	klet.backOff = util.NewBackOff(resyncInterval, maxContainerBackOff)
+	klet.backOff = util.NewBackOff(resyncInterval, MaxContainerBackOff)
 	klet.podKillingCh = make(chan *kubecontainer.Pod, podKillingChannelCapacity)
 
 	klet.sourcesSeen = sets.NewString()
diff --git a/test/e2e/pods.go b/test/e2e/pods.go
index 9a1be3724f9..131485d9ca5 100644
--- a/test/e2e/pods.go
+++ b/test/e2e/pods.go
@@ -20,6 +20,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"math"
 	"strconv"
 	"strings"
 	"time"
@@ -31,6 +32,7 @@ import (
 	"k8s.io/kubernetes/pkg/api/resource"
 	client "k8s.io/kubernetes/pkg/client/unversioned"
 	"k8s.io/kubernetes/pkg/fields"
+	"k8s.io/kubernetes/pkg/kubelet"
 	"k8s.io/kubernetes/pkg/labels"
 	"k8s.io/kubernetes/pkg/util"
 	"k8s.io/kubernetes/pkg/util/wait"
@@ -44,6 +46,12 @@ const (
 	defaultObservationTimeout = time.Minute * 2
 )
 
+var (
+	buildBackOffDuration = time.Minute
+	syncLoopFrequency    = 10 * time.Second
+	maxBackOffTolerance  = time.Duration(1.3 * float64(kubelet.MaxContainerBackOff))
+)
+
 func runLivenessTest(c *client.Client, ns string, podDescr *api.Pod, expectNumRestarts int, timeout time.Duration) {
 	By(fmt.Sprintf("Creating pod %s in namespace %s", podDescr.Name, ns))
 	_, err := c.Pods(ns).Create(podDescr)
@@ -134,6 +142,70 @@ func testHostIP(c *client.Client, ns string, pod *api.Pod) {
 	}
 }
 
+func runPodFromStruct(framework *Framework, pod *api.Pod) {
+	By("submitting the pod to kubernetes")
+
+	podClient := framework.Client.Pods(framework.Namespace.Name)
+	pod, err := podClient.Create(pod)
+	if err != nil {
+		Failf("Failed to create pod: %v", err)
+	}
+
+	expectNoError(framework.WaitForPodRunning(pod.Name))
+
+	By("verifying the pod is in kubernetes")
+	pod, err = podClient.Get(pod.Name)
+	if err != nil {
+		Failf("failed to get pod: %v", err)
+	}
+}
+
+func startPodAndGetBackOffs(framework *Framework, pod *api.Pod, podName string, containerName string, sleepAmount time.Duration) (time.Duration, time.Duration) {
+	runPodFromStruct(framework, pod)
+	time.Sleep(sleepAmount)
+
+	By("getting restart delay-0")
+	_, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName)
+	if err != nil {
+		Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
+	}
+
+	By("getting restart delay-1")
+	delay1, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName)
+	if err != nil {
+		Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
+	}
+
+	By("getting restart delay-2")
+	delay2, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName)
+	if err != nil {
+		Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
+	}
+	return delay1, delay2
+}
+
+func getRestartDelay(c *client.Client, pod *api.Pod, ns string, name string, containerName string) (time.Duration, error) {
+	beginTime := time.Now()
+	for time.Since(beginTime) < (2 * maxBackOffTolerance) { // may just miss the 1st MaxContainerBackOff delay
+		time.Sleep(time.Second)
+		pod, err := c.Pods(ns).Get(name)
+		expectNoError(err, fmt.Sprintf("getting pod %s", name))
+		status, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, containerName)
+		if !ok {
+			Logf("getRestartDelay: status missing")
+			continue
+		}
+
+		if status.State.Waiting == nil && status.State.Running != nil && status.LastTerminationState.Terminated != nil && status.State.Running.StartedAt.Time.After(beginTime) {
+			startedAt := status.State.Running.StartedAt.Time
+			finishedAt := status.LastTerminationState.Terminated.FinishedAt.Time
+			Logf("getRestartDelay: finishedAt=%s restartedAt=%s (%s)", finishedAt, startedAt, startedAt.Sub(finishedAt))
+			return startedAt.Sub(finishedAt), nil
+		}
+	}
+	return 0, fmt.Errorf("timeout getting pod restart delay")
+}
+
 var _ = Describe("Pods", func() {
 	framework := NewFramework("pods")
 
@@ -755,6 +827,190 @@ var _ = Describe("Pods", func() {
 		}
 	})
 
+	It("should have their container restart back-off timer increase exponentially", func() {
+		podName := "pod-back-off-exponentially"
+		containerName := "back-off"
+		podClient := framework.Client.Pods(framework.Namespace.Name)
+		pod := &api.Pod{
+			ObjectMeta: api.ObjectMeta{
+				Name:   podName,
+				Labels: map[string]string{"test": "back-off-image"},
+			},
+			Spec: api.PodSpec{
+				Containers: []api.Container{
+					{
+						Name:    containerName,
+						Image:   "gcr.io/google_containers/busybox",
+						Command: []string{"/bin/sh", "-c", "sleep 5", "/crash/missing"},
+					},
+				},
+			},
+		}
+
+		defer func() {
+			By("deleting the pod")
+			podClient.Delete(pod.Name, api.NewDeleteOptions(0))
+		}()
+
+		delay1, delay2 := startPodAndGetBackOffs(framework, pod, podName, containerName, buildBackOffDuration)
+		delay1 += 1 // divide by zero
+		ratio := float64(delay2) / float64(delay1)
+		if math.Floor(ratio) != 2 && math.Ceil(ratio) != 2 {
+			Failf("back-off gap is not increasing exponentially pod=%s/%s delay1=%s delay2=%s", podName, containerName, delay1, delay2)
+		}
+	})
+
+	It("should have their auto-restart back-off timer reset on image update", func() {
+		podName := "pod-back-off-image"
+		containerName := "back-off"
+		podClient := framework.Client.Pods(framework.Namespace.Name)
+		pod := &api.Pod{
+			ObjectMeta: api.ObjectMeta{
+				Name:   podName,
+				Labels: map[string]string{"test": "back-off-image"},
+			},
+			Spec: api.PodSpec{
+				Containers: []api.Container{
+					{
+						Name:    containerName,
+						Image:   "gcr.io/google_containers/busybox",
+						Command: []string{"/bin/sh", "-c", "sleep 5", "/crash/missing"},
+					},
+				},
+			},
+		}
+
+		defer func() {
+			By("deleting the pod")
+			podClient.Delete(pod.Name, api.NewDeleteOptions(0))
+		}()
+
+		delay1, delay2 := startPodAndGetBackOffs(framework, pod, podName, containerName, buildBackOffDuration)
+
+		By("updating the image")
+		pod, err := podClient.Get(pod.Name)
+		if err != nil {
+			Failf("failed to get pod: %v", err)
+		}
+		pod.Spec.Containers[0].Image = "nginx"
+		pod, err = podClient.Update(pod)
+		if err != nil {
+			Failf("error updating pod=%s/%s %v", podName, containerName, err)
+		}
+		time.Sleep(syncLoopFrequency)
+		expectNoError(framework.WaitForPodRunning(pod.Name))
+
+		By("get restart delay after image update")
+		delayAfterUpdate, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName)
+		if err != nil {
+			Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
+		}
+
+		if delayAfterUpdate > delay2 || delayAfterUpdate > delay1 {
+			Failf("updating image did not reset the back-off value in pod=%s/%s d3=%s d2=%s d1=%s", podName, containerName, delayAfterUpdate, delay1, delay2)
+		}
+	})
+
+	It("should not back-off restarting a container on LivenessProbe failure", func() {
+		podClient := framework.Client.Pods(framework.Namespace.Name)
+		podName := "pod-back-off-liveness"
+		containerName := "back-off-liveness"
+		pod := &api.Pod{
+			ObjectMeta: api.ObjectMeta{
+				Name:   podName,
+				Labels: map[string]string{"test": "liveness"},
+			},
+			Spec: api.PodSpec{
+				Containers: []api.Container{
+					{
+						Name:    containerName,
+						Image:   "gcr.io/google_containers/busybox",
+						Command: []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 5; rm -rf /tmp/health; sleep 600"},
+						LivenessProbe: &api.Probe{
+							Handler: api.Handler{
+								Exec: &api.ExecAction{
+									Command: []string{"cat", "/tmp/health"},
+								},
+							},
+							InitialDelaySeconds: 5,
+						},
+					},
+				},
+			},
+		}
+
+		defer func() {
+			By("deleting the pod")
+			podClient.Delete(pod.Name, api.NewDeleteOptions(0))
+		}()
+
+		delay1, delay2 := startPodAndGetBackOffs(framework, pod, podName, containerName, buildBackOffDuration)
+
+		if math.Abs(float64(delay2-delay1)) > float64(syncLoopFrequency) {
+			Failf("back-off increasing on LivenessProbe failure delay1=%s delay2=%s", delay1, delay2)
+		}
+	})
+
+	It("should cap back-off at MaxContainerBackOff", func() {
+		podClient := framework.Client.Pods(framework.Namespace.Name)
+		podName := "back-off-cap"
+		containerName := "back-off-cap"
+		pod := &api.Pod{
+			ObjectMeta: api.ObjectMeta{
+				Name:   podName,
+				Labels: map[string]string{"test": "liveness"},
+			},
+			Spec: api.PodSpec{
+				Containers: []api.Container{
+					{
+						Name:    containerName,
+						Image:   "gcr.io/google_containers/busybox",
+						Command: []string{"/bin/sh", "-c", "sleep 5", "/crash/missing"},
+					},
+				},
+			},
+		}
+
+		defer func() {
+			By("deleting the pod")
+			podClient.Delete(pod.Name, api.NewDeleteOptions(0))
+		}()
+
+		runPodFromStruct(framework, pod)
+		time.Sleep(2 * kubelet.MaxContainerBackOff) // it takes slightly more than 2*x to get to a back-off of x
+
+		// wait for a delay == capped delay of MaxContainerBackOff
+		By("geting restart delay when capped")
+		var (
+			delay1 time.Duration
+			err    error
+		)
+		for i := 0; i < 3; i++ {
+			delay1, err = getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName)
+			if err != nil {
+				Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
+			}
+
+			if delay1 < kubelet.MaxContainerBackOff {
+				continue
+			}
+		}
+
+		if (delay1 < kubelet.MaxContainerBackOff) || (delay1 > maxBackOffTolerance) {
+			Failf("expected %s back-off got=%s in delay1", kubelet.MaxContainerBackOff, delay1)
+		}
+
+		By("getting restart delay after a capped delay")
+		delay2, err := getRestartDelay(framework.Client, pod, framework.Namespace.Name, podName, containerName)
+		if err != nil {
+			Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
+		}
+
+		if delay2 < kubelet.MaxContainerBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift
+			Failf("expected %s back-off got=%s on delay2", kubelet.MaxContainerBackOff, delay2)
+		}
+	})
+
 	// The following tests for remote command execution and port forwarding are
 	// commented out because the GCE environment does not currently have nsenter
 	// in the kubelet's PATH, nor does it have socat installed. Once we figure