From 1a3d205faf9e7f7cd0be2af751ed0050e558e61e Mon Sep 17 00:00:00 2001
From: Yu-Ju Hong <yjhong@google.com>
Date: Fri, 26 Feb 2016 14:26:58 -0800
Subject: [PATCH] kubelet: stop probing if liveness check fails

This change puts the worker on hold when the liveness check fails. It will
resume probing when observing a new container ID.
---
 pkg/kubelet/prober/worker.go      | 18 +++++++++++++++
 pkg/kubelet/prober/worker_test.go | 37 ++++++++++++++++++++++++++++++-
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go
index 3a760541414..5067dd5474f 100644
--- a/pkg/kubelet/prober/worker.go
+++ b/pkg/kubelet/prober/worker.go
@@ -61,6 +61,9 @@ type worker struct {
 	lastResult results.Result
 	// How many times in a row the probe has returned the same result.
 	resultRun int
+
+	// If set, skip probing.
+	onHold bool
 }
 
 // Creates and starts a new probe worker.
@@ -165,6 +168,13 @@ func (w *worker) doProbe() (keepGoing bool) {
 		}
 		w.containerID = kubecontainer.ParseContainerID(c.ContainerID)
 		w.resultsManager.Set(w.containerID, w.initialValue, w.pod)
+		// We've got a new container; resume probing.
+		w.onHold = false
+	}
+
+	if w.onHold {
+		// Worker is on hold until there is a new container.
+		return true
 	}
 
 	if c.State.Running == nil {
@@ -203,5 +213,13 @@ func (w *worker) doProbe() (keepGoing bool) {
 
 	w.resultsManager.Set(w.containerID, result, w.pod)
 
+	if w.probeType == liveness && result == results.Failure {
+		// The container fails a liveness check, it will need to be restared.
+		// Stop probing until we see a new container ID. This is to reduce the
+		// chance of hitting #21751, where running `docker exec` when a
+		// container is being stopped may lead to corrupted container state.
+		w.onHold = true
+	}
+
 	return true
 }
diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go
index cfc62d355dc..2b23ad36aa9 100644
--- a/pkg/kubelet/prober/worker_test.go
+++ b/pkg/kubelet/prober/worker_test.go
@@ -275,7 +275,7 @@ func TestHandleCrash(t *testing.T) {
 }
 
 func expectResult(t *testing.T, w *worker, expectedResult results.Result, msg string) {
-	result, ok := resultsManager(w.probeManager, w.probeType).Get(testContainerID)
+	result, ok := resultsManager(w.probeManager, w.probeType).Get(w.containerID)
 	if !ok {
 		t.Errorf("[%s - %s] Expected result to be set, but was not set", w.probeType, msg)
 	} else if result != expectedResult {
@@ -305,3 +305,38 @@ type crashingExecProber struct{}
 func (p crashingExecProber) Probe(_ exec.Cmd) (probe.Result, string, error) {
 	panic("Intentional Probe crash.")
 }
+
+func TestOnHoldOnLivenessCheckFailure(t *testing.T) {
+	m := newTestManager()
+	w := newTestWorker(m, liveness, api.Probe{SuccessThreshold: 1, FailureThreshold: 1})
+	status := getTestRunningStatus()
+	m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())
+
+	// First probe should fail.
+	m.prober.exec = fakeExecProber{probe.Failure, nil}
+	msg := "first probe"
+	expectContinue(t, w, w.doProbe(), msg)
+	expectResult(t, w, results.Failure, msg)
+	if !w.onHold {
+		t.Errorf("Prober should be on hold due to liveness check failure")
+	}
+	// Set fakeExecProber to return success. However, the result will remain
+	// failure because the worker is on hold and won't probe.
+	m.prober.exec = fakeExecProber{probe.Success, nil}
+	msg = "while on hold"
+	expectContinue(t, w, w.doProbe(), msg)
+	expectResult(t, w, results.Failure, msg)
+	if !w.onHold {
+		t.Errorf("Prober should be on hold due to liveness check failure")
+	}
+
+	// Set a new container ID to lift the hold. The next probe will succeed.
+	status.ContainerStatuses[0].ContainerID = "test://newCont_ID"
+	m.statusManager.SetPodStatus(w.pod, status)
+	msg = "hold lifted"
+	expectContinue(t, w, w.doProbe(), msg)
+	expectResult(t, w, results.Success, msg)
+	if w.onHold {
+		t.Errorf("Prober should not be on hold anymore")
+	}
+}