From 1a3d205faf9e7f7cd0be2af751ed0050e558e61e Mon Sep 17 00:00:00 2001 From: Yu-Ju Hong Date: Fri, 26 Feb 2016 14:26:58 -0800 Subject: [PATCH] kubelet: stop probing if liveness check fails This change puts the worker on hold when the liveness check fails. It will resume probing when observing a new container ID. --- pkg/kubelet/prober/worker.go | 18 +++++++++++++++ pkg/kubelet/prober/worker_test.go | 37 ++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go index 3a760541414..5067dd5474f 100644 --- a/pkg/kubelet/prober/worker.go +++ b/pkg/kubelet/prober/worker.go @@ -61,6 +61,9 @@ type worker struct { lastResult results.Result // How many times in a row the probe has returned the same result. resultRun int + + // If set, skip probing. + onHold bool } // Creates and starts a new probe worker. @@ -165,6 +168,13 @@ func (w *worker) doProbe() (keepGoing bool) { } w.containerID = kubecontainer.ParseContainerID(c.ContainerID) w.resultsManager.Set(w.containerID, w.initialValue, w.pod) + // We've got a new container; resume probing. + w.onHold = false + } + + if w.onHold { + // Worker is on hold until there is a new container. + return true } if c.State.Running == nil { @@ -203,5 +213,13 @@ func (w *worker) doProbe() (keepGoing bool) { w.resultsManager.Set(w.containerID, result, w.pod) + if w.probeType == liveness && result == results.Failure { + // The container fails a liveness check, it will need to be restared. + // Stop probing until we see a new container ID. This is to reduce the + // chance of hitting #21751, where running `docker exec` when a + // container is being stopped may lead to corrupted container state. + w.onHold = true + } + return true } diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go index cfc62d355dc..2b23ad36aa9 100644 --- a/pkg/kubelet/prober/worker_test.go +++ b/pkg/kubelet/prober/worker_test.go @@ -275,7 +275,7 @@ func TestHandleCrash(t *testing.T) { } func expectResult(t *testing.T, w *worker, expectedResult results.Result, msg string) { - result, ok := resultsManager(w.probeManager, w.probeType).Get(testContainerID) + result, ok := resultsManager(w.probeManager, w.probeType).Get(w.containerID) if !ok { t.Errorf("[%s - %s] Expected result to be set, but was not set", w.probeType, msg) } else if result != expectedResult { @@ -305,3 +305,38 @@ type crashingExecProber struct{} func (p crashingExecProber) Probe(_ exec.Cmd) (probe.Result, string, error) { panic("Intentional Probe crash.") } + +func TestOnHoldOnLivenessCheckFailure(t *testing.T) { + m := newTestManager() + w := newTestWorker(m, liveness, api.Probe{SuccessThreshold: 1, FailureThreshold: 1}) + status := getTestRunningStatus() + m.statusManager.SetPodStatus(w.pod, getTestRunningStatus()) + + // First probe should fail. + m.prober.exec = fakeExecProber{probe.Failure, nil} + msg := "first probe" + expectContinue(t, w, w.doProbe(), msg) + expectResult(t, w, results.Failure, msg) + if !w.onHold { + t.Errorf("Prober should be on hold due to liveness check failure") + } + // Set fakeExecProber to return success. However, the result will remain + // failure because the worker is on hold and won't probe. + m.prober.exec = fakeExecProber{probe.Success, nil} + msg = "while on hold" + expectContinue(t, w, w.doProbe(), msg) + expectResult(t, w, results.Failure, msg) + if !w.onHold { + t.Errorf("Prober should be on hold due to liveness check failure") + } + + // Set a new container ID to lift the hold. The next probe will succeed. + status.ContainerStatuses[0].ContainerID = "test://newCont_ID" + m.statusManager.SetPodStatus(w.pod, status) + msg = "hold lifted" + expectContinue(t, w, w.doProbe(), msg) + expectResult(t, w, results.Success, msg) + if w.onHold { + t.Errorf("Prober should not be on hold anymore") + } +}