From aa9a9e86e18541abd1c08751910ab9be9d2f29e1 Mon Sep 17 00:00:00 2001 From: mochizuki875 Date: Fri, 13 Oct 2023 02:53:08 +0000 Subject: [PATCH 1/5] Stop StartupProbe explicity when successThrethold is reached --- pkg/kubelet/prober/common_test.go | 4 +++ pkg/kubelet/prober/worker.go | 5 ++- pkg/kubelet/prober/worker_test.go | 53 ++++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/prober/common_test.go b/pkg/kubelet/prober/common_test.go index d071392a1c3..be435cd1518 100644 --- a/pkg/kubelet/prober/common_test.go +++ b/pkg/kubelet/prober/common_test.go @@ -46,6 +46,10 @@ func getTestRunningStatus() v1.PodStatus { return getTestRunningStatusWithStarted(true) } +func getTestNotRunningStatus() v1.PodStatus { + return getTestRunningStatusWithStarted(false) +} + func getTestRunningStatusWithStarted(started bool) v1.PodStatus { containerStatus := v1.ContainerStatus{ Name: testContainerName, diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go index ea2d7243387..5dd1ef271c7 100644 --- a/pkg/kubelet/prober/worker.go +++ b/pkg/kubelet/prober/worker.go @@ -316,11 +316,14 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) { w.resultsManager.Set(w.containerID, result, w.pod) - if (w.probeType == liveness || w.probeType == startup) && result == results.Failure { + if (w.probeType == liveness && result == results.Failure) || w.probeType == startup { // The container fails a liveness/startup check, it will need to be restarted. // Stop probing until we see a new container ID. This is to reduce the // chance of hitting #21751, where running `docker exec` when a // container is being stopped may lead to corrupted container state. + // In addition, if the container succeeds a startup probe, we should stop probing + // until the container is restarted. + // This is to prevent extra Probe executions #117153. w.onHold = true w.resultRun = 0 } diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go index 4d1c6d140c1..c576fc51431 100644 --- a/pkg/kubelet/prober/worker_test.go +++ b/pkg/kubelet/prober/worker_test.go @@ -268,6 +268,48 @@ func TestSuccessThreshold(t *testing.T) { } } +func TestStartupProbeSuccessThreshold(t *testing.T) { + ctx := context.Background() + m := newTestManager() + successThreshold := 1 + failureThreshold := 3 + w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)}) + m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus()) + + m.prober.exec = fakeExecProber{probe.Success, nil} + + for i := 0; i < successThreshold+1; i++ { + msg := fmt.Sprintf("%d success", successThreshold) + expectContinue(t, w, w.doProbe(ctx), msg) + expectResult(t, w, results.Success, msg) + expectResultRun(t, w, 0, msg) + } +} + +func TestStartupProbeFailureThreshold(t *testing.T) { + ctx := context.Background() + m := newTestManager() + successThreshold := 1 + failureThreshold := 3 + w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)}) + m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus()) + + m.prober.exec = fakeExecProber{probe.Failure, nil} + + for i := 0; i < failureThreshold+1; i++ { + msg := fmt.Sprintf("%d failure", i+1) + expectContinue(t, w, w.doProbe(ctx), msg) + if i < failureThreshold-1 { + expectResult(t, w, results.Unknown, msg) + expectResultRun(t, w, i+1, msg) + } else { + msg := fmt.Sprintf("%d failure", failureThreshold) + expectResult(t, w, results.Failure, msg) + expectResultRun(t, w, 0, msg) + } + } +} + func TestCleanUp(t *testing.T) { m := newTestManager() @@ -315,6 +357,13 @@ func expectResult(t *testing.T, w *worker, expectedResult results.Result, msg st } } +func expectResultRun(t *testing.T, w *worker, expectedResultRun int, msg string) { + if w.resultRun != expectedResultRun { + t.Errorf("[%s - %s] Expected result to be %v, but was %v", + w.probeType, msg, expectedResultRun, w.resultRun) + } +} + func expectContinue(t *testing.T, w *worker, c bool, msg string) { if !c { t.Errorf("[%s - %s] Expected to continue, but did not", w.probeType, msg) @@ -366,8 +415,10 @@ func TestOnHoldOnLivenessOrStartupCheckFailure(t *testing.T) { msg = "hold lifted" expectContinue(t, w, w.doProbe(ctx), msg) expectResult(t, w, results.Success, msg) - if w.onHold { + if probeType == liveness && w.onHold { t.Errorf("Prober should not be on hold anymore") + } else if probeType == startup && !w.onHold { + t.Errorf("Prober should be on hold due to %s check success", probeType) } } } From 11ddb97c5ffd60425ccdc07d971e1d33bcb8b387 Mon Sep 17 00:00:00 2001 From: mochizuki875 Date: Mon, 23 Oct 2023 06:26:01 +0000 Subject: [PATCH 2/5] add status check of startupProbe --- pkg/kubelet/prober/worker.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go index 5dd1ef271c7..86200fe1538 100644 --- a/pkg/kubelet/prober/worker.go +++ b/pkg/kubelet/prober/worker.go @@ -316,12 +316,12 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) { w.resultsManager.Set(w.containerID, result, w.pod) - if (w.probeType == liveness && result == results.Failure) || w.probeType == startup { + if (w.probeType == liveness && result == results.Failure) || (w.probeType == startup && (result == results.Success || result == results.Failure)) { // The container fails a liveness/startup check, it will need to be restarted. // Stop probing until we see a new container ID. This is to reduce the // chance of hitting #21751, where running `docker exec` when a // container is being stopped may lead to corrupted container state. - // In addition, if the container succeeds a startup probe, we should stop probing + // In addition, if the threshold for each result of a startup probe is exceeded, we should stop probing // until the container is restarted. // This is to prevent extra Probe executions #117153. w.onHold = true From ead21021fdbe82aab9af9ab43e4ef3f44ad6ba96 Mon Sep 17 00:00:00 2001 From: mochizuki875 Date: Fri, 27 Oct 2023 05:08:09 +0000 Subject: [PATCH 3/5] add threshold check using w.onHold --- pkg/kubelet/prober/worker_test.go | 69 ++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go index c576fc51431..93a3bfd34ce 100644 --- a/pkg/kubelet/prober/worker_test.go +++ b/pkg/kubelet/prober/worker_test.go @@ -275,14 +275,29 @@ func TestStartupProbeSuccessThreshold(t *testing.T) { failureThreshold := 3 w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)}) m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus()) - m.prober.exec = fakeExecProber{probe.Success, nil} for i := 0; i < successThreshold+1; i++ { + if i < successThreshold { + // Probe should not be on hold and will continue to be excuted + // until successThreshold is met + if w.onHold { + t.Errorf("Prober should not be on hold") + } + } else { + // Probe should be on hold and will not be executed anymore + // when successThreshold is met + if !w.onHold { + t.Errorf("Prober should be on hold because successThreshold is exceeded") + } + } msg := fmt.Sprintf("%d success", successThreshold) expectContinue(t, w, w.doProbe(ctx), msg) expectResult(t, w, results.Success, msg) - expectResultRun(t, w, 0, msg) + // Meeting or exceeding successThreshold should cause resultRun to reset to 0 + if w.resultRun != 0 { + t.Errorf("Prober resultRun should be 0, but %d", w.resultRun) + } } } @@ -293,19 +308,48 @@ func TestStartupProbeFailureThreshold(t *testing.T) { failureThreshold := 3 w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)}) m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus()) - m.prober.exec = fakeExecProber{probe.Failure, nil} for i := 0; i < failureThreshold+1; i++ { - msg := fmt.Sprintf("%d failure", i+1) - expectContinue(t, w, w.doProbe(ctx), msg) if i < failureThreshold-1 { + // Probe should not be on hold and will continue to be excuted + // until failureThreshold is met + if w.onHold { + t.Errorf("Prober should not be on hold") + } + msg := fmt.Sprintf("%d failure", i+1) + expectContinue(t, w, w.doProbe(ctx), msg) expectResult(t, w, results.Unknown, msg) - expectResultRun(t, w, i+1, msg) - } else { - msg := fmt.Sprintf("%d failure", failureThreshold) + // resultRun should be incremented until failureThreshold is met + if w.resultRun != i+1 { + t.Errorf("Prober resultRun should be %d, but %d", i+1, w.resultRun) + } + } else if i < failureThreshold { + // Probe should not be on hold and will continue to be excuted + // until failureThreshold is met + if w.onHold { + t.Errorf("Prober should not be on hold") + } + msg := fmt.Sprintf("%d failure", i+1) + expectContinue(t, w, w.doProbe(ctx), msg) expectResult(t, w, results.Failure, msg) - expectResultRun(t, w, 0, msg) + // Meeting failureThreshold should cause resultRun to reset to 0 + if w.resultRun != 0 { + t.Errorf("Prober resultRun should be 0, but %d", w.resultRun) + } + } else { + // Probe should be on hold and will not be executed anymore + // when failureThreshold is met + if !w.onHold { + t.Errorf("Prober should be on hold because failureThreshold is exceeded") + } + msg := fmt.Sprintf("%d failure", failureThreshold) + expectContinue(t, w, w.doProbe(ctx), msg) + expectResult(t, w, results.Failure, msg) + // Exceeding failureThreshold should cause resultRun to reset to 0 + if w.resultRun != 0 { + t.Errorf("Prober resultRun should be 0, but %d", w.resultRun) + } } } } @@ -357,13 +401,6 @@ func expectResult(t *testing.T, w *worker, expectedResult results.Result, msg st } } -func expectResultRun(t *testing.T, w *worker, expectedResultRun int, msg string) { - if w.resultRun != expectedResultRun { - t.Errorf("[%s - %s] Expected result to be %v, but was %v", - w.probeType, msg, expectedResultRun, w.resultRun) - } -} - func expectContinue(t *testing.T, w *worker, c bool, msg string) { if !c { t.Errorf("[%s - %s] Expected to continue, but did not", w.probeType, msg) From a262c806d2e9946db8489b0956d853dc8d152671 Mon Sep 17 00:00:00 2001 From: mochizuki875 Date: Mon, 26 Aug 2024 03:04:32 +0000 Subject: [PATCH 4/5] fix from rv comment --- pkg/kubelet/prober/worker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go index 86200fe1538..dbd723332dc 100644 --- a/pkg/kubelet/prober/worker.go +++ b/pkg/kubelet/prober/worker.go @@ -316,7 +316,7 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) { w.resultsManager.Set(w.containerID, result, w.pod) - if (w.probeType == liveness && result == results.Failure) || (w.probeType == startup && (result == results.Success || result == results.Failure)) { + if (w.probeType == liveness && result == results.Failure) || w.probeType == startup { // The container fails a liveness/startup check, it will need to be restarted. // Stop probing until we see a new container ID. This is to reduce the // chance of hitting #21751, where running `docker exec` when a From 632f162d719dd3df43a826b1d2a0fd2c0b5cbb72 Mon Sep 17 00:00:00 2001 From: mochizuki875 Date: Mon, 26 Aug 2024 08:39:22 +0000 Subject: [PATCH 5/5] fix unit test --- pkg/kubelet/prober/worker_test.go | 55 +++++++++++++++---------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go index 93a3bfd34ce..fa852c5be3d 100644 --- a/pkg/kubelet/prober/worker_test.go +++ b/pkg/kubelet/prober/worker_test.go @@ -284,19 +284,19 @@ func TestStartupProbeSuccessThreshold(t *testing.T) { if w.onHold { t.Errorf("Prober should not be on hold") } + msg := fmt.Sprintf("%d success", i+1) + expectContinue(t, w, w.doProbe(ctx), msg) + expectResult(t, w, results.Success, msg) } else { // Probe should be on hold and will not be executed anymore // when successThreshold is met if !w.onHold { t.Errorf("Prober should be on hold because successThreshold is exceeded") } - } - msg := fmt.Sprintf("%d success", successThreshold) - expectContinue(t, w, w.doProbe(ctx), msg) - expectResult(t, w, results.Success, msg) - // Meeting or exceeding successThreshold should cause resultRun to reset to 0 - if w.resultRun != 0 { - t.Errorf("Prober resultRun should be 0, but %d", w.resultRun) + // Meeting or exceeding successThreshold should cause resultRun to reset to 0 + if w.resultRun != 0 { + t.Errorf("Prober resultRun should be 0, but %d", w.resultRun) + } } } } @@ -311,7 +311,7 @@ func TestStartupProbeFailureThreshold(t *testing.T) { m.prober.exec = fakeExecProber{probe.Failure, nil} for i := 0; i < failureThreshold+1; i++ { - if i < failureThreshold-1 { + if i < failureThreshold { // Probe should not be on hold and will continue to be excuted // until failureThreshold is met if w.onHold { @@ -319,23 +319,25 @@ func TestStartupProbeFailureThreshold(t *testing.T) { } msg := fmt.Sprintf("%d failure", i+1) expectContinue(t, w, w.doProbe(ctx), msg) - expectResult(t, w, results.Unknown, msg) - // resultRun should be incremented until failureThreshold is met - if w.resultRun != i+1 { - t.Errorf("Prober resultRun should be %d, but %d", i+1, w.resultRun) - } - } else if i < failureThreshold { - // Probe should not be on hold and will continue to be excuted - // until failureThreshold is met - if w.onHold { - t.Errorf("Prober should not be on hold") - } - msg := fmt.Sprintf("%d failure", i+1) - expectContinue(t, w, w.doProbe(ctx), msg) - expectResult(t, w, results.Failure, msg) - // Meeting failureThreshold should cause resultRun to reset to 0 - if w.resultRun != 0 { - t.Errorf("Prober resultRun should be 0, but %d", w.resultRun) + switch i { + case 0, 1: + // At this point, the expected result is Unknown + // because w.resultsManager.Set() will be called after FailureThreshold is reached + expectResult(t, w, results.Unknown, msg) + // resultRun should be incremented until failureThreshold is met + if w.resultRun != i+1 { + t.Errorf("Prober resultRun should be %d, but %d", i+1, w.resultRun) + } + case 2: + // The expected result is Failure + // because w.resultsManager.Set() will be called due to resultRun reaching failureThreshold, + // updating the cached result to Failure. + // After that, resultRun will be reset to 0. + expectResult(t, w, results.Failure, msg) + // Meeting failureThreshold should cause resultRun to reset to 0 + if w.resultRun != 0 { + t.Errorf("Prober resultRun should be 0, but %d", w.resultRun) + } } } else { // Probe should be on hold and will not be executed anymore @@ -343,9 +345,6 @@ func TestStartupProbeFailureThreshold(t *testing.T) { if !w.onHold { t.Errorf("Prober should be on hold because failureThreshold is exceeded") } - msg := fmt.Sprintf("%d failure", failureThreshold) - expectContinue(t, w, w.doProbe(ctx), msg) - expectResult(t, w, results.Failure, msg) // Exceeding failureThreshold should cause resultRun to reset to 0 if w.resultRun != 0 { t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)