From aa9a9e86e18541abd1c08751910ab9be9d2f29e1 Mon Sep 17 00:00:00 2001
From: mochizuki875 <mzk875@gmail.com>
Date: Fri, 13 Oct 2023 02:53:08 +0000
Subject: [PATCH 1/5] Stop StartupProbe explicity when successThrethold is
 reached

---
 pkg/kubelet/prober/common_test.go |  4 +++
 pkg/kubelet/prober/worker.go      |  5 ++-
 pkg/kubelet/prober/worker_test.go | 53 ++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/pkg/kubelet/prober/common_test.go b/pkg/kubelet/prober/common_test.go
index d071392a1c3..be435cd1518 100644
--- a/pkg/kubelet/prober/common_test.go
+++ b/pkg/kubelet/prober/common_test.go
@@ -46,6 +46,10 @@ func getTestRunningStatus() v1.PodStatus {
 	return getTestRunningStatusWithStarted(true)
 }
 
+func getTestNotRunningStatus() v1.PodStatus {
+	return getTestRunningStatusWithStarted(false)
+}
+
 func getTestRunningStatusWithStarted(started bool) v1.PodStatus {
 	containerStatus := v1.ContainerStatus{
 		Name:        testContainerName,
diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go
index ea2d7243387..5dd1ef271c7 100644
--- a/pkg/kubelet/prober/worker.go
+++ b/pkg/kubelet/prober/worker.go
@@ -316,11 +316,14 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) {
 
 	w.resultsManager.Set(w.containerID, result, w.pod)
 
-	if (w.probeType == liveness || w.probeType == startup) && result == results.Failure {
+	if (w.probeType == liveness && result == results.Failure) || w.probeType == startup {
 		// The container fails a liveness/startup check, it will need to be restarted.
 		// Stop probing until we see a new container ID. This is to reduce the
 		// chance of hitting #21751, where running `docker exec` when a
 		// container is being stopped may lead to corrupted container state.
+		// In addition, if the container succeeds a startup probe, we should stop probing
+		// until the container is restarted.
+		// This is to prevent extra Probe executions #117153.
 		w.onHold = true
 		w.resultRun = 0
 	}
diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go
index 4d1c6d140c1..c576fc51431 100644
--- a/pkg/kubelet/prober/worker_test.go
+++ b/pkg/kubelet/prober/worker_test.go
@@ -268,6 +268,48 @@ func TestSuccessThreshold(t *testing.T) {
 	}
 }
 
+func TestStartupProbeSuccessThreshold(t *testing.T) {
+	ctx := context.Background()
+	m := newTestManager()
+	successThreshold := 1
+	failureThreshold := 3
+	w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)})
+	m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus())
+
+	m.prober.exec = fakeExecProber{probe.Success, nil}
+
+	for i := 0; i < successThreshold+1; i++ {
+		msg := fmt.Sprintf("%d success", successThreshold)
+		expectContinue(t, w, w.doProbe(ctx), msg)
+		expectResult(t, w, results.Success, msg)
+		expectResultRun(t, w, 0, msg)
+	}
+}
+
+func TestStartupProbeFailureThreshold(t *testing.T) {
+	ctx := context.Background()
+	m := newTestManager()
+	successThreshold := 1
+	failureThreshold := 3
+	w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)})
+	m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus())
+
+	m.prober.exec = fakeExecProber{probe.Failure, nil}
+
+	for i := 0; i < failureThreshold+1; i++ {
+		msg := fmt.Sprintf("%d failure", i+1)
+		expectContinue(t, w, w.doProbe(ctx), msg)
+		if i < failureThreshold-1 {
+			expectResult(t, w, results.Unknown, msg)
+			expectResultRun(t, w, i+1, msg)
+		} else {
+			msg := fmt.Sprintf("%d failure", failureThreshold)
+			expectResult(t, w, results.Failure, msg)
+			expectResultRun(t, w, 0, msg)
+		}
+	}
+}
+
 func TestCleanUp(t *testing.T) {
 	m := newTestManager()
 
@@ -315,6 +357,13 @@ func expectResult(t *testing.T, w *worker, expectedResult results.Result, msg st
 	}
 }
 
+func expectResultRun(t *testing.T, w *worker, expectedResultRun int, msg string) {
+	if w.resultRun != expectedResultRun {
+		t.Errorf("[%s - %s] Expected result to be %v, but was %v",
+			w.probeType, msg, expectedResultRun, w.resultRun)
+	}
+}
+
 func expectContinue(t *testing.T, w *worker, c bool, msg string) {
 	if !c {
 		t.Errorf("[%s - %s] Expected to continue, but did not", w.probeType, msg)
@@ -366,8 +415,10 @@ func TestOnHoldOnLivenessOrStartupCheckFailure(t *testing.T) {
 		msg = "hold lifted"
 		expectContinue(t, w, w.doProbe(ctx), msg)
 		expectResult(t, w, results.Success, msg)
-		if w.onHold {
+		if probeType == liveness && w.onHold {
 			t.Errorf("Prober should not be on hold anymore")
+		} else if probeType == startup && !w.onHold {
+			t.Errorf("Prober should be on hold due to %s check success", probeType)
 		}
 	}
 }

From 11ddb97c5ffd60425ccdc07d971e1d33bcb8b387 Mon Sep 17 00:00:00 2001
From: mochizuki875 <mzk875@gmail.com>
Date: Mon, 23 Oct 2023 06:26:01 +0000
Subject: [PATCH 2/5] add status check of startupProbe

---
 pkg/kubelet/prober/worker.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go
index 5dd1ef271c7..86200fe1538 100644
--- a/pkg/kubelet/prober/worker.go
+++ b/pkg/kubelet/prober/worker.go
@@ -316,12 +316,12 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) {
 
 	w.resultsManager.Set(w.containerID, result, w.pod)
 
-	if (w.probeType == liveness && result == results.Failure) || w.probeType == startup {
+	if (w.probeType == liveness && result == results.Failure) || (w.probeType == startup && (result == results.Success || result == results.Failure)) {
 		// The container fails a liveness/startup check, it will need to be restarted.
 		// Stop probing until we see a new container ID. This is to reduce the
 		// chance of hitting #21751, where running `docker exec` when a
 		// container is being stopped may lead to corrupted container state.
-		// In addition, if the container succeeds a startup probe, we should stop probing
+		// In addition, if the threshold for each result of a startup probe is exceeded, we should stop probing
 		// until the container is restarted.
 		// This is to prevent extra Probe executions #117153.
 		w.onHold = true

From ead21021fdbe82aab9af9ab43e4ef3f44ad6ba96 Mon Sep 17 00:00:00 2001
From: mochizuki875 <mzk875@gmail.com>
Date: Fri, 27 Oct 2023 05:08:09 +0000
Subject: [PATCH 3/5] add threshold check using w.onHold

---
 pkg/kubelet/prober/worker_test.go | 69 ++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go
index c576fc51431..93a3bfd34ce 100644
--- a/pkg/kubelet/prober/worker_test.go
+++ b/pkg/kubelet/prober/worker_test.go
@@ -275,14 +275,29 @@ func TestStartupProbeSuccessThreshold(t *testing.T) {
 	failureThreshold := 3
 	w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)})
 	m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus())
-
 	m.prober.exec = fakeExecProber{probe.Success, nil}
 
 	for i := 0; i < successThreshold+1; i++ {
+		if i < successThreshold {
+			// Probe should not be on hold and will continue to be excuted
+			// until successThreshold is met
+			if w.onHold {
+				t.Errorf("Prober should not be on hold")
+			}
+		} else {
+			// Probe should be on hold and will not be executed anymore
+			// when successThreshold is met
+			if !w.onHold {
+				t.Errorf("Prober should be on hold because successThreshold is exceeded")
+			}
+		}
 		msg := fmt.Sprintf("%d success", successThreshold)
 		expectContinue(t, w, w.doProbe(ctx), msg)
 		expectResult(t, w, results.Success, msg)
-		expectResultRun(t, w, 0, msg)
+		// Meeting or exceeding successThreshold should cause resultRun to reset to 0
+		if w.resultRun != 0 {
+			t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
+		}
 	}
 }
 
@@ -293,19 +308,48 @@ func TestStartupProbeFailureThreshold(t *testing.T) {
 	failureThreshold := 3
 	w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: int32(successThreshold), FailureThreshold: int32(failureThreshold)})
 	m.statusManager.SetPodStatus(w.pod, getTestNotRunningStatus())
-
 	m.prober.exec = fakeExecProber{probe.Failure, nil}
 
 	for i := 0; i < failureThreshold+1; i++ {
-		msg := fmt.Sprintf("%d failure", i+1)
-		expectContinue(t, w, w.doProbe(ctx), msg)
 		if i < failureThreshold-1 {
+			// Probe should not be on hold and will continue to be excuted
+			// until failureThreshold is met
+			if w.onHold {
+				t.Errorf("Prober should not be on hold")
+			}
+			msg := fmt.Sprintf("%d failure", i+1)
+			expectContinue(t, w, w.doProbe(ctx), msg)
 			expectResult(t, w, results.Unknown, msg)
-			expectResultRun(t, w, i+1, msg)
-		} else {
-			msg := fmt.Sprintf("%d failure", failureThreshold)
+			// resultRun should be incremented until failureThreshold is met
+			if w.resultRun != i+1 {
+				t.Errorf("Prober resultRun should be %d, but %d", i+1, w.resultRun)
+			}
+		} else if i < failureThreshold {
+			// Probe should not be on hold and will continue to be excuted
+			// until failureThreshold is met
+			if w.onHold {
+				t.Errorf("Prober should not be on hold")
+			}
+			msg := fmt.Sprintf("%d failure", i+1)
+			expectContinue(t, w, w.doProbe(ctx), msg)
 			expectResult(t, w, results.Failure, msg)
-			expectResultRun(t, w, 0, msg)
+			// Meeting failureThreshold should cause resultRun to reset to 0
+			if w.resultRun != 0 {
+				t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
+			}
+		} else {
+			// Probe should be on hold and will not be executed anymore
+			// when failureThreshold is met
+			if !w.onHold {
+				t.Errorf("Prober should be on hold because failureThreshold is exceeded")
+			}
+			msg := fmt.Sprintf("%d failure", failureThreshold)
+			expectContinue(t, w, w.doProbe(ctx), msg)
+			expectResult(t, w, results.Failure, msg)
+			// Exceeding failureThreshold should cause resultRun to reset to 0
+			if w.resultRun != 0 {
+				t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
+			}
 		}
 	}
 }
@@ -357,13 +401,6 @@ func expectResult(t *testing.T, w *worker, expectedResult results.Result, msg st
 	}
 }
 
-func expectResultRun(t *testing.T, w *worker, expectedResultRun int, msg string) {
-	if w.resultRun != expectedResultRun {
-		t.Errorf("[%s - %s] Expected result to be %v, but was %v",
-			w.probeType, msg, expectedResultRun, w.resultRun)
-	}
-}
-
 func expectContinue(t *testing.T, w *worker, c bool, msg string) {
 	if !c {
 		t.Errorf("[%s - %s] Expected to continue, but did not", w.probeType, msg)

From a262c806d2e9946db8489b0956d853dc8d152671 Mon Sep 17 00:00:00 2001
From: mochizuki875 <mzk875@gmail.com>
Date: Mon, 26 Aug 2024 03:04:32 +0000
Subject: [PATCH 4/5] fix from rv comment

---
 pkg/kubelet/prober/worker.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go
index 86200fe1538..dbd723332dc 100644
--- a/pkg/kubelet/prober/worker.go
+++ b/pkg/kubelet/prober/worker.go
@@ -316,7 +316,7 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) {
 
 	w.resultsManager.Set(w.containerID, result, w.pod)
 
-	if (w.probeType == liveness && result == results.Failure) || (w.probeType == startup && (result == results.Success || result == results.Failure)) {
+	if (w.probeType == liveness && result == results.Failure) || w.probeType == startup {
 		// The container fails a liveness/startup check, it will need to be restarted.
 		// Stop probing until we see a new container ID. This is to reduce the
 		// chance of hitting #21751, where running `docker exec` when a

From 632f162d719dd3df43a826b1d2a0fd2c0b5cbb72 Mon Sep 17 00:00:00 2001
From: mochizuki875 <mzk875@gmail.com>
Date: Mon, 26 Aug 2024 08:39:22 +0000
Subject: [PATCH 5/5] fix unit test

---
 pkg/kubelet/prober/worker_test.go | 55 +++++++++++++++----------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go
index 93a3bfd34ce..fa852c5be3d 100644
--- a/pkg/kubelet/prober/worker_test.go
+++ b/pkg/kubelet/prober/worker_test.go
@@ -284,19 +284,19 @@ func TestStartupProbeSuccessThreshold(t *testing.T) {
 			if w.onHold {
 				t.Errorf("Prober should not be on hold")
 			}
+			msg := fmt.Sprintf("%d success", i+1)
+			expectContinue(t, w, w.doProbe(ctx), msg)
+			expectResult(t, w, results.Success, msg)
 		} else {
 			// Probe should be on hold and will not be executed anymore
 			// when successThreshold is met
 			if !w.onHold {
 				t.Errorf("Prober should be on hold because successThreshold is exceeded")
 			}
-		}
-		msg := fmt.Sprintf("%d success", successThreshold)
-		expectContinue(t, w, w.doProbe(ctx), msg)
-		expectResult(t, w, results.Success, msg)
-		// Meeting or exceeding successThreshold should cause resultRun to reset to 0
-		if w.resultRun != 0 {
-			t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
+			// Meeting or exceeding successThreshold should cause resultRun to reset to 0
+			if w.resultRun != 0 {
+				t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
+			}
 		}
 	}
 }
@@ -311,7 +311,7 @@ func TestStartupProbeFailureThreshold(t *testing.T) {
 	m.prober.exec = fakeExecProber{probe.Failure, nil}
 
 	for i := 0; i < failureThreshold+1; i++ {
-		if i < failureThreshold-1 {
+		if i < failureThreshold {
 			// Probe should not be on hold and will continue to be excuted
 			// until failureThreshold is met
 			if w.onHold {
@@ -319,23 +319,25 @@ func TestStartupProbeFailureThreshold(t *testing.T) {
 			}
 			msg := fmt.Sprintf("%d failure", i+1)
 			expectContinue(t, w, w.doProbe(ctx), msg)
-			expectResult(t, w, results.Unknown, msg)
-			// resultRun should be incremented until failureThreshold is met
-			if w.resultRun != i+1 {
-				t.Errorf("Prober resultRun should be %d, but %d", i+1, w.resultRun)
-			}
-		} else if i < failureThreshold {
-			// Probe should not be on hold and will continue to be excuted
-			// until failureThreshold is met
-			if w.onHold {
-				t.Errorf("Prober should not be on hold")
-			}
-			msg := fmt.Sprintf("%d failure", i+1)
-			expectContinue(t, w, w.doProbe(ctx), msg)
-			expectResult(t, w, results.Failure, msg)
-			// Meeting failureThreshold should cause resultRun to reset to 0
-			if w.resultRun != 0 {
-				t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
+			switch i {
+			case 0, 1:
+				// At this point, the expected result is Unknown
+				// because w.resultsManager.Set() will be called after FailureThreshold is reached
+				expectResult(t, w, results.Unknown, msg)
+				// resultRun should be incremented until failureThreshold is met
+				if w.resultRun != i+1 {
+					t.Errorf("Prober resultRun should be %d, but %d", i+1, w.resultRun)
+				}
+			case 2:
+				// The expected result is Failure
+				// because w.resultsManager.Set() will be called due to resultRun reaching failureThreshold,
+				// updating the cached result to Failure.
+				// After that, resultRun will be reset to 0.
+				expectResult(t, w, results.Failure, msg)
+				// Meeting failureThreshold should cause resultRun to reset to 0
+				if w.resultRun != 0 {
+					t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)
+				}
 			}
 		} else {
 			// Probe should be on hold and will not be executed anymore
@@ -343,9 +345,6 @@ func TestStartupProbeFailureThreshold(t *testing.T) {
 			if !w.onHold {
 				t.Errorf("Prober should be on hold because failureThreshold is exceeded")
 			}
-			msg := fmt.Sprintf("%d failure", failureThreshold)
-			expectContinue(t, w, w.doProbe(ctx), msg)
-			expectResult(t, w, results.Failure, msg)
 			// Exceeding failureThreshold should cause resultRun to reset to 0
 			if w.resultRun != 0 {
 				t.Errorf("Prober resultRun should be 0, but %d", w.resultRun)