From 66595d54a0b68cfe7b55f8ff17167b291fa4cf03 Mon Sep 17 00:00:00 2001 From: Matthias Bertschy Date: Thu, 24 Oct 2019 08:58:16 +0200 Subject: [PATCH] Add startupProbe result handling to kuberuntime --- pkg/kubelet/kubelet.go | 4 ++ pkg/kubelet/kubelet_test.go | 1 + pkg/kubelet/kuberuntime/BUILD | 1 + .../kuberuntime/fake_kuberuntime_manager.go | 1 + .../kuberuntime/kuberuntime_manager.go | 6 +++ .../kuberuntime/kuberuntime_manager_test.go | 39 ++++++++++++++++++- pkg/kubelet/prober/common_test.go | 1 + pkg/kubelet/prober/prober_manager.go | 2 +- pkg/kubelet/prober/results/results_manager.go | 13 ++++--- pkg/kubelet/prober/worker.go | 2 +- pkg/kubelet/prober/worker_test.go | 31 +++++++++------ 11 files changed, 81 insertions(+), 20 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 9fa8fcf52ec..b33889bb91f 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -583,6 +583,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, imageBackOff := flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff) klet.livenessManager = proberesults.NewManager() + klet.startupManager = proberesults.NewManager() klet.podCache = kubecontainer.NewCache() var checkpointManager checkpointmanager.CheckpointManager @@ -671,6 +672,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, runtime, err := kuberuntime.NewKubeGenericRuntimeManager( kubecontainer.FilterEventRecorder(kubeDeps.Recorder), klet.livenessManager, + klet.startupManager, seccompProfileRoot, containerRefManager, machineInfo, @@ -777,6 +779,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, klet.probeManager = prober.NewManager( klet.statusManager, klet.livenessManager, + klet.startupManager, klet.runner, containerRefManager, kubeDeps.Recorder) @@ -976,6 +979,7 @@ type Kubelet struct { probeManager prober.Manager // Manages container health check results. livenessManager proberesults.Manager + startupManager proberesults.Manager // How long to keep idle streaming command execution/port forwarding // connections open before terminating them diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 7b92438807e..27ad8a6acbd 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -233,6 +233,7 @@ func newTestKubeletWithImageList( kubelet.probeManager = probetest.FakeManager{} kubelet.livenessManager = proberesults.NewManager() + kubelet.startupManager = proberesults.NewManager() kubelet.containerManager = cm.NewStubContainerManager() fakeNodeRef := &v1.ObjectReference{ diff --git a/pkg/kubelet/kuberuntime/BUILD b/pkg/kubelet/kuberuntime/BUILD index 3813012d891..bc2b804e43e 100644 --- a/pkg/kubelet/kuberuntime/BUILD +++ b/pkg/kubelet/kuberuntime/BUILD @@ -107,6 +107,7 @@ go_test( "//pkg/kubelet/container/testing:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/metrics:go_default_library", + "//pkg/kubelet/prober/results:go_default_library", "//pkg/kubelet/runtimeclass:go_default_library", "//pkg/kubelet/runtimeclass/testing:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", diff --git a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go index 2f2a64f9d45..2552ebe8d3c 100644 --- a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go @@ -78,6 +78,7 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS cpuCFSQuota: false, cpuCFSQuotaPeriod: metav1.Duration{Duration: time.Microsecond * 100}, livenessManager: proberesults.NewManager(), + startupManager: proberesults.NewManager(), containerRefManager: kubecontainer.NewRefManager(), machineInfo: machineInfo, osInterface: osInterface, diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 814a7b97cce..2f86ef7a917 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -100,6 +100,7 @@ type kubeGenericRuntimeManager struct { // Health check results. livenessManager proberesults.Manager + startupManager proberesults.Manager // If true, enforce container cpu limits with CFS quota support cpuCFSQuota bool @@ -150,6 +151,7 @@ type LegacyLogProvider interface { func NewKubeGenericRuntimeManager( recorder record.EventRecorder, livenessManager proberesults.Manager, + startupManager proberesults.Manager, seccompProfileRoot string, containerRefManager *kubecontainer.RefManager, machineInfo *cadvisorapi.MachineInfo, @@ -175,6 +177,7 @@ func NewKubeGenericRuntimeManager( cpuCFSQuotaPeriod: cpuCFSQuotaPeriod, seccompProfileRoot: seccompProfileRoot, livenessManager: livenessManager, + startupManager: startupManager, containerRefManager: containerRefManager, machineInfo: machineInfo, osInterface: osInterface, @@ -590,6 +593,9 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku } else if liveness, found := m.livenessManager.Get(containerStatus.ID); found && liveness == proberesults.Failure { // If the container failed the liveness probe, we should kill it. message = fmt.Sprintf("Container %s failed liveness probe", container.Name) + } else if startup, found := m.startupManager.Get(containerStatus.ID); found && startup == proberesults.Failure { + // If the container failed the startup probe, we should kill it. + message = fmt.Sprintf("Container %s failed startup probe", container.Name) } else { // Keep the container. keepCount++ diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go index 166eb794547..9548589be9b 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go @@ -41,6 +41,7 @@ import ( "k8s.io/kubernetes/pkg/features" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" + proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results" ) var ( @@ -732,6 +733,7 @@ func TestComputePodActions(t *testing.T) { mutatePodFn func(*v1.Pod) mutateStatusFn func(*kubecontainer.PodStatus) actions podActions + resetStatusFn func(*kubecontainer.PodStatus) }{ "everying is good; do nothing": { actions: noAction, @@ -850,8 +852,38 @@ func TestComputePodActions(t *testing.T) { ContainersToKill: getKillMap(basePod, baseStatus, []int{1}), ContainersToStart: []int{1}, }, - // TODO: Add a test case for containers which failed the liveness - // check. Will need to fake the livessness check result. + }, + "Kill and recreate the container if the liveness check has failed": { + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.RestartPolicy = v1.RestartPolicyAlways + }, + mutateStatusFn: func(status *kubecontainer.PodStatus) { + m.livenessManager.Set(status.ContainerStatuses[1].ID, proberesults.Failure, basePod) + }, + actions: podActions{ + SandboxID: baseStatus.SandboxStatuses[0].Id, + ContainersToKill: getKillMap(basePod, baseStatus, []int{1}), + ContainersToStart: []int{1}, + }, + resetStatusFn: func(status *kubecontainer.PodStatus) { + m.livenessManager.Remove(status.ContainerStatuses[1].ID) + }, + }, + "Kill and recreate the container if the startup check has failed": { + mutatePodFn: func(pod *v1.Pod) { + pod.Spec.RestartPolicy = v1.RestartPolicyAlways + }, + mutateStatusFn: func(status *kubecontainer.PodStatus) { + m.startupManager.Set(status.ContainerStatuses[1].ID, proberesults.Failure, basePod) + }, + actions: podActions{ + SandboxID: baseStatus.SandboxStatuses[0].Id, + ContainersToKill: getKillMap(basePod, baseStatus, []int{1}), + ContainersToStart: []int{1}, + }, + resetStatusFn: func(status *kubecontainer.PodStatus) { + m.startupManager.Remove(status.ContainerStatuses[1].ID) + }, }, "Verify we do not create a pod sandbox if no ready sandbox for pod with RestartPolicy=Never and all containers exited": { mutatePodFn: func(pod *v1.Pod) { @@ -917,6 +949,9 @@ func TestComputePodActions(t *testing.T) { } actions := m.computePodActions(pod, status) verifyActions(t, &test.actions, &actions, desc) + if test.resetStatusFn != nil { + test.resetStatusFn(status) + } } } diff --git a/pkg/kubelet/prober/common_test.go b/pkg/kubelet/prober/common_test.go index 8cec8bfc8eb..30e2bf3575d 100644 --- a/pkg/kubelet/prober/common_test.go +++ b/pkg/kubelet/prober/common_test.go @@ -112,6 +112,7 @@ func newTestManager() *manager { m := NewManager( status.NewManager(&fake.Clientset{}, podManager, &statustest.FakePodDeletionSafetyProvider{}), results.NewManager(), + results.NewManager(), nil, // runner refManager, &record.FakeRecorder{}, diff --git a/pkg/kubelet/prober/prober_manager.go b/pkg/kubelet/prober/prober_manager.go index d3eccadfe9e..f0ca8b94192 100644 --- a/pkg/kubelet/prober/prober_manager.go +++ b/pkg/kubelet/prober/prober_manager.go @@ -102,13 +102,13 @@ type manager struct { func NewManager( statusManager status.Manager, livenessManager results.Manager, + startupManager results.Manager, runner kubecontainer.ContainerCommandRunner, refManager *kubecontainer.RefManager, recorder record.EventRecorder) Manager { prober := newProber(runner, refManager, recorder) readinessManager := results.NewManager() - startupManager := results.NewManager() return &manager{ statusManager: statusManager, prober: prober, diff --git a/pkg/kubelet/prober/results/results_manager.go b/pkg/kubelet/prober/results/results_manager.go index 58d02c588c3..bb9f494bf20 100644 --- a/pkg/kubelet/prober/results/results_manager.go +++ b/pkg/kubelet/prober/results/results_manager.go @@ -40,14 +40,17 @@ type Manager interface { } // Result is the type for probe results. -type Result bool +type Result int const ( - // Success is encoded as "true" (type Result) - Success Result = true + // Unknown is encoded as -1 (type Result) + Unknown Result = iota - 1 - // Failure is encoded as "false" (type Result) - Failure Result = false + // Success is encoded as 0 (type Result) + Success + + // Failure is encoded as 1 (type Result) + Failure ) func (r Result) String() string { diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go index d8225850042..a7882d72c59 100644 --- a/pkg/kubelet/prober/worker.go +++ b/pkg/kubelet/prober/worker.go @@ -101,7 +101,7 @@ func newWorker( case startup: w.spec = container.StartupProbe w.resultsManager = m.startupManager - w.initialValue = results.Failure + w.initialValue = results.Unknown } basicMetricLabels := metrics.Labels{ diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go index 0cd6de98df0..473173b9b82 100644 --- a/pkg/kubelet/prober/worker_test.go +++ b/pkg/kubelet/prober/worker_test.go @@ -79,10 +79,12 @@ func TestDoProbe(t *testing.T) { podStatus: &pendingStatus, expectContinue: true, expectSet: true, + expectedResult: results.Failure, }, { // Container terminated - podStatus: &terminatedStatus, - expectSet: true, + podStatus: &terminatedStatus, + expectSet: true, + expectedResult: results.Failure, }, { // Probe successful. podStatus: &runningStatus, @@ -134,8 +136,15 @@ func TestInitialDelay(t *testing.T) { m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(probeType != startup)) expectContinue(t, w, w.doProbe(), "during initial delay") - // Default value depends on probe, true for liveness, otherwise false. - expectResult(t, w, results.Result(probeType == liveness), "during initial delay") + // Default value depends on probe, Success for liveness, Failure for readiness, Unknown for startup + switch probeType { + case liveness: + expectResult(t, w, results.Success, "during initial delay") + case readiness: + expectResult(t, w, results.Failure, "during initial delay") + case startup: + expectResult(t, w, results.Unknown, "during initial delay") + } // 100 seconds later... laterStatus := getTestRunningStatusWithStarted(probeType != startup) @@ -397,17 +406,17 @@ func TestResultRunOnStartupCheckFailure(t *testing.T) { // Below FailureThreshold leaves probe state unchanged // which is failed for startup at first. m.prober.exec = fakeExecProber{probe.Failure, nil} - msg := "probe failure, result failure" + msg := "probe failure, result unknown" expectContinue(t, w, w.doProbe(), msg) - expectResult(t, w, results.Failure, msg) + expectResult(t, w, results.Unknown, msg) if w.resultRun != 1 { t.Errorf("Prober resultRun should be 1") } m.prober.exec = fakeExecProber{probe.Failure, nil} - msg = "2nd probe failure, result failure" + msg = "2nd probe failure, result unknown" expectContinue(t, w, w.doProbe(), msg) - expectResult(t, w, results.Failure, msg) + expectResult(t, w, results.Unknown, msg) if w.resultRun != 2 { t.Errorf("Prober resultRun should be 2") } @@ -446,11 +455,11 @@ func TestStartupProbeDisabledByStarted(t *testing.T) { m := newTestManager() w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: 1, FailureThreshold: 2}) m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(false)) - // startupProbe fails + // startupProbe fails < FailureThreshold, stays unknown m.prober.exec = fakeExecProber{probe.Failure, nil} - msg := "Not started, probe failure, result failure" + msg := "Not started, probe failure, result unknown" expectContinue(t, w, w.doProbe(), msg) - expectResult(t, w, results.Failure, msg) + expectResult(t, w, results.Unknown, msg) // startupProbe succeeds m.prober.exec = fakeExecProber{probe.Success, nil} msg = "Started, probe success, result success"