From ec0e99c2ed9e0cb1c66d2d416f31ec1f6975affc Mon Sep 17 00:00:00 2001 From: Yu-Ju Hong Date: Thu, 1 Dec 2016 14:56:28 -0800 Subject: [PATCH] Check the health of PLEG when updating the node status --- pkg/kubelet/pleg/generic.go | 15 ++++++++------- pkg/kubelet/runtime.go | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/pkg/kubelet/pleg/generic.go b/pkg/kubelet/pleg/generic.go index 5967185e64c..8edff04c7ff 100644 --- a/pkg/kubelet/pleg/generic.go +++ b/pkg/kubelet/pleg/generic.go @@ -75,6 +75,11 @@ const ( plegContainerExited plegContainerState = "exited" plegContainerUnknown plegContainerState = "unknown" plegContainerNonExistent plegContainerState = "non-existent" + + // The threshold needs to be greater than the relisting period + the + // relisting time, which can vary significantly. Set a conservative + // threshold to avoid flipping between healthy and unhealthy. + relistThreshold = 3 * time.Minute ) func convertState(state kubecontainer.ContainerState) plegContainerState { @@ -126,13 +131,9 @@ func (g *GenericPLEG) Start() { func (g *GenericPLEG) Healthy() (bool, error) { relistTime := g.getRelistTime() - // TODO: Evaluate if we can reduce this threshold. - // The threshold needs to be greater than the relisting period + the - // relisting time, which can vary significantly. Set a conservative - // threshold so that we don't cause kubelet to be restarted unnecessarily. - threshold := 2 * time.Minute - if g.clock.Since(relistTime) > threshold { - return false, fmt.Errorf("pleg was last seen active at %v", relistTime) + elapsed := g.clock.Since(relistTime) + if elapsed > relistThreshold { + return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold) } return true, nil } diff --git a/pkg/kubelet/runtime.go b/pkg/kubelet/runtime.go index 6cb74fe364c..445b4cf4ebb 100644 --- a/pkg/kubelet/runtime.go +++ b/pkg/kubelet/runtime.go @@ -30,6 +30,22 @@ type runtimeState struct { internalError error cidr string initError error + healthChecks []*healthCheck +} + +// A health check function should be efficient and not rely on external +// components (e.g., container runtime). +type healthCheckFnType func() (bool, error) + +type healthCheck struct { + name string + fn healthCheckFnType +} + +func (s *runtimeState) addHealthCheck(name string, f healthCheckFnType) { + s.Lock() + defer s.Unlock() + s.healthChecks = append(s.healthChecks, &healthCheck{name: name, fn: f}) } func (s *runtimeState) setRuntimeSync(t time.Time) { @@ -81,6 +97,12 @@ func (s *runtimeState) runtimeErrors() []string { if s.internalError != nil { ret = append(ret, s.internalError.Error()) } + for _, hc := range s.healthChecks { + if ok, err := hc.fn(); !ok { + ret = append(ret, fmt.Sprintf("%s is not healthy: %v", hc.name, err)) + } + } + return ret }