mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 15:05:27 +00:00
Merge pull request #37865 from yujuhong/decouple_lifecycle
Automatic merge from submit-queue kubelet: remove the pleg health check from healthz This prevents kubelet from being killed when docker hangs. Also, kubelet will report node not ready if PLEG hangs (`docker ps` + `docker inspect`).
This commit is contained in:
commit
9a88687e24
@ -2026,11 +2026,6 @@ func (kl *Kubelet) LatestLoopEntryTime() time.Time {
|
|||||||
return val.(time.Time)
|
return val.(time.Time)
|
||||||
}
|
}
|
||||||
|
|
||||||
// PLEGHealthCheck returns whether the PLEG is healthy.
|
|
||||||
func (kl *Kubelet) PLEGHealthCheck() (bool, error) {
|
|
||||||
return kl.pleg.Healthy()
|
|
||||||
}
|
|
||||||
|
|
||||||
// updateRuntimeUp calls the container runtime status callback, initializing
|
// updateRuntimeUp calls the container runtime status callback, initializing
|
||||||
// the runtime dependent modules when the container runtime first comes up,
|
// the runtime dependent modules when the container runtime first comes up,
|
||||||
// and returns an error if the status check fails. If the status check is OK,
|
// and returns an error if the status check fails. If the status check is OK,
|
||||||
|
@ -75,6 +75,11 @@ const (
|
|||||||
plegContainerExited plegContainerState = "exited"
|
plegContainerExited plegContainerState = "exited"
|
||||||
plegContainerUnknown plegContainerState = "unknown"
|
plegContainerUnknown plegContainerState = "unknown"
|
||||||
plegContainerNonExistent plegContainerState = "non-existent"
|
plegContainerNonExistent plegContainerState = "non-existent"
|
||||||
|
|
||||||
|
// The threshold needs to be greater than the relisting period + the
|
||||||
|
// relisting time, which can vary significantly. Set a conservative
|
||||||
|
// threshold to avoid flipping between healthy and unhealthy.
|
||||||
|
relistThreshold = 3 * time.Minute
|
||||||
)
|
)
|
||||||
|
|
||||||
func convertState(state kubecontainer.ContainerState) plegContainerState {
|
func convertState(state kubecontainer.ContainerState) plegContainerState {
|
||||||
@ -126,13 +131,9 @@ func (g *GenericPLEG) Start() {
|
|||||||
|
|
||||||
func (g *GenericPLEG) Healthy() (bool, error) {
|
func (g *GenericPLEG) Healthy() (bool, error) {
|
||||||
relistTime := g.getRelistTime()
|
relistTime := g.getRelistTime()
|
||||||
// TODO: Evaluate if we can reduce this threshold.
|
elapsed := g.clock.Since(relistTime)
|
||||||
// The threshold needs to be greater than the relisting period + the
|
if elapsed > relistThreshold {
|
||||||
// relisting time, which can vary significantly. Set a conservative
|
return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)
|
||||||
// threshold so that we don't cause kubelet to be restarted unnecessarily.
|
|
||||||
threshold := 2 * time.Minute
|
|
||||||
if g.clock.Since(relistTime) > threshold {
|
|
||||||
return false, fmt.Errorf("pleg was last seen active at %v", relistTime)
|
|
||||||
}
|
}
|
||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,22 @@ type runtimeState struct {
|
|||||||
internalError error
|
internalError error
|
||||||
cidr string
|
cidr string
|
||||||
initError error
|
initError error
|
||||||
|
healthChecks []*healthCheck
|
||||||
|
}
|
||||||
|
|
||||||
|
// A health check function should be efficient and not rely on external
|
||||||
|
// components (e.g., container runtime).
|
||||||
|
type healthCheckFnType func() (bool, error)
|
||||||
|
|
||||||
|
type healthCheck struct {
|
||||||
|
name string
|
||||||
|
fn healthCheckFnType
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *runtimeState) addHealthCheck(name string, f healthCheckFnType) {
|
||||||
|
s.Lock()
|
||||||
|
defer s.Unlock()
|
||||||
|
s.healthChecks = append(s.healthChecks, &healthCheck{name: name, fn: f})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *runtimeState) setRuntimeSync(t time.Time) {
|
func (s *runtimeState) setRuntimeSync(t time.Time) {
|
||||||
@ -81,6 +97,12 @@ func (s *runtimeState) runtimeErrors() []string {
|
|||||||
if s.internalError != nil {
|
if s.internalError != nil {
|
||||||
ret = append(ret, s.internalError.Error())
|
ret = append(ret, s.internalError.Error())
|
||||||
}
|
}
|
||||||
|
for _, hc := range s.healthChecks {
|
||||||
|
if ok, err := hc.fn(); !ok {
|
||||||
|
ret = append(ret, fmt.Sprintf("%s is not healthy: %v", hc.name, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,7 +182,6 @@ type HostInterface interface {
|
|||||||
ImagesFsInfo() (cadvisorapiv2.FsInfo, error)
|
ImagesFsInfo() (cadvisorapiv2.FsInfo, error)
|
||||||
RootFsInfo() (cadvisorapiv2.FsInfo, error)
|
RootFsInfo() (cadvisorapiv2.FsInfo, error)
|
||||||
ListVolumesForPod(podUID types.UID) (map[string]volume.Volume, bool)
|
ListVolumesForPod(podUID types.UID) (map[string]volume.Volume, bool)
|
||||||
PLEGHealthCheck() (bool, error)
|
|
||||||
GetExec(podFullName string, podUID types.UID, containerName string, cmd []string, streamOpts remotecommand.Options) (*url.URL, error)
|
GetExec(podFullName string, podUID types.UID, containerName string, cmd []string, streamOpts remotecommand.Options) (*url.URL, error)
|
||||||
GetAttach(podFullName string, podUID types.UID, containerName string, streamOpts remotecommand.Options) (*url.URL, error)
|
GetAttach(podFullName string, podUID types.UID, containerName string, streamOpts remotecommand.Options) (*url.URL, error)
|
||||||
GetPortForward(podName, podNamespace string, podUID types.UID) (*url.URL, error)
|
GetPortForward(podName, podNamespace string, podUID types.UID) (*url.URL, error)
|
||||||
@ -257,7 +256,6 @@ func (s *Server) InstallDefaultHandlers() {
|
|||||||
healthz.InstallHandler(s.restfulCont,
|
healthz.InstallHandler(s.restfulCont,
|
||||||
healthz.PingHealthz,
|
healthz.PingHealthz,
|
||||||
healthz.NamedCheck("syncloop", s.syncLoopHealthCheck),
|
healthz.NamedCheck("syncloop", s.syncLoopHealthCheck),
|
||||||
healthz.NamedCheck("pleg", s.plegHealthCheck),
|
|
||||||
)
|
)
|
||||||
var ws *restful.WebService
|
var ws *restful.WebService
|
||||||
ws = new(restful.WebService)
|
ws = new(restful.WebService)
|
||||||
@ -417,14 +415,6 @@ func (s *Server) syncLoopHealthCheck(req *http.Request) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checks if pleg, which lists pods periodically, is healthy.
|
|
||||||
func (s *Server) plegHealthCheck(req *http.Request) error {
|
|
||||||
if ok, err := s.host.PLEGHealthCheck(); !ok {
|
|
||||||
return fmt.Errorf("PLEG took longer than expected: %v", err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// getContainerLogs handles containerLogs request against the Kubelet
|
// getContainerLogs handles containerLogs request against the Kubelet
|
||||||
func (s *Server) getContainerLogs(request *restful.Request, response *restful.Response) {
|
func (s *Server) getContainerLogs(request *restful.Request, response *restful.Response) {
|
||||||
podNamespace := request.PathParameter("podNamespace")
|
podNamespace := request.PathParameter("podNamespace")
|
||||||
|
@ -159,8 +159,6 @@ func (fk *fakeKubelet) StreamingConnectionIdleTimeout() time.Duration {
|
|||||||
return fk.streamingConnectionIdleTimeoutFunc()
|
return fk.streamingConnectionIdleTimeoutFunc()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (fk *fakeKubelet) PLEGHealthCheck() (bool, error) { return fk.plegHealth, nil }
|
|
||||||
|
|
||||||
// Unused functions
|
// Unused functions
|
||||||
func (_ *fakeKubelet) GetContainerInfoV2(_ string, _ cadvisorapiv2.RequestOptions) (map[string]cadvisorapiv2.ContainerInfo, error) {
|
func (_ *fakeKubelet) GetContainerInfoV2(_ string, _ cadvisorapiv2.RequestOptions) (map[string]cadvisorapiv2.ContainerInfo, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
@ -869,18 +867,6 @@ func TestSyncLoopCheck(t *testing.T) {
|
|||||||
assertHealthFails(t, fw.testHTTPServer.URL+"/healthz", http.StatusInternalServerError)
|
assertHealthFails(t, fw.testHTTPServer.URL+"/healthz", http.StatusInternalServerError)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPLEGHealthCheck(t *testing.T) {
|
|
||||||
fw := newServerTest()
|
|
||||||
defer fw.testHTTPServer.Close()
|
|
||||||
fw.fakeKubelet.hostnameFunc = func() string {
|
|
||||||
return "127.0.0.1"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test with failed pleg health check.
|
|
||||||
fw.fakeKubelet.plegHealth = false
|
|
||||||
assertHealthFails(t, fw.testHTTPServer.URL+"/healthz", http.StatusInternalServerError)
|
|
||||||
}
|
|
||||||
|
|
||||||
// returns http response status code from the HTTP GET
|
// returns http response status code from the HTTP GET
|
||||||
func assertHealthIsOk(t *testing.T, httpURL string) {
|
func assertHealthIsOk(t *testing.T, httpURL string) {
|
||||||
resp, err := http.Get(httpURL)
|
resp, err := http.Get(httpURL)
|
||||||
|
Loading…
Reference in New Issue
Block a user