mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 19:56:01 +00:00
Merge pull request #59020 from brendandburns/kubelet-hang
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Remove setInitError. **What this PR does / why we need it**: Removes setInitError, it's not sure it was ever really used, and it causes the kubelet to hang and get wedged. **Which issue(s) this PR fixes** Fixes #46086 **Special notes for your reviewer**: If `initializeModules()` in `kubelet.go` encounters an error, it calls `runtimeState.setInitError(...)`47d61ef472/pkg/kubelet/kubelet.go (L1339)
The trouble with this is that `initError` is never cleared, which means that `runtimeState.runtimeErrors()` always returns this `initError`, and thus pods never start sync-ing. In normal operation, this is expected and desired because eventually the runtime is expected to become healthy, but in this case, `initError` is never updated, and so the system just gets wedged.47d61ef472/pkg/kubelet/kubelet.go (L1751)
We could add some retry to `initializeModules()` but that seems unnecessary, as eventually we'd want to just die anyway. Instead, just log fatal and die, a supervisor will restart us. Note, I'm happy to add some retry here too, if that makes reviewers happier. **Release note**: ```release-note Prevent kubelet from getting wedged if initialization of modules returns an error. ``` @feiskyer @dchen1107 @janetkuo @kubernetes/sig-node-bugs
This commit is contained in:
commit
a18f086220
@ -1335,8 +1335,7 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) {
|
||||
|
||||
if err := kl.initializeModules(); err != nil {
|
||||
kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.KubeletSetupFailed, err.Error())
|
||||
glog.Error(err)
|
||||
kl.runtimeState.setInitError(err)
|
||||
glog.Fatal(err)
|
||||
}
|
||||
|
||||
// Start volume manager
|
||||
|
@ -29,7 +29,6 @@ type runtimeState struct {
|
||||
networkError error
|
||||
internalError error
|
||||
cidr string
|
||||
initError error
|
||||
healthChecks []*healthCheck
|
||||
}
|
||||
|
||||
@ -78,19 +77,10 @@ func (s *runtimeState) podCIDR() string {
|
||||
return s.cidr
|
||||
}
|
||||
|
||||
func (s *runtimeState) setInitError(err error) {
|
||||
s.Lock()
|
||||
defer s.Unlock()
|
||||
s.initError = err
|
||||
}
|
||||
|
||||
func (s *runtimeState) runtimeErrors() []string {
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
var ret []string
|
||||
if s.initError != nil {
|
||||
ret = append(ret, s.initError.Error())
|
||||
}
|
||||
if !s.lastBaseRuntimeSync.Add(s.baseRuntimeSyncThreshold).After(time.Now()) {
|
||||
ret = append(ret, "container runtime is down")
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user