mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-11 04:52:08 +00:00
Only wait for cache syncs once in NodeController
This commit is contained in:
parent
928b8cbdb8
commit
e7befa2a14
@ -356,97 +356,94 @@ func NewNodeController(
|
|||||||
|
|
||||||
// Run starts an asynchronous loop that monitors the status of cluster nodes.
|
// Run starts an asynchronous loop that monitors the status of cluster nodes.
|
||||||
func (nc *NodeController) Run() {
|
func (nc *NodeController) Run() {
|
||||||
// Incorporate the results of node status pushed from kubelet to master.
|
go func() {
|
||||||
go wait.Until(func() {
|
defer utilruntime.HandleCrash()
|
||||||
|
|
||||||
if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) {
|
if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) {
|
||||||
glog.Errorf("NodeController timed out while waiting for informers to sync...")
|
utilruntime.HandleError(errors.New("NodeController timed out while waiting for informers to sync..."))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if err := nc.monitorNodeStatus(); err != nil {
|
|
||||||
glog.Errorf("Error monitoring node status: %v", err)
|
|
||||||
}
|
|
||||||
}, nc.nodeMonitorPeriod, wait.NeverStop)
|
|
||||||
|
|
||||||
// Managing eviction of nodes:
|
// Incorporate the results of node status pushed from kubelet to master.
|
||||||
// 1. when we delete pods off a node, if the node was not empty at the time we then
|
go wait.Until(func() {
|
||||||
// queue a termination watcher
|
if err := nc.monitorNodeStatus(); err != nil {
|
||||||
// a. If we hit an error, retry deletion
|
glog.Errorf("Error monitoring node status: %v", err)
|
||||||
// 2. The terminator loop ensures that pods are eventually cleaned and we never
|
}
|
||||||
// terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt
|
}, nc.nodeMonitorPeriod, wait.NeverStop)
|
||||||
// is the time from which we measure "has this pod been terminating too long",
|
|
||||||
// after which we will delete the pod with grace period 0 (force delete).
|
|
||||||
// a. If we hit errors, retry instantly
|
|
||||||
// b. If there are no pods left terminating, exit
|
|
||||||
// c. If there are pods still terminating, wait for their estimated completion
|
|
||||||
// before retrying
|
|
||||||
go wait.Until(func() {
|
|
||||||
if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) {
|
|
||||||
glog.Errorf("NodeController timed out while waiting for informers to sync...")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
nc.evictorLock.Lock()
|
|
||||||
defer nc.evictorLock.Unlock()
|
|
||||||
for k := range nc.zonePodEvictor {
|
|
||||||
nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
|
|
||||||
obj, exists, err := nc.nodeStore.GetByKey(value.Value)
|
|
||||||
if err != nil {
|
|
||||||
glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err)
|
|
||||||
} else if !exists {
|
|
||||||
glog.Warningf("Node %v no longer present in nodeStore!", value.Value)
|
|
||||||
} else {
|
|
||||||
node, _ := obj.(*api.Node)
|
|
||||||
zone := utilnode.GetZoneKey(node)
|
|
||||||
EvictionsNumber.WithLabelValues(zone).Inc()
|
|
||||||
}
|
|
||||||
|
|
||||||
nodeUid, _ := value.UID.(string)
|
// Managing eviction of nodes:
|
||||||
remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
|
// 1. when we delete pods off a node, if the node was not empty at the time we then
|
||||||
if err != nil {
|
// queue a termination watcher
|
||||||
utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
|
// a. If we hit an error, retry deletion
|
||||||
return false, 0
|
// 2. The terminator loop ensures that pods are eventually cleaned and we never
|
||||||
}
|
// terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt
|
||||||
|
// is the time from which we measure "has this pod been terminating too long",
|
||||||
|
// after which we will delete the pod with grace period 0 (force delete).
|
||||||
|
// a. If we hit errors, retry instantly
|
||||||
|
// b. If there are no pods left terminating, exit
|
||||||
|
// c. If there are pods still terminating, wait for their estimated completion
|
||||||
|
// before retrying
|
||||||
|
go wait.Until(func() {
|
||||||
|
nc.evictorLock.Lock()
|
||||||
|
defer nc.evictorLock.Unlock()
|
||||||
|
for k := range nc.zonePodEvictor {
|
||||||
|
nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
|
||||||
|
obj, exists, err := nc.nodeStore.GetByKey(value.Value)
|
||||||
|
if err != nil {
|
||||||
|
glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err)
|
||||||
|
} else if !exists {
|
||||||
|
glog.Warningf("Node %v no longer present in nodeStore!", value.Value)
|
||||||
|
} else {
|
||||||
|
node, _ := obj.(*api.Node)
|
||||||
|
zone := utilnode.GetZoneKey(node)
|
||||||
|
EvictionsNumber.WithLabelValues(zone).Inc()
|
||||||
|
}
|
||||||
|
|
||||||
if remaining {
|
nodeUid, _ := value.UID.(string)
|
||||||
nc.zoneTerminationEvictor[k].Add(value.Value, value.UID)
|
remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
|
||||||
}
|
if err != nil {
|
||||||
return true, 0
|
utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
|
||||||
})
|
return false, 0
|
||||||
}
|
}
|
||||||
}, nodeEvictionPeriod, wait.NeverStop)
|
|
||||||
|
|
||||||
// TODO: replace with a controller that ensures pods that are terminating complete
|
if remaining {
|
||||||
// in a particular time period
|
nc.zoneTerminationEvictor[k].Add(value.Value, value.UID)
|
||||||
go wait.Until(func() {
|
}
|
||||||
if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) {
|
|
||||||
glog.Errorf("NodeController timed out while waiting for informers to sync...")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
nc.evictorLock.Lock()
|
|
||||||
defer nc.evictorLock.Unlock()
|
|
||||||
for k := range nc.zoneTerminationEvictor {
|
|
||||||
nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
|
|
||||||
nodeUid, _ := value.UID.(string)
|
|
||||||
completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod)
|
|
||||||
if err != nil {
|
|
||||||
utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
|
|
||||||
return false, 0
|
|
||||||
}
|
|
||||||
|
|
||||||
if completed {
|
|
||||||
glog.V(2).Infof("All pods terminated on %s", value.Value)
|
|
||||||
recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
|
|
||||||
return true, 0
|
return true, 0
|
||||||
}
|
})
|
||||||
|
}
|
||||||
|
}, nodeEvictionPeriod, wait.NeverStop)
|
||||||
|
|
||||||
glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining)
|
// TODO: replace with a controller that ensures pods that are terminating complete
|
||||||
// clamp very short intervals
|
// in a particular time period
|
||||||
if remaining < nodeEvictionPeriod {
|
go wait.Until(func() {
|
||||||
remaining = nodeEvictionPeriod
|
nc.evictorLock.Lock()
|
||||||
}
|
defer nc.evictorLock.Unlock()
|
||||||
return false, remaining
|
for k := range nc.zoneTerminationEvictor {
|
||||||
})
|
nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
|
||||||
}
|
nodeUid, _ := value.UID.(string)
|
||||||
}, nodeEvictionPeriod, wait.NeverStop)
|
completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod)
|
||||||
|
if err != nil {
|
||||||
|
utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
|
||||||
|
return false, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if completed {
|
||||||
|
glog.V(2).Infof("All pods terminated on %s", value.Value)
|
||||||
|
recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
|
||||||
|
return true, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining)
|
||||||
|
// clamp very short intervals
|
||||||
|
if remaining < nodeEvictionPeriod {
|
||||||
|
remaining = nodeEvictionPeriod
|
||||||
|
}
|
||||||
|
return false, remaining
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}, nodeEvictionPeriod, wait.NeverStop)
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
|
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
|
||||||
|
Loading…
Reference in New Issue
Block a user