Only wait for cache syncs once in NodeController

2026-01-05 15:37:24 +00:00 · 2016-10-14 15:36:31 -04:00
parent 928b8cbdb8
commit e7befa2a14
1 changed files with 79 additions and 82 deletions
--- a/pkg/controller/node/nodecontroller.go
+++ b/pkg/controller/node/nodecontroller.go
@@ -356,97 +356,94 @@ func NewNodeController(

 // Run starts an asynchronous loop that monitors the status of cluster nodes.
 func (nc *NodeController) Run() {
-	// Incorporate the results of node status pushed from kubelet to master.
-	go wait.Until(func() {
+	go func() {
+		defer utilruntime.HandleCrash()
+
 		if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) {
-			glog.Errorf("NodeController timed out while waiting for informers to sync...")
+			utilruntime.HandleError(errors.New("NodeController timed out while waiting for informers to sync..."))
 			return
 		}
-		if err := nc.monitorNodeStatus(); err != nil {
-			glog.Errorf("Error monitoring node status: %v", err)
-		}
-	}, nc.nodeMonitorPeriod, wait.NeverStop)

-	// Managing eviction of nodes:
-	// 1. when we delete pods off a node, if the node was not empty at the time we then
-	//    queue a termination watcher
-	//    a. If we hit an error, retry deletion
-	// 2. The terminator loop ensures that pods are eventually cleaned and we never
-	//    terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt
-	//    is the time from which we measure "has this pod been terminating too long",
-	//    after which we will delete the pod with grace period 0 (force delete).
-	//    a. If we hit errors, retry instantly
-	//    b. If there are no pods left terminating, exit
-	//    c. If there are pods still terminating, wait for their estimated completion
-	//       before retrying
-	go wait.Until(func() {
-		if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) {
-			glog.Errorf("NodeController timed out while waiting for informers to sync...")
-			return
-		}
-		nc.evictorLock.Lock()
-		defer nc.evictorLock.Unlock()
-		for k := range nc.zonePodEvictor {
-			nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
-				obj, exists, err := nc.nodeStore.GetByKey(value.Value)
-				if err != nil {
-					glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err)
-				} else if !exists {
-					glog.Warningf("Node %v no longer present in nodeStore!", value.Value)
-				} else {
-					node, _ := obj.(*api.Node)
-					zone := utilnode.GetZoneKey(node)
-					EvictionsNumber.WithLabelValues(zone).Inc()
-				}
+		// Incorporate the results of node status pushed from kubelet to master.
+		go wait.Until(func() {
+			if err := nc.monitorNodeStatus(); err != nil {
+				glog.Errorf("Error monitoring node status: %v", err)
+			}
+		}, nc.nodeMonitorPeriod, wait.NeverStop)

-				nodeUid, _ := value.UID.(string)
-				remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
-				if err != nil {
-					utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
-					return false, 0
-				}
+		// Managing eviction of nodes:
+		// 1. when we delete pods off a node, if the node was not empty at the time we then
+		//    queue a termination watcher
+		//    a. If we hit an error, retry deletion
+		// 2. The terminator loop ensures that pods are eventually cleaned and we never
+		//    terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt
+		//    is the time from which we measure "has this pod been terminating too long",
+		//    after which we will delete the pod with grace period 0 (force delete).
+		//    a. If we hit errors, retry instantly
+		//    b. If there are no pods left terminating, exit
+		//    c. If there are pods still terminating, wait for their estimated completion
+		//       before retrying
+		go wait.Until(func() {
+			nc.evictorLock.Lock()
+			defer nc.evictorLock.Unlock()
+			for k := range nc.zonePodEvictor {
+				nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
+					obj, exists, err := nc.nodeStore.GetByKey(value.Value)
+					if err != nil {
+						glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err)
+					} else if !exists {
+						glog.Warningf("Node %v no longer present in nodeStore!", value.Value)
+					} else {
+						node, _ := obj.(*api.Node)
+						zone := utilnode.GetZoneKey(node)
+						EvictionsNumber.WithLabelValues(zone).Inc()
+					}

-				if remaining {
-					nc.zoneTerminationEvictor[k].Add(value.Value, value.UID)
-				}
-				return true, 0
-			})
-		}
-	}, nodeEvictionPeriod, wait.NeverStop)
+					nodeUid, _ := value.UID.(string)
+					remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
+					if err != nil {
+						utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
+						return false, 0
+					}

-	// TODO: replace with a controller that ensures pods that are terminating complete
-	// in a particular time period
-	go wait.Until(func() {
-		if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformer.Informer().HasSynced, nc.podInformer.Informer().HasSynced, nc.daemonSetInformer.Informer().HasSynced) {
-			glog.Errorf("NodeController timed out while waiting for informers to sync...")
-			return
-		}
-		nc.evictorLock.Lock()
-		defer nc.evictorLock.Unlock()
-		for k := range nc.zoneTerminationEvictor {
-			nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
-				nodeUid, _ := value.UID.(string)
-				completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod)
-				if err != nil {
-					utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
-					return false, 0
-				}
-
-				if completed {
-					glog.V(2).Infof("All pods terminated on %s", value.Value)
-					recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
+					if remaining {
+						nc.zoneTerminationEvictor[k].Add(value.Value, value.UID)
+					}
 					return true, 0
-				}
+				})
+			}
+		}, nodeEvictionPeriod, wait.NeverStop)

-				glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining)
-				// clamp very short intervals
-				if remaining < nodeEvictionPeriod {
-					remaining = nodeEvictionPeriod
-				}
-				return false, remaining
-			})
-		}
-	}, nodeEvictionPeriod, wait.NeverStop)
+		// TODO: replace with a controller that ensures pods that are terminating complete
+		// in a particular time period
+		go wait.Until(func() {
+			nc.evictorLock.Lock()
+			defer nc.evictorLock.Unlock()
+			for k := range nc.zoneTerminationEvictor {
+				nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
+					nodeUid, _ := value.UID.(string)
+					completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod)
+					if err != nil {
+						utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
+						return false, 0
+					}
+
+					if completed {
+						glog.V(2).Infof("All pods terminated on %s", value.Value)
+						recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
+						return true, 0
+					}
+
+					glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining)
+					// clamp very short intervals
+					if remaining < nodeEvictionPeriod {
+						remaining = nodeEvictionPeriod
+					}
+					return false, remaining
+				})
+			}
+		}, nodeEvictionPeriod, wait.NeverStop)
+	}()
 }

 // monitorNodeStatus verifies node status are constantly updated by kubelet, and if not,