Merge pull request #6156 from gmarek/retry_loop

Add a retry loop to UpdateStatus in NodeController
2025-09-11 06:02:18 +00:00 · 2015-04-01 09:16:11 +02:00
parent 2de37624e8 fa6d5e259f
commit 81683441b9
2 changed files with 80 additions and 49 deletions
--- a/pkg/cloudprovider/controller/nodecontroller.go
+++ b/pkg/cloudprovider/controller/nodecontroller.go
@@ -60,6 +60,8 @@ const (
 	// Theoretically, this value should be lower than nodeMonitorGracePeriod.
 	// TODO: Change node status monitor to watch based.
 	nodeMonitorPeriod = 5 * time.Second
 	// Constant controlling number of retries of writing NodeStatus update.
 	nodeStatusUpdateRetry = 5
 )
 var (
@@ -457,18 +459,10 @@ func (nc *NodeController) recordNodeOfflineEvent(node *api.Node) {
 	nc.recorder.Eventf(ref, "offline", "Node %s is now offline", node.Name)
 }
-// MonitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
+func (nc NodeController) tryUpdateNodeStatus(node *api.Node) (error, time.Duration, api.NodeCondition) {
-// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
+	var err error
 // not reachable for a long period of time.
 func (nc *NodeController) MonitorNodeStatus() error {
 	nodes, err := nc.kubeClient.Nodes().List()
 	if err != nil {
 		return err
 	}
 	for i := range nodes.Items {
 	var gracePeriod time.Duration
 	var lastReadyCondition api.NodeCondition
 		node := &nodes.Items[i]
 	readyCondition := nc.getCondition(node, api.NodeReady)
 	if readyCondition == nil {
 		// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
@@ -505,7 +499,7 @@ func (nc *NodeController) MonitorNodeStatus() error {
 				node.Name, nc.now().Time.Sub(lastReadyCondition.LastProbeTime.Time), lastReadyCondition)
 			if lastReadyCondition.Status != api.ConditionUnknown {
 				readyCondition.Status = api.ConditionUnknown
-					readyCondition.Reason = fmt.Sprintf("Kubelet stopped posting node status")
+				readyCondition.Reason = fmt.Sprintf("Kubelet stopped posting node status.")
 				// LastProbeTime is the last time we heard from kubelet.
 				readyCondition.LastProbeTime = lastReadyCondition.LastProbeTime
 				readyCondition.LastTransitionTime = nc.now()
@@ -517,10 +511,47 @@ func (nc *NodeController) MonitorNodeStatus() error {
 		}
 		_, err = nc.kubeClient.Nodes().Update(node)
 		if err != nil {
-				glog.Errorf("error updating node %s: %v", node.Name, err)
+			glog.Errorf("Error updating node %s: %v", node.Name, err)
 		} else {
 			return nil, gracePeriod, lastReadyCondition
 		}
 	}
 	return err, gracePeriod, lastReadyCondition
 }
 // MonitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
 // post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
 // not reachable for a long period of time.
 func (nc *NodeController) MonitorNodeStatus() error {
 	nodes, err := nc.kubeClient.Nodes().List()
 	if err != nil {
 		return err
 	}
 	for i := range nodes.Items {
 		var gracePeriod time.Duration
 		var lastReadyCondition api.NodeCondition
 		node := &nodes.Items[i]
 		for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
 			err, gracePeriod, lastReadyCondition = nc.tryUpdateNodeStatus(node)
 			if err != nil {
 				name := node.Name
 				node, err = nc.kubeClient.Nodes().Get(name)
 				if err != nil {
 					glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
 					break
 				}
 			} else {
 				break
 			}
 		}
 		if err != nil {
 			glog.Errorf("Update status  of Node %v from NodeController exceeds retry count."+
 				"Skipping - no pods will be evicted.", node.Name)
 			continue
 		}
 		readyCondition := nc.getCondition(node, api.NodeReady)
 		if readyCondition != nil {
 			// Check eviction timeout.
 			if lastReadyCondition.Status == api.ConditionFalse &&
--- a/pkg/cloudprovider/controller/nodecontroller_test.go
+++ b/pkg/cloudprovider/controller/nodecontroller_test.go
@@ -1361,7 +1361,7 @@ func TestMonitorNodeStatusUpdateStatus(t *testing.T) {
 							{
 								Type:               api.NodeReady,
 								Status:             api.ConditionUnknown,
-								Reason:             fmt.Sprintf("Kubelet stopped posting node status"),
+								Reason:             fmt.Sprintf("Kubelet stopped posting node status."),
 								LastProbeTime:      util.Date(2015, 1, 1, 11, 0, 0, 0, time.UTC),
 								LastTransitionTime: fakeNow,
 							},