kubelet: Keep trying fast status update at startup until node is ready

2025-09-08 12:41:58 +00:00 · 2022-10-31 09:48:48 +00:00
parent 4a50fc4b8c
commit 9f5c5b82a9
3 changed files with 272 additions and 27 deletions
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -28,7 +28,6 @@ import (
 	"path/filepath"
 	sysruntime "runtime"
 	"sort"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -128,6 +127,10 @@ const (
 	// nodeStatusUpdateRetry specifies how many times kubelet retries when posting node status failed.
 	nodeStatusUpdateRetry = 5
 	// nodeReadyGracePeriod is the period to allow for before fast status update is
 	// terminated and container runtime not being ready is logged without verbosity guard.
 	nodeReadyGracePeriod = 120 * time.Second
 	// DefaultContainerLogsDir is the location of container logs.
 	DefaultContainerLogsDir = "/var/log/containers"
@@ -1063,6 +1066,12 @@ type Kubelet struct {
 	// used for generating ContainerStatus.
 	reasonCache *ReasonCache
 	// containerRuntimeReadyExpected indicates whether container runtime being ready is expected
 	// so errors are logged without verbosity guard, to avoid excessive error logs at node startup.
 	// It's false during the node initialization period of nodeReadyGracePeriod, and after that
 	// it's set to true by fastStatusUpdateOnce when it exits.
 	containerRuntimeReadyExpected bool
 	// nodeStatusUpdateFrequency specifies how often kubelet computes node status. If node lease
 	// feature is not enabled, it is also the frequency that kubelet posts node status to master.
 	// In that case, be cautious when changing the constant, it must work with nodeMonitorGracePeriod
@@ -1085,15 +1094,15 @@ type Kubelet struct {
 	lastStatusReportTime time.Time
 	// syncNodeStatusMux is a lock on updating the node status, because this path is not thread-safe.
-	// This lock is used by Kubelet.syncNodeStatus function and shouldn't be used anywhere else.
+	// This lock is used by Kubelet.syncNodeStatus and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else.
 	syncNodeStatusMux sync.Mutex
 	// updatePodCIDRMux is a lock on updating pod CIDR, because this path is not thread-safe.
-	// This lock is used by Kubelet.syncNodeStatus function and shouldn't be used anywhere else.
+	// This lock is used by Kubelet.updatePodCIDR function and shouldn't be used anywhere else.
 	updatePodCIDRMux sync.Mutex
 	// updateRuntimeMux is a lock on updating runtime, because this path is not thread-safe.
-	// This lock is used by Kubelet.updateRuntimeUp function and shouldn't be used anywhere else.
+	// This lock is used by Kubelet.updateRuntimeUp and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else.
 	updateRuntimeMux sync.Mutex
 	// nodeLeaseController claims and renews the node lease for this Kubelet
@@ -1502,6 +1511,12 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) {
 	go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop)
 	if kl.kubeClient != nil {
 		// Start two go-routines to update the status.
 		//
 		// The first will report to the apiserver every nodeStatusUpdateFrequency and is aimed to provide regular status intervals,
 		// while the second is used to provide a more timely status update during initialization and runs an one-shot update to the apiserver
 		// once the node becomes ready, then exits afterwards.
 		//
 		// Introduce some small jittering to ensure that over time the requests won't start
 		// accumulating at approximately the same time from the set of nodes due to priority and
 		// fairness effect.
@@ -2435,9 +2450,13 @@ func (kl *Kubelet) updateRuntimeUp() {
 	}
 	// Periodically log the whole runtime status for debugging.
 	klog.V(4).InfoS("Container runtime status", "status", s)
 	klogErrorS := klog.ErrorS
 	if !kl.containerRuntimeReadyExpected {
 		klogErrorS = klog.V(4).ErrorS
 	}
 	networkReady := s.GetRuntimeCondition(kubecontainer.NetworkReady)
 	if networkReady == nil || !networkReady.Status {
-		klog.ErrorS(nil, "Container runtime network not ready", "networkReady", networkReady)
+		klogErrorS(nil, "Container runtime network not ready", "networkReady", networkReady)
 		kl.runtimeState.setNetworkState(fmt.Errorf("container runtime network not ready: %v", networkReady))
 	} else {
 		// Set nil if the container runtime network is ready.
@@ -2447,7 +2466,7 @@ func (kl *Kubelet) updateRuntimeUp() {
 	runtimeReady := s.GetRuntimeCondition(kubecontainer.RuntimeReady)
 	// If RuntimeReady is not set or is false, report an error.
 	if runtimeReady == nil || !runtimeReady.Status {
-		klog.ErrorS(nil, "Container runtime not ready", "runtimeReady", runtimeReady)
+		klogErrorS(nil, "Container runtime not ready", "runtimeReady", runtimeReady)
 		kl.runtimeState.setRuntimeState(fmt.Errorf("container runtime not ready: %v", runtimeReady))
 		return
 	}
@@ -2502,31 +2521,25 @@ func (kl *Kubelet) cleanUpContainersInPod(podID types.UID, exitedContainerID str
 	}
 }
-// fastStatusUpdateOnce starts a loop that checks the internal node indexer cache for when a CIDR
+// fastStatusUpdateOnce starts a loop that checks if the current state of kubelet + container runtime
-// is applied  and tries to update pod CIDR immediately. After pod CIDR is updated it fires off
+// would be able to turn the node ready, and sync the ready state to the apiserver as soon as possible.
-// a runtime update and a node status update. Function returns after one successful node status update.
+// Function returns after the node status update after such event, or when the node is already ready.
 // Function is executed only during Kubelet start which improves latency to ready node by updating
-// pod CIDR, runtime status and node statuses ASAP.
+// kubelet state, runtime status and node statuses ASAP.
 func (kl *Kubelet) fastStatusUpdateOnce() {
 	ctx := context.Background()
-	for {
+	start := kl.clock.Now()
-		time.Sleep(100 * time.Millisecond)
+	stopCh := make(chan struct{})
-		node, err := kl.GetNode()
+
-		if err != nil {
+	// Keep trying to make fast node status update until either timeout is reached or an update is successful.
-			klog.ErrorS(err, "Error getting node")
+	wait.Until(func() {
-			continue
+		// fastNodeStatusUpdate returns true when it succeeds or when the grace period has expired
 		// (status was not updated within nodeReadyGracePeriod and the second argument below gets true),
 		// then we close the channel and abort the loop.
 		if kl.fastNodeStatusUpdate(ctx, kl.clock.Since(start) >= nodeReadyGracePeriod) {
 			close(stopCh)
 		}
-		if len(node.Spec.PodCIDRs) != 0 {
+	}, 100*time.Millisecond, stopCh)
 			podCIDRs := strings.Join(node.Spec.PodCIDRs, ",")
 			if _, err := kl.updatePodCIDR(ctx, podCIDRs); err != nil {
 				klog.ErrorS(err, "Pod CIDR update failed", "CIDR", podCIDRs)
 				continue
 			}
 			kl.updateRuntimeUp()
 			kl.syncNodeStatus()
 			return
 		}
 	}
 }
 // CheckpointContainer tries to checkpoint a container. The parameters are used to
--- a/pkg/kubelet/kubelet_node_status.go
+++ b/pkg/kubelet/kubelet_node_status.go
@@ -429,6 +429,85 @@ func (kl *Kubelet) initialNode(ctx context.Context) (*v1.Node, error) {
 	return node, nil
 }
 // fastNodeStatusUpdate is a "lightweight" version of syncNodeStatus which doesn't hit the
 // apiserver except for the final run, to be called by fastStatusUpdateOnce in each loop.
 // It holds the same lock as syncNodeStatus and is thread-safe when called concurrently with
 // syncNodeStatus. Its return value indicates whether the loop running it should exit
 // (final run), and it also sets kl.containerRuntimeReadyExpected.
 func (kl *Kubelet) fastNodeStatusUpdate(ctx context.Context, timeout bool) (completed bool) {
 	kl.syncNodeStatusMux.Lock()
 	defer func() {
 		kl.syncNodeStatusMux.Unlock()
 		if completed {
 			// containerRuntimeReadyExpected is read by updateRuntimeUp().
 			// Not going for a more granular mutex as this path runs only once.
 			kl.updateRuntimeMux.Lock()
 			defer kl.updateRuntimeMux.Unlock()
 			kl.containerRuntimeReadyExpected = true
 		}
 	}()
 	if timeout {
 		klog.ErrorS(nil, "Node not becoming ready in time after startup")
 		return true
 	}
 	originalNode, err := kl.GetNode()
 	if err != nil {
 		klog.ErrorS(err, "Error getting the current node from lister")
 		return false
 	}
 	readyIdx, originalNodeReady := nodeutil.GetNodeCondition(&originalNode.Status, v1.NodeReady)
 	if readyIdx == -1 {
 		klog.ErrorS(nil, "Node does not have NodeReady condition", "originalNode", originalNode)
 		return false
 	}
 	if originalNodeReady.Status == v1.ConditionTrue {
 		return true
 	}
 	// This is in addition to the regular syncNodeStatus logic so we can get the container runtime status earlier.
 	// This function itself has a mutex and it doesn't recursively call fastNodeStatusUpdate or syncNodeStatus.
 	kl.updateRuntimeUp()
 	node, changed := kl.updateNode(ctx, originalNode)
 	if !changed {
 		// We don't do markVolumesFromNode(node) here and leave it to the regular syncNodeStatus().
 		return false
 	}
 	readyIdx, nodeReady := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady)
 	if readyIdx == -1 {
 		klog.ErrorS(nil, "Node does not have NodeReady condition", "node", node)
 		return false
 	}
 	if nodeReady.Status == v1.ConditionFalse {
 		return false
 	}
 	klog.InfoS("Fast updating node status as it just became ready")
 	if _, err := kl.patchNodeStatus(originalNode, node); err != nil {
 		// The originalNode is probably stale, but we know that the current state of kubelet would turn
 		// the node to be ready. Retry using syncNodeStatus() which fetches from the apiserver.
 		klog.ErrorS(err, "Error updating node status, will retry with syncNodeStatus")
 		// The reversed kl.syncNodeStatusMux.Unlock/Lock() below to allow kl.syncNodeStatus() execution.
 		kl.syncNodeStatusMux.Unlock()
 		kl.syncNodeStatus()
 		// This lock action is unnecessary if we add a flag to check in the defer before unlocking it,
 		// but having it here makes the logic a bit easier to read.
 		kl.syncNodeStatusMux.Lock()
 	}
 	// We don't do markVolumesFromNode(node) here and leave it to the regular syncNodeStatus().
 	return true
 }
 // syncNodeStatus should be called periodically from a goroutine.
 // It synchronizes node status to master if there is any change or enough time
 // passed from the last sync, registering the kubelet first if necessary.
--- a/pkg/kubelet/kubelet_node_status_test.go
+++ b/pkg/kubelet/kubelet_node_status_test.go
@@ -1134,6 +1134,159 @@ func TestUpdateNodeStatusAndVolumesInUseWithNodeLease(t *testing.T) {
 	}
 }
 func TestFastStatusUpdateOnce(t *testing.T) {
 	tests := []struct {
 		name            string
 		beforeMarkReady int
 		beforeNextReady int
 		beforeTimeout   int
 		wantCalls       int
 		patchFailures   int
 		wantPatches     int
 	}{
 		{
 			name:            "timeout after third loop",
 			beforeMarkReady: 9,
 			beforeNextReady: 9,
 			beforeTimeout:   2,
 			wantCalls:       3,
 		},
 		{
 			name:            "already ready on third loop",
 			beforeMarkReady: 9,
 			beforeNextReady: 1,
 			beforeTimeout:   9,
 			wantCalls:       2,
 		},
 		{
 			name:            "turns ready on third loop",
 			beforeMarkReady: 2,
 			beforeNextReady: 9,
 			beforeTimeout:   9,
 			wantCalls:       3,
 			wantPatches:     1,
 		},
 		{
 			name:            "turns ready on second loop then first patch fails",
 			beforeMarkReady: 1,
 			beforeNextReady: 9,
 			beforeTimeout:   9,
 			wantCalls:       3,
 			patchFailures:   1,
 			wantPatches:     2,
 		},
 		{
 			name:            "turns ready on second loop then all patches fail",
 			beforeMarkReady: 1,
 			beforeNextReady: 9,
 			beforeTimeout:   9,
 			wantCalls:       nodeStatusUpdateRetry + 2,
 			patchFailures:   nodeStatusUpdateRetry + 2,
 			wantPatches:     nodeStatusUpdateRetry + 1,
 		},
 	}
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
 			defer testKubelet.Cleanup()
 			kubelet := testKubelet.kubelet
 			// Ensure we capture actions on the heartbeat client only.
 			// We don't set it to nil or GetNode() doesn't read from nodeLister.
 			kubelet.kubeClient = &fake.Clientset{}
 			kubeClient := testKubelet.fakeKubeClient
 			node := &v1.Node{
 				ObjectMeta: metav1.ObjectMeta{
 					Name: string(kubelet.nodeName),
 				},
 				Status: v1.NodeStatus{
 					Conditions: []v1.NodeCondition{
 						{
 							Type:    v1.NodeReady,
 							Status:  v1.ConditionFalse,
 							Reason:  "NotReady",
 							Message: "Node not ready",
 						},
 					},
 				},
 			}
 			nodeLister := testNodeLister{[]*v1.Node{node.DeepCopy()}}
 			kubelet.nodeLister = nodeLister
 			callCount := 0
 			// The original node status functions turn the node ready.
 			nodeStatusFuncs := kubelet.setNodeStatusFuncs
 			kubelet.setNodeStatusFuncs = []func(context.Context, *v1.Node) error{func(ctx context.Context, node *v1.Node) error {
 				assert.False(t, kubelet.containerRuntimeReadyExpected)
 				callCount++
 				var lastErr error
 				if callCount > tc.beforeMarkReady {
 					for _, f := range nodeStatusFuncs {
 						if err := f(ctx, node); err != nil {
 							lastErr = err
 						}
 					}
 				}
 				if callCount > tc.beforeNextReady {
 					nodeLister.nodes[0].Status.Conditions[0].Status = v1.ConditionTrue
 				}
 				if callCount > tc.beforeTimeout {
 					testKubelet.fakeClock.Step(nodeReadyGracePeriod)
 				}
 				return lastErr
 			}}
 			patchCount := 0
 			kubeClient.AddReactor("patch", "nodes", func(action core.Action) (bool, runtime.Object, error) {
 				assert.False(t, kubelet.containerRuntimeReadyExpected)
 				patchCount++
 				if patchCount > tc.patchFailures {
 					return false, nil, nil
 				}
 				return true, nil, fmt.Errorf("try again")
 			})
 			kubelet.fastStatusUpdateOnce()
 			assert.True(t, kubelet.containerRuntimeReadyExpected)
 			assert.Equal(t, tc.wantCalls, callCount)
 			assert.Equal(t, tc.wantPatches, patchCount)
 			actions := kubeClient.Actions()
 			if tc.wantPatches == 0 {
 				require.Len(t, actions, 0)
 				return
 			}
 			// patch, get, patch, get, patch, ... up to initial patch + nodeStatusUpdateRetry patches
 			require.Len(t, actions, 2*tc.wantPatches-1)
 			for i, action := range actions {
 				if i%2 == 1 {
 					require.IsType(t, core.GetActionImpl{}, action)
 					continue
 				}
 				require.IsType(t, core.PatchActionImpl{}, action)
 				patchAction := action.(core.PatchActionImpl)
 				updatedNode, err := applyNodeStatusPatch(node, patchAction.GetPatch())
 				require.NoError(t, err)
 				seenNodeReady := false
 				for _, c := range updatedNode.Status.Conditions {
 					if c.Type == v1.NodeReady {
 						assert.Equal(t, v1.ConditionTrue, c.Status)
 						seenNodeReady = true
 					}
 				}
 				assert.True(t, seenNodeReady)
 			}
 		})
 	}
 }
 func TestRegisterWithApiServer(t *testing.T) {
 	testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
 	defer testKubelet.Cleanup()