mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-21 02:41:25 +00:00
kubelet: Keep trying fast status update at startup until node is ready
This commit is contained in:
parent
4a50fc4b8c
commit
9f5c5b82a9
@ -28,7 +28,6 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
sysruntime "runtime"
|
sysruntime "runtime"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
@ -128,6 +127,10 @@ const (
|
|||||||
// nodeStatusUpdateRetry specifies how many times kubelet retries when posting node status failed.
|
// nodeStatusUpdateRetry specifies how many times kubelet retries when posting node status failed.
|
||||||
nodeStatusUpdateRetry = 5
|
nodeStatusUpdateRetry = 5
|
||||||
|
|
||||||
|
// nodeReadyGracePeriod is the period to allow for before fast status update is
|
||||||
|
// terminated and container runtime not being ready is logged without verbosity guard.
|
||||||
|
nodeReadyGracePeriod = 120 * time.Second
|
||||||
|
|
||||||
// DefaultContainerLogsDir is the location of container logs.
|
// DefaultContainerLogsDir is the location of container logs.
|
||||||
DefaultContainerLogsDir = "/var/log/containers"
|
DefaultContainerLogsDir = "/var/log/containers"
|
||||||
|
|
||||||
@ -1063,6 +1066,12 @@ type Kubelet struct {
|
|||||||
// used for generating ContainerStatus.
|
// used for generating ContainerStatus.
|
||||||
reasonCache *ReasonCache
|
reasonCache *ReasonCache
|
||||||
|
|
||||||
|
// containerRuntimeReadyExpected indicates whether container runtime being ready is expected
|
||||||
|
// so errors are logged without verbosity guard, to avoid excessive error logs at node startup.
|
||||||
|
// It's false during the node initialization period of nodeReadyGracePeriod, and after that
|
||||||
|
// it's set to true by fastStatusUpdateOnce when it exits.
|
||||||
|
containerRuntimeReadyExpected bool
|
||||||
|
|
||||||
// nodeStatusUpdateFrequency specifies how often kubelet computes node status. If node lease
|
// nodeStatusUpdateFrequency specifies how often kubelet computes node status. If node lease
|
||||||
// feature is not enabled, it is also the frequency that kubelet posts node status to master.
|
// feature is not enabled, it is also the frequency that kubelet posts node status to master.
|
||||||
// In that case, be cautious when changing the constant, it must work with nodeMonitorGracePeriod
|
// In that case, be cautious when changing the constant, it must work with nodeMonitorGracePeriod
|
||||||
@ -1085,15 +1094,15 @@ type Kubelet struct {
|
|||||||
lastStatusReportTime time.Time
|
lastStatusReportTime time.Time
|
||||||
|
|
||||||
// syncNodeStatusMux is a lock on updating the node status, because this path is not thread-safe.
|
// syncNodeStatusMux is a lock on updating the node status, because this path is not thread-safe.
|
||||||
// This lock is used by Kubelet.syncNodeStatus function and shouldn't be used anywhere else.
|
// This lock is used by Kubelet.syncNodeStatus and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else.
|
||||||
syncNodeStatusMux sync.Mutex
|
syncNodeStatusMux sync.Mutex
|
||||||
|
|
||||||
// updatePodCIDRMux is a lock on updating pod CIDR, because this path is not thread-safe.
|
// updatePodCIDRMux is a lock on updating pod CIDR, because this path is not thread-safe.
|
||||||
// This lock is used by Kubelet.syncNodeStatus function and shouldn't be used anywhere else.
|
// This lock is used by Kubelet.updatePodCIDR function and shouldn't be used anywhere else.
|
||||||
updatePodCIDRMux sync.Mutex
|
updatePodCIDRMux sync.Mutex
|
||||||
|
|
||||||
// updateRuntimeMux is a lock on updating runtime, because this path is not thread-safe.
|
// updateRuntimeMux is a lock on updating runtime, because this path is not thread-safe.
|
||||||
// This lock is used by Kubelet.updateRuntimeUp function and shouldn't be used anywhere else.
|
// This lock is used by Kubelet.updateRuntimeUp and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else.
|
||||||
updateRuntimeMux sync.Mutex
|
updateRuntimeMux sync.Mutex
|
||||||
|
|
||||||
// nodeLeaseController claims and renews the node lease for this Kubelet
|
// nodeLeaseController claims and renews the node lease for this Kubelet
|
||||||
@ -1502,6 +1511,12 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) {
|
|||||||
go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop)
|
go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop)
|
||||||
|
|
||||||
if kl.kubeClient != nil {
|
if kl.kubeClient != nil {
|
||||||
|
// Start two go-routines to update the status.
|
||||||
|
//
|
||||||
|
// The first will report to the apiserver every nodeStatusUpdateFrequency and is aimed to provide regular status intervals,
|
||||||
|
// while the second is used to provide a more timely status update during initialization and runs an one-shot update to the apiserver
|
||||||
|
// once the node becomes ready, then exits afterwards.
|
||||||
|
//
|
||||||
// Introduce some small jittering to ensure that over time the requests won't start
|
// Introduce some small jittering to ensure that over time the requests won't start
|
||||||
// accumulating at approximately the same time from the set of nodes due to priority and
|
// accumulating at approximately the same time from the set of nodes due to priority and
|
||||||
// fairness effect.
|
// fairness effect.
|
||||||
@ -2435,9 +2450,13 @@ func (kl *Kubelet) updateRuntimeUp() {
|
|||||||
}
|
}
|
||||||
// Periodically log the whole runtime status for debugging.
|
// Periodically log the whole runtime status for debugging.
|
||||||
klog.V(4).InfoS("Container runtime status", "status", s)
|
klog.V(4).InfoS("Container runtime status", "status", s)
|
||||||
|
klogErrorS := klog.ErrorS
|
||||||
|
if !kl.containerRuntimeReadyExpected {
|
||||||
|
klogErrorS = klog.V(4).ErrorS
|
||||||
|
}
|
||||||
networkReady := s.GetRuntimeCondition(kubecontainer.NetworkReady)
|
networkReady := s.GetRuntimeCondition(kubecontainer.NetworkReady)
|
||||||
if networkReady == nil || !networkReady.Status {
|
if networkReady == nil || !networkReady.Status {
|
||||||
klog.ErrorS(nil, "Container runtime network not ready", "networkReady", networkReady)
|
klogErrorS(nil, "Container runtime network not ready", "networkReady", networkReady)
|
||||||
kl.runtimeState.setNetworkState(fmt.Errorf("container runtime network not ready: %v", networkReady))
|
kl.runtimeState.setNetworkState(fmt.Errorf("container runtime network not ready: %v", networkReady))
|
||||||
} else {
|
} else {
|
||||||
// Set nil if the container runtime network is ready.
|
// Set nil if the container runtime network is ready.
|
||||||
@ -2447,7 +2466,7 @@ func (kl *Kubelet) updateRuntimeUp() {
|
|||||||
runtimeReady := s.GetRuntimeCondition(kubecontainer.RuntimeReady)
|
runtimeReady := s.GetRuntimeCondition(kubecontainer.RuntimeReady)
|
||||||
// If RuntimeReady is not set or is false, report an error.
|
// If RuntimeReady is not set or is false, report an error.
|
||||||
if runtimeReady == nil || !runtimeReady.Status {
|
if runtimeReady == nil || !runtimeReady.Status {
|
||||||
klog.ErrorS(nil, "Container runtime not ready", "runtimeReady", runtimeReady)
|
klogErrorS(nil, "Container runtime not ready", "runtimeReady", runtimeReady)
|
||||||
kl.runtimeState.setRuntimeState(fmt.Errorf("container runtime not ready: %v", runtimeReady))
|
kl.runtimeState.setRuntimeState(fmt.Errorf("container runtime not ready: %v", runtimeReady))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -2502,31 +2521,25 @@ func (kl *Kubelet) cleanUpContainersInPod(podID types.UID, exitedContainerID str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// fastStatusUpdateOnce starts a loop that checks the internal node indexer cache for when a CIDR
|
// fastStatusUpdateOnce starts a loop that checks if the current state of kubelet + container runtime
|
||||||
// is applied and tries to update pod CIDR immediately. After pod CIDR is updated it fires off
|
// would be able to turn the node ready, and sync the ready state to the apiserver as soon as possible.
|
||||||
// a runtime update and a node status update. Function returns after one successful node status update.
|
// Function returns after the node status update after such event, or when the node is already ready.
|
||||||
// Function is executed only during Kubelet start which improves latency to ready node by updating
|
// Function is executed only during Kubelet start which improves latency to ready node by updating
|
||||||
// pod CIDR, runtime status and node statuses ASAP.
|
// kubelet state, runtime status and node statuses ASAP.
|
||||||
func (kl *Kubelet) fastStatusUpdateOnce() {
|
func (kl *Kubelet) fastStatusUpdateOnce() {
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
for {
|
start := kl.clock.Now()
|
||||||
time.Sleep(100 * time.Millisecond)
|
stopCh := make(chan struct{})
|
||||||
node, err := kl.GetNode()
|
|
||||||
if err != nil {
|
// Keep trying to make fast node status update until either timeout is reached or an update is successful.
|
||||||
klog.ErrorS(err, "Error getting node")
|
wait.Until(func() {
|
||||||
continue
|
// fastNodeStatusUpdate returns true when it succeeds or when the grace period has expired
|
||||||
|
// (status was not updated within nodeReadyGracePeriod and the second argument below gets true),
|
||||||
|
// then we close the channel and abort the loop.
|
||||||
|
if kl.fastNodeStatusUpdate(ctx, kl.clock.Since(start) >= nodeReadyGracePeriod) {
|
||||||
|
close(stopCh)
|
||||||
}
|
}
|
||||||
if len(node.Spec.PodCIDRs) != 0 {
|
}, 100*time.Millisecond, stopCh)
|
||||||
podCIDRs := strings.Join(node.Spec.PodCIDRs, ",")
|
|
||||||
if _, err := kl.updatePodCIDR(ctx, podCIDRs); err != nil {
|
|
||||||
klog.ErrorS(err, "Pod CIDR update failed", "CIDR", podCIDRs)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
kl.updateRuntimeUp()
|
|
||||||
kl.syncNodeStatus()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// CheckpointContainer tries to checkpoint a container. The parameters are used to
|
// CheckpointContainer tries to checkpoint a container. The parameters are used to
|
||||||
|
@ -429,6 +429,85 @@ func (kl *Kubelet) initialNode(ctx context.Context) (*v1.Node, error) {
|
|||||||
return node, nil
|
return node, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// fastNodeStatusUpdate is a "lightweight" version of syncNodeStatus which doesn't hit the
|
||||||
|
// apiserver except for the final run, to be called by fastStatusUpdateOnce in each loop.
|
||||||
|
// It holds the same lock as syncNodeStatus and is thread-safe when called concurrently with
|
||||||
|
// syncNodeStatus. Its return value indicates whether the loop running it should exit
|
||||||
|
// (final run), and it also sets kl.containerRuntimeReadyExpected.
|
||||||
|
func (kl *Kubelet) fastNodeStatusUpdate(ctx context.Context, timeout bool) (completed bool) {
|
||||||
|
kl.syncNodeStatusMux.Lock()
|
||||||
|
defer func() {
|
||||||
|
kl.syncNodeStatusMux.Unlock()
|
||||||
|
|
||||||
|
if completed {
|
||||||
|
// containerRuntimeReadyExpected is read by updateRuntimeUp().
|
||||||
|
// Not going for a more granular mutex as this path runs only once.
|
||||||
|
kl.updateRuntimeMux.Lock()
|
||||||
|
defer kl.updateRuntimeMux.Unlock()
|
||||||
|
kl.containerRuntimeReadyExpected = true
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if timeout {
|
||||||
|
klog.ErrorS(nil, "Node not becoming ready in time after startup")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
originalNode, err := kl.GetNode()
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "Error getting the current node from lister")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
readyIdx, originalNodeReady := nodeutil.GetNodeCondition(&originalNode.Status, v1.NodeReady)
|
||||||
|
if readyIdx == -1 {
|
||||||
|
klog.ErrorS(nil, "Node does not have NodeReady condition", "originalNode", originalNode)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if originalNodeReady.Status == v1.ConditionTrue {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is in addition to the regular syncNodeStatus logic so we can get the container runtime status earlier.
|
||||||
|
// This function itself has a mutex and it doesn't recursively call fastNodeStatusUpdate or syncNodeStatus.
|
||||||
|
kl.updateRuntimeUp()
|
||||||
|
|
||||||
|
node, changed := kl.updateNode(ctx, originalNode)
|
||||||
|
|
||||||
|
if !changed {
|
||||||
|
// We don't do markVolumesFromNode(node) here and leave it to the regular syncNodeStatus().
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
readyIdx, nodeReady := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady)
|
||||||
|
if readyIdx == -1 {
|
||||||
|
klog.ErrorS(nil, "Node does not have NodeReady condition", "node", node)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if nodeReady.Status == v1.ConditionFalse {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
klog.InfoS("Fast updating node status as it just became ready")
|
||||||
|
if _, err := kl.patchNodeStatus(originalNode, node); err != nil {
|
||||||
|
// The originalNode is probably stale, but we know that the current state of kubelet would turn
|
||||||
|
// the node to be ready. Retry using syncNodeStatus() which fetches from the apiserver.
|
||||||
|
klog.ErrorS(err, "Error updating node status, will retry with syncNodeStatus")
|
||||||
|
|
||||||
|
// The reversed kl.syncNodeStatusMux.Unlock/Lock() below to allow kl.syncNodeStatus() execution.
|
||||||
|
kl.syncNodeStatusMux.Unlock()
|
||||||
|
kl.syncNodeStatus()
|
||||||
|
// This lock action is unnecessary if we add a flag to check in the defer before unlocking it,
|
||||||
|
// but having it here makes the logic a bit easier to read.
|
||||||
|
kl.syncNodeStatusMux.Lock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't do markVolumesFromNode(node) here and leave it to the regular syncNodeStatus().
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// syncNodeStatus should be called periodically from a goroutine.
|
// syncNodeStatus should be called periodically from a goroutine.
|
||||||
// It synchronizes node status to master if there is any change or enough time
|
// It synchronizes node status to master if there is any change or enough time
|
||||||
// passed from the last sync, registering the kubelet first if necessary.
|
// passed from the last sync, registering the kubelet first if necessary.
|
||||||
|
@ -1134,6 +1134,159 @@ func TestUpdateNodeStatusAndVolumesInUseWithNodeLease(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFastStatusUpdateOnce(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
beforeMarkReady int
|
||||||
|
beforeNextReady int
|
||||||
|
beforeTimeout int
|
||||||
|
wantCalls int
|
||||||
|
patchFailures int
|
||||||
|
wantPatches int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "timeout after third loop",
|
||||||
|
beforeMarkReady: 9,
|
||||||
|
beforeNextReady: 9,
|
||||||
|
beforeTimeout: 2,
|
||||||
|
wantCalls: 3,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "already ready on third loop",
|
||||||
|
beforeMarkReady: 9,
|
||||||
|
beforeNextReady: 1,
|
||||||
|
beforeTimeout: 9,
|
||||||
|
wantCalls: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "turns ready on third loop",
|
||||||
|
beforeMarkReady: 2,
|
||||||
|
beforeNextReady: 9,
|
||||||
|
beforeTimeout: 9,
|
||||||
|
wantCalls: 3,
|
||||||
|
wantPatches: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "turns ready on second loop then first patch fails",
|
||||||
|
beforeMarkReady: 1,
|
||||||
|
beforeNextReady: 9,
|
||||||
|
beforeTimeout: 9,
|
||||||
|
wantCalls: 3,
|
||||||
|
patchFailures: 1,
|
||||||
|
wantPatches: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "turns ready on second loop then all patches fail",
|
||||||
|
beforeMarkReady: 1,
|
||||||
|
beforeNextReady: 9,
|
||||||
|
beforeTimeout: 9,
|
||||||
|
wantCalls: nodeStatusUpdateRetry + 2,
|
||||||
|
patchFailures: nodeStatusUpdateRetry + 2,
|
||||||
|
wantPatches: nodeStatusUpdateRetry + 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
|
||||||
|
defer testKubelet.Cleanup()
|
||||||
|
kubelet := testKubelet.kubelet
|
||||||
|
// Ensure we capture actions on the heartbeat client only.
|
||||||
|
// We don't set it to nil or GetNode() doesn't read from nodeLister.
|
||||||
|
kubelet.kubeClient = &fake.Clientset{}
|
||||||
|
kubeClient := testKubelet.fakeKubeClient
|
||||||
|
|
||||||
|
node := &v1.Node{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: string(kubelet.nodeName),
|
||||||
|
},
|
||||||
|
Status: v1.NodeStatus{
|
||||||
|
Conditions: []v1.NodeCondition{
|
||||||
|
{
|
||||||
|
Type: v1.NodeReady,
|
||||||
|
Status: v1.ConditionFalse,
|
||||||
|
Reason: "NotReady",
|
||||||
|
Message: "Node not ready",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
nodeLister := testNodeLister{[]*v1.Node{node.DeepCopy()}}
|
||||||
|
kubelet.nodeLister = nodeLister
|
||||||
|
|
||||||
|
callCount := 0
|
||||||
|
// The original node status functions turn the node ready.
|
||||||
|
nodeStatusFuncs := kubelet.setNodeStatusFuncs
|
||||||
|
kubelet.setNodeStatusFuncs = []func(context.Context, *v1.Node) error{func(ctx context.Context, node *v1.Node) error {
|
||||||
|
assert.False(t, kubelet.containerRuntimeReadyExpected)
|
||||||
|
callCount++
|
||||||
|
var lastErr error
|
||||||
|
if callCount > tc.beforeMarkReady {
|
||||||
|
for _, f := range nodeStatusFuncs {
|
||||||
|
if err := f(ctx, node); err != nil {
|
||||||
|
lastErr = err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if callCount > tc.beforeNextReady {
|
||||||
|
nodeLister.nodes[0].Status.Conditions[0].Status = v1.ConditionTrue
|
||||||
|
}
|
||||||
|
if callCount > tc.beforeTimeout {
|
||||||
|
testKubelet.fakeClock.Step(nodeReadyGracePeriod)
|
||||||
|
}
|
||||||
|
return lastErr
|
||||||
|
}}
|
||||||
|
|
||||||
|
patchCount := 0
|
||||||
|
kubeClient.AddReactor("patch", "nodes", func(action core.Action) (bool, runtime.Object, error) {
|
||||||
|
assert.False(t, kubelet.containerRuntimeReadyExpected)
|
||||||
|
patchCount++
|
||||||
|
if patchCount > tc.patchFailures {
|
||||||
|
return false, nil, nil
|
||||||
|
}
|
||||||
|
return true, nil, fmt.Errorf("try again")
|
||||||
|
})
|
||||||
|
|
||||||
|
kubelet.fastStatusUpdateOnce()
|
||||||
|
|
||||||
|
assert.True(t, kubelet.containerRuntimeReadyExpected)
|
||||||
|
assert.Equal(t, tc.wantCalls, callCount)
|
||||||
|
assert.Equal(t, tc.wantPatches, patchCount)
|
||||||
|
|
||||||
|
actions := kubeClient.Actions()
|
||||||
|
if tc.wantPatches == 0 {
|
||||||
|
require.Len(t, actions, 0)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// patch, get, patch, get, patch, ... up to initial patch + nodeStatusUpdateRetry patches
|
||||||
|
require.Len(t, actions, 2*tc.wantPatches-1)
|
||||||
|
|
||||||
|
for i, action := range actions {
|
||||||
|
if i%2 == 1 {
|
||||||
|
require.IsType(t, core.GetActionImpl{}, action)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
require.IsType(t, core.PatchActionImpl{}, action)
|
||||||
|
patchAction := action.(core.PatchActionImpl)
|
||||||
|
|
||||||
|
updatedNode, err := applyNodeStatusPatch(node, patchAction.GetPatch())
|
||||||
|
require.NoError(t, err)
|
||||||
|
seenNodeReady := false
|
||||||
|
for _, c := range updatedNode.Status.Conditions {
|
||||||
|
if c.Type == v1.NodeReady {
|
||||||
|
assert.Equal(t, v1.ConditionTrue, c.Status)
|
||||||
|
seenNodeReady = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert.True(t, seenNodeReady)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRegisterWithApiServer(t *testing.T) {
|
func TestRegisterWithApiServer(t *testing.T) {
|
||||||
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
|
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
|
||||||
defer testKubelet.Cleanup()
|
defer testKubelet.Cleanup()
|
||||||
|
Loading…
Reference in New Issue
Block a user