mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-27 05:27:21 +00:00
Merge pull request #6156 from gmarek/retry_loop
Add a retry loop to UpdateStatus in NodeController
This commit is contained in:
commit
81683441b9
@ -60,6 +60,8 @@ const (
|
||||
// Theoretically, this value should be lower than nodeMonitorGracePeriod.
|
||||
// TODO: Change node status monitor to watch based.
|
||||
nodeMonitorPeriod = 5 * time.Second
|
||||
// Constant controlling number of retries of writing NodeStatus update.
|
||||
nodeStatusUpdateRetry = 5
|
||||
)
|
||||
|
||||
var (
|
||||
@ -457,18 +459,10 @@ func (nc *NodeController) recordNodeOfflineEvent(node *api.Node) {
|
||||
nc.recorder.Eventf(ref, "offline", "Node %s is now offline", node.Name)
|
||||
}
|
||||
|
||||
// MonitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
|
||||
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
|
||||
// not reachable for a long period of time.
|
||||
func (nc *NodeController) MonitorNodeStatus() error {
|
||||
nodes, err := nc.kubeClient.Nodes().List()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for i := range nodes.Items {
|
||||
func (nc NodeController) tryUpdateNodeStatus(node *api.Node) (error, time.Duration, api.NodeCondition) {
|
||||
var err error
|
||||
var gracePeriod time.Duration
|
||||
var lastReadyCondition api.NodeCondition
|
||||
node := &nodes.Items[i]
|
||||
readyCondition := nc.getCondition(node, api.NodeReady)
|
||||
if readyCondition == nil {
|
||||
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
|
||||
@ -505,7 +499,7 @@ func (nc *NodeController) MonitorNodeStatus() error {
|
||||
node.Name, nc.now().Time.Sub(lastReadyCondition.LastProbeTime.Time), lastReadyCondition)
|
||||
if lastReadyCondition.Status != api.ConditionUnknown {
|
||||
readyCondition.Status = api.ConditionUnknown
|
||||
readyCondition.Reason = fmt.Sprintf("Kubelet stopped posting node status")
|
||||
readyCondition.Reason = fmt.Sprintf("Kubelet stopped posting node status.")
|
||||
// LastProbeTime is the last time we heard from kubelet.
|
||||
readyCondition.LastProbeTime = lastReadyCondition.LastProbeTime
|
||||
readyCondition.LastTransitionTime = nc.now()
|
||||
@ -517,10 +511,47 @@ func (nc *NodeController) MonitorNodeStatus() error {
|
||||
}
|
||||
_, err = nc.kubeClient.Nodes().Update(node)
|
||||
if err != nil {
|
||||
glog.Errorf("error updating node %s: %v", node.Name, err)
|
||||
glog.Errorf("Error updating node %s: %v", node.Name, err)
|
||||
} else {
|
||||
return nil, gracePeriod, lastReadyCondition
|
||||
}
|
||||
}
|
||||
|
||||
return err, gracePeriod, lastReadyCondition
|
||||
}
|
||||
|
||||
// MonitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
|
||||
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
|
||||
// not reachable for a long period of time.
|
||||
func (nc *NodeController) MonitorNodeStatus() error {
|
||||
nodes, err := nc.kubeClient.Nodes().List()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for i := range nodes.Items {
|
||||
var gracePeriod time.Duration
|
||||
var lastReadyCondition api.NodeCondition
|
||||
node := &nodes.Items[i]
|
||||
for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
|
||||
err, gracePeriod, lastReadyCondition = nc.tryUpdateNodeStatus(node)
|
||||
if err != nil {
|
||||
name := node.Name
|
||||
node, err = nc.kubeClient.Nodes().Get(name)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
|
||||
break
|
||||
}
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
glog.Errorf("Update status of Node %v from NodeController exceeds retry count."+
|
||||
"Skipping - no pods will be evicted.", node.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
readyCondition := nc.getCondition(node, api.NodeReady)
|
||||
if readyCondition != nil {
|
||||
// Check eviction timeout.
|
||||
if lastReadyCondition.Status == api.ConditionFalse &&
|
||||
|
@ -1361,7 +1361,7 @@ func TestMonitorNodeStatusUpdateStatus(t *testing.T) {
|
||||
{
|
||||
Type: api.NodeReady,
|
||||
Status: api.ConditionUnknown,
|
||||
Reason: fmt.Sprintf("Kubelet stopped posting node status"),
|
||||
Reason: fmt.Sprintf("Kubelet stopped posting node status."),
|
||||
LastProbeTime: util.Date(2015, 1, 1, 11, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: fakeNow,
|
||||
},
|
||||
|
Loading…
Reference in New Issue
Block a user