mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-27 21:47:07 +00:00
Merge pull request #6156 from gmarek/retry_loop
Add a retry loop to UpdateStatus in NodeController
This commit is contained in:
commit
81683441b9
@ -60,6 +60,8 @@ const (
|
|||||||
// Theoretically, this value should be lower than nodeMonitorGracePeriod.
|
// Theoretically, this value should be lower than nodeMonitorGracePeriod.
|
||||||
// TODO: Change node status monitor to watch based.
|
// TODO: Change node status monitor to watch based.
|
||||||
nodeMonitorPeriod = 5 * time.Second
|
nodeMonitorPeriod = 5 * time.Second
|
||||||
|
// Constant controlling number of retries of writing NodeStatus update.
|
||||||
|
nodeStatusUpdateRetry = 5
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@ -457,18 +459,10 @@ func (nc *NodeController) recordNodeOfflineEvent(node *api.Node) {
|
|||||||
nc.recorder.Eventf(ref, "offline", "Node %s is now offline", node.Name)
|
nc.recorder.Eventf(ref, "offline", "Node %s is now offline", node.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MonitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
|
func (nc NodeController) tryUpdateNodeStatus(node *api.Node) (error, time.Duration, api.NodeCondition) {
|
||||||
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
|
var err error
|
||||||
// not reachable for a long period of time.
|
|
||||||
func (nc *NodeController) MonitorNodeStatus() error {
|
|
||||||
nodes, err := nc.kubeClient.Nodes().List()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
for i := range nodes.Items {
|
|
||||||
var gracePeriod time.Duration
|
var gracePeriod time.Duration
|
||||||
var lastReadyCondition api.NodeCondition
|
var lastReadyCondition api.NodeCondition
|
||||||
node := &nodes.Items[i]
|
|
||||||
readyCondition := nc.getCondition(node, api.NodeReady)
|
readyCondition := nc.getCondition(node, api.NodeReady)
|
||||||
if readyCondition == nil {
|
if readyCondition == nil {
|
||||||
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
|
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
|
||||||
@ -505,7 +499,7 @@ func (nc *NodeController) MonitorNodeStatus() error {
|
|||||||
node.Name, nc.now().Time.Sub(lastReadyCondition.LastProbeTime.Time), lastReadyCondition)
|
node.Name, nc.now().Time.Sub(lastReadyCondition.LastProbeTime.Time), lastReadyCondition)
|
||||||
if lastReadyCondition.Status != api.ConditionUnknown {
|
if lastReadyCondition.Status != api.ConditionUnknown {
|
||||||
readyCondition.Status = api.ConditionUnknown
|
readyCondition.Status = api.ConditionUnknown
|
||||||
readyCondition.Reason = fmt.Sprintf("Kubelet stopped posting node status")
|
readyCondition.Reason = fmt.Sprintf("Kubelet stopped posting node status.")
|
||||||
// LastProbeTime is the last time we heard from kubelet.
|
// LastProbeTime is the last time we heard from kubelet.
|
||||||
readyCondition.LastProbeTime = lastReadyCondition.LastProbeTime
|
readyCondition.LastProbeTime = lastReadyCondition.LastProbeTime
|
||||||
readyCondition.LastTransitionTime = nc.now()
|
readyCondition.LastTransitionTime = nc.now()
|
||||||
@ -517,10 +511,47 @@ func (nc *NodeController) MonitorNodeStatus() error {
|
|||||||
}
|
}
|
||||||
_, err = nc.kubeClient.Nodes().Update(node)
|
_, err = nc.kubeClient.Nodes().Update(node)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Errorf("error updating node %s: %v", node.Name, err)
|
glog.Errorf("Error updating node %s: %v", node.Name, err)
|
||||||
|
} else {
|
||||||
|
return nil, gracePeriod, lastReadyCondition
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return err, gracePeriod, lastReadyCondition
|
||||||
|
}
|
||||||
|
|
||||||
|
// MonitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
|
||||||
|
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
|
||||||
|
// not reachable for a long period of time.
|
||||||
|
func (nc *NodeController) MonitorNodeStatus() error {
|
||||||
|
nodes, err := nc.kubeClient.Nodes().List()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for i := range nodes.Items {
|
||||||
|
var gracePeriod time.Duration
|
||||||
|
var lastReadyCondition api.NodeCondition
|
||||||
|
node := &nodes.Items[i]
|
||||||
|
for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
|
||||||
|
err, gracePeriod, lastReadyCondition = nc.tryUpdateNodeStatus(node)
|
||||||
|
if err != nil {
|
||||||
|
name := node.Name
|
||||||
|
node, err = nc.kubeClient.Nodes().Get(name)
|
||||||
|
if err != nil {
|
||||||
|
glog.Errorf("Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted.", name)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
glog.Errorf("Update status of Node %v from NodeController exceeds retry count."+
|
||||||
|
"Skipping - no pods will be evicted.", node.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
readyCondition := nc.getCondition(node, api.NodeReady)
|
||||||
if readyCondition != nil {
|
if readyCondition != nil {
|
||||||
// Check eviction timeout.
|
// Check eviction timeout.
|
||||||
if lastReadyCondition.Status == api.ConditionFalse &&
|
if lastReadyCondition.Status == api.ConditionFalse &&
|
||||||
|
@ -1361,7 +1361,7 @@ func TestMonitorNodeStatusUpdateStatus(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: api.NodeReady,
|
Type: api.NodeReady,
|
||||||
Status: api.ConditionUnknown,
|
Status: api.ConditionUnknown,
|
||||||
Reason: fmt.Sprintf("Kubelet stopped posting node status"),
|
Reason: fmt.Sprintf("Kubelet stopped posting node status."),
|
||||||
LastProbeTime: util.Date(2015, 1, 1, 11, 0, 0, 0, time.UTC),
|
LastProbeTime: util.Date(2015, 1, 1, 11, 0, 0, 0, time.UTC),
|
||||||
LastTransitionTime: fakeNow,
|
LastTransitionTime: fakeNow,
|
||||||
},
|
},
|
||||||
|
Loading…
Reference in New Issue
Block a user