mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-29 22:46:12 +00:00
Merge pull request #82884 from krzysied/node_controller_cleanup3
NodeLifecycleContorller - eviction processing refactor
This commit is contained in:
commit
d8a420c0d8
@ -758,81 +758,16 @@ func (nc *Controller) monitorNodeHealth() error {
|
|||||||
zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition)
|
zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition)
|
||||||
}
|
}
|
||||||
|
|
||||||
decisionTimestamp := nc.now()
|
|
||||||
nodeHealthData := nc.nodeHealthMap.getDeepCopy(node.Name)
|
nodeHealthData := nc.nodeHealthMap.getDeepCopy(node.Name)
|
||||||
if nodeHealthData == nil {
|
if nodeHealthData == nil {
|
||||||
klog.Errorf("Skipping %v node processing: health data doesn't exist.", node.Name)
|
klog.Errorf("Skipping %v node processing: health data doesn't exist.", node.Name)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if currentReadyCondition != nil {
|
if currentReadyCondition != nil {
|
||||||
// Check eviction timeout against decisionTimestamp
|
if nc.useTaintBasedEvictions {
|
||||||
switch observedReadyCondition.Status {
|
nc.processTaintBaseEviction(node, &observedReadyCondition)
|
||||||
case v1.ConditionFalse:
|
} else {
|
||||||
if nc.useTaintBasedEvictions {
|
nc.processNoTaintBaseEviction(node, &observedReadyCondition, gracePeriod)
|
||||||
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
|
|
||||||
if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
|
|
||||||
taintToAdd := *NotReadyTaintTemplate
|
|
||||||
if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
|
|
||||||
klog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.")
|
|
||||||
}
|
|
||||||
} else if nc.markNodeForTainting(node) {
|
|
||||||
klog.V(2).Infof("Node %v is NotReady as of %v. Adding it to the Taint queue.",
|
|
||||||
node.Name,
|
|
||||||
decisionTimestamp,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if decisionTimestamp.After(nodeHealthData.readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
|
|
||||||
if nc.evictPods(node) {
|
|
||||||
klog.V(2).Infof("Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v",
|
|
||||||
node.Name,
|
|
||||||
decisionTimestamp,
|
|
||||||
nodeHealthData.readyTransitionTimestamp,
|
|
||||||
nc.podEvictionTimeout,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case v1.ConditionUnknown:
|
|
||||||
if nc.useTaintBasedEvictions {
|
|
||||||
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
|
|
||||||
if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
|
|
||||||
taintToAdd := *UnreachableTaintTemplate
|
|
||||||
if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) {
|
|
||||||
klog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.")
|
|
||||||
}
|
|
||||||
} else if nc.markNodeForTainting(node) {
|
|
||||||
klog.V(2).Infof("Node %v is unresponsive as of %v. Adding it to the Taint queue.",
|
|
||||||
node.Name,
|
|
||||||
decisionTimestamp,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if decisionTimestamp.After(nodeHealthData.probeTimestamp.Add(nc.podEvictionTimeout)) {
|
|
||||||
if nc.evictPods(node) {
|
|
||||||
klog.V(2).Infof("Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v",
|
|
||||||
node.Name,
|
|
||||||
decisionTimestamp,
|
|
||||||
nodeHealthData.readyTransitionTimestamp,
|
|
||||||
nc.podEvictionTimeout-gracePeriod,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case v1.ConditionTrue:
|
|
||||||
if nc.useTaintBasedEvictions {
|
|
||||||
removed, err := nc.markNodeAsReachable(node)
|
|
||||||
if err != nil {
|
|
||||||
klog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
|
|
||||||
}
|
|
||||||
if removed {
|
|
||||||
klog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if nc.cancelPodEviction(node) {
|
|
||||||
klog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Report node event.
|
// Report node event.
|
||||||
@ -849,6 +784,85 @@ func (nc *Controller) monitorNodeHealth() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (nc *Controller) processTaintBaseEviction(node *v1.Node, observedReadyCondition *v1.NodeCondition) {
|
||||||
|
decisionTimestamp := nc.now()
|
||||||
|
// Check eviction timeout against decisionTimestamp
|
||||||
|
switch observedReadyCondition.Status {
|
||||||
|
case v1.ConditionFalse:
|
||||||
|
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
|
||||||
|
if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
|
||||||
|
taintToAdd := *NotReadyTaintTemplate
|
||||||
|
if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
|
||||||
|
klog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.")
|
||||||
|
}
|
||||||
|
} else if nc.markNodeForTainting(node) {
|
||||||
|
klog.V(2).Infof("Node %v is NotReady as of %v. Adding it to the Taint queue.",
|
||||||
|
node.Name,
|
||||||
|
decisionTimestamp,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
case v1.ConditionUnknown:
|
||||||
|
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
|
||||||
|
if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
|
||||||
|
taintToAdd := *UnreachableTaintTemplate
|
||||||
|
if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) {
|
||||||
|
klog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.")
|
||||||
|
}
|
||||||
|
} else if nc.markNodeForTainting(node) {
|
||||||
|
klog.V(2).Infof("Node %v is unresponsive as of %v. Adding it to the Taint queue.",
|
||||||
|
node.Name,
|
||||||
|
decisionTimestamp,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
case v1.ConditionTrue:
|
||||||
|
removed, err := nc.markNodeAsReachable(node)
|
||||||
|
if err != nil {
|
||||||
|
klog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
|
||||||
|
}
|
||||||
|
if removed {
|
||||||
|
klog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (nc *Controller) processNoTaintBaseEviction(node *v1.Node, observedReadyCondition *v1.NodeCondition, gracePeriod time.Duration) {
|
||||||
|
decisionTimestamp := nc.now()
|
||||||
|
nodeHealthData := nc.nodeHealthMap.getDeepCopy(node.Name)
|
||||||
|
if nodeHealthData == nil {
|
||||||
|
klog.Errorf("Skipping %v node processing: health data doesn't exist.", node.Name)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Check eviction timeout against decisionTimestamp
|
||||||
|
switch observedReadyCondition.Status {
|
||||||
|
case v1.ConditionFalse:
|
||||||
|
if decisionTimestamp.After(nodeHealthData.readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
|
||||||
|
if nc.evictPods(node) {
|
||||||
|
klog.V(2).Infof("Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v",
|
||||||
|
node.Name,
|
||||||
|
decisionTimestamp,
|
||||||
|
nodeHealthData.readyTransitionTimestamp,
|
||||||
|
nc.podEvictionTimeout,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case v1.ConditionUnknown:
|
||||||
|
if decisionTimestamp.After(nodeHealthData.probeTimestamp.Add(nc.podEvictionTimeout)) {
|
||||||
|
if nc.evictPods(node) {
|
||||||
|
klog.V(2).Infof("Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v",
|
||||||
|
node.Name,
|
||||||
|
decisionTimestamp,
|
||||||
|
nodeHealthData.readyTransitionTimestamp,
|
||||||
|
nc.podEvictionTimeout-gracePeriod,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case v1.ConditionTrue:
|
||||||
|
if nc.cancelPodEviction(node) {
|
||||||
|
klog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// labelNodeDisruptionExclusion is a label on nodes that controls whether they are
|
// labelNodeDisruptionExclusion is a label on nodes that controls whether they are
|
||||||
// excluded from being considered for disruption checks by the node controller.
|
// excluded from being considered for disruption checks by the node controller.
|
||||||
const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption"
|
const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption"
|
||||||
|
Loading…
Reference in New Issue
Block a user