ignore failed pods to not stuck rolling update daemonset

This commit is contained in:
daihao 2019-05-21 17:38:53 +08:00
parent 8211cabfb2
commit 166147d5c4

View File

@ -862,13 +862,12 @@ func (dsc *DaemonSetsController) resolveControllerRef(namespace string, controll
// podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node:
// - nodesNeedingDaemonPods: the pods need to start on the node
// - podsToDelete: the Pods need to be deleted on the node
// - failedPodsObserved: the number of failed pods on node
// - err: unexpected error
func (dsc *DaemonSetsController) podsShouldBeOnNode(
node *v1.Node,
nodeToDaemonPods map[string][]*v1.Pod,
ds *apps.DaemonSet,
) (nodesNeedingDaemonPods, podsToDelete []string, failedPodsObserved int, err error) {
) (nodesNeedingDaemonPods, podsToDelete []string, err error) {
wantToRun, shouldSchedule, shouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(node, ds)
if err != nil {
@ -896,8 +895,6 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode(
continue
}
if pod.Status.Phase == v1.PodFailed {
failedPodsObserved++
// This is a critical place where DS is often fighting with kubelet that rejects pods.
// We need to avoid hot looping and backoff.
backoffKey := failedPodsBackoffKey(ds, node.Name)
@ -938,7 +935,7 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode(
}
}
return nodesNeedingDaemonPods, podsToDelete, failedPodsObserved, nil
return nodesNeedingDaemonPods, podsToDelete, nil
}
// manage manages the scheduling and running of Pods of ds on nodes.
@ -955,9 +952,8 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,
// For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon
// pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node.
var nodesNeedingDaemonPods, podsToDelete []string
var failedPodsObserved int
for _, node := range nodeList {
nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, failedPodsObservedOnNode, err := dsc.podsShouldBeOnNode(
nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, err := dsc.podsShouldBeOnNode(
node, nodeToDaemonPods, ds)
if err != nil {
@ -966,7 +962,6 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,
nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...)
podsToDelete = append(podsToDelete, podsToDeleteOnNode...)
failedPodsObserved += failedPodsObservedOnNode
}
// Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler.
@ -980,11 +975,6 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,
return err
}
// Throw an error when the daemon pods fail, to use ratelimiter to prevent kill-recreate hot loop
if failedPodsObserved > 0 {
return fmt.Errorf("deleted %d failed pods of DaemonSet %s/%s", failedPodsObserved, ds.Namespace, ds.Name)
}
return nil
}