Merge pull request #78170 from DaiHao/daemon

ignore failed pods to not block rolling update daemonset
This commit is contained in:
Kubernetes Prow Robot 2019-07-23 21:36:14 -07:00 committed by GitHub
commit bbdd0557b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -862,13 +862,12 @@ func (dsc *DaemonSetsController) resolveControllerRef(namespace string, controll
// podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node:
// - nodesNeedingDaemonPods: the pods need to start on the node
// - podsToDelete: the Pods need to be deleted on the node
// - failedPodsObserved: the number of failed pods on node
// - err: unexpected error
func (dsc *DaemonSetsController) podsShouldBeOnNode(
node *v1.Node,
nodeToDaemonPods map[string][]*v1.Pod,
ds *apps.DaemonSet,
) (nodesNeedingDaemonPods, podsToDelete []string, failedPodsObserved int, err error) {
) (nodesNeedingDaemonPods, podsToDelete []string, err error) {
wantToRun, shouldSchedule, shouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(node, ds)
if err != nil {
@ -900,8 +899,6 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode(
continue
}
if pod.Status.Phase == v1.PodFailed {
failedPodsObserved++
// This is a critical place where DS is often fighting with kubelet that rejects pods.
// We need to avoid hot looping and backoff.
backoffKey := failedPodsBackoffKey(ds, node.Name)
@ -945,7 +942,7 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode(
}
}
return nodesNeedingDaemonPods, podsToDelete, failedPodsObserved, nil
return nodesNeedingDaemonPods, podsToDelete, nil
}
// manage manages the scheduling and running of Pods of ds on nodes.
@ -962,9 +959,8 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,
// For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon
// pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node.
var nodesNeedingDaemonPods, podsToDelete []string
var failedPodsObserved int
for _, node := range nodeList {
nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, failedPodsObservedOnNode, err := dsc.podsShouldBeOnNode(
nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, err := dsc.podsShouldBeOnNode(
node, nodeToDaemonPods, ds)
if err != nil {
@ -973,7 +969,6 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,
nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...)
podsToDelete = append(podsToDelete, podsToDeleteOnNode...)
failedPodsObserved += failedPodsObservedOnNode
}
// Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler.
@ -987,11 +982,6 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,
return err
}
// Throw an error when the daemon pods fail, to use ratelimiter to prevent kill-recreate hot loop
if failedPodsObserved > 0 {
return fmt.Errorf("deleted %d failed pods of DaemonSet %s/%s", failedPodsObserved, ds.Namespace, ds.Name)
}
return nil
}