Merge pull request #35235 from foxish/node-controller-no-force-deletion

Automatic merge from submit-queue Node controller to not force delete pods Fixes https://github.com/kubernetes/kubernetes/issues/35145 - [x] e2e tests to test Petset, RC, Job. - [x] Remove and cover other locations where we force-delete pods within the NodeController. **Release note**:  ``` release-note Node controller no longer force-deletes pods from the api-server. * For StatefulSet (previously PetSet), this change means creation of replacement pods is blocked until old pods are definitely not running (indicated either by the kubelet returning from partitioned state, or deletion of the Node object, or deletion of the instance in the cloud provider, or force deletion of the pod from the api-server). This has the desirable outcome of "fencing" to prevent "split brain" scenarios. * For all other existing controllers except StatefulSet , this has no effect on the ability of the controller to replace pods because the controllers do not reuse pod names (they use generate-name). * User-written controllers that reuse names of pod objects should evaluate this change. ```
2026-01-13 11:25:19 +00:00 · 2016-11-01 20:08:57 -07:00
parent 7760c2f5fa 71941016c1
commit 49e7d640d9
10 changed files with 621 additions and 511 deletions
--- a/pkg/controller/node/controller_utils.go
+++ b/pkg/controller/node/controller_utils.go
@@ -19,7 +19,6 @@ package node
 import (
 	"fmt"
 	"strings"
-	"time"

 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/client/cache"
@@ -90,23 +89,9 @@ func forcefullyDeletePod(c clientset.Interface, pod *api.Pod) error {
 	return err
 }

-// forcefullyDeleteNode immediately deletes all pods on the node, and then
-// deletes the node itself.
-func forcefullyDeleteNode(kubeClient clientset.Interface, nodeName string, forcefulDeletePodFunc func(*api.Pod) error) error {
-	selector := fields.OneTermEqualSelector(api.PodHostField, nodeName)
-	options := api.ListOptions{FieldSelector: selector}
-	pods, err := kubeClient.Core().Pods(api.NamespaceAll).List(options)
-	if err != nil {
-		return fmt.Errorf("unable to list pods on node %q: %v", nodeName, err)
-	}
-	for _, pod := range pods.Items {
-		if pod.Spec.NodeName != nodeName {
-			continue
-		}
-		if err := forcefulDeletePodFunc(&pod); err != nil {
-			return fmt.Errorf("unable to delete pod %q on node %q: %v", pod.Name, nodeName, err)
-		}
-	}
+// forcefullyDeleteNode immediately the node. The pods on the node are cleaned
+// up by the podGC.
+func forcefullyDeleteNode(kubeClient clientset.Interface, nodeName string) error {
 	if err := kubeClient.Core().Nodes().Delete(nodeName, nil); err != nil {
 		return fmt.Errorf("unable to delete node %q: %v", nodeName, err)
 	}
@@ -265,59 +250,3 @@ func recordNodeStatusChange(recorder record.EventRecorder, node *api.Node, new_s
 	// and event is recorded or neither should happen, see issue #6055.
 	recorder.Eventf(ref, api.EventTypeNormal, new_status, "Node %s status is now: %s", node.Name, new_status)
 }
-
-// terminatePods will ensure all pods on the given node that are in terminating state are eventually
-// cleaned up. Returns true if the node has no pods in terminating state, a duration that indicates how
-// long before we should check again (the next deadline for a pod to complete), or an error.
-func terminatePods(kubeClient clientset.Interface, recorder record.EventRecorder, nodeName string, nodeUID string, since time.Time, maxGracePeriod time.Duration) (bool, time.Duration, error) {
-	// the time before we should try again
-	nextAttempt := time.Duration(0)
-	// have we deleted all pods
-	complete := true
-
-	selector := fields.OneTermEqualSelector(api.PodHostField, nodeName)
-	options := api.ListOptions{FieldSelector: selector}
-	pods, err := kubeClient.Core().Pods(api.NamespaceAll).List(options)
-	if err != nil {
-		return false, nextAttempt, err
-	}
-
-	now := time.Now()
-	elapsed := now.Sub(since)
-	for _, pod := range pods.Items {
-		// Defensive check, also needed for tests.
-		if pod.Spec.NodeName != nodeName {
-			continue
-		}
-		// only clean terminated pods
-		if pod.DeletionGracePeriodSeconds == nil {
-			continue
-		}
-
-		// the user's requested grace period
-		grace := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second
-		if grace > maxGracePeriod {
-			grace = maxGracePeriod
-		}
-
-		// the time remaining before the pod should have been deleted
-		remaining := grace - elapsed
-		if remaining < 0 {
-			remaining = 0
-			glog.V(2).Infof("Removing pod %v after %s grace period", pod.Name, grace)
-			recordNodeEvent(recorder, nodeName, nodeUID, api.EventTypeNormal, "TerminatingEvictedPod", fmt.Sprintf("Pod %s has exceeded the grace period for deletion after being evicted from Node %q and is being force killed", pod.Name, nodeName))
-			if err := kubeClient.Core().Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil {
-				glog.Errorf("Error completing deletion of pod %s: %v", pod.Name, err)
-				complete = false
-			}
-		} else {
-			glog.V(2).Infof("Pod %v still terminating, requested grace period %s, %s remaining", pod.Name, grace, remaining)
-			complete = false
-		}
-
-		if nextAttempt < remaining {
-			nextAttempt = remaining
-		}
-	}
-	return complete, nextAttempt, nil
-}
--- a/pkg/controller/node/nodecontroller.go
+++ b/pkg/controller/node/nodecontroller.go
@@ -125,9 +125,8 @@ type NodeController struct {
 	// Lock to access evictor workers
 	evictorLock sync.Mutex
 	// workers that evicts pods from unresponsive nodes.
-	zonePodEvictor         map[string]*RateLimitedTimedQueue
-	zoneTerminationEvictor map[string]*RateLimitedTimedQueue
-	podEvictionTimeout     time.Duration
+	zonePodEvictor     map[string]*RateLimitedTimedQueue
+	podEvictionTimeout time.Duration
 	// The maximum duration before a pod evicted from a node can be forcefully terminated.
 	maximumGracePeriod time.Duration
 	recorder           record.EventRecorder
@@ -215,7 +214,6 @@ func NewNodeController(
 		podEvictionTimeout:          podEvictionTimeout,
 		maximumGracePeriod:          5 * time.Minute,
 		zonePodEvictor:              make(map[string]*RateLimitedTimedQueue),
-		zoneTerminationEvictor:      make(map[string]*RateLimitedTimedQueue),
 		nodeStatusMap:               make(map[string]nodeStatusData),
 		nodeMonitorGracePeriod:      nodeMonitorGracePeriod,
 		nodeMonitorPeriod:           nodeMonitorPeriod,
@@ -370,17 +368,8 @@ func (nc *NodeController) Run() {
 		}, nc.nodeMonitorPeriod, wait.NeverStop)

 		// Managing eviction of nodes:
-		// 1. when we delete pods off a node, if the node was not empty at the time we then
-		//    queue a termination watcher
-		//    a. If we hit an error, retry deletion
-		// 2. The terminator loop ensures that pods are eventually cleaned and we never
-		//    terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt
-		//    is the time from which we measure "has this pod been terminating too long",
-		//    after which we will delete the pod with grace period 0 (force delete).
-		//    a. If we hit errors, retry instantly
-		//    b. If there are no pods left terminating, exit
-		//    c. If there are pods still terminating, wait for their estimated completion
-		//       before retrying
+		// When we delete pods off a node, if the node was not empty at the time we then
+		// queue an eviction watcher. If we hit an error, retry deletion.
 		go wait.Until(func() {
 			nc.evictorLock.Lock()
 			defer nc.evictorLock.Unlock()
@@ -405,42 +394,12 @@ func (nc *NodeController) Run() {
 					}

 					if remaining {
-						nc.zoneTerminationEvictor[k].Add(value.Value, value.UID)
+						glog.Infof("Pods awaiting deletion due to NodeController eviction")
 					}
 					return true, 0
 				})
 			}
 		}, nodeEvictionPeriod, wait.NeverStop)
-
-		// TODO: replace with a controller that ensures pods that are terminating complete
-		// in a particular time period
-		go wait.Until(func() {
-			nc.evictorLock.Lock()
-			defer nc.evictorLock.Unlock()
-			for k := range nc.zoneTerminationEvictor {
-				nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
-					nodeUid, _ := value.UID.(string)
-					completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod)
-					if err != nil {
-						utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
-						return false, 0
-					}
-
-					if completed {
-						glog.V(2).Infof("All pods terminated on %s", value.Value)
-						recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
-						return true, 0
-					}
-
-					glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining)
-					// clamp very short intervals
-					if remaining < nodeEvictionPeriod {
-						remaining = nodeEvictionPeriod
-					}
-					return false, remaining
-				})
-			}
-		}, nodeEvictionPeriod, wait.NeverStop)
 	}()
 }

@@ -470,10 +429,6 @@ func (nc *NodeController) monitorNodeStatus() error {
 			glog.Infof("Initializing eviction metric for zone: %v", zone)
 			EvictionsNumber.WithLabelValues(zone).Add(0)
 		}
-		if _, found := nc.zoneTerminationEvictor[zone]; !found {
-			nc.zoneTerminationEvictor[zone] = NewRateLimitedTimedQueue(
-				flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
-		}
 		nc.cancelPodEviction(added[i])
 	}

@@ -557,7 +512,7 @@ func (nc *NodeController) monitorNodeStatus() error {
 						// Kubelet is not reporting and Cloud Provider says node
 						// is gone. Delete it without worrying about grace
 						// periods.
-						if err := forcefullyDeleteNode(nc.kubeClient, nodeName, nc.forcefullyDeletePod); err != nil {
+						if err := forcefullyDeleteNode(nc.kubeClient, nodeName); err != nil {
 							glog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err)
 						}
 					}(node.Name)
@@ -618,7 +573,6 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*ap
 			// We stop all evictions.
 			for k := range nc.zonePodEvictor {
 				nc.zonePodEvictor[k].SwapLimiter(0)
-				nc.zoneTerminationEvictor[k].SwapLimiter(0)
 			}
 			for k := range nc.zoneStates {
 				nc.zoneStates[k] = stateFullDisruption
@@ -662,17 +616,12 @@ func (nc *NodeController) setLimiterInZone(zone string, zoneSize int, state zone
 	switch state {
 	case stateNormal:
 		nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
-		nc.zoneTerminationEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
 	case statePartialDisruption:
 		nc.zonePodEvictor[zone].SwapLimiter(
 			nc.enterPartialDisruptionFunc(zoneSize))
-		nc.zoneTerminationEvictor[zone].SwapLimiter(
-			nc.enterPartialDisruptionFunc(zoneSize))
 	case stateFullDisruption:
 		nc.zonePodEvictor[zone].SwapLimiter(
 			nc.enterFullDisruptionFunc(zoneSize))
-		nc.zoneTerminationEvictor[zone].SwapLimiter(
-			nc.enterFullDisruptionFunc(zoneSize))
 	}
 }

@@ -871,8 +820,7 @@ func (nc *NodeController) cancelPodEviction(node *api.Node) bool {
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
 	wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name)
-	wasTerminating := nc.zoneTerminationEvictor[zone].Remove(node.Name)
-	if wasDeleting || wasTerminating {
+	if wasDeleting {
 		glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
 		return true
 	}
--- a/pkg/controller/node/nodecontroller_test.go
+++ b/pkg/controller/node/nodecontroller_test.go
@@ -519,15 +519,7 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) {
 		for _, zone := range zones {
 			nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
 				nodeUid, _ := value.UID.(string)
-				remaining, _ := deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetStore)
-				if remaining {
-					nodeController.zoneTerminationEvictor[zone].Add(value.Value, nodeUid)
-				}
-				return true, 0
-			})
-			nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
-				nodeUid, _ := value.UID.(string)
-				terminatePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, value.AddedAt, nodeController.maximumGracePeriod)
+				deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetStore)
 				return true, 0
 			})
 		}
@@ -1054,15 +1046,7 @@ func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) {
 		for _, zone := range zones {
 			nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
 				uid, _ := value.UID.(string)
-				remaining, _ := deletePods(fakeNodeHandler, nodeController.recorder, value.Value, uid, nodeController.daemonSetStore)
-				if remaining {
-					nodeController.zoneTerminationEvictor[zone].Add(value.Value, value.UID)
-				}
-				return true, 0
-			})
-			nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
-				uid, _ := value.UID.(string)
-				terminatePods(fakeNodeHandler, nodeController.recorder, value.Value, uid, value.AddedAt, nodeController.maximumGracePeriod)
+				deletePods(fakeNodeHandler, nodeController.recorder, value.Value, uid, nodeController.daemonSetStore)
 				return true, 0
 			})
 		}