Merge pull request #35235 from foxish/node-controller-no-force-deletion

Automatic merge from submit-queue

Node controller to not force delete pods

Fixes https://github.com/kubernetes/kubernetes/issues/35145

- [x] e2e tests to test Petset, RC, Job.
- [x] Remove and cover other locations where we force-delete pods within the NodeController.

**Release note**:

<!--  Steps to write your release note:
1. Use the release-note-* labels to set the release note state (if you have access) 
2. Enter your extended release note in the below block; leaving it blank means using the PR title as the release note. If no release note is required, just write `NONE`. 
-->

``` release-note
Node controller no longer force-deletes pods from the api-server.

* For StatefulSet (previously PetSet), this change means creation of replacement pods is blocked until old pods are definitely not running (indicated either by the kubelet returning from partitioned state, or deletion of the Node object, or deletion of the instance in the cloud provider, or force deletion of the pod from the api-server). This has the desirable outcome of "fencing" to prevent "split brain" scenarios.
* For all other existing controllers except StatefulSet , this has no effect on the ability of the controller to replace pods because the controllers do not reuse pod names (they use generate-name).
* User-written controllers that reuse names of pod objects should evaluate this change.
```
This commit is contained in:
Kubernetes Submit Queue
2016-11-01 20:08:57 -07:00
committed by GitHub
10 changed files with 621 additions and 511 deletions

View File

@@ -19,7 +19,6 @@ package node
import (
"fmt"
"strings"
"time"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/cache"
@@ -90,23 +89,9 @@ func forcefullyDeletePod(c clientset.Interface, pod *api.Pod) error {
return err
}
// forcefullyDeleteNode immediately deletes all pods on the node, and then
// deletes the node itself.
func forcefullyDeleteNode(kubeClient clientset.Interface, nodeName string, forcefulDeletePodFunc func(*api.Pod) error) error {
selector := fields.OneTermEqualSelector(api.PodHostField, nodeName)
options := api.ListOptions{FieldSelector: selector}
pods, err := kubeClient.Core().Pods(api.NamespaceAll).List(options)
if err != nil {
return fmt.Errorf("unable to list pods on node %q: %v", nodeName, err)
}
for _, pod := range pods.Items {
if pod.Spec.NodeName != nodeName {
continue
}
if err := forcefulDeletePodFunc(&pod); err != nil {
return fmt.Errorf("unable to delete pod %q on node %q: %v", pod.Name, nodeName, err)
}
}
// forcefullyDeleteNode immediately the node. The pods on the node are cleaned
// up by the podGC.
func forcefullyDeleteNode(kubeClient clientset.Interface, nodeName string) error {
if err := kubeClient.Core().Nodes().Delete(nodeName, nil); err != nil {
return fmt.Errorf("unable to delete node %q: %v", nodeName, err)
}
@@ -265,59 +250,3 @@ func recordNodeStatusChange(recorder record.EventRecorder, node *api.Node, new_s
// and event is recorded or neither should happen, see issue #6055.
recorder.Eventf(ref, api.EventTypeNormal, new_status, "Node %s status is now: %s", node.Name, new_status)
}
// terminatePods will ensure all pods on the given node that are in terminating state are eventually
// cleaned up. Returns true if the node has no pods in terminating state, a duration that indicates how
// long before we should check again (the next deadline for a pod to complete), or an error.
func terminatePods(kubeClient clientset.Interface, recorder record.EventRecorder, nodeName string, nodeUID string, since time.Time, maxGracePeriod time.Duration) (bool, time.Duration, error) {
// the time before we should try again
nextAttempt := time.Duration(0)
// have we deleted all pods
complete := true
selector := fields.OneTermEqualSelector(api.PodHostField, nodeName)
options := api.ListOptions{FieldSelector: selector}
pods, err := kubeClient.Core().Pods(api.NamespaceAll).List(options)
if err != nil {
return false, nextAttempt, err
}
now := time.Now()
elapsed := now.Sub(since)
for _, pod := range pods.Items {
// Defensive check, also needed for tests.
if pod.Spec.NodeName != nodeName {
continue
}
// only clean terminated pods
if pod.DeletionGracePeriodSeconds == nil {
continue
}
// the user's requested grace period
grace := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second
if grace > maxGracePeriod {
grace = maxGracePeriod
}
// the time remaining before the pod should have been deleted
remaining := grace - elapsed
if remaining < 0 {
remaining = 0
glog.V(2).Infof("Removing pod %v after %s grace period", pod.Name, grace)
recordNodeEvent(recorder, nodeName, nodeUID, api.EventTypeNormal, "TerminatingEvictedPod", fmt.Sprintf("Pod %s has exceeded the grace period for deletion after being evicted from Node %q and is being force killed", pod.Name, nodeName))
if err := kubeClient.Core().Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil {
glog.Errorf("Error completing deletion of pod %s: %v", pod.Name, err)
complete = false
}
} else {
glog.V(2).Infof("Pod %v still terminating, requested grace period %s, %s remaining", pod.Name, grace, remaining)
complete = false
}
if nextAttempt < remaining {
nextAttempt = remaining
}
}
return complete, nextAttempt, nil
}

View File

@@ -125,9 +125,8 @@ type NodeController struct {
// Lock to access evictor workers
evictorLock sync.Mutex
// workers that evicts pods from unresponsive nodes.
zonePodEvictor map[string]*RateLimitedTimedQueue
zoneTerminationEvictor map[string]*RateLimitedTimedQueue
podEvictionTimeout time.Duration
zonePodEvictor map[string]*RateLimitedTimedQueue
podEvictionTimeout time.Duration
// The maximum duration before a pod evicted from a node can be forcefully terminated.
maximumGracePeriod time.Duration
recorder record.EventRecorder
@@ -215,7 +214,6 @@ func NewNodeController(
podEvictionTimeout: podEvictionTimeout,
maximumGracePeriod: 5 * time.Minute,
zonePodEvictor: make(map[string]*RateLimitedTimedQueue),
zoneTerminationEvictor: make(map[string]*RateLimitedTimedQueue),
nodeStatusMap: make(map[string]nodeStatusData),
nodeMonitorGracePeriod: nodeMonitorGracePeriod,
nodeMonitorPeriod: nodeMonitorPeriod,
@@ -370,17 +368,8 @@ func (nc *NodeController) Run() {
}, nc.nodeMonitorPeriod, wait.NeverStop)
// Managing eviction of nodes:
// 1. when we delete pods off a node, if the node was not empty at the time we then
// queue a termination watcher
// a. If we hit an error, retry deletion
// 2. The terminator loop ensures that pods are eventually cleaned and we never
// terminate a pod in a time period less than nc.maximumGracePeriod. AddedAt
// is the time from which we measure "has this pod been terminating too long",
// after which we will delete the pod with grace period 0 (force delete).
// a. If we hit errors, retry instantly
// b. If there are no pods left terminating, exit
// c. If there are pods still terminating, wait for their estimated completion
// before retrying
// When we delete pods off a node, if the node was not empty at the time we then
// queue an eviction watcher. If we hit an error, retry deletion.
go wait.Until(func() {
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
@@ -405,42 +394,12 @@ func (nc *NodeController) Run() {
}
if remaining {
nc.zoneTerminationEvictor[k].Add(value.Value, value.UID)
glog.Infof("Pods awaiting deletion due to NodeController eviction")
}
return true, 0
})
}
}, nodeEvictionPeriod, wait.NeverStop)
// TODO: replace with a controller that ensures pods that are terminating complete
// in a particular time period
go wait.Until(func() {
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
for k := range nc.zoneTerminationEvictor {
nc.zoneTerminationEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
nodeUid, _ := value.UID.(string)
completed, remaining, err := terminatePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, value.AddedAt, nc.maximumGracePeriod)
if err != nil {
utilruntime.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
return false, 0
}
if completed {
glog.V(2).Infof("All pods terminated on %s", value.Value)
recordNodeEvent(nc.recorder, value.Value, nodeUid, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
return true, 0
}
glog.V(2).Infof("Pods terminating since %s on %q, estimated completion %s", value.AddedAt, value.Value, remaining)
// clamp very short intervals
if remaining < nodeEvictionPeriod {
remaining = nodeEvictionPeriod
}
return false, remaining
})
}
}, nodeEvictionPeriod, wait.NeverStop)
}()
}
@@ -470,10 +429,6 @@ func (nc *NodeController) monitorNodeStatus() error {
glog.Infof("Initializing eviction metric for zone: %v", zone)
EvictionsNumber.WithLabelValues(zone).Add(0)
}
if _, found := nc.zoneTerminationEvictor[zone]; !found {
nc.zoneTerminationEvictor[zone] = NewRateLimitedTimedQueue(
flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
}
nc.cancelPodEviction(added[i])
}
@@ -557,7 +512,7 @@ func (nc *NodeController) monitorNodeStatus() error {
// Kubelet is not reporting and Cloud Provider says node
// is gone. Delete it without worrying about grace
// periods.
if err := forcefullyDeleteNode(nc.kubeClient, nodeName, nc.forcefullyDeletePod); err != nil {
if err := forcefullyDeleteNode(nc.kubeClient, nodeName); err != nil {
glog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err)
}
}(node.Name)
@@ -618,7 +573,6 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*ap
// We stop all evictions.
for k := range nc.zonePodEvictor {
nc.zonePodEvictor[k].SwapLimiter(0)
nc.zoneTerminationEvictor[k].SwapLimiter(0)
}
for k := range nc.zoneStates {
nc.zoneStates[k] = stateFullDisruption
@@ -662,17 +616,12 @@ func (nc *NodeController) setLimiterInZone(zone string, zoneSize int, state zone
switch state {
case stateNormal:
nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
nc.zoneTerminationEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
case statePartialDisruption:
nc.zonePodEvictor[zone].SwapLimiter(
nc.enterPartialDisruptionFunc(zoneSize))
nc.zoneTerminationEvictor[zone].SwapLimiter(
nc.enterPartialDisruptionFunc(zoneSize))
case stateFullDisruption:
nc.zonePodEvictor[zone].SwapLimiter(
nc.enterFullDisruptionFunc(zoneSize))
nc.zoneTerminationEvictor[zone].SwapLimiter(
nc.enterFullDisruptionFunc(zoneSize))
}
}
@@ -871,8 +820,7 @@ func (nc *NodeController) cancelPodEviction(node *api.Node) bool {
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name)
wasTerminating := nc.zoneTerminationEvictor[zone].Remove(node.Name)
if wasDeleting || wasTerminating {
if wasDeleting {
glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
return true
}

View File

@@ -519,15 +519,7 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) {
for _, zone := range zones {
nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
nodeUid, _ := value.UID.(string)
remaining, _ := deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetStore)
if remaining {
nodeController.zoneTerminationEvictor[zone].Add(value.Value, nodeUid)
}
return true, 0
})
nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
nodeUid, _ := value.UID.(string)
terminatePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, value.AddedAt, nodeController.maximumGracePeriod)
deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetStore)
return true, 0
})
}
@@ -1054,15 +1046,7 @@ func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) {
for _, zone := range zones {
nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
uid, _ := value.UID.(string)
remaining, _ := deletePods(fakeNodeHandler, nodeController.recorder, value.Value, uid, nodeController.daemonSetStore)
if remaining {
nodeController.zoneTerminationEvictor[zone].Add(value.Value, value.UID)
}
return true, 0
})
nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
uid, _ := value.UID.(string)
terminatePods(fakeNodeHandler, nodeController.recorder, value.Value, uid, value.AddedAt, nodeController.maximumGracePeriod)
deletePods(fakeNodeHandler, nodeController.recorder, value.Value, uid, nodeController.daemonSetStore)
return true, 0
})
}