From e5c812bbe746eeaac453eb753bf227e754df8159 Mon Sep 17 00:00:00 2001 From: kerthcet Date: Wed, 14 Sep 2022 17:04:08 +0800 Subject: [PATCH 1/4] Remove CLI flag enable-taint-manager Signed-off-by: kerthcet --- api/api-rules/violation_exceptions.list | 1 - cmd/kube-controller-manager/app/core.go | 1 - .../app/options/nodelifecyclecontroller.go | 3 - .../app/options/options_test.go | 4 - pkg/controller/nodelifecycle/config/types.go | 3 - .../nodelifecycle/config/v1alpha1/defaults.go | 4 - .../v1alpha1/zz_generated.conversion.go | 6 - .../node_lifecycle_controller.go | 359 ++---------------- .../node_lifecycle_controller_test.go | 46 +-- pkg/generated/openapi/zz_generated.openapi.go | 9 +- .../config/v1alpha1/types.go | 3 - .../config/v1alpha1/zz_generated.deepcopy.go | 7 +- test/integration/node/lifecycle_test.go | 2 - .../integration/scheduler/taint/taint_test.go | 1 - 14 files changed, 61 insertions(+), 388 deletions(-) diff --git a/api/api-rules/violation_exceptions.list b/api/api-rules/violation_exceptions.list index 50fb2b584da..ee0a9657704 100644 --- a/api/api-rules/violation_exceptions.list +++ b/api/api-rules/violation_exceptions.list @@ -537,7 +537,6 @@ API rule violation: names_match,k8s.io/kube-controller-manager/config/v1alpha1,N API rule violation: names_match,k8s.io/kube-controller-manager/config/v1alpha1,NodeIPAMControllerConfiguration,NodeCIDRMaskSizeIPv6 API rule violation: names_match,k8s.io/kube-controller-manager/config/v1alpha1,NodeIPAMControllerConfiguration,SecondaryServiceCIDR API rule violation: names_match,k8s.io/kube-controller-manager/config/v1alpha1,NodeIPAMControllerConfiguration,ServiceCIDR -API rule violation: names_match,k8s.io/kube-controller-manager/config/v1alpha1,NodeLifecycleControllerConfiguration,EnableTaintManager API rule violation: names_match,k8s.io/kube-controller-manager/config/v1alpha1,NodeLifecycleControllerConfiguration,LargeClusterSizeThreshold API rule violation: names_match,k8s.io/kube-controller-manager/config/v1alpha1,NodeLifecycleControllerConfiguration,NodeEvictionRate API rule violation: names_match,k8s.io/kube-controller-manager/config/v1alpha1,NodeLifecycleControllerConfiguration,NodeMonitorGracePeriod diff --git a/cmd/kube-controller-manager/app/core.go b/cmd/kube-controller-manager/app/core.go index e665b3f8125..72e02b7d33d 100644 --- a/cmd/kube-controller-manager/app/core.go +++ b/cmd/kube-controller-manager/app/core.go @@ -191,7 +191,6 @@ func startNodeLifecycleController(ctx context.Context, controllerContext Control controllerContext.ComponentConfig.NodeLifecycleController.SecondaryNodeEvictionRate, controllerContext.ComponentConfig.NodeLifecycleController.LargeClusterSizeThreshold, controllerContext.ComponentConfig.NodeLifecycleController.UnhealthyZoneThreshold, - controllerContext.ComponentConfig.NodeLifecycleController.EnableTaintManager, ) if err != nil { return nil, true, err diff --git a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go index 916e9aa96f6..0952bdd49b6 100644 --- a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go +++ b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go @@ -44,8 +44,6 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) { fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.") fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, "Number of nodes from which NodeController treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller.") fs.Float32Var(&o.UnhealthyZoneThreshold, "unhealthy-zone-threshold", 0.55, "Fraction of Nodes in a zone which needs to be not Ready (minimum 3) for zone to be treated as unhealthy. ") - fs.BoolVar(&o.EnableTaintManager, "enable-taint-manager", o.EnableTaintManager, "If set to true enables NoExecute Taints and will evict all not-tolerating Pod running on Nodes tainted with this kind of Taints.") - fs.MarkDeprecated("enable-taint-manager", "This flag is deprecated and it will be removed in 1.27. The taint-manager is enabled by default and will remain implicitly enabled once this flag is removed.") fs.MarkDeprecated("pod-eviction-timeout", "This flag is deprecated and it will be removed in 1.27. Once taint manager is enabled, this flag has no effect.") } @@ -55,7 +53,6 @@ func (o *NodeLifecycleControllerOptions) ApplyTo(cfg *nodelifecycleconfig.NodeLi return nil } - cfg.EnableTaintManager = o.EnableTaintManager cfg.NodeStartupGracePeriod = o.NodeStartupGracePeriod cfg.NodeMonitorGracePeriod = o.NodeMonitorGracePeriod cfg.PodEvictionTimeout = o.PodEvictionTimeout diff --git a/cmd/kube-controller-manager/app/options/options_test.go b/cmd/kube-controller-manager/app/options/options_test.go index 6220d226837..44d3c073ab2 100644 --- a/cmd/kube-controller-manager/app/options/options_test.go +++ b/cmd/kube-controller-manager/app/options/options_test.go @@ -107,7 +107,6 @@ var args = []string{ "--enable-dynamic-provisioning=false", "--enable-garbage-collector=false", "--enable-hostpath-provisioner=true", - "--enable-taint-manager=false", "--cluster-signing-duration=10h", "--flex-volume-plugin-dir=/flex-volume-plugin", "--volume-host-cidr-denylist=127.0.0.1/28,feed::/16", @@ -344,7 +343,6 @@ func TestAddFlags(t *testing.T) { }, NodeLifecycleController: &NodeLifecycleControllerOptions{ &nodelifecycleconfig.NodeLifecycleControllerConfiguration{ - EnableTaintManager: false, NodeEvictionRate: 0.2, SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, @@ -589,7 +587,6 @@ func TestApplyTo(t *testing.T) { NodeCIDRMaskSizeIPv6: 108, }, NodeLifecycleController: nodelifecycleconfig.NodeLifecycleControllerConfiguration{ - EnableTaintManager: false, NodeEvictionRate: 0.2, SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, @@ -1164,7 +1161,6 @@ func TestValidateControllersOptions(t *testing.T) { expectErrors: false, validate: (&NodeLifecycleControllerOptions{ &nodelifecycleconfig.NodeLifecycleControllerConfiguration{ - EnableTaintManager: false, NodeEvictionRate: 0.2, SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, diff --git a/pkg/controller/nodelifecycle/config/types.go b/pkg/controller/nodelifecycle/config/types.go index b6c856f2332..dfba420f7cb 100644 --- a/pkg/controller/nodelifecycle/config/types.go +++ b/pkg/controller/nodelifecycle/config/types.go @@ -22,9 +22,6 @@ import ( // NodeLifecycleControllerConfiguration contains elements describing NodeLifecycleController. type NodeLifecycleControllerConfiguration struct { - // If set to true enables NoExecute Taints and will evict all not-tolerating - // Pod running on Nodes tainted with this kind of Taints. - EnableTaintManager bool // nodeEvictionRate is the number of nodes per second on which pods are deleted in case of node failure when a zone is healthy NodeEvictionRate float32 // secondaryNodeEvictionRate is the number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy diff --git a/pkg/controller/nodelifecycle/config/v1alpha1/defaults.go b/pkg/controller/nodelifecycle/config/v1alpha1/defaults.go index 6631aad1b94..bf80ce31e07 100644 --- a/pkg/controller/nodelifecycle/config/v1alpha1/defaults.go +++ b/pkg/controller/nodelifecycle/config/v1alpha1/defaults.go @@ -21,7 +21,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubectrlmgrconfigv1alpha1 "k8s.io/kube-controller-manager/config/v1alpha1" - "k8s.io/utils/pointer" ) // RecommendedDefaultNodeLifecycleControllerConfiguration defaults a pointer to a @@ -44,7 +43,4 @@ func RecommendedDefaultNodeLifecycleControllerConfiguration(obj *kubectrlmgrconf if obj.NodeStartupGracePeriod == zero { obj.NodeStartupGracePeriod = metav1.Duration{Duration: 60 * time.Second} } - if obj.EnableTaintManager == nil { - obj.EnableTaintManager = pointer.Bool(true) - } } diff --git a/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go b/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go index 30e20e36353..912f8d20e74 100644 --- a/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go +++ b/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go @@ -82,9 +82,6 @@ func Convert_v1_GroupResource_To_v1alpha1_GroupResource(in *v1.GroupResource, ou } func autoConvert_v1alpha1_NodeLifecycleControllerConfiguration_To_config_NodeLifecycleControllerConfiguration(in *v1alpha1.NodeLifecycleControllerConfiguration, out *config.NodeLifecycleControllerConfiguration, s conversion.Scope) error { - if err := v1.Convert_Pointer_bool_To_bool(&in.EnableTaintManager, &out.EnableTaintManager, s); err != nil { - return err - } out.NodeEvictionRate = in.NodeEvictionRate out.SecondaryNodeEvictionRate = in.SecondaryNodeEvictionRate out.NodeStartupGracePeriod = in.NodeStartupGracePeriod @@ -96,9 +93,6 @@ func autoConvert_v1alpha1_NodeLifecycleControllerConfiguration_To_config_NodeLif } func autoConvert_config_NodeLifecycleControllerConfiguration_To_v1alpha1_NodeLifecycleControllerConfiguration(in *config.NodeLifecycleControllerConfiguration, out *v1alpha1.NodeLifecycleControllerConfiguration, s conversion.Scope) error { - if err := v1.Convert_bool_To_Pointer_bool(&in.EnableTaintManager, &out.EnableTaintManager, s); err != nil { - return err - } out.NodeEvictionRate = in.NodeEvictionRate out.SecondaryNodeEvictionRate = in.SecondaryNodeEvictionRate out.NodeStartupGracePeriod = in.NodeStartupGracePeriod diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index f32e6b1255e..a3c0e3d1658 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -205,57 +205,6 @@ type podUpdateItem struct { name string } -type evictionStatus int - -const ( - unmarked = iota - toBeEvicted - evicted -) - -// nodeEvictionMap stores evictionStatus data for each node. -type nodeEvictionMap struct { - lock sync.Mutex - nodeEvictions map[string]evictionStatus -} - -func newNodeEvictionMap() *nodeEvictionMap { - return &nodeEvictionMap{ - nodeEvictions: make(map[string]evictionStatus), - } -} - -func (n *nodeEvictionMap) registerNode(nodeName string) { - n.lock.Lock() - defer n.lock.Unlock() - n.nodeEvictions[nodeName] = unmarked -} - -func (n *nodeEvictionMap) unregisterNode(nodeName string) { - n.lock.Lock() - defer n.lock.Unlock() - delete(n.nodeEvictions, nodeName) -} - -func (n *nodeEvictionMap) setStatus(nodeName string, status evictionStatus) bool { - n.lock.Lock() - defer n.lock.Unlock() - if _, exists := n.nodeEvictions[nodeName]; !exists { - return false - } - n.nodeEvictions[nodeName] = status - return true -} - -func (n *nodeEvictionMap) getStatus(nodeName string) (evictionStatus, bool) { - n.lock.Lock() - defer n.lock.Unlock() - if _, exists := n.nodeEvictions[nodeName]; !exists { - return unmarked, false - } - return n.nodeEvictions[nodeName], true -} - // Controller is the controller that manages node's life cycle. type Controller struct { taintManager *scheduler.NoExecuteTaintManager @@ -277,10 +226,7 @@ type Controller struct { nodeHealthMap *nodeHealthMap // evictorLock protects zonePodEvictor and zoneNoExecuteTainter. - evictorLock sync.Mutex - nodeEvictionMap *nodeEvictionMap - // workers that evicts pods from unresponsive nodes. - zonePodEvictor map[string]*scheduler.RateLimitedTimedQueue + evictorLock sync.Mutex // workers that are responsible for tainting nodes. zoneNoExecuteTainter map[string]*scheduler.RateLimitedTimedQueue @@ -342,10 +288,6 @@ type Controller struct { largeClusterThreshold int32 unhealthyZoneThreshold float32 - // if set to true Controller will start TaintManager that will evict Pods from - // tainted nodes, if they're not tolerated. - runTaintManager bool - nodeUpdateQueue workqueue.Interface podUpdateQueue workqueue.RateLimitingInterface } @@ -366,7 +308,6 @@ func NewNodeLifecycleController( secondaryEvictionLimiterQPS float32, largeClusterThreshold int32, unhealthyZoneThreshold float32, - runTaintManager bool, ) (*Controller, error) { logger := klog.LoggerWithName(klog.FromContext(ctx), "NodeLifecycleController") if kubeClient == nil { @@ -382,14 +323,12 @@ func NewNodeLifecycleController( now: metav1.Now, knownNodeSet: make(map[string]*v1.Node), nodeHealthMap: newNodeHealthMap(), - nodeEvictionMap: newNodeEvictionMap(), broadcaster: eventBroadcaster, recorder: recorder, nodeMonitorPeriod: nodeMonitorPeriod, nodeStartupGracePeriod: nodeStartupGracePeriod, nodeMonitorGracePeriod: nodeMonitorGracePeriod, nodeUpdateWorkerSize: scheduler.UpdateWorkerSize, - zonePodEvictor: make(map[string]*scheduler.RateLimitedTimedQueue), zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue), nodesToRetry: sync.Map{}, zoneStates: make(map[string]ZoneState), @@ -398,7 +337,6 @@ func NewNodeLifecycleController( secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, largeClusterThreshold: largeClusterThreshold, unhealthyZoneThreshold: unhealthyZoneThreshold, - runTaintManager: runTaintManager, nodeUpdateQueue: workqueue.NewNamed("node_lifecycle_controller"), podUpdateQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node_lifecycle_controller_pods"), } @@ -477,29 +415,26 @@ func NewNodeLifecycleController( nc.podLister = podInformer.Lister() nc.nodeLister = nodeInformer.Lister() - if nc.runTaintManager { - nc.taintManager = scheduler.NewNoExecuteTaintManager(ctx, kubeClient, nc.podLister, nc.nodeLister, nc.getPodsAssignedToNode) - nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { - nc.taintManager.NodeUpdated(nil, node) - return nil - }), - UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(oldNode, newNode *v1.Node) error { - nc.taintManager.NodeUpdated(oldNode, newNode) - return nil - }), - DeleteFunc: controllerutil.CreateDeleteNodeHandler(func(node *v1.Node) error { - nc.taintManager.NodeUpdated(node, nil) - return nil - }), - }) - } + nc.taintManager = scheduler.NewNoExecuteTaintManager(ctx, kubeClient, nc.podLister, nc.nodeLister, nc.getPodsAssignedToNode) + nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { + nc.taintManager.NodeUpdated(nil, node) + return nil + }), + UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(oldNode, newNode *v1.Node) error { + nc.taintManager.NodeUpdated(oldNode, newNode) + return nil + }), + DeleteFunc: controllerutil.CreateDeleteNodeHandler(func(node *v1.Node) error { + nc.taintManager.NodeUpdated(node, nil) + return nil + }), + }) logger.Info("Controller will reconcile labels") nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { nc.nodeUpdateQueue.Add(node.Name) - nc.nodeEvictionMap.registerNode(node.Name) return nil }), UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { @@ -508,7 +443,6 @@ func NewNodeLifecycleController( }), DeleteFunc: controllerutil.CreateDeleteNodeHandler(func(node *v1.Node) error { nc.nodesToRetry.Delete(node.Name) - nc.nodeEvictionMap.unregisterNode(node.Name) return nil }), }) @@ -549,9 +483,7 @@ func (nc *Controller) Run(ctx context.Context) { return } - if nc.runTaintManager { - go nc.taintManager.Run(ctx) - } + go nc.taintManager.Run(ctx) // Start workers to reconcile labels and/or update NoSchedule taint for nodes. for i := 0; i < scheduler.UpdateWorkerSize; i++ { @@ -566,16 +498,9 @@ func (nc *Controller) Run(ctx context.Context) { go wait.UntilWithContext(ctx, nc.doPodProcessingWorker, time.Second) } - if nc.runTaintManager { - // Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated - // taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints. - go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod) - } else { - // Managing eviction of nodes: - // When we delete pods off a node, if the node was not empty at the time we then - // queue an eviction watcher. If we hit an error, retry deletion. - go wait.UntilWithContext(ctx, nc.doEvictionPass, scheduler.NodeEvictionPeriod) - } + // Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated + // taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints. + go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod) // Incorporate the results of node health signal pushed from kubelet to master. go wait.UntilWithContext(ctx, func(ctx context.Context) { @@ -732,73 +657,6 @@ func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) { } } -func (nc *Controller) doEvictionPass(ctx context.Context) { - // Extract out the keys of the map in order to not hold - // the evictorLock for the entire function and hold it - // only when nescessary. - var zonePodEvictorKeys []string - func() { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - - zonePodEvictorKeys = make([]string, 0, len(nc.zonePodEvictor)) - for k := range nc.zonePodEvictor { - zonePodEvictorKeys = append(zonePodEvictorKeys, k) - } - }() - logger := klog.FromContext(ctx) - for _, k := range zonePodEvictorKeys { - var zonePodEvictionWorker *scheduler.RateLimitedTimedQueue - func() { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - // Extracting the value without checking if the key - // exists or not is safe to do here since zones do - // not get removed, and consequently pod evictors for - // these zones also do not get removed, only added. - zonePodEvictionWorker = nc.zonePodEvictor[k] - }() - - // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). - zonePodEvictionWorker.Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { - node, err := nc.nodeLister.Get(value.Value) - if apierrors.IsNotFound(err) { - logger.Info("Node no longer present in nodeLister", "node", klog.KRef("", value.Value)) - } else if err != nil { - logger.Info("Failed to get Node from the nodeLister", "node", klog.KRef("", value.Value), "err", err) - } - nodeUID, _ := value.UID.(string) - pods, err := nc.getPodsAssignedToNode(value.Value) - if err != nil { - utilruntime.HandleError(fmt.Errorf("unable to list pods from node %q: %v", value.Value, err)) - return false, 0 - } - remaining, err := controllerutil.DeletePods(ctx, nc.kubeClient, pods, nc.recorder, value.Value, nodeUID, nc.daemonSetStore) - if err != nil { - // We are not setting eviction status here. - // New pods will be handled by zonePodEvictor retry - // instead of immediate pod eviction. - utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err)) - return false, 0 - } - if !nc.nodeEvictionMap.setStatus(value.Value, evicted) { - logger.V(2).Info("Node was unregistered in the meantime - skipping setting status", "node", klog.KRef("", value.Value)) - } - if remaining { - logger.Info("Pods awaiting deletion due to Controller eviction") - } - - if node != nil { - zone := nodetopology.GetZoneKey(node) - evictionsNumber.WithLabelValues(zone).Inc() - evictionsTotal.WithLabelValues(zone).Inc() - } - - return true, 0 - }) - } -} - // monitorNodeHealth verifies node health are constantly updated by kubelet, and // if not, post "NodeReady==ConditionUnknown". // This function will taint nodes who are not ready or not reachable for a long period of time. @@ -824,11 +682,7 @@ func (nc *Controller) monitorNodeHealth(ctx context.Context) error { controllerutil.RecordNodeEvent(nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name)) nc.knownNodeSet[added[i].Name] = added[i] nc.addPodEvictorForNewZone(logger, added[i]) - if nc.runTaintManager { - nc.markNodeAsReachable(ctx, added[i]) - } else { - nc.cancelPodEviction(logger, added[i]) - } + nc.markNodeAsReachable(ctx, added[i]) } for i := range deleted { @@ -845,14 +699,13 @@ func (nc *Controller) monitorNodeHealth(ctx context.Context) error { updateNodeHealthDuration.Observe(time.Since(start.Time).Seconds()) }() - var gracePeriod time.Duration var observedReadyCondition v1.NodeCondition var currentReadyCondition *v1.NodeCondition node := nodes[piece].DeepCopy() if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) { var err error - gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(ctx, node) + _, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(ctx, node) if err == nil { return true, nil } @@ -887,13 +740,7 @@ func (nc *Controller) monitorNodeHealth(ctx context.Context) error { } return } - if nc.runTaintManager { - nc.processTaintBaseEviction(ctx, node, &observedReadyCondition) - } else { - if err := nc.processNoTaintBaseEviction(ctx, node, &observedReadyCondition, gracePeriod, pods); err != nil { - utilruntime.HandleError(fmt.Errorf("unable to evict all pods from node %v: %v; queuing for retry", node.Name, err)) - } - } + nc.processTaintBaseEviction(ctx, node, &observedReadyCondition) _, needsRetry := nc.nodesToRetry.Load(node.Name) switch { @@ -960,53 +807,6 @@ func (nc *Controller) processTaintBaseEviction(ctx context.Context, node *v1.Nod } } -func (nc *Controller) processNoTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition, gracePeriod time.Duration, pods []*v1.Pod) error { - decisionTimestamp := nc.now() - nodeHealthData := nc.nodeHealthMap.getDeepCopy(node.Name) - if nodeHealthData == nil { - return fmt.Errorf("health data doesn't exist for node %q", node.Name) - } - // Check eviction timeout against decisionTimestamp - logger := klog.FromContext(ctx) - switch observedReadyCondition.Status { - case v1.ConditionFalse: - if decisionTimestamp.After(nodeHealthData.readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { - enqueued, err := nc.evictPods(ctx, node, pods) - if err != nil { - return err - } - if enqueued { - logger.V(2).Info("Node is NotReady. Adding Pods on Node to eviction queue: decisionTimestamp is later than readyTransitionTimestamp + podEvictionTimeout", - "node", klog.KObj(node), - "decisionTimestamp", decisionTimestamp, - "readyTransitionTimestamp", nodeHealthData.readyTransitionTimestamp, - "podEvictionTimeout", nc.podEvictionTimeout, - ) - } - } - case v1.ConditionUnknown: - if decisionTimestamp.After(nodeHealthData.probeTimestamp.Add(nc.podEvictionTimeout)) { - enqueued, err := nc.evictPods(ctx, node, pods) - if err != nil { - return err - } - if enqueued { - logger.V(2).Info("Node is unresponsive. Adding Pods on Node to eviction queues: decisionTimestamp is later than readyTransitionTimestamp + podEvictionTimeout-gracePeriod", - "node", klog.KObj(node), - "decisionTimestamp", decisionTimestamp, - "readyTransitionTimestamp", nodeHealthData.readyTransitionTimestamp, - "podEvictionTimeoutGracePeriod", nc.podEvictionTimeout-gracePeriod, - ) - } - } - case v1.ConditionTrue: - if nc.cancelPodEviction(logger, node) { - logger.V(2).Info("Node is ready again, cancelled pod eviction", "node", klog.KObj(node)) - } - } - return nil -} - // labelNodeDisruptionExclusion is a label on nodes that controls whether they are // excluded from being considered for disruption checks by the node controller. const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption" @@ -1230,22 +1030,14 @@ func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions if allAreFullyDisrupted { logger.Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode") for i := range nodes { - if nc.runTaintManager { - _, err := nc.markNodeAsReachable(ctx, nodes[i]) - if err != nil { - logger.Error(nil, "Failed to remove taints from Node", "node", klog.KObj(nodes[i])) - } - } else { - nc.cancelPodEviction(logger, nodes[i]) + _, err := nc.markNodeAsReachable(ctx, nodes[i]) + if err != nil { + logger.Error(nil, "Failed to remove taints from Node", "node", klog.KObj(nodes[i])) } } // We stop all evictions. for k := range nc.zoneStates { - if nc.runTaintManager { - nc.zoneNoExecuteTainter[k].SwapLimiter(0) - } else { - nc.zonePodEvictor[k].SwapLimiter(0) - } + nc.zoneNoExecuteTainter[k].SwapLimiter(0) } for k := range nc.zoneStates { nc.zoneStates[k] = stateFullDisruption @@ -1312,7 +1104,7 @@ func (nc *Controller) doPodProcessingWorker(ctx context.Context) { // processPod is processing events of assigning pods to nodes. In particular: // 1. for NodeReady=true node, taint eviction for this pod will be cancelled // 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready -// 3. if node doesn't exist in cache, it will be skipped and handled later by doEvictionPass +// 3. if node doesn't exist in cache, it will be skipped. func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { defer nc.podUpdateQueue.Done(podItem) pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name) @@ -1331,12 +1123,11 @@ func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { nodeHealth := nc.nodeHealthMap.getDeepCopy(nodeName) if nodeHealth == nil { - // Node data is not gathered yet or node has beed removed in the meantime. - // Pod will be handled by doEvictionPass method. + // Node data is not gathered yet or node has been removed in the meantime. return } - node, err := nc.nodeLister.Get(nodeName) + _, err = nc.nodeLister.Get(nodeName) if err != nil { logger.Info("Failed to read node", "node", klog.KRef("", nodeName), "err", err) nc.podUpdateQueue.AddRateLimited(podItem) @@ -1352,16 +1143,6 @@ func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { } pods := []*v1.Pod{pod} - // In taint-based eviction mode, only node updates are processed by NodeLifecycleController. - // Pods are processed by TaintManager. - if !nc.runTaintManager { - if err := nc.processNoTaintBaseEviction(ctx, node, currentReadyCondition, nc.nodeMonitorGracePeriod, pods); err != nil { - logger.Info("Unable to process pod eviction from node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err) - nc.podUpdateQueue.AddRateLimited(podItem) - return - } - } - if currentReadyCondition.Status != v1.ConditionTrue { if err := controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, nodeName); err != nil { logger.Info("Unable to mark pod NotReady on node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err) @@ -1373,27 +1154,13 @@ func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { func (nc *Controller) setLimiterInZone(zone string, zoneSize int, state ZoneState) { switch state { case stateNormal: - if nc.runTaintManager { - nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS) - } else { - nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS) - } + nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS) case statePartialDisruption: - if nc.runTaintManager { - nc.zoneNoExecuteTainter[zone].SwapLimiter( - nc.enterPartialDisruptionFunc(zoneSize)) - } else { - nc.zonePodEvictor[zone].SwapLimiter( - nc.enterPartialDisruptionFunc(zoneSize)) - } + nc.zoneNoExecuteTainter[zone].SwapLimiter( + nc.enterPartialDisruptionFunc(zoneSize)) case stateFullDisruption: - if nc.runTaintManager { - nc.zoneNoExecuteTainter[zone].SwapLimiter( - nc.enterFullDisruptionFunc(zoneSize)) - } else { - nc.zonePodEvictor[zone].SwapLimiter( - nc.enterFullDisruptionFunc(zoneSize)) - } + nc.zoneNoExecuteTainter[zone].SwapLimiter( + nc.enterFullDisruptionFunc(zoneSize)) } } @@ -1453,15 +1220,9 @@ func (nc *Controller) addPodEvictorForNewZone(logger klog.Logger, node *v1.Node) zone := nodetopology.GetZoneKey(node) if _, found := nc.zoneStates[zone]; !found { nc.zoneStates[zone] = stateInitial - if !nc.runTaintManager { - nc.zonePodEvictor[zone] = - scheduler.NewRateLimitedTimedQueue( - flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) - } else { - nc.zoneNoExecuteTainter[zone] = - scheduler.NewRateLimitedTimedQueue( - flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) - } + nc.zoneNoExecuteTainter[zone] = + scheduler.NewRateLimitedTimedQueue( + flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) // Init the metric for the new zone. logger.Info("Initializing eviction metric for zone", "zone", zone) evictionsNumber.WithLabelValues(zone).Add(0) @@ -1469,50 +1230,6 @@ func (nc *Controller) addPodEvictorForNewZone(logger klog.Logger, node *v1.Node) } } -// cancelPodEviction removes any queued evictions, typically because the node is available again. It -// returns true if an eviction was queued. -func (nc *Controller) cancelPodEviction(logger klog.Logger, node *v1.Node) bool { - zone := nodetopology.GetZoneKey(node) - if !nc.nodeEvictionMap.setStatus(node.Name, unmarked) { - logger.V(2).Info("Node was unregistered in the meantime - skipping setting status", "node", klog.KObj(node)) - } - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name) - if wasDeleting { - logger.V(2).Info("Cancelling pod Eviction on Node", "node", klog.KObj(node)) - return true - } - return false -} - -// evictPods: -// - adds node to evictor queue if the node is not marked as evicted. -// Returns false if the node name was already enqueued. -// - deletes pods immediately if node is already marked as evicted. -// Returns false, because the node wasn't added to the queue. -func (nc *Controller) evictPods(ctx context.Context, node *v1.Node, pods []*v1.Pod) (bool, error) { - status, ok := nc.nodeEvictionMap.getStatus(node.Name) - if ok && status == evicted { - // Node eviction already happened for this node. - // Handling immediate pod deletion. - _, err := controllerutil.DeletePods(ctx, nc.kubeClient, pods, nc.recorder, node.Name, string(node.UID), nc.daemonSetStore) - if err != nil { - return false, fmt.Errorf("unable to delete pods from node %q: %v", node.Name, err) - } - return false, nil - } - logger := klog.FromContext(ctx) - if !nc.nodeEvictionMap.setStatus(node.Name, toBeEvicted) { - logger.V(2).Info("Node was unregistered in the meantime - skipping setting status", "node", klog.KObj(node)) - } - - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - - return nc.zonePodEvictor[nodetopology.GetZoneKey(node)].Add(node.Name, string(node.UID)), nil -} - func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStatus) bool { nc.evictorLock.Lock() defer nc.evictorLock.Unlock() diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index f9944d1b563..fdf567329d9 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -95,11 +95,10 @@ func (nc *nodeLifecycleController) doEviction(logger klog.Logger, fakeNodeHandle defer nc.evictorLock.Unlock() zones := testutil.GetZones(fakeNodeHandler) for _, zone := range zones { - nc.zonePodEvictor[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { + nc.zoneNoExecuteTainter[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { uid, _ := value.UID.(string) pods, _ := nc.getPodsAssignedToNode(value.Value) controllerutil.DeletePods(context.TODO(), fakeNodeHandler, pods, nc.recorder, value.Value, uid, nc.daemonSetStore) - _ = nc.nodeEvictionMap.setStatus(value.Value, evicted) return true, 0 }) } @@ -157,7 +156,6 @@ func newNodeLifecycleControllerFromClient( nodeMonitorGracePeriod time.Duration, nodeStartupGracePeriod time.Duration, nodeMonitorPeriod time.Duration, - useTaints bool, ) (*nodeLifecycleController, error) { factory := informers.NewSharedInformerFactory(kubeClient, controller.NoResyncPeriodFunc()) @@ -181,7 +179,6 @@ func newNodeLifecycleControllerFromClient( secondaryEvictionLimiterQPS, largeClusterThreshold, unhealthyZoneThreshold, - useTaints, ) if err != nil { return nil, err @@ -378,7 +375,7 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { }, }, secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: false, + expectedEvictPods: true, description: "Node created long time ago, and kubelet posted NotReady for a short period of time.", }, // Pod is ds-managed, and kubelet posted NotReady for a long period of time. @@ -609,7 +606,7 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { }, }, secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: false, + expectedEvictPods: true, description: "Node created long time ago, node controller posted Unknown for a short period of time.", }, // Node created long time ago, node controller posted Unknown for a long period of time. @@ -694,7 +691,7 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - false) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) @@ -725,8 +722,8 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { zones := testutil.GetZones(item.fakeNodeHandler) logger, _ := ktesting.NewTestContext(t) for _, zone := range zones { - if _, ok := nodeController.zonePodEvictor[zone]; ok { - nodeController.zonePodEvictor[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { + if _, ok := nodeController.zoneNoExecuteTainter[zone]; ok { + nodeController.zoneNoExecuteTainter[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { nodeUID, _ := value.UID.(string) pods, err := nodeController.getPodsAssignedToNode(value.Value) if err != nil { @@ -864,7 +861,7 @@ func TestPodStatusChange(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - false) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) @@ -888,7 +885,7 @@ func TestPodStatusChange(t *testing.T) { zones := testutil.GetZones(item.fakeNodeHandler) logger, _ := ktesting.NewTestContext(t) for _, zone := range zones { - nodeController.zonePodEvictor[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { + nodeController.zoneNoExecuteTainter[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { nodeUID, _ := value.UID.(string) pods, err := nodeController.getPodsAssignedToNode(value.Value) if err != nil { @@ -1427,7 +1424,7 @@ func TestMonitorNodeHealthEvictPodsWithDisruption(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - false) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) @@ -1715,7 +1712,7 @@ func TestMonitorNodeHealthUpdateStatus(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - false) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) @@ -2259,7 +2256,7 @@ func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - false) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) @@ -2424,7 +2421,7 @@ func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - false) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) @@ -2525,8 +2522,7 @@ func TestMonitorNodeHealthMarkPodsNotReadyWithWorkerSize(t *testing.T) { testUnhealthyThreshold, testNodeMonitorGracePeriod, testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) + testNodeMonitorPeriod) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) @@ -2730,7 +2726,7 @@ func TestMonitorNodeHealthMarkPodsNotReadyRetry(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - false) + ) if item.updateReactor != nil { item.fakeNodeHandler.Clientset.PrependReactor("update", "pods", item.updateReactor) } @@ -2869,7 +2865,7 @@ func TestApplyNoExecuteTaints(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - true) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) @@ -3024,7 +3020,7 @@ func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - true) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) @@ -3249,7 +3245,7 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - true) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) @@ -3354,7 +3350,7 @@ func TestTaintsNodeByCondition(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - true) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) @@ -3557,7 +3553,7 @@ func TestNodeEventGeneration(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - false) + ) nodeController.now = func() metav1.Time { return fakeNow } fakeRecorder := testutil.NewFakeRecorder() nodeController.recorder = fakeRecorder @@ -3631,7 +3627,7 @@ func TestReconcileNodeLabels(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - true) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) @@ -3775,7 +3771,7 @@ func TestTryUpdateNodeHealth(t *testing.T) { testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, - true) + ) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index 72623475c66..fa4f5ac78ac 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -52473,13 +52473,6 @@ func schema_k8sio_kube_controller_manager_config_v1alpha1_NodeLifecycleControlle Description: "NodeLifecycleControllerConfiguration contains elements describing NodeLifecycleController.", Type: []string{"object"}, Properties: map[string]spec.Schema{ - "EnableTaintManager": { - SchemaProps: spec.SchemaProps{ - Description: "If set to true enables NoExecute Taints and will evict all not-tolerating Pod running on Nodes tainted with this kind of Taints.", - Type: []string{"boolean"}, - Format: "", - }, - }, "NodeEvictionRate": { SchemaProps: spec.SchemaProps{ Description: "nodeEvictionRate is the number of nodes per second on which pods are deleted in case of node failure when a zone is healthy", @@ -52534,7 +52527,7 @@ func schema_k8sio_kube_controller_manager_config_v1alpha1_NodeLifecycleControlle }, }, }, - Required: []string{"EnableTaintManager", "NodeEvictionRate", "SecondaryNodeEvictionRate", "NodeStartupGracePeriod", "NodeMonitorGracePeriod", "PodEvictionTimeout", "LargeClusterSizeThreshold", "UnhealthyZoneThreshold"}, + Required: []string{"NodeEvictionRate", "SecondaryNodeEvictionRate", "NodeStartupGracePeriod", "NodeMonitorGracePeriod", "PodEvictionTimeout", "LargeClusterSizeThreshold", "UnhealthyZoneThreshold"}, }, }, Dependencies: []string{ diff --git a/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/types.go b/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/types.go index 3d02f6d4cd1..1ab74d57a77 100644 --- a/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/types.go +++ b/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/types.go @@ -383,9 +383,6 @@ type NodeIPAMControllerConfiguration struct { // NodeLifecycleControllerConfiguration contains elements describing NodeLifecycleController. type NodeLifecycleControllerConfiguration struct { - // If set to true enables NoExecute Taints and will evict all not-tolerating - // Pod running on Nodes tainted with this kind of Taints. - EnableTaintManager *bool // nodeEvictionRate is the number of nodes per second on which pods are deleted in case of node failure when a zone is healthy NodeEvictionRate float32 // secondaryNodeEvictionRate is the number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy diff --git a/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/zz_generated.deepcopy.go b/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/zz_generated.deepcopy.go index e752178db42..441e3bb2f75 100644 --- a/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/zz_generated.deepcopy.go +++ b/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/zz_generated.deepcopy.go @@ -312,7 +312,7 @@ func (in *KubeControllerManagerConfiguration) DeepCopyInto(out *KubeControllerMa out.CronJobController = in.CronJobController out.NamespaceController = in.NamespaceController out.NodeIPAMController = in.NodeIPAMController - in.NodeLifecycleController.DeepCopyInto(&out.NodeLifecycleController) + out.NodeLifecycleController = in.NodeLifecycleController in.PersistentVolumeBinderController.DeepCopyInto(&out.PersistentVolumeBinderController) out.PodGCController = in.PodGCController out.ReplicaSetController = in.ReplicaSetController @@ -378,11 +378,6 @@ func (in *NodeIPAMControllerConfiguration) DeepCopy() *NodeIPAMControllerConfigu // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeLifecycleControllerConfiguration) DeepCopyInto(out *NodeLifecycleControllerConfiguration) { *out = *in - if in.EnableTaintManager != nil { - in, out := &in.EnableTaintManager, &out.EnableTaintManager - *out = new(bool) - **out = **in - } out.NodeStartupGracePeriod = in.NodeStartupGracePeriod out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod out.PodEvictionTimeout = in.PodEvictionTimeout diff --git a/test/integration/node/lifecycle_test.go b/test/integration/node/lifecycle_test.go index 2128ec44157..9b807095286 100644 --- a/test/integration/node/lifecycle_test.go +++ b/test/integration/node/lifecycle_test.go @@ -131,7 +131,6 @@ func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) { 100, // Secondary eviction limiter QPS 50, // Large cluster threshold 0.55, // Unhealthy zone threshold - true, // Run taint manager ) if err != nil { t.Fatalf("Failed to create node controller: %v", err) @@ -284,7 +283,6 @@ func TestTaintBasedEvictions(t *testing.T) { 100, // Secondary eviction limiter QPS 50, // Large cluster threshold 0.55, // Unhealthy zone threshold - true, // Run taint manager ) if err != nil { t.Fatalf("Failed to create node controller: %v", err) diff --git a/test/integration/scheduler/taint/taint_test.go b/test/integration/scheduler/taint/taint_test.go index 4805797a8ff..3ed68b7fc99 100644 --- a/test/integration/scheduler/taint/taint_test.go +++ b/test/integration/scheduler/taint/taint_test.go @@ -100,7 +100,6 @@ func TestTaintNodeByCondition(t *testing.T) { 100, // Secondary eviction limiter QPS 100, // Large cluster threshold 100, // Unhealthy zone threshold - true, // Run taint manager ) if err != nil { t.Errorf("Failed to create node controller: %v", err) From 15daa48e30bd914eb621b8da1581ea304733bbe4 Mon Sep 17 00:00:00 2001 From: kerthcet Date: Wed, 14 Sep 2022 17:07:11 +0800 Subject: [PATCH 2/4] Deprecate CLI flag pod-eviction-timeout Signed-off-by: kerthcet --- .../app/options/nodelifecyclecontroller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go index 0952bdd49b6..04d4d3cedef 100644 --- a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go +++ b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go @@ -44,7 +44,7 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) { fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.") fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, "Number of nodes from which NodeController treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller.") fs.Float32Var(&o.UnhealthyZoneThreshold, "unhealthy-zone-threshold", 0.55, "Fraction of Nodes in a zone which needs to be not Ready (minimum 3) for zone to be treated as unhealthy. ") - fs.MarkDeprecated("pod-eviction-timeout", "This flag is deprecated and it will be removed in 1.27. Once taint manager is enabled, this flag has no effect.") + fs.MarkDeprecated("pod-eviction-timeout", "This flag is deprecated and it will be removed in 1.27. Once taint-manager is always enabled, this flag will have no effect.") } // ApplyTo fills up NodeLifecycleController config with options. From 98bbeae99f065e2c650225b941bc175bc4ccd3fc Mon Sep 17 00:00:00 2001 From: kerthcet Date: Thu, 16 Feb 2023 17:21:52 +0100 Subject: [PATCH 3/4] address comments Signed-off-by: kerthcet --- .../app/options/nodelifecyclecontroller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go index 04d4d3cedef..9f9da5d485b 100644 --- a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go +++ b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go @@ -44,7 +44,7 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) { fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.") fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, "Number of nodes from which NodeController treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller.") fs.Float32Var(&o.UnhealthyZoneThreshold, "unhealthy-zone-threshold", 0.55, "Fraction of Nodes in a zone which needs to be not Ready (minimum 3) for zone to be treated as unhealthy. ") - fs.MarkDeprecated("pod-eviction-timeout", "This flag is deprecated and it will be removed in 1.27. Once taint-manager is always enabled, this flag will have no effect.") + fs.MarkDeprecated("pod-eviction-timeout", "This flag is deprecated and it will be removed in 1.28. Since taint manager is always enabled, this flag will have no effect.") } // ApplyTo fills up NodeLifecycleController config with options. From cae19f9e851864b03bc0a6cfde301bea77a4fece Mon Sep 17 00:00:00 2001 From: Andrea Tosatto Date: Tue, 7 Mar 2023 14:27:14 +0000 Subject: [PATCH 4/4] Remove deprecated pod-eviction-timeout flag from controller-manager --- cmd/kube-controller-manager/app/core.go | 1 - .../app/options/nodelifecyclecontroller.go | 3 - .../app/options/options_test.go | 4 - pkg/controller/nodelifecycle/config/types.go | 2 - .../v1alpha1/zz_generated.conversion.go | 3 +- .../config/zz_generated.deepcopy.go | 1 - .../node_lifecycle_controller.go | 5 +- .../node_lifecycle_controller_test.go | 621 +----------------- test/integration/node/lifecycle_test.go | 2 - .../integration/scheduler/taint/taint_test.go | 1 - 10 files changed, 4 insertions(+), 639 deletions(-) diff --git a/cmd/kube-controller-manager/app/core.go b/cmd/kube-controller-manager/app/core.go index 72e02b7d33d..cad229e12be 100644 --- a/cmd/kube-controller-manager/app/core.go +++ b/cmd/kube-controller-manager/app/core.go @@ -186,7 +186,6 @@ func startNodeLifecycleController(ctx context.Context, controllerContext Control controllerContext.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration, controllerContext.ComponentConfig.NodeLifecycleController.NodeStartupGracePeriod.Duration, controllerContext.ComponentConfig.NodeLifecycleController.NodeMonitorGracePeriod.Duration, - controllerContext.ComponentConfig.NodeLifecycleController.PodEvictionTimeout.Duration, controllerContext.ComponentConfig.NodeLifecycleController.NodeEvictionRate, controllerContext.ComponentConfig.NodeLifecycleController.SecondaryNodeEvictionRate, controllerContext.ComponentConfig.NodeLifecycleController.LargeClusterSizeThreshold, diff --git a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go index 9f9da5d485b..d3c84d63826 100644 --- a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go +++ b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go @@ -39,12 +39,10 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) { "Amount of time which we allow running Node to be unresponsive before marking it unhealthy. "+ "Must be N times more than kubelet's nodeStatusUpdateFrequency, "+ "where N means number of retries allowed for kubelet to post node status.") - fs.DurationVar(&o.PodEvictionTimeout.Duration, "pod-eviction-timeout", o.PodEvictionTimeout.Duration, "The grace period for deleting pods on failed nodes.") fs.Float32Var(&o.NodeEvictionRate, "node-eviction-rate", 0.1, "Number of nodes per second on which pods are deleted in case of node failure when a zone is healthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters.") fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.") fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, "Number of nodes from which NodeController treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller.") fs.Float32Var(&o.UnhealthyZoneThreshold, "unhealthy-zone-threshold", 0.55, "Fraction of Nodes in a zone which needs to be not Ready (minimum 3) for zone to be treated as unhealthy. ") - fs.MarkDeprecated("pod-eviction-timeout", "This flag is deprecated and it will be removed in 1.28. Since taint manager is always enabled, this flag will have no effect.") } // ApplyTo fills up NodeLifecycleController config with options. @@ -55,7 +53,6 @@ func (o *NodeLifecycleControllerOptions) ApplyTo(cfg *nodelifecycleconfig.NodeLi cfg.NodeStartupGracePeriod = o.NodeStartupGracePeriod cfg.NodeMonitorGracePeriod = o.NodeMonitorGracePeriod - cfg.PodEvictionTimeout = o.PodEvictionTimeout cfg.NodeEvictionRate = o.NodeEvictionRate cfg.SecondaryNodeEvictionRate = o.SecondaryNodeEvictionRate cfg.LargeClusterSizeThreshold = o.LargeClusterSizeThreshold diff --git a/cmd/kube-controller-manager/app/options/options_test.go b/cmd/kube-controller-manager/app/options/options_test.go index 44d3c073ab2..d7ee588116a 100644 --- a/cmd/kube-controller-manager/app/options/options_test.go +++ b/cmd/kube-controller-manager/app/options/options_test.go @@ -141,7 +141,6 @@ var args = []string{ "--node-monitor-grace-period=30s", "--node-monitor-period=10s", "--node-startup-grace-period=30s", - "--pod-eviction-timeout=2m", "--profiling=false", "--pv-recycler-increment-timeout-nfs=45", "--pv-recycler-minimum-timeout-hostpath=45", @@ -347,7 +346,6 @@ func TestAddFlags(t *testing.T) { SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second}, - PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute}, LargeClusterSizeThreshold: 100, UnhealthyZoneThreshold: 0.6, }, @@ -591,7 +589,6 @@ func TestApplyTo(t *testing.T) { SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second}, - PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute}, LargeClusterSizeThreshold: 100, UnhealthyZoneThreshold: 0.6, }, @@ -1165,7 +1162,6 @@ func TestValidateControllersOptions(t *testing.T) { SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second}, - PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute}, LargeClusterSizeThreshold: 100, UnhealthyZoneThreshold: 0.6, }, diff --git a/pkg/controller/nodelifecycle/config/types.go b/pkg/controller/nodelifecycle/config/types.go index dfba420f7cb..176b0b9b637 100644 --- a/pkg/controller/nodelifecycle/config/types.go +++ b/pkg/controller/nodelifecycle/config/types.go @@ -34,8 +34,6 @@ type NodeLifecycleControllerConfiguration struct { // nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet // to post node status. NodeMonitorGracePeriod metav1.Duration - // podEvictionTimeout is the grace period for deleting pods on failed nodes. - PodEvictionTimeout metav1.Duration // secondaryNodeEvictionRate is implicitly overridden to 0 for clusters smaller than or equal to largeClusterSizeThreshold LargeClusterSizeThreshold int32 // Zone is treated as unhealthy in nodeEvictionRate and secondaryNodeEvictionRate when at least diff --git a/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go b/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go index 912f8d20e74..5998751dbee 100644 --- a/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go +++ b/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go @@ -86,7 +86,7 @@ func autoConvert_v1alpha1_NodeLifecycleControllerConfiguration_To_config_NodeLif out.SecondaryNodeEvictionRate = in.SecondaryNodeEvictionRate out.NodeStartupGracePeriod = in.NodeStartupGracePeriod out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod - out.PodEvictionTimeout = in.PodEvictionTimeout + // WARNING: in.PodEvictionTimeout requires manual conversion: does not exist in peer-type out.LargeClusterSizeThreshold = in.LargeClusterSizeThreshold out.UnhealthyZoneThreshold = in.UnhealthyZoneThreshold return nil @@ -97,7 +97,6 @@ func autoConvert_config_NodeLifecycleControllerConfiguration_To_v1alpha1_NodeLif out.SecondaryNodeEvictionRate = in.SecondaryNodeEvictionRate out.NodeStartupGracePeriod = in.NodeStartupGracePeriod out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod - out.PodEvictionTimeout = in.PodEvictionTimeout out.LargeClusterSizeThreshold = in.LargeClusterSizeThreshold out.UnhealthyZoneThreshold = in.UnhealthyZoneThreshold return nil diff --git a/pkg/controller/nodelifecycle/config/zz_generated.deepcopy.go b/pkg/controller/nodelifecycle/config/zz_generated.deepcopy.go index 0e0375c6cbf..8f4a87c06fd 100644 --- a/pkg/controller/nodelifecycle/config/zz_generated.deepcopy.go +++ b/pkg/controller/nodelifecycle/config/zz_generated.deepcopy.go @@ -26,7 +26,6 @@ func (in *NodeLifecycleControllerConfiguration) DeepCopyInto(out *NodeLifecycleC *out = *in out.NodeStartupGracePeriod = in.NodeStartupGracePeriod out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod - out.PodEvictionTimeout = in.PodEvictionTimeout return } diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index a3c0e3d1658..ad1c05ad21c 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -273,7 +273,7 @@ type Controller struct { // post node status/lease. It is pointless to make nodeMonitorGracePeriod // be less than the node health signal update frequency, since there will // only be fresh values from Kubelet at an interval of node health signal - // update frequency. The constant must be less than podEvictionTimeout. + // update frequency. // 2. nodeMonitorGracePeriod can't be too large for user experience - larger // value takes longer for user to see up-to-date node health. nodeMonitorGracePeriod time.Duration @@ -282,7 +282,6 @@ type Controller struct { // Defaults to scheduler.UpdateWorkerSize. nodeUpdateWorkerSize int - podEvictionTimeout time.Duration evictionLimiterQPS float32 secondaryEvictionLimiterQPS float32 largeClusterThreshold int32 @@ -303,7 +302,6 @@ func NewNodeLifecycleController( nodeMonitorPeriod time.Duration, nodeStartupGracePeriod time.Duration, nodeMonitorGracePeriod time.Duration, - podEvictionTimeout time.Duration, evictionLimiterQPS float32, secondaryEvictionLimiterQPS float32, largeClusterThreshold int32, @@ -332,7 +330,6 @@ func NewNodeLifecycleController( zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue), nodesToRetry: sync.Map{}, zoneStates: make(map[string]ZoneState), - podEvictionTimeout: podEvictionTimeout, evictionLimiterQPS: evictionLimiterQPS, secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, largeClusterThreshold: largeClusterThreshold, diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index fdf567329d9..c782a9fd372 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -41,7 +41,6 @@ import ( clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/fake" testcore "k8s.io/client-go/testing" - "k8s.io/klog/v2" "k8s.io/klog/v2/ktesting" kubeletapis "k8s.io/kubelet/pkg/apis" "k8s.io/kubernetes/pkg/controller" @@ -89,28 +88,6 @@ type nodeLifecycleController struct { daemonSetInformer appsinformers.DaemonSetInformer } -// doEviction does the fake eviction and returns the status of eviction operation. -func (nc *nodeLifecycleController) doEviction(logger klog.Logger, fakeNodeHandler *testutil.FakeNodeHandler) bool { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - zones := testutil.GetZones(fakeNodeHandler) - for _, zone := range zones { - nc.zoneNoExecuteTainter[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { - uid, _ := value.UID.(string) - pods, _ := nc.getPodsAssignedToNode(value.Value) - controllerutil.DeletePods(context.TODO(), fakeNodeHandler, pods, nc.recorder, value.Value, uid, nc.daemonSetStore) - return true, 0 - }) - } - - for _, action := range fakeNodeHandler.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - return true - } - } - return false -} - func createNodeLease(nodeName string, renewTime metav1.MicroTime) *coordv1.Lease { return &coordv1.Lease{ ObjectMeta: metav1.ObjectMeta{ @@ -148,7 +125,6 @@ func (nc *nodeLifecycleController) syncNodeStore(fakeNodeHandler *testutil.FakeN func newNodeLifecycleControllerFromClient( ctx context.Context, kubeClient clientset.Interface, - podEvictionTimeout time.Duration, evictionLimiterQPS float32, secondaryEvictionLimiterQPS float32, largeClusterThreshold int32, @@ -174,7 +150,6 @@ func newNodeLifecycleControllerFromClient( nodeMonitorPeriod, nodeStartupGracePeriod, nodeMonitorGracePeriod, - podEvictionTimeout, evictionLimiterQPS, secondaryEvictionLimiterQPS, largeClusterThreshold, @@ -194,7 +169,6 @@ func newNodeLifecycleControllerFromClient( func TestMonitorNodeHealthEvictPods(t *testing.T) { fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute labels := map[string]string{ v1.LabelTopologyRegion: "region1", v1.LabelTopologyZone: "zone1", @@ -362,7 +336,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), }, daemonSets: nil, - timeToPass: evictionTimeout, newNodeStatus: v1.NodeStatus{ Conditions: []v1.NodeCondition{ { @@ -593,7 +566,7 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), }, daemonSets: nil, - timeToPass: evictionTimeout - testNodeMonitorGracePeriod, + timeToPass: testNodeMonitorGracePeriod, newNodeStatus: v1.NodeStatus{ Conditions: []v1.NodeCondition{ { @@ -683,7 +656,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -754,7 +726,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { func TestPodStatusChange(t *testing.T) { fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady // we need second healthy node in tests. Because of how the tests are written we need to update @@ -853,7 +824,6 @@ func TestPodStatusChange(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -913,574 +883,6 @@ func TestPodStatusChange(t *testing.T) { } } -func TestMonitorNodeHealthEvictPodsWithDisruption(t *testing.T) { - fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - timeToPass := 60 * time.Minute - - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. Because of how the tests are written we need to update - // the status of this Node. - healthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 13, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - unhealthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - // Node status was updated by nodecontroller 1hr ago - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - - table := []struct { - nodeList []*v1.Node - podList []v1.Pod - updatedNodeStatuses []v1.NodeStatus - expectedInitialStates map[string]ZoneState - expectedFollowingStates map[string]ZoneState - expectedEvictPods bool - description string - }{ - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Only zone is down - eviction shouldn't take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption}, - expectedFollowingStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption}, - expectedEvictPods: false, - description: "Network Disruption: Only zone is down - eviction shouldn't take place.", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Both zones down - eviction shouldn't take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region2", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region2", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, - }, - expectedEvictPods: false, - description: "Network Disruption: Both zones down - eviction shouldn't take place.", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // One zone is down - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedEvictPods: true, - description: "Network Disruption: One zone is down - eviction should take place.", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period - // of on first Node, eviction should stop even though Node with label - // node.kubernetes.io/exclude-disruption is healthy. - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node-master", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - labelNodeDisruptionExclusion: "", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - }, - expectedEvictPods: false, - description: "NetworkDisruption: eviction should stop, only Node with label node.kubernetes.io/exclude-disruption is healthy", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Initially both zones down, one comes back - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateFullDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedEvictPods: true, - description: "Initially both zones down, one comes back - eviction should take place", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Zone is partially disrupted - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node3", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node4", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - healthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, - }, - expectedEvictPods: true, - description: "Zone is partially disrupted - eviction should take place.", - }, - } - - for _, item := range table { - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: item.nodeList, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: item.podList}), - } - nodeController, _ := newNodeLifecycleControllerFromClient( - context.TODO(), - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - ) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - nodeController.enterPartialDisruptionFunc = func(nodeNum int) float32 { - return testRateLimiterQPS - } - nodeController.enterFullDisruptionFunc = func(nodeNum int) float32 { - return testRateLimiterQPS - } - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { - t.Errorf("%v: unexpected error: %v", item.description, err) - } - - for zone, state := range item.expectedInitialStates { - if state != nodeController.zoneStates[zone] { - t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state) - } - } - - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} } - for i := range item.updatedNodeStatuses { - fakeNodeHandler.Existing[i].Status = item.updatedNodeStatuses[i] - } - - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { - t.Errorf("%v: unexpected error: %v", item.description, err) - } - for zone, state := range item.expectedFollowingStates { - if state != nodeController.zoneStates[zone] { - t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state) - } - } - var podEvicted bool - start := time.Now() - // Infinite loop, used for retrying in case ratelimiter fails to reload for Try function. - // this breaks when we have the status that we need for test case or when we don't see the - // intended result after 1 minute. - logger, _ := ktesting.NewTestContext(t) - for { - podEvicted = nodeController.doEviction(logger, fakeNodeHandler) - if podEvicted == item.expectedEvictPods || time.Since(start) > 1*time.Minute { - break - } - } - if item.expectedEvictPods != podEvicted { - t.Errorf("%v: expected pod eviction: %+v, got %+v", item.description, item.expectedEvictPods, podEvicted) - } - } -} - func TestMonitorNodeHealthUpdateStatus(t *testing.T) { fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) table := []struct { @@ -1704,7 +1106,6 @@ func TestMonitorNodeHealthUpdateStatus(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2248,7 +1649,6 @@ func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2413,7 +1813,6 @@ func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2515,7 +1914,6 @@ func TestMonitorNodeHealthMarkPodsNotReadyWithWorkerSize(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2718,7 +2116,6 @@ func TestMonitorNodeHealthMarkPodsNotReadyRetry(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2766,7 +2163,6 @@ func TestApplyNoExecuteTaints(t *testing.T) { t.Skip("Skipping test on Windows.") } fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -2857,7 +2253,6 @@ func TestApplyNoExecuteTaints(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2922,7 +2317,6 @@ func TestApplyNoExecuteTaints(t *testing.T) { // TestApplyNoExecuteTaintsToNodesEnqueueTwice ensures we taint every node with NoExecute even if enqueued twice func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3012,7 +2406,6 @@ func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3155,7 +2548,6 @@ func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { func TestSwapUnreachableNotReadyTaints(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3209,7 +2601,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { }, Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), } - timeToPass := evictionTimeout newNodeStatus := v1.NodeStatus{ Conditions: []v1.NodeCondition{ { @@ -3237,7 +2628,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3272,7 +2662,7 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints) } - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} } + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Time} } node0.Status = newNodeStatus node1.Status = healthyNodeNewStatus @@ -3309,7 +2699,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { func TestTaintsNodeByCondition(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3342,7 +2731,6 @@ func TestTaintsNodeByCondition(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3545,7 +2933,6 @@ func TestNodeEventGeneration(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3586,7 +2973,6 @@ func TestNodeEventGeneration(t *testing.T) { func TestReconcileNodeLabels(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3619,7 +3005,6 @@ func TestReconcileNodeLabels(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3736,7 +3121,6 @@ func TestReconcileNodeLabels(t *testing.T) { func TestTryUpdateNodeHealth(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) fakeOld := metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3763,7 +3147,6 @@ func TestTryUpdateNodeHealth(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, diff --git a/test/integration/node/lifecycle_test.go b/test/integration/node/lifecycle_test.go index 9b807095286..497517fda5f 100644 --- a/test/integration/node/lifecycle_test.go +++ b/test/integration/node/lifecycle_test.go @@ -126,7 +126,6 @@ func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) { 1*time.Second, // Node monitor grace period time.Minute, // Node startup grace period time.Millisecond, // Node monitor period - 1, // Pod eviction timeout 100, // Eviction limiter QPS 100, // Secondary eviction limiter QPS 50, // Large cluster threshold @@ -278,7 +277,6 @@ func TestTaintBasedEvictions(t *testing.T) { 1*time.Second, // Node monitor grace period time.Minute, // Node startup grace period time.Millisecond, // Node monitor period - time.Second, // Pod eviction timeout 100, // Eviction limiter QPS 100, // Secondary eviction limiter QPS 50, // Large cluster threshold diff --git a/test/integration/scheduler/taint/taint_test.go b/test/integration/scheduler/taint/taint_test.go index 3ed68b7fc99..a9b2ef6ea46 100644 --- a/test/integration/scheduler/taint/taint_test.go +++ b/test/integration/scheduler/taint/taint_test.go @@ -95,7 +95,6 @@ func TestTaintNodeByCondition(t *testing.T) { time.Hour, // Node monitor grace period time.Second, // Node startup grace period time.Second, // Node monitor period - time.Second, // Pod eviction timeout 100, // Eviction limiter QPS 100, // Secondary eviction limiter QPS 100, // Large cluster threshold