diff --git a/cmd/kube-controller-manager/app/core.go b/cmd/kube-controller-manager/app/core.go index 72e02b7d33d..cad229e12be 100644 --- a/cmd/kube-controller-manager/app/core.go +++ b/cmd/kube-controller-manager/app/core.go @@ -186,7 +186,6 @@ func startNodeLifecycleController(ctx context.Context, controllerContext Control controllerContext.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration, controllerContext.ComponentConfig.NodeLifecycleController.NodeStartupGracePeriod.Duration, controllerContext.ComponentConfig.NodeLifecycleController.NodeMonitorGracePeriod.Duration, - controllerContext.ComponentConfig.NodeLifecycleController.PodEvictionTimeout.Duration, controllerContext.ComponentConfig.NodeLifecycleController.NodeEvictionRate, controllerContext.ComponentConfig.NodeLifecycleController.SecondaryNodeEvictionRate, controllerContext.ComponentConfig.NodeLifecycleController.LargeClusterSizeThreshold, diff --git a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go index 9f9da5d485b..d3c84d63826 100644 --- a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go +++ b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go @@ -39,12 +39,10 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) { "Amount of time which we allow running Node to be unresponsive before marking it unhealthy. "+ "Must be N times more than kubelet's nodeStatusUpdateFrequency, "+ "where N means number of retries allowed for kubelet to post node status.") - fs.DurationVar(&o.PodEvictionTimeout.Duration, "pod-eviction-timeout", o.PodEvictionTimeout.Duration, "The grace period for deleting pods on failed nodes.") fs.Float32Var(&o.NodeEvictionRate, "node-eviction-rate", 0.1, "Number of nodes per second on which pods are deleted in case of node failure when a zone is healthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters.") fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.") fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, "Number of nodes from which NodeController treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller.") fs.Float32Var(&o.UnhealthyZoneThreshold, "unhealthy-zone-threshold", 0.55, "Fraction of Nodes in a zone which needs to be not Ready (minimum 3) for zone to be treated as unhealthy. ") - fs.MarkDeprecated("pod-eviction-timeout", "This flag is deprecated and it will be removed in 1.28. Since taint manager is always enabled, this flag will have no effect.") } // ApplyTo fills up NodeLifecycleController config with options. @@ -55,7 +53,6 @@ func (o *NodeLifecycleControllerOptions) ApplyTo(cfg *nodelifecycleconfig.NodeLi cfg.NodeStartupGracePeriod = o.NodeStartupGracePeriod cfg.NodeMonitorGracePeriod = o.NodeMonitorGracePeriod - cfg.PodEvictionTimeout = o.PodEvictionTimeout cfg.NodeEvictionRate = o.NodeEvictionRate cfg.SecondaryNodeEvictionRate = o.SecondaryNodeEvictionRate cfg.LargeClusterSizeThreshold = o.LargeClusterSizeThreshold diff --git a/cmd/kube-controller-manager/app/options/options_test.go b/cmd/kube-controller-manager/app/options/options_test.go index 44d3c073ab2..d7ee588116a 100644 --- a/cmd/kube-controller-manager/app/options/options_test.go +++ b/cmd/kube-controller-manager/app/options/options_test.go @@ -141,7 +141,6 @@ var args = []string{ "--node-monitor-grace-period=30s", "--node-monitor-period=10s", "--node-startup-grace-period=30s", - "--pod-eviction-timeout=2m", "--profiling=false", "--pv-recycler-increment-timeout-nfs=45", "--pv-recycler-minimum-timeout-hostpath=45", @@ -347,7 +346,6 @@ func TestAddFlags(t *testing.T) { SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second}, - PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute}, LargeClusterSizeThreshold: 100, UnhealthyZoneThreshold: 0.6, }, @@ -591,7 +589,6 @@ func TestApplyTo(t *testing.T) { SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second}, - PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute}, LargeClusterSizeThreshold: 100, UnhealthyZoneThreshold: 0.6, }, @@ -1165,7 +1162,6 @@ func TestValidateControllersOptions(t *testing.T) { SecondaryNodeEvictionRate: 0.05, NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second}, NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second}, - PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute}, LargeClusterSizeThreshold: 100, UnhealthyZoneThreshold: 0.6, }, diff --git a/pkg/controller/nodelifecycle/config/types.go b/pkg/controller/nodelifecycle/config/types.go index dfba420f7cb..176b0b9b637 100644 --- a/pkg/controller/nodelifecycle/config/types.go +++ b/pkg/controller/nodelifecycle/config/types.go @@ -34,8 +34,6 @@ type NodeLifecycleControllerConfiguration struct { // nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet // to post node status. NodeMonitorGracePeriod metav1.Duration - // podEvictionTimeout is the grace period for deleting pods on failed nodes. - PodEvictionTimeout metav1.Duration // secondaryNodeEvictionRate is implicitly overridden to 0 for clusters smaller than or equal to largeClusterSizeThreshold LargeClusterSizeThreshold int32 // Zone is treated as unhealthy in nodeEvictionRate and secondaryNodeEvictionRate when at least diff --git a/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go b/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go index 912f8d20e74..5998751dbee 100644 --- a/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go +++ b/pkg/controller/nodelifecycle/config/v1alpha1/zz_generated.conversion.go @@ -86,7 +86,7 @@ func autoConvert_v1alpha1_NodeLifecycleControllerConfiguration_To_config_NodeLif out.SecondaryNodeEvictionRate = in.SecondaryNodeEvictionRate out.NodeStartupGracePeriod = in.NodeStartupGracePeriod out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod - out.PodEvictionTimeout = in.PodEvictionTimeout + // WARNING: in.PodEvictionTimeout requires manual conversion: does not exist in peer-type out.LargeClusterSizeThreshold = in.LargeClusterSizeThreshold out.UnhealthyZoneThreshold = in.UnhealthyZoneThreshold return nil @@ -97,7 +97,6 @@ func autoConvert_config_NodeLifecycleControllerConfiguration_To_v1alpha1_NodeLif out.SecondaryNodeEvictionRate = in.SecondaryNodeEvictionRate out.NodeStartupGracePeriod = in.NodeStartupGracePeriod out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod - out.PodEvictionTimeout = in.PodEvictionTimeout out.LargeClusterSizeThreshold = in.LargeClusterSizeThreshold out.UnhealthyZoneThreshold = in.UnhealthyZoneThreshold return nil diff --git a/pkg/controller/nodelifecycle/config/zz_generated.deepcopy.go b/pkg/controller/nodelifecycle/config/zz_generated.deepcopy.go index 0e0375c6cbf..8f4a87c06fd 100644 --- a/pkg/controller/nodelifecycle/config/zz_generated.deepcopy.go +++ b/pkg/controller/nodelifecycle/config/zz_generated.deepcopy.go @@ -26,7 +26,6 @@ func (in *NodeLifecycleControllerConfiguration) DeepCopyInto(out *NodeLifecycleC *out = *in out.NodeStartupGracePeriod = in.NodeStartupGracePeriod out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod - out.PodEvictionTimeout = in.PodEvictionTimeout return } diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index a3c0e3d1658..ad1c05ad21c 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -273,7 +273,7 @@ type Controller struct { // post node status/lease. It is pointless to make nodeMonitorGracePeriod // be less than the node health signal update frequency, since there will // only be fresh values from Kubelet at an interval of node health signal - // update frequency. The constant must be less than podEvictionTimeout. + // update frequency. // 2. nodeMonitorGracePeriod can't be too large for user experience - larger // value takes longer for user to see up-to-date node health. nodeMonitorGracePeriod time.Duration @@ -282,7 +282,6 @@ type Controller struct { // Defaults to scheduler.UpdateWorkerSize. nodeUpdateWorkerSize int - podEvictionTimeout time.Duration evictionLimiterQPS float32 secondaryEvictionLimiterQPS float32 largeClusterThreshold int32 @@ -303,7 +302,6 @@ func NewNodeLifecycleController( nodeMonitorPeriod time.Duration, nodeStartupGracePeriod time.Duration, nodeMonitorGracePeriod time.Duration, - podEvictionTimeout time.Duration, evictionLimiterQPS float32, secondaryEvictionLimiterQPS float32, largeClusterThreshold int32, @@ -332,7 +330,6 @@ func NewNodeLifecycleController( zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue), nodesToRetry: sync.Map{}, zoneStates: make(map[string]ZoneState), - podEvictionTimeout: podEvictionTimeout, evictionLimiterQPS: evictionLimiterQPS, secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, largeClusterThreshold: largeClusterThreshold, diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index fdf567329d9..c782a9fd372 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -41,7 +41,6 @@ import ( clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/fake" testcore "k8s.io/client-go/testing" - "k8s.io/klog/v2" "k8s.io/klog/v2/ktesting" kubeletapis "k8s.io/kubelet/pkg/apis" "k8s.io/kubernetes/pkg/controller" @@ -89,28 +88,6 @@ type nodeLifecycleController struct { daemonSetInformer appsinformers.DaemonSetInformer } -// doEviction does the fake eviction and returns the status of eviction operation. -func (nc *nodeLifecycleController) doEviction(logger klog.Logger, fakeNodeHandler *testutil.FakeNodeHandler) bool { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - zones := testutil.GetZones(fakeNodeHandler) - for _, zone := range zones { - nc.zoneNoExecuteTainter[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { - uid, _ := value.UID.(string) - pods, _ := nc.getPodsAssignedToNode(value.Value) - controllerutil.DeletePods(context.TODO(), fakeNodeHandler, pods, nc.recorder, value.Value, uid, nc.daemonSetStore) - return true, 0 - }) - } - - for _, action := range fakeNodeHandler.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - return true - } - } - return false -} - func createNodeLease(nodeName string, renewTime metav1.MicroTime) *coordv1.Lease { return &coordv1.Lease{ ObjectMeta: metav1.ObjectMeta{ @@ -148,7 +125,6 @@ func (nc *nodeLifecycleController) syncNodeStore(fakeNodeHandler *testutil.FakeN func newNodeLifecycleControllerFromClient( ctx context.Context, kubeClient clientset.Interface, - podEvictionTimeout time.Duration, evictionLimiterQPS float32, secondaryEvictionLimiterQPS float32, largeClusterThreshold int32, @@ -174,7 +150,6 @@ func newNodeLifecycleControllerFromClient( nodeMonitorPeriod, nodeStartupGracePeriod, nodeMonitorGracePeriod, - podEvictionTimeout, evictionLimiterQPS, secondaryEvictionLimiterQPS, largeClusterThreshold, @@ -194,7 +169,6 @@ func newNodeLifecycleControllerFromClient( func TestMonitorNodeHealthEvictPods(t *testing.T) { fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute labels := map[string]string{ v1.LabelTopologyRegion: "region1", v1.LabelTopologyZone: "zone1", @@ -362,7 +336,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), }, daemonSets: nil, - timeToPass: evictionTimeout, newNodeStatus: v1.NodeStatus{ Conditions: []v1.NodeCondition{ { @@ -593,7 +566,7 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), }, daemonSets: nil, - timeToPass: evictionTimeout - testNodeMonitorGracePeriod, + timeToPass: testNodeMonitorGracePeriod, newNodeStatus: v1.NodeStatus{ Conditions: []v1.NodeCondition{ { @@ -683,7 +656,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -754,7 +726,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { func TestPodStatusChange(t *testing.T) { fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady // we need second healthy node in tests. Because of how the tests are written we need to update @@ -853,7 +824,6 @@ func TestPodStatusChange(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -913,574 +883,6 @@ func TestPodStatusChange(t *testing.T) { } } -func TestMonitorNodeHealthEvictPodsWithDisruption(t *testing.T) { - fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - timeToPass := 60 * time.Minute - - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. Because of how the tests are written we need to update - // the status of this Node. - healthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 13, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - unhealthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - // Node status was updated by nodecontroller 1hr ago - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - - table := []struct { - nodeList []*v1.Node - podList []v1.Pod - updatedNodeStatuses []v1.NodeStatus - expectedInitialStates map[string]ZoneState - expectedFollowingStates map[string]ZoneState - expectedEvictPods bool - description string - }{ - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Only zone is down - eviction shouldn't take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption}, - expectedFollowingStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption}, - expectedEvictPods: false, - description: "Network Disruption: Only zone is down - eviction shouldn't take place.", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Both zones down - eviction shouldn't take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region2", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region2", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, - }, - expectedEvictPods: false, - description: "Network Disruption: Both zones down - eviction shouldn't take place.", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // One zone is down - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedEvictPods: true, - description: "Network Disruption: One zone is down - eviction should take place.", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period - // of on first Node, eviction should stop even though Node with label - // node.kubernetes.io/exclude-disruption is healthy. - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node-master", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - labelNodeDisruptionExclusion: "", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - }, - expectedEvictPods: false, - description: "NetworkDisruption: eviction should stop, only Node with label node.kubernetes.io/exclude-disruption is healthy", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Initially both zones down, one comes back - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateFullDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedEvictPods: true, - description: "Initially both zones down, one comes back - eviction should take place", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Zone is partially disrupted - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node3", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node4", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - healthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, - }, - expectedEvictPods: true, - description: "Zone is partially disrupted - eviction should take place.", - }, - } - - for _, item := range table { - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: item.nodeList, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: item.podList}), - } - nodeController, _ := newNodeLifecycleControllerFromClient( - context.TODO(), - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - ) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - nodeController.enterPartialDisruptionFunc = func(nodeNum int) float32 { - return testRateLimiterQPS - } - nodeController.enterFullDisruptionFunc = func(nodeNum int) float32 { - return testRateLimiterQPS - } - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { - t.Errorf("%v: unexpected error: %v", item.description, err) - } - - for zone, state := range item.expectedInitialStates { - if state != nodeController.zoneStates[zone] { - t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state) - } - } - - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} } - for i := range item.updatedNodeStatuses { - fakeNodeHandler.Existing[i].Status = item.updatedNodeStatuses[i] - } - - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(context.TODO()); err != nil { - t.Errorf("%v: unexpected error: %v", item.description, err) - } - for zone, state := range item.expectedFollowingStates { - if state != nodeController.zoneStates[zone] { - t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state) - } - } - var podEvicted bool - start := time.Now() - // Infinite loop, used for retrying in case ratelimiter fails to reload for Try function. - // this breaks when we have the status that we need for test case or when we don't see the - // intended result after 1 minute. - logger, _ := ktesting.NewTestContext(t) - for { - podEvicted = nodeController.doEviction(logger, fakeNodeHandler) - if podEvicted == item.expectedEvictPods || time.Since(start) > 1*time.Minute { - break - } - } - if item.expectedEvictPods != podEvicted { - t.Errorf("%v: expected pod eviction: %+v, got %+v", item.description, item.expectedEvictPods, podEvicted) - } - } -} - func TestMonitorNodeHealthUpdateStatus(t *testing.T) { fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) table := []struct { @@ -1704,7 +1106,6 @@ func TestMonitorNodeHealthUpdateStatus(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2248,7 +1649,6 @@ func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2413,7 +1813,6 @@ func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2515,7 +1914,6 @@ func TestMonitorNodeHealthMarkPodsNotReadyWithWorkerSize(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2718,7 +2116,6 @@ func TestMonitorNodeHealthMarkPodsNotReadyRetry(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), item.fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2766,7 +2163,6 @@ func TestApplyNoExecuteTaints(t *testing.T) { t.Skip("Skipping test on Windows.") } fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -2857,7 +2253,6 @@ func TestApplyNoExecuteTaints(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -2922,7 +2317,6 @@ func TestApplyNoExecuteTaints(t *testing.T) { // TestApplyNoExecuteTaintsToNodesEnqueueTwice ensures we taint every node with NoExecute even if enqueued twice func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3012,7 +2406,6 @@ func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3155,7 +2548,6 @@ func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { func TestSwapUnreachableNotReadyTaints(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3209,7 +2601,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { }, Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), } - timeToPass := evictionTimeout newNodeStatus := v1.NodeStatus{ Conditions: []v1.NodeCondition{ { @@ -3237,7 +2628,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3272,7 +2662,7 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints) } - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} } + nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Time} } node0.Status = newNodeStatus node1.Status = healthyNodeNewStatus @@ -3309,7 +2699,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) { func TestTaintsNodeByCondition(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3342,7 +2731,6 @@ func TestTaintsNodeByCondition(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3545,7 +2933,6 @@ func TestNodeEventGeneration(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - 5*time.Minute, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3586,7 +2973,6 @@ func TestNodeEventGeneration(t *testing.T) { func TestReconcileNodeLabels(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3619,7 +3005,6 @@ func TestReconcileNodeLabels(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, @@ -3736,7 +3121,6 @@ func TestReconcileNodeLabels(t *testing.T) { func TestTryUpdateNodeHealth(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) fakeOld := metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -3763,7 +3147,6 @@ func TestTryUpdateNodeHealth(t *testing.T) { nodeController, _ := newNodeLifecycleControllerFromClient( context.TODO(), fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, diff --git a/test/integration/node/lifecycle_test.go b/test/integration/node/lifecycle_test.go index 9b807095286..497517fda5f 100644 --- a/test/integration/node/lifecycle_test.go +++ b/test/integration/node/lifecycle_test.go @@ -126,7 +126,6 @@ func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) { 1*time.Second, // Node monitor grace period time.Minute, // Node startup grace period time.Millisecond, // Node monitor period - 1, // Pod eviction timeout 100, // Eviction limiter QPS 100, // Secondary eviction limiter QPS 50, // Large cluster threshold @@ -278,7 +277,6 @@ func TestTaintBasedEvictions(t *testing.T) { 1*time.Second, // Node monitor grace period time.Minute, // Node startup grace period time.Millisecond, // Node monitor period - time.Second, // Pod eviction timeout 100, // Eviction limiter QPS 100, // Secondary eviction limiter QPS 50, // Large cluster threshold diff --git a/test/integration/scheduler/taint/taint_test.go b/test/integration/scheduler/taint/taint_test.go index 3ed68b7fc99..a9b2ef6ea46 100644 --- a/test/integration/scheduler/taint/taint_test.go +++ b/test/integration/scheduler/taint/taint_test.go @@ -95,7 +95,6 @@ func TestTaintNodeByCondition(t *testing.T) { time.Hour, // Node monitor grace period time.Second, // Node startup grace period time.Second, // Node monitor period - time.Second, // Pod eviction timeout 100, // Eviction limiter QPS 100, // Secondary eviction limiter QPS 100, // Large cluster threshold