diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index 33f4c35b59c..4f7fe5e1c66 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -1477,13 +1477,13 @@ func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStat defer nc.evictorLock.Unlock() if status == v1.ConditionFalse { if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { - nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].SetRemove(node.Name) + nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name) } } if status == v1.ConditionUnknown { if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { - nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].SetRemove(node.Name) + nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name) } } diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index acb14c9f14d..4fa3f761996 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -2782,6 +2782,239 @@ func TestApplyNoExecuteTaints(t *testing.T) { } } +// TestApplyNoExecuteTaintsToNodesEnqueueTwice ensures we taint every node with NoExecute even if enqueued twice +func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { + fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) + evictionTimeout := 10 * time.Minute + + fakeNodeHandler := &testutil.FakeNodeHandler{ + Existing: []*v1.Node{ + // Unreachable Taint with effect 'NoExecute' should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady + // we need second healthy node in tests. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // NotReady Taint with NoExecute effect should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + } + healthyNodeNewStatus := v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + } + nodeController, _ := newNodeLifecycleControllerFromClient( + fakeNodeHandler, + evictionTimeout, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + true) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) + if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + t.Errorf("unexpected error: %v", err) + } + // 1. monitor node health twice, add untainted node once + if err := nodeController.monitorNodeHealth(); err != nil { + t.Errorf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeHealth(); err != nil { + t.Errorf("unexpected error: %v", err) + } + + // 2. mark node0 healthy + node0, err := fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node0...") + return + } + node0.Status = healthyNodeNewStatus + _, err = fakeNodeHandler.UpdateStatus(context.TODO(), node0, metav1.UpdateOptions{}) + if err != nil { + t.Errorf(err.Error()) + return + } + + // add other notReady nodes + fakeNodeHandler.Existing = append(fakeNodeHandler.Existing, []*v1.Node{ + // Unreachable Taint with effect 'NoExecute' should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node3", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady + // we need second healthy node in tests. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node4", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + // NotReady Taint with NoExecute effect should be applied to this node. + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node5", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + Labels: map[string]string{ + v1.LabelTopologyRegion: "region1", + v1.LabelTopologyZone: "zone1", + v1.LabelFailureDomainBetaRegion: "region1", + v1.LabelFailureDomainBetaZone: "zone1", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }...) + if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { + t.Errorf("unexpected error: %v", err) + } + // 3. start monitor node health again, add untainted node twice, construct UniqueQueue with duplicated node cache + if err := nodeController.monitorNodeHealth(); err != nil { + t.Errorf("unexpected error: %v", err) + } + + // 4. do NoExecute taint pass + // when processing with node0, condition.Status is NodeReady, and return true with default case + // then remove the set value and queue value both, the taint job never stuck + nodeController.doNoExecuteTaintingPass() + + // 5. get node3 and node5, see if it has ready got NoExecute taint + node3, err := fakeNodeHandler.Get(context.TODO(), "node3", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node3...") + return + } + if !taintutils.TaintExists(node3.Spec.Taints, UnreachableTaintTemplate) || len(node3.Spec.Taints) == 0 { + t.Errorf("Not found taint %v in %v, which should be present in %s", UnreachableTaintTemplate, node3.Spec.Taints, node3.Name) + } + node5, err := fakeNodeHandler.Get(context.TODO(), "node5", metav1.GetOptions{}) + if err != nil { + t.Errorf("Can't get current node5...") + return + } + if !taintutils.TaintExists(node5.Spec.Taints, NotReadyTaintTemplate) || len(node5.Spec.Taints) == 0 { + t.Errorf("Not found taint %v in %v, which should be present in %s", NotReadyTaintTemplate, node5.Spec.Taints, node5.Name) + } +} + func TestSwapUnreachableNotReadyTaints(t *testing.T) { fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) evictionTimeout := 10 * time.Minute diff --git a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go index 26bcd29d840..e343fa59667 100644 --- a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go +++ b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go @@ -194,15 +194,6 @@ func (q *UniqueQueue) Clear() { } } -// SetRemove remove value from the set if value existed -func (q *UniqueQueue) SetRemove(value string) { - q.lock.Lock() - defer q.lock.Unlock() - if q.set.Has(value) { - q.set.Delete(value) - } -} - // RateLimitedTimedQueue is a unique item priority queue ordered by // the expected next time of execution. It is also rate limited. type RateLimitedTimedQueue struct { @@ -289,11 +280,6 @@ func (q *RateLimitedTimedQueue) Clear() { q.queue.Clear() } -// SetRemove remove value from the set of the queue -func (q *RateLimitedTimedQueue) SetRemove(value string) { - q.queue.SetRemove(value) -} - // SwapLimiter safely swaps current limiter for this queue with the // passed one if capacities or qps's differ. func (q *RateLimitedTimedQueue) SwapLimiter(newQPS float32) { diff --git a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go index 00411792df8..644b6569039 100644 --- a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go +++ b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go @@ -282,17 +282,6 @@ func TestClear(t *testing.T) { } } -func TestSetRemove(t *testing.T) { - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - - evictor.SetRemove("first") - - if evictor.queue.set.Len() != 0 { - t.Fatalf("SetRemove should remove element from the set.") - } -} - func TestSwapLimiter(t *testing.T) { evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) fakeAlways := flowcontrol.NewFakeAlwaysRateLimiter()