fix nodelifecyle controller not add NoExecute taint bug

This commit is contained in:
Hao Yuan 2020-11-26 09:55:43 +08:00 committed by howieyuen
parent 87984d84d1
commit 5569db4902
4 changed files with 235 additions and 27 deletions

View File

@ -1477,13 +1477,13 @@ func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStat
defer nc.evictorLock.Unlock() defer nc.evictorLock.Unlock()
if status == v1.ConditionFalse { if status == v1.ConditionFalse {
if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].SetRemove(node.Name) nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name)
} }
} }
if status == v1.ConditionUnknown { if status == v1.ConditionUnknown {
if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].SetRemove(node.Name) nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name)
} }
} }

View File

@ -2782,6 +2782,239 @@ func TestApplyNoExecuteTaints(t *testing.T) {
} }
} }
// TestApplyNoExecuteTaintsToNodesEnqueueTwice ensures we taint every node with NoExecute even if enqueued twice
func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) {
fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
fakeNodeHandler := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
// Unreachable Taint with effect 'NoExecute' should be applied to this node.
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
// Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
// we need second healthy node in tests.
{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
// NotReady Taint with NoExecute effect should be applied to this node.
{
ObjectMeta: metav1.ObjectMeta{
Name: "node2",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
}
healthyNodeNewStatus := v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
}
nodeController, _ := newNodeLifecycleControllerFromClient(
fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
testUnhealthyThreshold,
testNodeMonitorGracePeriod,
testNodeStartupGracePeriod,
testNodeMonitorPeriod,
true)
nodeController.now = func() metav1.Time { return fakeNow }
nodeController.recorder = testutil.NewFakeRecorder()
nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
t.Errorf("unexpected error: %v", err)
}
// 1. monitor node health twice, add untainted node once
if err := nodeController.monitorNodeHealth(); err != nil {
t.Errorf("unexpected error: %v", err)
}
if err := nodeController.monitorNodeHealth(); err != nil {
t.Errorf("unexpected error: %v", err)
}
// 2. mark node0 healthy
node0, err := fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{})
if err != nil {
t.Errorf("Can't get current node0...")
return
}
node0.Status = healthyNodeNewStatus
_, err = fakeNodeHandler.UpdateStatus(context.TODO(), node0, metav1.UpdateOptions{})
if err != nil {
t.Errorf(err.Error())
return
}
// add other notReady nodes
fakeNodeHandler.Existing = append(fakeNodeHandler.Existing, []*v1.Node{
// Unreachable Taint with effect 'NoExecute' should be applied to this node.
{
ObjectMeta: metav1.ObjectMeta{
Name: "node3",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
// Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
// we need second healthy node in tests.
{
ObjectMeta: metav1.ObjectMeta{
Name: "node4",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
// NotReady Taint with NoExecute effect should be applied to this node.
{
ObjectMeta: metav1.ObjectMeta{
Name: "node5",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
}...)
if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
t.Errorf("unexpected error: %v", err)
}
// 3. start monitor node health again, add untainted node twice, construct UniqueQueue with duplicated node cache
if err := nodeController.monitorNodeHealth(); err != nil {
t.Errorf("unexpected error: %v", err)
}
// 4. do NoExecute taint pass
// when processing with node0, condition.Status is NodeReady, and return true with default case
// then remove the set value and queue value both, the taint job never stuck
nodeController.doNoExecuteTaintingPass()
// 5. get node3 and node5, see if it has ready got NoExecute taint
node3, err := fakeNodeHandler.Get(context.TODO(), "node3", metav1.GetOptions{})
if err != nil {
t.Errorf("Can't get current node3...")
return
}
if !taintutils.TaintExists(node3.Spec.Taints, UnreachableTaintTemplate) || len(node3.Spec.Taints) == 0 {
t.Errorf("Not found taint %v in %v, which should be present in %s", UnreachableTaintTemplate, node3.Spec.Taints, node3.Name)
}
node5, err := fakeNodeHandler.Get(context.TODO(), "node5", metav1.GetOptions{})
if err != nil {
t.Errorf("Can't get current node5...")
return
}
if !taintutils.TaintExists(node5.Spec.Taints, NotReadyTaintTemplate) || len(node5.Spec.Taints) == 0 {
t.Errorf("Not found taint %v in %v, which should be present in %s", NotReadyTaintTemplate, node5.Spec.Taints, node5.Name)
}
}
func TestSwapUnreachableNotReadyTaints(t *testing.T) { func TestSwapUnreachableNotReadyTaints(t *testing.T) {
fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute evictionTimeout := 10 * time.Minute

View File

@ -194,15 +194,6 @@ func (q *UniqueQueue) Clear() {
} }
} }
// SetRemove remove value from the set if value existed
func (q *UniqueQueue) SetRemove(value string) {
q.lock.Lock()
defer q.lock.Unlock()
if q.set.Has(value) {
q.set.Delete(value)
}
}
// RateLimitedTimedQueue is a unique item priority queue ordered by // RateLimitedTimedQueue is a unique item priority queue ordered by
// the expected next time of execution. It is also rate limited. // the expected next time of execution. It is also rate limited.
type RateLimitedTimedQueue struct { type RateLimitedTimedQueue struct {
@ -289,11 +280,6 @@ func (q *RateLimitedTimedQueue) Clear() {
q.queue.Clear() q.queue.Clear()
} }
// SetRemove remove value from the set of the queue
func (q *RateLimitedTimedQueue) SetRemove(value string) {
q.queue.SetRemove(value)
}
// SwapLimiter safely swaps current limiter for this queue with the // SwapLimiter safely swaps current limiter for this queue with the
// passed one if capacities or qps's differ. // passed one if capacities or qps's differ.
func (q *RateLimitedTimedQueue) SwapLimiter(newQPS float32) { func (q *RateLimitedTimedQueue) SwapLimiter(newQPS float32) {

View File

@ -282,17 +282,6 @@ func TestClear(t *testing.T) {
} }
} }
func TestSetRemove(t *testing.T) {
evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter())
evictor.Add("first", "11111")
evictor.SetRemove("first")
if evictor.queue.set.Len() != 0 {
t.Fatalf("SetRemove should remove element from the set.")
}
}
func TestSwapLimiter(t *testing.T) { func TestSwapLimiter(t *testing.T) {
evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter())
fakeAlways := flowcontrol.NewFakeAlwaysRateLimiter() fakeAlways := flowcontrol.NewFakeAlwaysRateLimiter()