diff --git a/test/e2e/scheduling/BUILD b/test/e2e/scheduling/BUILD index 653bac5ecbf..fa07f98d381 100644 --- a/test/e2e/scheduling/BUILD +++ b/test/e2e/scheduling/BUILD @@ -10,7 +10,6 @@ go_library( "predicates.go", "preemption.go", "priorities.go", - "taint_based_evictions.go", "taints.go", "ubernetes_lite.go", "ubernetes_lite_volumes.go", @@ -22,14 +21,12 @@ go_library( "//pkg/apis/extensions:go_default_library", "//pkg/apis/scheduling:go_default_library", "//pkg/scheduler/algorithm/priorities/util:go_default_library", - "//pkg/scheduler/api:go_default_library", "//staging/src/k8s.io/api/apps/v1:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", "//staging/src/k8s.io/api/scheduling/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", - "//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/intstr:go_default_library", diff --git a/test/e2e/scheduling/taint_based_evictions.go b/test/e2e/scheduling/taint_based_evictions.go deleted file mode 100644 index 0727850bbdb..00000000000 --- a/test/e2e/scheduling/taint_based_evictions.go +++ /dev/null @@ -1,196 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduling - -import ( - "errors" - "fmt" - "time" - - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - clientset "k8s.io/client-go/kubernetes" - schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" - "k8s.io/kubernetes/test/e2e/framework" - e2enode "k8s.io/kubernetes/test/e2e/framework/node" - e2epod "k8s.io/kubernetes/test/e2e/framework/pod" - - "github.com/onsi/ginkgo" -) - -func newUnreachableNoExecuteTaint() *v1.Taint { - return &v1.Taint{ - Key: schedulerapi.TaintNodeUnreachable, - Effect: v1.TaintEffectNoExecute, - } -} - -func getTolerationSeconds(tolerations []v1.Toleration) (int64, error) { - for _, t := range tolerations { - if t.Key == schedulerapi.TaintNodeUnreachable && t.Effect == v1.TaintEffectNoExecute && t.Operator == v1.TolerationOpExists { - return *t.TolerationSeconds, nil - } - } - return 0, errors.New("cannot find toleration") -} - -var _ = SIGDescribe("TaintBasedEvictions [Serial]", func() { - f := framework.NewDefaultFramework("sched-taint-based-evictions") - var cs clientset.Interface - var ns string - - ginkgo.BeforeEach(func() { - cs = f.ClientSet - ns = f.Namespace.Name - // skip if TaintBasedEvictions is not enabled - // TODO(Huang-Wei): remove this when TaintBasedEvictions is GAed - framework.SkipUnlessTaintBasedEvictionsEnabled() - // it's required to run on a cluster that has more than 1 node - // otherwise node lifecycle manager enters a fully disruption mode - framework.SkipUnlessNodeCountIsAtLeast(2) - }) - - // This test verifies that when a node becomes unreachable - // 1. node lifecycle manager generate a status change: [NodeReady=true, status=ConditionUnknown] - // 1. it's applied with node.kubernetes.io/unreachable=:NoExecute taint - // 2. pods without toleration are applied with toleration with tolerationSeconds=300 - // 3. pods with toleration and without tolerationSeconds won't be modified, and won't be evicted - // 4. pods with toleration and with tolerationSeconds won't be modified, and will be evicted after tolerationSeconds - // When network issue recovers, it's expected to see: - // 5. node lifecycle manager generate a status change: [NodeReady=true, status=ConditionTrue] - // 6. node.kubernetes.io/unreachable=:NoExecute taint is taken off the node - ginkgo.It("Checks that the node becomes unreachable", func() { - framework.SkipUnlessSSHKeyPresent() - - // find an available node - nodeName := GetNodeThatCanRunPod(f) - ginkgo.By("Finding an available node " + nodeName) - - // pod0 is a pod with unschedulable=:NoExecute toleration, and tolerationSeconds=0s - // pod1 is a pod with unschedulable=:NoExecute toleration, and tolerationSeconds=200s - // pod2 is a pod without any toleration - base := "taint-based-eviction" - tolerationSeconds := []int64{0, 200} - numPods := len(tolerationSeconds) + 1 - ginkgo.By(fmt.Sprintf("Preparing %v pods", numPods)) - pods := make([]*v1.Pod, numPods) - zero := int64(0) - // build pod0, pod1 - for i := 0; i < numPods-1; i++ { - pods[i] = createPausePod(f, pausePodConfig{ - Name: fmt.Sprintf("%v-%v", base, i), - NodeName: nodeName, - Tolerations: []v1.Toleration{ - { - Key: schedulerapi.TaintNodeUnreachable, - Operator: v1.TolerationOpExists, - Effect: v1.TaintEffectNoExecute, - TolerationSeconds: &tolerationSeconds[i], - }, - }, - DeletionGracePeriodSeconds: &zero, - }) - } - // build pod2 - pods[numPods-1] = createPausePod(f, pausePodConfig{ - Name: fmt.Sprintf("%v-%v", base, numPods-1), - NodeName: nodeName, - }) - - ginkgo.By("Verifying all pods are running properly") - for _, pod := range pods { - framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(cs, pod)) - } - - // get the node API object - nodeSelector := fields.OneTermEqualSelector("metadata.name", nodeName) - nodeList, err := cs.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: nodeSelector.String()}) - if err != nil || len(nodeList.Items) != 1 { - framework.Failf("expected no err, got %v; expected len(nodes) = 1, got %v", err, len(nodeList.Items)) - } - node := nodeList.Items[0] - - ginkgo.By(fmt.Sprintf("Blocking traffic from node %s to the master", nodeName)) - host, err := e2enode.GetExternalIP(&node) - if err != nil { - host, err = e2enode.GetInternalIP(&node) - } - framework.ExpectNoError(err) - masterAddresses := framework.GetAllMasterAddresses(cs) - taint := newUnreachableNoExecuteTaint() - - defer func() { - ginkgo.By(fmt.Sprintf("Unblocking traffic from node %s to the master", node.Name)) - for _, masterAddress := range masterAddresses { - framework.UnblockNetwork(host, masterAddress) - } - - if ginkgo.CurrentGinkgoTestDescription().Failed { - framework.Failf("Current e2e test has failed, so return from here.") - return - } - - ginkgo.By(fmt.Sprintf("Expecting to see node %q becomes Ready", nodeName)) - e2enode.WaitForNodeToBeReady(cs, nodeName, time.Minute*1) - ginkgo.By("Expecting to see unreachable=:NoExecute taint is taken off") - err := framework.WaitForNodeHasTaintOrNot(cs, nodeName, taint, false, time.Second*30) - framework.ExpectNoError(err) - }() - - for _, masterAddress := range masterAddresses { - framework.BlockNetwork(host, masterAddress) - } - - ginkgo.By(fmt.Sprintf("Expecting to see node %q becomes NotReady", nodeName)) - if !e2enode.WaitForNodeToBeNotReady(cs, nodeName, time.Minute*3) { - framework.Failf("node %q doesn't turn to NotReady after 3 minutes", nodeName) - } - ginkgo.By("Expecting to see unreachable=:NoExecute taint is applied") - err = framework.WaitForNodeHasTaintOrNot(cs, nodeName, taint, true, time.Second*30) - framework.ExpectNoError(err) - - ginkgo.By("Expecting pod0 to be evicted immediately") - err = e2epod.WaitForPodCondition(cs, ns, pods[0].Name, "pod0 terminating", time.Second*15, func(pod *v1.Pod) (bool, error) { - // as node is unreachable, pod0 is expected to be in Terminating status - // rather than getting deleted - if pod.DeletionTimestamp != nil { - return true, nil - } - return false, nil - }) - framework.ExpectNoError(err) - - ginkgo.By("Expecting pod2 to be updated with a toleration with tolerationSeconds=300") - err = e2epod.WaitForPodCondition(cs, ns, pods[2].Name, "pod2 updated with tolerationSeconds=300", time.Second*15, func(pod *v1.Pod) (bool, error) { - if seconds, err := getTolerationSeconds(pod.Spec.Tolerations); err == nil { - return seconds == 300, nil - } - return false, nil - }) - framework.ExpectNoError(err) - - ginkgo.By("Expecting pod1 to be unchanged") - livePod1, err := cs.CoreV1().Pods(pods[1].Namespace).Get(pods[1].Name, metav1.GetOptions{}) - framework.ExpectNoError(err) - seconds, err := getTolerationSeconds(livePod1.Spec.Tolerations) - framework.ExpectNoError(err) - if seconds != 200 { - framework.Failf("expect tolerationSeconds of pod1 is 200, but got %v", seconds) - } - }) -}) diff --git a/test/integration/scheduler/BUILD b/test/integration/scheduler/BUILD index a251fd7b98c..6bd3b8c71dd 100644 --- a/test/integration/scheduler/BUILD +++ b/test/integration/scheduler/BUILD @@ -35,6 +35,7 @@ go_test( "//pkg/scheduler/framework/v1alpha1:go_default_library", "//pkg/scheduler/nodeinfo:go_default_library", "//pkg/scheduler/testing:go_default_library", + "//plugin/pkg/admission/defaulttolerationseconds:go_default_library", "//plugin/pkg/admission/podtolerationrestriction:go_default_library", "//plugin/pkg/admission/podtolerationrestriction/apis/podtolerationrestriction:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", @@ -48,6 +49,7 @@ go_test( "//staging/src/k8s.io/apimachinery/pkg/util/intstr:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/admission:go_default_library", "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/client-go/informers:go_default_library", "//staging/src/k8s.io/client-go/kubernetes:go_default_library", @@ -56,6 +58,7 @@ go_test( "//staging/src/k8s.io/client-go/tools/cache:go_default_library", "//staging/src/k8s.io/client-go/tools/events:go_default_library", "//staging/src/k8s.io/component-base/featuregate/testing:go_default_library", + "//test/e2e/framework/pod:go_default_library", "//test/integration/framework:go_default_library", "//test/utils:go_default_library", "//test/utils/image:go_default_library", diff --git a/test/integration/scheduler/taint_test.go b/test/integration/scheduler/taint_test.go index 5f42fe35eef..cd5086453b0 100644 --- a/test/integration/scheduler/taint_test.go +++ b/test/integration/scheduler/taint_test.go @@ -27,6 +27,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apiserver/pkg/admission" utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" @@ -36,8 +37,11 @@ import ( "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/algorithmprovider" schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" + "k8s.io/kubernetes/plugin/pkg/admission/defaulttolerationseconds" "k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction" pluginapi "k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction/apis/podtolerationrestriction" + "k8s.io/kubernetes/test/e2e/framework/pod" + imageutils "k8s.io/kubernetes/test/utils/image" ) func newPod(nsName, name string, req, limit v1.ResourceList) *v1.Pod { @@ -571,3 +575,230 @@ func TestTaintNodeByCondition(t *testing.T) { }) } } + +// TestTaintBasedEvictions tests related cases for the TaintBasedEvictions feature +func TestTaintBasedEvictions(t *testing.T) { + // we need at least 2 nodes to prevent lifecycle manager from entering "fully-disrupted" mode + nodeCount := 3 + zero := int64(0) + gracePeriod := int64(1) + testPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "testpod1", DeletionGracePeriodSeconds: &zero}, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container", Image: imageutils.GetPauseImageName()}, + }, + Tolerations: []v1.Toleration{ + { + Key: schedulerapi.TaintNodeNotReady, + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoExecute, + }, + }, + TerminationGracePeriodSeconds: &gracePeriod, + }, + } + tolerationSeconds := []int64{200, 300, 0} + tests := []struct { + name string + nodeTaints []v1.Taint + nodeConditions []v1.NodeCondition + pod *v1.Pod + waitForPodCondition string + }{ + { + name: "Taint based evictions for NodeNotReady and 200 tolerationseconds", + nodeTaints: []v1.Taint{{Key: schedulerapi.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}}, + nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}, + pod: testPod, + waitForPodCondition: "updated with tolerationSeconds of 200", + }, + { + name: "Taint based evictions for NodeNotReady with no pod tolerations", + nodeTaints: []v1.Taint{{Key: schedulerapi.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}}, + nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "testpod1"}, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container", Image: imageutils.GetPauseImageName()}, + }, + }, + }, + waitForPodCondition: "updated with tolerationSeconds=300", + }, + { + name: "Taint based evictions for NodeNotReady and 0 tolerationseconds", + nodeTaints: []v1.Taint{{Key: schedulerapi.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}}, + nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}, + pod: testPod, + waitForPodCondition: "terminating", + }, + { + name: "Taint based evictions for NodeUnreachable", + nodeTaints: []v1.Taint{{Key: schedulerapi.TaintNodeUnreachable, Effect: v1.TaintEffectNoExecute}}, + nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionUnknown}}, + }, + } + + // Enable TaintBasedEvictions + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.TaintBasedEvictions, true)() + // ApplyFeatureGates() is called to ensure TaintNodesByCondition related logic is applied/restored properly. + defer algorithmprovider.ApplyFeatureGates()() + + // Build admission chain handler. + podTolerations := podtolerationrestriction.NewPodTolerationsPlugin(&pluginapi.Configuration{}) + admission := admission.NewChainHandler( + podTolerations, + defaulttolerationseconds.NewDefaultTolerationSeconds(), + ) + for i, test := range tests { + t.Run(test.name, func(t *testing.T) { + context := initTestMaster(t, "taint-based-evictions", admission) + // Build clientset and informers for controllers. + externalClientset := kubernetes.NewForConfigOrDie(&restclient.Config{ + QPS: -1, + Host: context.httpServer.URL, + ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) + externalInformers := informers.NewSharedInformerFactory(externalClientset, time.Second) + podTolerations.SetExternalKubeClientSet(externalClientset) + podTolerations.SetExternalKubeInformerFactory(externalInformers) + + context = initTestScheduler(t, context, true, nil) + cs := context.clientSet + informers := context.informerFactory + _, err := cs.CoreV1().Namespaces().Create(context.ns) + if err != nil { + t.Errorf("Failed to create namespace %+v", err) + } + + // Start NodeLifecycleController for taint. + nc, err := nodelifecycle.NewNodeLifecycleController( + informers.Coordination().V1beta1().Leases(), + informers.Core().V1().Pods(), + informers.Core().V1().Nodes(), + informers.Apps().V1().DaemonSets(), + cs, + 5*time.Second, // Node monitor grace period + time.Minute, // Node startup grace period + time.Millisecond, // Node monitor period + time.Second, // Pod eviction timeout + 100, // Eviction limiter QPS + 100, // Secondary eviction limiter QPS + 50, // Large cluster threshold + 0.55, // Unhealthy zone threshold + true, // Run taint manager + true, // Use taint based evictions + false, // Enabled TaintNodeByCondition feature + ) + if err != nil { + t.Errorf("Failed to create node controller: %v", err) + return + } + + go nc.Run(context.stopCh) + + // Waiting for all controller sync. + externalInformers.Start(context.stopCh) + externalInformers.WaitForCacheSync(context.stopCh) + informers.Start(context.stopCh) + informers.WaitForCacheSync(context.stopCh) + + nodeRes := v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("4000m"), + v1.ResourceMemory: resource.MustParse("16Gi"), + v1.ResourcePods: resource.MustParse("110"), + } + + var nodes []*v1.Node + for i := 0; i < nodeCount; i++ { + nodes = append(nodes, &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("node-%d", i), + Labels: map[string]string{v1.LabelZoneRegion: "region1", v1.LabelZoneFailureDomain: "zone1"}, + }, + Spec: v1.NodeSpec{}, + Status: v1.NodeStatus{ + Capacity: nodeRes, + Allocatable: nodeRes, + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }, + }, + }, + }) + if _, err := cs.CoreV1().Nodes().Create(nodes[i]); err != nil { + t.Errorf("Failed to create node, err: %v", err) + } + } + neededNode := nodes[1] + if test.pod != nil { + test.pod.Name = fmt.Sprintf("testpod-%d", i) + if len(test.pod.Spec.Tolerations) > 0 { + test.pod.Spec.Tolerations[0].TolerationSeconds = &tolerationSeconds[i] + } + + test.pod, err = cs.CoreV1().Pods(context.ns.Name).Create(test.pod) + if err != nil { + t.Fatalf("Test Failed: error: %v, while creating pod", err) + } + + if err := waitForPodToSchedule(cs, test.pod); err != nil { + t.Errorf("Failed to schedule pod %s/%s on the node, err: %v", + test.pod.Namespace, test.pod.Name, err) + } + test.pod, err = cs.CoreV1().Pods(context.ns.Name).Get(test.pod.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Test Failed: error: %v, while creating pod", err) + } + neededNode, err = cs.CoreV1().Nodes().Get(test.pod.Spec.NodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Error while getting node associated with pod %v with err %v", test.pod.Name, err) + } + } + + neededNode.Status.Conditions = test.nodeConditions + // Update node condition. + err = updateNodeStatus(cs, neededNode) + if err != nil { + t.Fatalf("Cannot update node: %v", err) + } + + if err := waitForNodeTaints(cs, neededNode, test.nodeTaints); err != nil { + t.Errorf("Failed to taint node in test %d <%s>, err: %v", i, neededNode.Name, err) + } + + if test.pod != nil { + err = pod.WaitForPodCondition(cs, context.ns.Name, test.pod.Name, test.waitForPodCondition, time.Second*15, func(pod *v1.Pod) (bool, error) { + // as node is unreachable, pod0 is expected to be in Terminating status + // rather than getting deleted + if tolerationSeconds[i] == 0 { + return pod.DeletionTimestamp != nil, nil + } + if seconds, err := getTolerationSeconds(pod.Spec.Tolerations); err == nil { + return seconds == tolerationSeconds[i], nil + } + return false, nil + }) + if err != nil { + pod, _ := cs.CoreV1().Pods(context.ns.Name).Get(test.pod.Name, metav1.GetOptions{}) + t.Fatalf("Error: %v, Expected test pod to be %s but it's %v", err, test.waitForPodCondition, pod) + } + cleanupPods(cs, t, []*v1.Pod{test.pod}) + } + cleanupNodes(cs, t) + waitForSchedulerCacheCleanup(context.scheduler, t) + }) + } +} + +func getTolerationSeconds(tolerations []v1.Toleration) (int64, error) { + for _, t := range tolerations { + if t.Key == schedulerapi.TaintNodeNotReady && t.Effect == v1.TaintEffectNoExecute && t.Operator == v1.TolerationOpExists { + return *t.TolerationSeconds, nil + } + } + return 0, fmt.Errorf("cannot find toleration") +}