diff --git a/test/e2e/lifecycle/cluster_upgrade.go b/test/e2e/lifecycle/cluster_upgrade.go index d70a62d67a7..3a2d7ce7235 100644 --- a/test/e2e/lifecycle/cluster_upgrade.go +++ b/test/e2e/lifecycle/cluster_upgrade.go @@ -54,6 +54,10 @@ var upgradeTests = []upgrades.Test{ &upgrades.AppArmorUpgradeTest{}, } +var gpuUpgradeTests = []upgrades.Test{ + &upgrades.NvidiaGPUUpgradeTest{}, +} + var statefulsetUpgradeTests = []upgrades.Test{ &upgrades.MySqlUpgradeTest{}, &upgrades.EtcdUpgradeTest{}, @@ -256,6 +260,52 @@ var _ = SIGDescribe("ingress Downgrade [Feature:IngressDowngrade]", func() { }) }) +var _ = SIGDescribe("gpu Upgrade [Feature:GPUUpgrade]", func() { + f := framework.NewDefaultFramework("gpu-upgrade") + + // Create the frameworks here because we can only create them + // in a "Describe". + testFrameworks := createUpgradeFrameworks(gpuUpgradeTests) + Describe("master upgrade", func() { + It("should NOT disrupt gpu pod [Feature:GPUMasterUpgrade]", func() { + upgCtx, err := getUpgradeContext(f.ClientSet.Discovery(), framework.TestContext.UpgradeTarget) + framework.ExpectNoError(err) + + testSuite := &junit.TestSuite{Name: "GPU master upgrade"} + gpuUpgradeTest := &junit.TestCase{Name: "[sig-node] gpu-master-upgrade", Classname: "upgrade_tests"} + testSuite.TestCases = append(testSuite.TestCases, gpuUpgradeTest) + upgradeFunc := func() { + start := time.Now() + defer finalizeUpgradeTest(start, gpuUpgradeTest) + target := upgCtx.Versions[1].Version.String() + framework.ExpectNoError(framework.MasterUpgrade(target)) + framework.ExpectNoError(framework.CheckMasterVersion(f.ClientSet, target)) + } + runUpgradeSuite(f, gpuUpgradeTests, testFrameworks, testSuite, upgCtx, upgrades.MasterUpgrade, upgradeFunc) + }) + }) + Describe("cluster upgrade", func() { + It("should be able to run gpu pod after upgrade [Feature:GPUClusterUpgrade]", func() { + upgCtx, err := getUpgradeContext(f.ClientSet.Discovery(), framework.TestContext.UpgradeTarget) + framework.ExpectNoError(err) + + testSuite := &junit.TestSuite{Name: "GPU cluster upgrade"} + gpuUpgradeTest := &junit.TestCase{Name: "[sig-node] gpu-cluster-upgrade", Classname: "upgrade_tests"} + testSuite.TestCases = append(testSuite.TestCases, gpuUpgradeTest) + upgradeFunc := func() { + start := time.Now() + defer finalizeUpgradeTest(start, gpuUpgradeTest) + target := upgCtx.Versions[1].Version.String() + framework.ExpectNoError(framework.MasterUpgrade(target)) + framework.ExpectNoError(framework.CheckMasterVersion(f.ClientSet, target)) + framework.ExpectNoError(framework.NodeUpgrade(f, target, framework.TestContext.UpgradeImage)) + framework.ExpectNoError(framework.CheckNodesVersions(f.ClientSet, target)) + } + runUpgradeSuite(f, gpuUpgradeTests, testFrameworks, testSuite, upgCtx, upgrades.ClusterUpgrade, upgradeFunc) + }) + }) +}) + var _ = Describe("[sig-apps] stateful Upgrade [Feature:StatefulUpgrade]", func() { f := framework.NewDefaultFramework("stateful-upgrade") diff --git a/test/e2e/upgrades/BUILD b/test/e2e/upgrades/BUILD index a3db3528724..142471ea1b4 100644 --- a/test/e2e/upgrades/BUILD +++ b/test/e2e/upgrades/BUILD @@ -16,6 +16,7 @@ go_library( "ingress.go", "kube_proxy_migration.go", "mysql.go", + "nvidia-gpu.go", "secrets.go", "services.go", "sysctl.go", @@ -28,6 +29,7 @@ go_library( "//pkg/util/version:go_default_library", "//test/e2e/common:go_default_library", "//test/e2e/framework:go_default_library", + "//test/e2e/scheduling:go_default_library", "//test/utils/image:go_default_library", "//vendor/github.com/davecgh/go-spew/spew:go_default_library", "//vendor/github.com/onsi/ginkgo:go_default_library", @@ -38,6 +40,7 @@ go_library( "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/extensions/v1beta1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library", diff --git a/test/e2e/upgrades/nvidia-gpu.go b/test/e2e/upgrades/nvidia-gpu.go new file mode 100644 index 00000000000..12abd0e34ac --- /dev/null +++ b/test/e2e/upgrades/nvidia-gpu.go @@ -0,0 +1,112 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package upgrades + +import ( + "regexp" + "time" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kubernetes/test/e2e/framework" + "k8s.io/kubernetes/test/e2e/scheduling" + imageutils "k8s.io/kubernetes/test/utils/image" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +// NvidiaGPUUpgradeTest tests that gpu resource is available before and after +// a cluster upgrade. +type NvidiaGPUUpgradeTest struct { +} + +func (NvidiaGPUUpgradeTest) Name() string { return "nvidia-gpu-upgrade [sig-node] [sig-scheduling]" } + +// Setup creates a job requesting gpu. +func (t *NvidiaGPUUpgradeTest) Setup(f *framework.Framework) { + scheduling.SetupNVIDIAGPUNode(f, false) + By("Creating a job requesting gpu") + t.startJob(f) +} + +// Test waits for the upgrade to complete, and then verifies that the +// cuda pod started by the gpu job can successfully finish. +func (t *NvidiaGPUUpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade UpgradeType) { + <-done + By("Verifying gpu job success") + t.verifyJobPodSuccess(f) + if upgrade == MasterUpgrade { + // MasterUpgrade should be totally hitless. + job, err := framework.GetJob(f.ClientSet, f.Namespace.Name, "cuda-add") + Expect(err).NotTo(HaveOccurred()) + Expect(job.Status.Failed).To(BeZero(), "Job pods failed during master upgrade: %v", job.Status.Failed) + } +} + +// Teardown cleans up any remaining resources. +func (t *NvidiaGPUUpgradeTest) Teardown(f *framework.Framework) { + // rely on the namespace deletion to clean up everything +} + +// startJob creates a job that requests gpu and runs a simple cuda container. +func (t *NvidiaGPUUpgradeTest) startJob(f *framework.Framework) { + var activeSeconds int64 = 3600 + // Specifies 100 completions to make sure the job life spans across the upgrade. + testJob := framework.NewTestJob("succeed", "cuda-add", v1.RestartPolicyAlways, 1, 100, &activeSeconds, 6) + testJob.Spec.Template.Spec = v1.PodSpec{ + RestartPolicy: v1.RestartPolicyOnFailure, + Containers: []v1.Container{ + { + Name: "vector-addition", + Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd), + Command: []string{"/bin/sh", "-c", "./vectorAdd && sleep 60"}, + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + framework.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI), + }, + }, + }, + }, + } + ns := f.Namespace.Name + _, err := framework.CreateJob(f.ClientSet, ns, testJob) + Expect(err).NotTo(HaveOccurred()) + framework.Logf("Created job %v", testJob) + By("Waiting for gpu job pod start") + err = framework.WaitForAllJobPodsRunning(f.ClientSet, ns, testJob.Name, 1) + Expect(err).NotTo(HaveOccurred()) + By("Done with gpu job pod start") +} + +// verifyJobPodSuccess verifies that the started cuda pod successfully passes. +func (t *NvidiaGPUUpgradeTest) verifyJobPodSuccess(f *framework.Framework) { + // Wait for client pod to complete. + ns := f.Namespace.Name + err := framework.WaitForAllJobPodsRunning(f.ClientSet, f.Namespace.Name, "cuda-add", 1) + Expect(err).NotTo(HaveOccurred()) + pods, err := framework.GetJobPods(f.ClientSet, f.Namespace.Name, "cuda-add") + Expect(err).NotTo(HaveOccurred()) + createdPod := pods.Items[0].Name + framework.Logf("Created pod %v", createdPod) + f.PodClient().WaitForSuccess(createdPod, 5*time.Minute) + logs, err := framework.GetPodLogs(f.ClientSet, ns, createdPod, "vector-addition") + framework.ExpectNoError(err, "Should be able to get pod logs") + framework.Logf("Got pod logs: %v", logs) + regex := regexp.MustCompile("PASSED") + Expect(regex.MatchString(logs)).To(BeTrue()) +}