mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 03:41:45 +00:00
Merge pull request #63631 from jiayingz/upgrade-test
Automatic merge from submit-queue (batch tested with PRs 64344, 64709, 64717, 63631, 58647). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Add gpu cluster upgrade test. **What this PR does / why we need it**: **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # **Special notes for your reviewer**: Currently running GPUMasterUpgrade test should pass with gpu nodes but running GPUClusterUpgrade test will run into https://github.com/kubernetes/kubernetes/issues/63506 **Release note**: ```release-note ```
This commit is contained in:
commit
0bd77a2884
@ -54,6 +54,10 @@ var upgradeTests = []upgrades.Test{
|
|||||||
&upgrades.AppArmorUpgradeTest{},
|
&upgrades.AppArmorUpgradeTest{},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var gpuUpgradeTests = []upgrades.Test{
|
||||||
|
&upgrades.NvidiaGPUUpgradeTest{},
|
||||||
|
}
|
||||||
|
|
||||||
var statefulsetUpgradeTests = []upgrades.Test{
|
var statefulsetUpgradeTests = []upgrades.Test{
|
||||||
&upgrades.MySqlUpgradeTest{},
|
&upgrades.MySqlUpgradeTest{},
|
||||||
&upgrades.EtcdUpgradeTest{},
|
&upgrades.EtcdUpgradeTest{},
|
||||||
@ -256,6 +260,52 @@ var _ = SIGDescribe("ingress Downgrade [Feature:IngressDowngrade]", func() {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
var _ = SIGDescribe("gpu Upgrade [Feature:GPUUpgrade]", func() {
|
||||||
|
f := framework.NewDefaultFramework("gpu-upgrade")
|
||||||
|
|
||||||
|
// Create the frameworks here because we can only create them
|
||||||
|
// in a "Describe".
|
||||||
|
testFrameworks := createUpgradeFrameworks(gpuUpgradeTests)
|
||||||
|
Describe("master upgrade", func() {
|
||||||
|
It("should NOT disrupt gpu pod [Feature:GPUMasterUpgrade]", func() {
|
||||||
|
upgCtx, err := getUpgradeContext(f.ClientSet.Discovery(), framework.TestContext.UpgradeTarget)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
testSuite := &junit.TestSuite{Name: "GPU master upgrade"}
|
||||||
|
gpuUpgradeTest := &junit.TestCase{Name: "[sig-node] gpu-master-upgrade", Classname: "upgrade_tests"}
|
||||||
|
testSuite.TestCases = append(testSuite.TestCases, gpuUpgradeTest)
|
||||||
|
upgradeFunc := func() {
|
||||||
|
start := time.Now()
|
||||||
|
defer finalizeUpgradeTest(start, gpuUpgradeTest)
|
||||||
|
target := upgCtx.Versions[1].Version.String()
|
||||||
|
framework.ExpectNoError(framework.MasterUpgrade(target))
|
||||||
|
framework.ExpectNoError(framework.CheckMasterVersion(f.ClientSet, target))
|
||||||
|
}
|
||||||
|
runUpgradeSuite(f, gpuUpgradeTests, testFrameworks, testSuite, upgCtx, upgrades.MasterUpgrade, upgradeFunc)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
Describe("cluster upgrade", func() {
|
||||||
|
It("should be able to run gpu pod after upgrade [Feature:GPUClusterUpgrade]", func() {
|
||||||
|
upgCtx, err := getUpgradeContext(f.ClientSet.Discovery(), framework.TestContext.UpgradeTarget)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
testSuite := &junit.TestSuite{Name: "GPU cluster upgrade"}
|
||||||
|
gpuUpgradeTest := &junit.TestCase{Name: "[sig-node] gpu-cluster-upgrade", Classname: "upgrade_tests"}
|
||||||
|
testSuite.TestCases = append(testSuite.TestCases, gpuUpgradeTest)
|
||||||
|
upgradeFunc := func() {
|
||||||
|
start := time.Now()
|
||||||
|
defer finalizeUpgradeTest(start, gpuUpgradeTest)
|
||||||
|
target := upgCtx.Versions[1].Version.String()
|
||||||
|
framework.ExpectNoError(framework.MasterUpgrade(target))
|
||||||
|
framework.ExpectNoError(framework.CheckMasterVersion(f.ClientSet, target))
|
||||||
|
framework.ExpectNoError(framework.NodeUpgrade(f, target, framework.TestContext.UpgradeImage))
|
||||||
|
framework.ExpectNoError(framework.CheckNodesVersions(f.ClientSet, target))
|
||||||
|
}
|
||||||
|
runUpgradeSuite(f, gpuUpgradeTests, testFrameworks, testSuite, upgCtx, upgrades.ClusterUpgrade, upgradeFunc)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
var _ = Describe("[sig-apps] stateful Upgrade [Feature:StatefulUpgrade]", func() {
|
var _ = Describe("[sig-apps] stateful Upgrade [Feature:StatefulUpgrade]", func() {
|
||||||
f := framework.NewDefaultFramework("stateful-upgrade")
|
f := framework.NewDefaultFramework("stateful-upgrade")
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@ go_library(
|
|||||||
"ingress.go",
|
"ingress.go",
|
||||||
"kube_proxy_migration.go",
|
"kube_proxy_migration.go",
|
||||||
"mysql.go",
|
"mysql.go",
|
||||||
|
"nvidia-gpu.go",
|
||||||
"secrets.go",
|
"secrets.go",
|
||||||
"services.go",
|
"services.go",
|
||||||
"sysctl.go",
|
"sysctl.go",
|
||||||
@ -28,6 +29,7 @@ go_library(
|
|||||||
"//pkg/util/version:go_default_library",
|
"//pkg/util/version:go_default_library",
|
||||||
"//test/e2e/common:go_default_library",
|
"//test/e2e/common:go_default_library",
|
||||||
"//test/e2e/framework:go_default_library",
|
"//test/e2e/framework:go_default_library",
|
||||||
|
"//test/e2e/scheduling:go_default_library",
|
||||||
"//test/utils/image:go_default_library",
|
"//test/utils/image:go_default_library",
|
||||||
"//vendor/github.com/davecgh/go-spew/spew:go_default_library",
|
"//vendor/github.com/davecgh/go-spew/spew:go_default_library",
|
||||||
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
||||||
@ -38,6 +40,7 @@ go_library(
|
|||||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
"//vendor/k8s.io/api/core/v1:go_default_library",
|
||||||
"//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
|
"//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
||||||
|
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
|
||||||
|
112
test/e2e/upgrades/nvidia-gpu.go
Normal file
112
test/e2e/upgrades/nvidia-gpu.go
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2018 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package upgrades
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
|
"k8s.io/kubernetes/test/e2e/scheduling"
|
||||||
|
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||||
|
|
||||||
|
. "github.com/onsi/ginkgo"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NvidiaGPUUpgradeTest tests that gpu resource is available before and after
|
||||||
|
// a cluster upgrade.
|
||||||
|
type NvidiaGPUUpgradeTest struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (NvidiaGPUUpgradeTest) Name() string { return "nvidia-gpu-upgrade [sig-node] [sig-scheduling]" }
|
||||||
|
|
||||||
|
// Setup creates a job requesting gpu.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) Setup(f *framework.Framework) {
|
||||||
|
scheduling.SetupNVIDIAGPUNode(f, false)
|
||||||
|
By("Creating a job requesting gpu")
|
||||||
|
t.startJob(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test waits for the upgrade to complete, and then verifies that the
|
||||||
|
// cuda pod started by the gpu job can successfully finish.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade UpgradeType) {
|
||||||
|
<-done
|
||||||
|
By("Verifying gpu job success")
|
||||||
|
t.verifyJobPodSuccess(f)
|
||||||
|
if upgrade == MasterUpgrade {
|
||||||
|
// MasterUpgrade should be totally hitless.
|
||||||
|
job, err := framework.GetJob(f.ClientSet, f.Namespace.Name, "cuda-add")
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
Expect(job.Status.Failed).To(BeZero(), "Job pods failed during master upgrade: %v", job.Status.Failed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Teardown cleans up any remaining resources.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) Teardown(f *framework.Framework) {
|
||||||
|
// rely on the namespace deletion to clean up everything
|
||||||
|
}
|
||||||
|
|
||||||
|
// startJob creates a job that requests gpu and runs a simple cuda container.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) startJob(f *framework.Framework) {
|
||||||
|
var activeSeconds int64 = 3600
|
||||||
|
// Specifies 100 completions to make sure the job life spans across the upgrade.
|
||||||
|
testJob := framework.NewTestJob("succeed", "cuda-add", v1.RestartPolicyAlways, 1, 100, &activeSeconds, 6)
|
||||||
|
testJob.Spec.Template.Spec = v1.PodSpec{
|
||||||
|
RestartPolicy: v1.RestartPolicyOnFailure,
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{
|
||||||
|
Name: "vector-addition",
|
||||||
|
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
|
||||||
|
Command: []string{"/bin/sh", "-c", "./vectorAdd && sleep 60"},
|
||||||
|
Resources: v1.ResourceRequirements{
|
||||||
|
Limits: v1.ResourceList{
|
||||||
|
framework.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
ns := f.Namespace.Name
|
||||||
|
_, err := framework.CreateJob(f.ClientSet, ns, testJob)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
framework.Logf("Created job %v", testJob)
|
||||||
|
By("Waiting for gpu job pod start")
|
||||||
|
err = framework.WaitForAllJobPodsRunning(f.ClientSet, ns, testJob.Name, 1)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
By("Done with gpu job pod start")
|
||||||
|
}
|
||||||
|
|
||||||
|
// verifyJobPodSuccess verifies that the started cuda pod successfully passes.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) verifyJobPodSuccess(f *framework.Framework) {
|
||||||
|
// Wait for client pod to complete.
|
||||||
|
ns := f.Namespace.Name
|
||||||
|
err := framework.WaitForAllJobPodsRunning(f.ClientSet, f.Namespace.Name, "cuda-add", 1)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
pods, err := framework.GetJobPods(f.ClientSet, f.Namespace.Name, "cuda-add")
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
createdPod := pods.Items[0].Name
|
||||||
|
framework.Logf("Created pod %v", createdPod)
|
||||||
|
f.PodClient().WaitForSuccess(createdPod, 5*time.Minute)
|
||||||
|
logs, err := framework.GetPodLogs(f.ClientSet, ns, createdPod, "vector-addition")
|
||||||
|
framework.ExpectNoError(err, "Should be able to get pod logs")
|
||||||
|
framework.Logf("Got pod logs: %v", logs)
|
||||||
|
regex := regexp.MustCompile("PASSED")
|
||||||
|
Expect(regex.MatchString(logs)).To(BeTrue())
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user