mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-09-14 21:53:52 +00:00
Merge pull request #127475 from dims/resurrect-gpu-tests-that-use-jobs
Resurrect GPU tests that use Jobs
This commit is contained in:
@@ -19,6 +19,7 @@ package node
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"os"
|
"os"
|
||||||
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
appsv1 "k8s.io/api/apps/v1"
|
appsv1 "k8s.io/api/apps/v1"
|
||||||
@@ -32,6 +33,7 @@ import (
|
|||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
|
e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
|
||||||
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
|
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
|
||||||
|
e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
|
||||||
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
|
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
|
||||||
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
|
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
|
||||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||||
@@ -44,9 +46,13 @@ import (
|
|||||||
"github.com/onsi/gomega"
|
"github.com/onsi/gomega"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", func() {
|
// NOTE: All the tests in this file are run serially because they share a limited set of GPU(s), please inspect
|
||||||
|
// the CI job definitions to see how many GPU(s) are available in the environment
|
||||||
|
// Currently the CI jobs have 2 nodes each with 4 Nvidia T4's across both GCE and AWS harness(es).
|
||||||
|
|
||||||
f := framework.NewDefaultFramework("nvidia-gpu")
|
var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Sanity test using nvidia-smi", func() {
|
||||||
|
|
||||||
|
f := framework.NewDefaultFramework("nvidia-gpu1")
|
||||||
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||||
var podClient *e2epod.PodClient
|
var podClient *e2epod.PodClient
|
||||||
|
|
||||||
@@ -72,6 +78,18 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
|
|||||||
gomega.Expect(log).To(gomega.ContainSubstring("Driver Version:"))
|
gomega.Expect(log).To(gomega.ContainSubstring("Driver Version:"))
|
||||||
gomega.Expect(log).To(gomega.ContainSubstring("CUDA Version:"))
|
gomega.Expect(log).To(gomega.ContainSubstring("CUDA Version:"))
|
||||||
})
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using a Pod", func() {
|
||||||
|
|
||||||
|
f := framework.NewDefaultFramework("nvidia-gpu2")
|
||||||
|
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||||
|
var podClient *e2epod.PodClient
|
||||||
|
|
||||||
|
ginkgo.BeforeEach(func() {
|
||||||
|
e2eskipper.SkipUnlessProviderIs("aws", "gce")
|
||||||
|
podClient = e2epod.NewPodClient(f)
|
||||||
|
})
|
||||||
|
|
||||||
f.It("should run gpu based matrix multiplication", func(ctx context.Context) {
|
f.It("should run gpu based matrix multiplication", func(ctx context.Context) {
|
||||||
SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
|
SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
|
||||||
@@ -91,6 +109,46 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using a Job", func() {
|
||||||
|
|
||||||
|
f := framework.NewDefaultFramework("nvidia-gpu2")
|
||||||
|
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||||
|
|
||||||
|
ginkgo.BeforeEach(func() {
|
||||||
|
e2eskipper.SkipUnlessProviderIs("aws", "gce")
|
||||||
|
})
|
||||||
|
|
||||||
|
f.It("should run gpu based jobs", func(ctx context.Context) {
|
||||||
|
SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
|
||||||
|
|
||||||
|
// Job set to have 5 completions with parallelism of 1 to ensure that it lasts long enough to experience the node recreation
|
||||||
|
completions := int32(5)
|
||||||
|
ginkgo.By("Starting GPU job")
|
||||||
|
StartJob(ctx, f, completions)
|
||||||
|
|
||||||
|
job, err := e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, "cuda-add")
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
// make sure job is running by waiting for its first pod to start running
|
||||||
|
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
numNodes, err := e2enode.TotalRegistered(ctx, f.ClientSet)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
_, err = e2enode.CheckReady(ctx, f.ClientSet, numNodes, framework.NodeReadyInitialTimeout)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
ginkgo.By("Waiting for gpu job to finish")
|
||||||
|
err = e2ejob.WaitForJobFinish(ctx, f.ClientSet, f.Namespace.Name, job.Name)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
ginkgo.By("Done with gpu job")
|
||||||
|
|
||||||
|
gomega.Expect(job.Status.Failed).To(gomega.BeZero(), "Job pods failed during node recreation: %v", job.Status.Failed)
|
||||||
|
|
||||||
|
VerifyJobNCompletions(ctx, f, completions)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) {
|
func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) {
|
||||||
pod = podClient.Create(ctx, pod)
|
pod = podClient.Create(ctx, pod)
|
||||||
|
|
||||||
@@ -304,3 +362,109 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
|
|||||||
|
|
||||||
return rsgather
|
return rsgather
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// StartJob starts a simple CUDA job that requests gpu and the specified number of completions
|
||||||
|
func StartJob(ctx context.Context, f *framework.Framework, completions int32) {
|
||||||
|
var activeSeconds int64 = 3600
|
||||||
|
testJob := e2ejob.NewTestJob("succeed", "cuda-add", v1.RestartPolicyAlways, 1, completions, &activeSeconds, 6)
|
||||||
|
testJob.Spec.Template.Spec = v1.PodSpec{
|
||||||
|
RestartPolicy: v1.RestartPolicyOnFailure,
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{
|
||||||
|
Name: "vector-addition",
|
||||||
|
Image: "cupy/cupy:v13.3.0",
|
||||||
|
Command: []string{
|
||||||
|
"python3",
|
||||||
|
"-c",
|
||||||
|
`
|
||||||
|
import cupy as cp
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Set the number of elements to test
|
||||||
|
num_elements_list = [10, 100, 1000, 10000, 100000, 1000000]
|
||||||
|
|
||||||
|
for num_elements in num_elements_list:
|
||||||
|
# Create random input vectors on the CPU
|
||||||
|
h_A = np.random.rand(num_elements).astype(np.float32)
|
||||||
|
h_B = np.random.rand(num_elements).astype(np.float32)
|
||||||
|
|
||||||
|
# Transfer the input vectors to the GPU
|
||||||
|
d_A = cp.asarray(h_A)
|
||||||
|
d_B = cp.asarray(h_B)
|
||||||
|
|
||||||
|
# Perform vector addition on the GPU
|
||||||
|
start_gpu = time.time()
|
||||||
|
d_C = d_A + d_B
|
||||||
|
gpu_time = time.time() - start_gpu
|
||||||
|
|
||||||
|
# Transfer the result back to the CPU
|
||||||
|
h_C = cp.asnumpy(d_C)
|
||||||
|
|
||||||
|
# Compute the expected result on the CPU
|
||||||
|
start_cpu = time.time()
|
||||||
|
h_C_expected = h_A + h_B
|
||||||
|
cpu_time = time.time() - start_cpu
|
||||||
|
|
||||||
|
# Verify the result
|
||||||
|
if np.allclose(h_C_expected, h_C, atol=1e-5):
|
||||||
|
print(f"GPU time: {gpu_time:.6f} seconds")
|
||||||
|
print(f"CPU time: {cpu_time:.6f} seconds")
|
||||||
|
print(f"GPU speedup: {cpu_time / gpu_time:.2f}x")
|
||||||
|
else:
|
||||||
|
print(f"Test FAILED for {num_elements} elements.")
|
||||||
|
|
||||||
|
# Print the first few elements for verification
|
||||||
|
print("First few elements of A:", h_A[:5])
|
||||||
|
print("First few elements of B:", h_B[:5])
|
||||||
|
print("First few elements of C:", h_C[:5])
|
||||||
|
|
||||||
|
print(f"Test PASSED")
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
Resources: v1.ResourceRequirements{
|
||||||
|
Limits: v1.ResourceList{
|
||||||
|
e2egpu.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
ns := f.Namespace.Name
|
||||||
|
_, err := e2ejob.CreateJob(ctx, f.ClientSet, ns, testJob)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
framework.Logf("Created job %v", testJob)
|
||||||
|
}
|
||||||
|
|
||||||
|
func podNames(pods []v1.Pod) []string {
|
||||||
|
originalPodNames := make([]string, len(pods))
|
||||||
|
for i, p := range pods {
|
||||||
|
originalPodNames[i] = p.ObjectMeta.Name
|
||||||
|
}
|
||||||
|
return originalPodNames
|
||||||
|
}
|
||||||
|
|
||||||
|
// VerifyJobNCompletions verifies that the job has completions number of successful pods
|
||||||
|
func VerifyJobNCompletions(ctx context.Context, f *framework.Framework, completions int32) {
|
||||||
|
ns := f.Namespace.Name
|
||||||
|
pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, "cuda-add")
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
createdPods := pods.Items
|
||||||
|
createdPodNames := podNames(createdPods)
|
||||||
|
framework.Logf("Got the following pods for job cuda-add: %v", createdPodNames)
|
||||||
|
|
||||||
|
successes := int32(0)
|
||||||
|
regex := regexp.MustCompile("PASSED")
|
||||||
|
for _, podName := range createdPodNames {
|
||||||
|
e2epod.NewPodClient(f).WaitForFinish(ctx, podName, 5*time.Minute)
|
||||||
|
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, ns, podName, "vector-addition")
|
||||||
|
framework.ExpectNoError(err, "Should be able to get logs for pod %v", podName)
|
||||||
|
if regex.MatchString(logs) {
|
||||||
|
successes++
|
||||||
|
}
|
||||||
|
gomega.Expect(logs).To(gomega.Not(gomega.ContainSubstring("FAILED")))
|
||||||
|
}
|
||||||
|
if successes != completions {
|
||||||
|
framework.Failf("Only got %v completions. Expected %v completions.", successes, completions)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user