Merge pull request #127547 from dims/skip-reinstallation-of-gpu-daemonset

Skip re-installation of GPU daemonset
This commit is contained in:
Kubernetes Prow Robot 2024-09-23 15:28:00 +01:00 committed by GitHub
commit ff391cefe2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 28 additions and 27 deletions

View File

@ -42,17 +42,23 @@ type JobState func(job *batchv1.Job) string
// WaitForJobPodsRunning wait for all pods for the Job named JobName in namespace ns to become Running. Only use // WaitForJobPodsRunning wait for all pods for the Job named JobName in namespace ns to become Running. Only use
// when pods will run for a long time, or it will be racy. // when pods will run for a long time, or it will be racy.
func WaitForJobPodsRunning(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32) error { func WaitForJobPodsRunning(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32) error {
return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning) return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning, JobTimeout)
}
// WaitForJobPodsRunningWithTimeout wait for all pods for the Job named JobName in namespace ns to become Running. Only use
// when pods will run for a long time, or it will be racy. same as WaitForJobPodsRunning but with an additional timeout parameter
func WaitForJobPodsRunningWithTimeout(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, timeout time.Duration) error {
return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning, timeout)
} }
// WaitForJobPodsSucceeded wait for all pods for the Job named JobName in namespace ns to become Succeeded. // WaitForJobPodsSucceeded wait for all pods for the Job named JobName in namespace ns to become Succeeded.
func WaitForJobPodsSucceeded(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32) error { func WaitForJobPodsSucceeded(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32) error {
return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodSucceeded) return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodSucceeded, JobTimeout)
} }
// waitForJobPodsInPhase wait for all pods for the Job named JobName in namespace ns to be in a given phase. // waitForJobPodsInPhase wait for all pods for the Job named JobName in namespace ns to be in a given phase.
func waitForJobPodsInPhase(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, phase v1.PodPhase) error { func waitForJobPodsInPhase(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, phase v1.PodPhase, timeout time.Duration) error {
return wait.PollUntilContextTimeout(ctx, framework.Poll, JobTimeout, false, func(ctx context.Context) (bool, error) { return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, false, func(ctx context.Context) (bool, error) {
pods, err := GetJobPods(ctx, c, ns, jobName) pods, err := GetJobPods(ctx, c, ns, jobName)
if err != nil { if err != nil {
return false, err return false, err
@ -157,7 +163,12 @@ func isJobFailed(j *batchv1.Job) bool {
// WaitForJobFinish uses c to wait for the Job jobName in namespace ns to finish (either Failed or Complete). // WaitForJobFinish uses c to wait for the Job jobName in namespace ns to finish (either Failed or Complete).
func WaitForJobFinish(ctx context.Context, c clientset.Interface, ns, jobName string) error { func WaitForJobFinish(ctx context.Context, c clientset.Interface, ns, jobName string) error {
return wait.PollUntilContextTimeout(ctx, framework.Poll, JobTimeout, true, func(ctx context.Context) (bool, error) { return WaitForJobFinishWithTimeout(ctx, c, ns, jobName, JobTimeout)
}
// WaitForJobFinishWithTimeout uses c to wait for the Job jobName in namespace ns to finish (either Failed or Complete).
func WaitForJobFinishWithTimeout(ctx context.Context, c clientset.Interface, ns, jobName string, timeout time.Duration) error {
return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, true, func(ctx context.Context) (bool, error) {
curr, err := c.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{}) curr, err := c.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{})
if err != nil { if err != nil {
return false, err return false, err

View File

@ -31,7 +31,6 @@ import (
extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions" extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
"k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework"
e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu" e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
e2ejob "k8s.io/kubernetes/test/e2e/framework/job" e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest" e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
@ -131,7 +130,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
framework.ExpectNoError(err) framework.ExpectNoError(err)
// make sure job is running by waiting for its first pod to start running // make sure job is running by waiting for its first pod to start running
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) err = e2ejob.WaitForJobPodsRunningWithTimeout(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1, e2ejob.JobTimeout*2)
framework.ExpectNoError(err) framework.ExpectNoError(err)
numNodes, err := e2enode.TotalRegistered(ctx, f.ClientSet) numNodes, err := e2enode.TotalRegistered(ctx, f.ClientSet)
@ -140,7 +139,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
framework.ExpectNoError(err) framework.ExpectNoError(err)
ginkgo.By("Waiting for gpu job to finish") ginkgo.By("Waiting for gpu job to finish")
err = e2ejob.WaitForJobFinish(ctx, f.ClientSet, f.Namespace.Name, job.Name) err = e2ejob.WaitForJobFinishWithTimeout(ctx, f.ClientSet, f.Namespace.Name, job.Name, e2ejob.JobTimeout*2)
framework.ExpectNoError(err) framework.ExpectNoError(err)
ginkgo.By("Done with gpu job") ginkgo.By("Done with gpu job")
@ -154,7 +153,7 @@ func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient
pod = podClient.Create(ctx, pod) pod = podClient.Create(ctx, pod)
ginkgo.By("Watching for error events or started pod") ginkgo.By("Watching for error events or started pod")
ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*3) ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*6)
framework.ExpectNoError(err) framework.ExpectNoError(err)
gomega.Expect(ev).To(gomega.BeNil()) gomega.Expect(ev).To(gomega.BeNil())
@ -263,15 +262,7 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f
func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework, clientSet clientset.Interface) { func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework, clientSet clientset.Interface) {
if framework.ProviderIs("gce") { if framework.ProviderIs("gce") {
rsgather := SetupNVIDIAGPUNode(ctx, f) SetupNVIDIAGPUNode(ctx, f)
defer func() {
framework.Logf("Stopping ResourceUsageGather")
constraints := make(map[string]e2edebug.ResourceConstraint)
// For now, just gets summary. Can pass valid constraints in the future.
summary, err := rsgather.StopAndSummarize([]int{50, 90, 100}, constraints)
f.TestSummaries = append(f.TestSummaries, summary)
framework.ExpectNoError(err, "getting resource usage summary")
}()
} }
nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet) nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet)
framework.ExpectNoError(err) framework.ExpectNoError(err)
@ -329,7 +320,7 @@ const (
) )
// SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes // SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes
func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.ContainerResourceGatherer { func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) {
logOSImages(ctx, f) logOSImages(ctx, f)
var err error var err error
@ -348,6 +339,13 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
ds, err = e2emanifest.DaemonSetFromData(data) ds, err = e2emanifest.DaemonSetFromData(data)
framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset") framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
} }
prev, err := f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Get(ctx, ds.Name, metav1.GetOptions{})
if err == nil && prev != nil {
framework.Logf("Daemonset already installed, skipping...")
return
}
ds.Namespace = f.Namespace.Name ds.Namespace = f.Namespace.Name
_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ctx, ds, metav1.CreateOptions{}) _, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ctx, ds, metav1.CreateOptions{})
framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset") framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
@ -362,19 +360,11 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C
pods.Items = append(pods.Items, devicepluginPods.Items...) pods.Items = append(pods.Items, devicepluginPods.Items...)
} }
framework.Logf("Starting ResourceUsageGather for the created DaemonSet pods.")
rsgather, err := e2edebug.NewResourceUsageGatherer(ctx, f.ClientSet,
e2edebug.ResourceGathererOptions{InKubemark: false, Nodes: e2edebug.AllNodes, ResourceDataGatheringPeriod: 2 * time.Second, ProbeDuration: 2 * time.Second, PrintVerboseLogs: true}, pods)
framework.ExpectNoError(err, "creating ResourceUsageGather for the daemonset pods")
go rsgather.StartGatheringData(ctx)
// Wait for Nvidia GPUs to be available on nodes // Wait for Nvidia GPUs to be available on nodes
framework.Logf("Waiting for drivers to be installed and GPUs to be available in Node Capacity...") framework.Logf("Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
gomega.Eventually(ctx, func(ctx context.Context) bool { gomega.Eventually(ctx, func(ctx context.Context) bool {
return areGPUsAvailableOnAllSchedulableNodes(ctx, f.ClientSet) return areGPUsAvailableOnAllSchedulableNodes(ctx, f.ClientSet)
}, driverInstallTimeout, time.Second).Should(gomega.BeTrueBecause("expected GPU resources to be available within the timout")) }, driverInstallTimeout, time.Second).Should(gomega.BeTrueBecause("expected GPU resources to be available within the timout"))
return rsgather
} }
// StartJob starts a simple CUDA job that requests gpu and the specified number of completions // StartJob starts a simple CUDA job that requests gpu and the specified number of completions