diff --git a/test/e2e/framework/job/wait.go b/test/e2e/framework/job/wait.go index a7c3d265e5f..27355ec3501 100644 --- a/test/e2e/framework/job/wait.go +++ b/test/e2e/framework/job/wait.go @@ -42,17 +42,23 @@ type JobState func(job *batchv1.Job) string // WaitForJobPodsRunning wait for all pods for the Job named JobName in namespace ns to become Running. Only use // when pods will run for a long time, or it will be racy. func WaitForJobPodsRunning(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32) error { - return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning) + return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning, JobTimeout) +} + +// WaitForJobPodsRunningWithTimeout wait for all pods for the Job named JobName in namespace ns to become Running. Only use +// when pods will run for a long time, or it will be racy. same as WaitForJobPodsRunning but with an additional timeout parameter +func WaitForJobPodsRunningWithTimeout(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, timeout time.Duration) error { + return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodRunning, timeout) } // WaitForJobPodsSucceeded wait for all pods for the Job named JobName in namespace ns to become Succeeded. func WaitForJobPodsSucceeded(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32) error { - return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodSucceeded) + return waitForJobPodsInPhase(ctx, c, ns, jobName, expectedCount, v1.PodSucceeded, JobTimeout) } // waitForJobPodsInPhase wait for all pods for the Job named JobName in namespace ns to be in a given phase. -func waitForJobPodsInPhase(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, phase v1.PodPhase) error { - return wait.PollUntilContextTimeout(ctx, framework.Poll, JobTimeout, false, func(ctx context.Context) (bool, error) { +func waitForJobPodsInPhase(ctx context.Context, c clientset.Interface, ns, jobName string, expectedCount int32, phase v1.PodPhase, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, false, func(ctx context.Context) (bool, error) { pods, err := GetJobPods(ctx, c, ns, jobName) if err != nil { return false, err @@ -157,7 +163,12 @@ func isJobFailed(j *batchv1.Job) bool { // WaitForJobFinish uses c to wait for the Job jobName in namespace ns to finish (either Failed or Complete). func WaitForJobFinish(ctx context.Context, c clientset.Interface, ns, jobName string) error { - return wait.PollUntilContextTimeout(ctx, framework.Poll, JobTimeout, true, func(ctx context.Context) (bool, error) { + return WaitForJobFinishWithTimeout(ctx, c, ns, jobName, JobTimeout) +} + +// WaitForJobFinishWithTimeout uses c to wait for the Job jobName in namespace ns to finish (either Failed or Complete). +func WaitForJobFinishWithTimeout(ctx context.Context, c clientset.Interface, ns, jobName string, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, true, func(ctx context.Context) (bool, error) { curr, err := c.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{}) if err != nil { return false, err diff --git a/test/e2e/node/gpu.go b/test/e2e/node/gpu.go index 09f82804339..3e0a0376ab3 100644 --- a/test/e2e/node/gpu.go +++ b/test/e2e/node/gpu.go @@ -31,7 +31,6 @@ import ( extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" - e2edebug "k8s.io/kubernetes/test/e2e/framework/debug" e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu" e2ejob "k8s.io/kubernetes/test/e2e/framework/job" e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest" @@ -131,7 +130,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using framework.ExpectNoError(err) // make sure job is running by waiting for its first pod to start running - err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) + err = e2ejob.WaitForJobPodsRunningWithTimeout(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1, e2ejob.JobTimeout*2) framework.ExpectNoError(err) numNodes, err := e2enode.TotalRegistered(ctx, f.ClientSet) @@ -140,7 +139,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using framework.ExpectNoError(err) ginkgo.By("Waiting for gpu job to finish") - err = e2ejob.WaitForJobFinish(ctx, f.ClientSet, f.Namespace.Name, job.Name) + err = e2ejob.WaitForJobFinishWithTimeout(ctx, f.ClientSet, f.Namespace.Name, job.Name, e2ejob.JobTimeout*2) framework.ExpectNoError(err) ginkgo.By("Done with gpu job") @@ -154,7 +153,7 @@ func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient pod = podClient.Create(ctx, pod) ginkgo.By("Watching for error events or started pod") - ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*3) + ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*6) framework.ExpectNoError(err) gomega.Expect(ev).To(gomega.BeNil()) @@ -263,15 +262,7 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework, clientSet clientset.Interface) { if framework.ProviderIs("gce") { - rsgather := SetupNVIDIAGPUNode(ctx, f) - defer func() { - framework.Logf("Stopping ResourceUsageGather") - constraints := make(map[string]e2edebug.ResourceConstraint) - // For now, just gets summary. Can pass valid constraints in the future. - summary, err := rsgather.StopAndSummarize([]int{50, 90, 100}, constraints) - f.TestSummaries = append(f.TestSummaries, summary) - framework.ExpectNoError(err, "getting resource usage summary") - }() + SetupNVIDIAGPUNode(ctx, f) } nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet) framework.ExpectNoError(err) @@ -329,7 +320,7 @@ const ( ) // SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes -func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.ContainerResourceGatherer { +func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) { logOSImages(ctx, f) var err error @@ -348,6 +339,13 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C ds, err = e2emanifest.DaemonSetFromData(data) framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset") } + + prev, err := f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Get(ctx, ds.Name, metav1.GetOptions{}) + if err == nil && prev != nil { + framework.Logf("Daemonset already installed, skipping...") + return + } + ds.Namespace = f.Namespace.Name _, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ctx, ds, metav1.CreateOptions{}) framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset") @@ -362,19 +360,11 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.C pods.Items = append(pods.Items, devicepluginPods.Items...) } - framework.Logf("Starting ResourceUsageGather for the created DaemonSet pods.") - rsgather, err := e2edebug.NewResourceUsageGatherer(ctx, f.ClientSet, - e2edebug.ResourceGathererOptions{InKubemark: false, Nodes: e2edebug.AllNodes, ResourceDataGatheringPeriod: 2 * time.Second, ProbeDuration: 2 * time.Second, PrintVerboseLogs: true}, pods) - framework.ExpectNoError(err, "creating ResourceUsageGather for the daemonset pods") - go rsgather.StartGatheringData(ctx) - // Wait for Nvidia GPUs to be available on nodes framework.Logf("Waiting for drivers to be installed and GPUs to be available in Node Capacity...") gomega.Eventually(ctx, func(ctx context.Context) bool { return areGPUsAvailableOnAllSchedulableNodes(ctx, f.ClientSet) }, driverInstallTimeout, time.Second).Should(gomega.BeTrueBecause("expected GPU resources to be available within the timout")) - - return rsgather } // StartJob starts a simple CUDA job that requests gpu and the specified number of completions