handle container restarts for GPUs

Signed-off-by: Vishnu Kannan <vishnuk@google.com>
2026-01-05 15:37:24 +00:00 · 2017-03-10 16:27:08 -08:00
parent ab9b299c30
commit 8ed9bff073
4 changed files with 110 additions and 25 deletions
--- a/test/e2e_node/gpus.go
+++ b/test/e2e_node/gpus.go
@@ -18,6 +18,7 @@ package e2e_node

 import (
 	"fmt"
+	"time"

 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -79,6 +80,20 @@ var _ = framework.KubeDescribe("GPU [Serial]", func() {
 			podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
 			podSuccess = f.PodClient().CreateSync(podSuccess)

+			By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
+			const minContainerRestartCount = 2
+			Eventually(func() bool {
+				p, err := f.ClientSet.Core().Pods(f.Namespace.Name).Get(podSuccess.Name, metav1.GetOptions{})
+				if err != nil {
+					framework.Logf("failed to get pod status: %v", err)
+					return false
+				}
+				if p.Status.ContainerStatuses[0].RestartCount < minContainerRestartCount {
+					return false
+				}
+				return true
+			}, time.Minute, time.Second).Should(BeTrue())
+
 			By("Checking if the pod outputted Success to its logs")
 			framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))

@@ -115,12 +130,13 @@ func makePod(gpus int64, name string) *v1.Pod {
 			v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
 		},
 	}
-	gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus)
+	gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success", gpus)
 	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: name,
 		},
 		Spec: v1.PodSpec{
+			RestartPolicy: v1.RestartPolicyAlways,
 			Containers: []v1.Container{
 				{
 					Image:     "gcr.io/google_containers/busybox:1.24",