handle container restarts for GPUs

Signed-off-by: Vishnu Kannan <vishnuk@google.com>
This commit is contained in:
Vishnu Kannan
2017-03-10 16:27:08 -08:00
committed by Vishnu kannan
parent ab9b299c30
commit 8ed9bff073
4 changed files with 110 additions and 25 deletions

View File

@@ -18,6 +18,7 @@ package e2e_node
import (
"fmt"
"time"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -79,6 +80,20 @@ var _ = framework.KubeDescribe("GPU [Serial]", func() {
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
podSuccess = f.PodClient().CreateSync(podSuccess)
By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
const minContainerRestartCount = 2
Eventually(func() bool {
p, err := f.ClientSet.Core().Pods(f.Namespace.Name).Get(podSuccess.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("failed to get pod status: %v", err)
return false
}
if p.Status.ContainerStatuses[0].RestartCount < minContainerRestartCount {
return false
}
return true
}, time.Minute, time.Second).Should(BeTrue())
By("Checking if the pod outputted Success to its logs")
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
@@ -115,12 +130,13 @@ func makePod(gpus int64, name string) *v1.Pod {
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
},
}
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus)
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success", gpus)
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyAlways,
Containers: []v1.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",