mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-09-08 12:41:58 +00:00
Merge pull request #104606 from endocrimes/dani/device-driver-deflake
[Failing Test] Fix GPU Device Driver test in kubelet-serial
This commit is contained in:
@@ -19,18 +19,14 @@ package e2enode
|
||||
import (
|
||||
"context"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/uuid"
|
||||
"k8s.io/component-base/metrics/testutil"
|
||||
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
|
||||
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
|
||||
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
|
||||
|
||||
"github.com/onsi/ginkgo"
|
||||
"github.com/onsi/gomega"
|
||||
@@ -54,8 +50,7 @@ func NVIDIADevicePlugin() *v1.Pod {
|
||||
framework.ExpectNoError(err)
|
||||
p := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
|
||||
Namespace: metav1.NamespaceSystem,
|
||||
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
|
||||
},
|
||||
Spec: ds.Spec.Template.Spec,
|
||||
}
|
||||
@@ -70,7 +65,6 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
|
||||
|
||||
ginkgo.Context("DevicePlugin", func() {
|
||||
var devicePluginPod *v1.Pod
|
||||
var err error
|
||||
ginkgo.BeforeEach(func() {
|
||||
ginkgo.By("Ensuring that Nvidia GPUs exists on the node")
|
||||
if !checkIfNvidiaGPUsExistOnNode() {
|
||||
@@ -81,14 +75,13 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
|
||||
ginkgo.Skip("Test works only with in-tree dockershim. Skipping test.")
|
||||
}
|
||||
|
||||
ginkgo.By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
|
||||
devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), NVIDIADevicePlugin(), metav1.CreateOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
ginkgo.By("Creating the Google Device Plugin pod for NVIDIA GPU")
|
||||
devicePluginPod = f.PodClient().Create(NVIDIADevicePlugin())
|
||||
|
||||
ginkgo.By("Waiting for GPUs to become available on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
return numberOfNVIDIAGPUs(getLocalNode(f)) > 0
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue(), "GPUs never became available on the local node")
|
||||
|
||||
if numberOfNVIDIAGPUs(getLocalNode(f)) < 2 {
|
||||
ginkgo.Skip("Not enough GPUs to execute this test (at least two needed)")
|
||||
@@ -99,18 +92,33 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
|
||||
l, err := f.PodClient().List(context.TODO(), metav1.ListOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
f.PodClient().DeleteSync(devicePluginPod.Name, metav1.DeleteOptions{}, 2*time.Minute)
|
||||
|
||||
for _, p := range l.Items {
|
||||
if p.Namespace != f.Namespace.Name {
|
||||
continue
|
||||
}
|
||||
|
||||
f.PodClient().Delete(context.TODO(), p.Name, metav1.DeleteOptions{})
|
||||
framework.Logf("Deleting pod: %s", p.Name)
|
||||
f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute)
|
||||
}
|
||||
|
||||
restartKubelet()
|
||||
|
||||
ginkgo.By("Waiting for GPUs to become unavailable on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return numberOfNVIDIAGPUs(node) <= 0
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
})
|
||||
|
||||
ginkgo.It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() {
|
||||
ginkgo.By("Creating one GPU pod on a node with at least two GPUs")
|
||||
podRECMD := "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs"
|
||||
// This test is disabled as this behaviour has not existed since at least
|
||||
// kubernetes 0.19. If this is a bug, then this test should pass when the
|
||||
// issue is resolved. If the behaviour is intentional then it can be removed.
|
||||
ginkgo.XIt("keeps GPU assignation to pods after the device plugin has been removed.", func() {
|
||||
ginkgo.By("Creating one GPU pod")
|
||||
podRECMD := "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs && sleep 180"
|
||||
p1 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD))
|
||||
|
||||
deviceIDRE := "gpu devices: (nvidia[0-9]+)"
|
||||
@@ -118,7 +126,52 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
|
||||
p1, err := f.PodClient().Get(context.TODO(), p1.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By("Restarting Kubelet and waiting for the current running pod to restart")
|
||||
ginkgo.By("Deleting the device plugin")
|
||||
f.PodClient().DeleteSync(devicePluginPod.Name, metav1.DeleteOptions{}, 2*time.Minute)
|
||||
|
||||
ginkgo.By("Waiting for GPUs to become unavailable on the local node")
|
||||
gomega.Eventually(func() int64 {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return numberOfNVIDIAGPUs(node)
|
||||
}, 10*time.Minute, framework.Poll).Should(gomega.BeZero(), "Expected GPUs to eventually be unavailable")
|
||||
|
||||
ginkgo.By("Checking that scheduled pods can continue to run even after we delete the device plugin")
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
ginkgo.By("Restarting Kubelet")
|
||||
restartKubelet()
|
||||
framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
|
||||
|
||||
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
// Cleanup
|
||||
f.PodClient().DeleteSync(p1.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||
})
|
||||
|
||||
ginkgo.It("keeps GPU assignment to pods across pod and kubelet restarts.", func() {
|
||||
ginkgo.By("Creating one GPU pod on a node with at least two GPUs")
|
||||
podRECMD := "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs && sleep 40"
|
||||
p1 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD))
|
||||
|
||||
deviceIDRE := "gpu devices: (nvidia[0-9]+)"
|
||||
devID1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
p1, err := f.PodClient().Get(context.TODO(), p1.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By("Confirming that after many pod restarts, GPU assignment is kept")
|
||||
for i := 0; i < 3; i++ {
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
}
|
||||
|
||||
ginkgo.By("Restarting Kubelet")
|
||||
restartKubelet()
|
||||
|
||||
ginkgo.By("Confirming that after a kubelet and pod restart, GPU assignment is kept")
|
||||
@@ -127,48 +180,22 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
ginkgo.By("Restarting Kubelet and creating another pod")
|
||||
|
||||
restartKubelet()
|
||||
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
|
||||
framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
|
||||
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
|
||||
gomega.Eventually(func() bool {
|
||||
return numberOfNVIDIAGPUs(getLocalNode(f)) > 0
|
||||
return numberOfNVIDIAGPUs(getLocalNode(f)) >= 2
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
|
||||
p2 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD))
|
||||
|
||||
ginkgo.By("Checking that pods got a different GPU")
|
||||
devID2 := parseLog(f, p2.Name, p2.Name, deviceIDRE)
|
||||
|
||||
framework.ExpectEqual(devID1, devID2)
|
||||
|
||||
ginkgo.By("Deleting device plugin.")
|
||||
f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(context.TODO(), devicePluginPod.Name, metav1.DeleteOptions{})
|
||||
ginkgo.By("Waiting for GPUs to become unavailable on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return numberOfNVIDIAGPUs(node) <= 0
|
||||
}, 10*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
ensurePodContainerRestart(f, p2.Name, p2.Name)
|
||||
devIDRestart2 := parseLog(f, p2.Name, p2.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart2, devID2)
|
||||
ginkgo.By("Restarting Kubelet.")
|
||||
restartKubelet()
|
||||
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
ensurePodContainerRestart(f, p2.Name, p2.Name)
|
||||
devIDRestart2 = parseLog(f, p2.Name, p2.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart2, devID2)
|
||||
logDevicePluginMetrics()
|
||||
|
||||
// Cleanup
|
||||
f.PodClient().DeleteSync(p1.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||
f.PodClient().DeleteSync(p2.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||
framework.ExpectNotEqual(devID1, devID2)
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -182,31 +209,3 @@ func checkIfNvidiaGPUsExistOnNode() bool {
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func logDevicePluginMetrics() {
|
||||
ms, err := e2emetrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName+":10255", "/metrics")
|
||||
framework.ExpectNoError(err)
|
||||
for msKey, samples := range ms {
|
||||
switch msKey {
|
||||
case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationDurationKey:
|
||||
for _, sample := range samples {
|
||||
latency := sample.Value
|
||||
resource := string(sample.Metric["resource_name"])
|
||||
var quantile float64
|
||||
if val, ok := sample.Metric[testutil.QuantileLabel]; ok {
|
||||
var err error
|
||||
if quantile, err = strconv.ParseFloat(string(val), 64); err != nil {
|
||||
continue
|
||||
}
|
||||
framework.Logf("Metric: %v ResourceName: %v Quantile: %v Latency: %v", msKey, resource, quantile, latency)
|
||||
}
|
||||
}
|
||||
case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginRegistrationCountKey:
|
||||
for _, sample := range samples {
|
||||
resource := string(sample.Metric["resource_name"])
|
||||
count := sample.Value
|
||||
framework.Logf("Metric: %v ResourceName: %v Count: %v", msKey, resource, count)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,18 @@
|
||||
|
||||
runcmd:
|
||||
- modprobe configs
|
||||
- docker run -v /dev:/dev -v /home/kubernetes/bin/nvidia:/rootfs/nvidia -v /etc/os-release:/rootfs/etc/os-release -v /proc/sysrq-trigger:/sysrq -e BASE_DIR=/rootfs/nvidia --privileged k8s.gcr.io/cos-nvidia-driver-install@sha256:cb55c7971c337fece62f2bfe858662522a01e43ac9984a2dd1dd5c71487d225c
|
||||
# Setup the installation target at make it executable
|
||||
- mkdir -p /home/kubernetes/bin/nvidia
|
||||
- mount --bind /home/kubernetes/bin/nvidia /home/kubernetes/bin/nvidia
|
||||
- mount -o remount,exec /home/kubernetes/bin/nvidia
|
||||
# Compile and install the nvidia driver (precompiled driver installation currently fails)
|
||||
- docker run --net=host --pid=host -v /dev:/dev -v /:/root -v /home/kubernetes/bin/nvidia:/usr/local/nvidia -e NVIDIA_INSTALL_DIR_HOST=/home/kubernetes/bin/nvidia -e NVIDIA_INSTALL_DIR_CONTAINER=/usr/local/nvidia -e NVIDIA_DRIVER_VERSION=460.91.03 --privileged gcr.io/cos-cloud/cos-gpu-installer:latest
|
||||
# Run the installer again, as on the first try it doesn't detect the libnvidia-ml.so
|
||||
# on the second attempt we detect it and update the ld cache.
|
||||
- docker run --net=host --pid=host -v /dev:/dev -v /:/root -v /home/kubernetes/bin/nvidia:/usr/local/nvidia -e NVIDIA_INSTALL_DIR_HOST=/home/kubernetes/bin/nvidia -e NVIDIA_INSTALL_DIR_CONTAINER=/usr/local/nvidia -e NVIDIA_DRIVER_VERSION=460.91.03 --privileged gcr.io/cos-cloud/cos-gpu-installer:latest
|
||||
# Remove build containers. They're very large.
|
||||
- docker rm -f $(docker ps -aq)
|
||||
# Standard installation proceeds
|
||||
- mount /tmp /tmp -o remount,exec,suid
|
||||
- usermod -a -G docker jenkins
|
||||
- mkdir -p /var/lib/kubelet
|
||||
|
Reference in New Issue
Block a user