From 21565fe16fc7610afd6dcc301d099f7121effd8d Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Wed, 18 Sep 2024 16:57:27 -0400 Subject: [PATCH] Install Nvidia Daemonset in test harness for GCE Signed-off-by: Davanum Srinivas --- test/e2e/node/gpu.go | 108 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 4 deletions(-) diff --git a/test/e2e/node/gpu.go b/test/e2e/node/gpu.go index 39f5c66e3e7..81e2ae4b8d3 100644 --- a/test/e2e/node/gpu.go +++ b/test/e2e/node/gpu.go @@ -18,17 +18,26 @@ package node import ( "context" + "os" + "time" + + appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/uuid" clientset "k8s.io/client-go/kubernetes" + extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" + e2edebug "k8s.io/kubernetes/test/e2e/framework/debug" e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu" + e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest" e2enode "k8s.io/kubernetes/test/e2e/framework/node" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + e2eresource "k8s.io/kubernetes/test/e2e/framework/resource" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles" admissionapi "k8s.io/pod-security-admission/api" "github.com/onsi/ginkgo/v2" @@ -42,12 +51,12 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu var podClient *e2epod.PodClient ginkgo.BeforeEach(func() { - e2eskipper.SkipUnlessProviderIs("aws") + e2eskipper.SkipUnlessProviderIs("aws", "gce") podClient = e2epod.NewPodClient(f) }) f.It("should run nvidia-smi cli", func(ctx context.Context) { - checkEnvironmentAndSkipIfNeeded(ctx, f.ClientSet) + SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet) pod := testNvidiaCLIPod() pod.Spec.Containers[0].Command = []string{"nvidia-smi"} @@ -65,7 +74,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu }) f.It("should run gpu based matrix multiplication", func(ctx context.Context) { - checkEnvironmentAndSkipIfNeeded(ctx, f.ClientSet) + SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet) pod := testMatrixMultiplicationPod() ginkgo.By("Creating a pod that runs matrix multiplication") @@ -180,7 +189,18 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f return &pod } -func checkEnvironmentAndSkipIfNeeded(ctx context.Context, clientSet clientset.Interface) { +func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework, clientSet clientset.Interface) { + if framework.ProviderIs("gce") { + rsgather := SetupNVIDIAGPUNode(ctx, f) + defer func() { + framework.Logf("Stopping ResourceUsageGather") + constraints := make(map[string]e2edebug.ResourceConstraint) + // For now, just gets summary. Can pass valid constraints in the future. + summary, err := rsgather.StopAndSummarize([]int{50, 90, 100}, constraints) + f.TestSummaries = append(f.TestSummaries, summary) + framework.ExpectNoError(err, "getting resource usage summary") + }() + } nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet) framework.ExpectNoError(err) capacity := 0 @@ -204,3 +224,83 @@ func checkEnvironmentAndSkipIfNeeded(ctx context.Context, clientSet clientset.In e2eskipper.Skipf("%d ready nodes do not have any allocatable Nvidia GPU(s). Skipping...", len(nodes.Items)) } } + +func areGPUsAvailableOnAllSchedulableNodes(ctx context.Context, clientSet clientset.Interface) bool { + framework.Logf("Getting list of Nodes from API server") + nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + for _, node := range nodeList.Items { + if node.Spec.Unschedulable { + continue + } + framework.Logf("gpuResourceName %s", e2egpu.NVIDIAGPUResourceName) + if val, ok := node.Status.Capacity[e2egpu.NVIDIAGPUResourceName]; !ok || val.Value() == 0 { + framework.Logf("Nvidia GPUs not available on Node: %q", node.Name) + return false + } + } + framework.Logf("Nvidia GPUs exist on all schedulable nodes") + return true +} + +func logOSImages(ctx context.Context, f *framework.Framework) { + nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + for _, node := range nodeList.Items { + framework.Logf("Nodename: %v, OS Image: %v", node.Name, node.Status.NodeInfo.OSImage) + } +} + +const ( + // Nvidia driver installation can take upwards of 5 minutes. + driverInstallTimeout = 10 * time.Minute +) + +// SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes +func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.ContainerResourceGatherer { + logOSImages(ctx, f) + + var err error + var ds *appsv1.DaemonSet + dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET") + if dsYamlURLFromEnv != "" { + // Using DaemonSet from remote URL + framework.Logf("Using remote nvidia-driver-installer daemonset manifest from %v", dsYamlURLFromEnv) + ds, err = e2emanifest.DaemonSetFromURL(ctx, dsYamlURLFromEnv) + framework.ExpectNoError(err, "failed get remote") + } else { + // Using default local DaemonSet + framework.Logf("Using default local nvidia-driver-installer daemonset manifest.") + data, err := e2etestfiles.Read("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml") + framework.ExpectNoError(err, "failed to read local manifest for nvidia-driver-installer daemonset") + ds, err = e2emanifest.DaemonSetFromData(data) + framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset") + } + ds.Namespace = f.Namespace.Name + _, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ctx, ds, metav1.CreateOptions{}) + framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset") + framework.Logf("Successfully created daemonset to install Nvidia drivers.") + + pods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, ds.Namespace, ds.Name, extensionsinternal.Kind("DaemonSet")) + framework.ExpectNoError(err, "failed to get pods controlled by the nvidia-driver-installer daemonset") + + devicepluginPods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, "kube-system", "nvidia-gpu-device-plugin", extensionsinternal.Kind("DaemonSet")) + if err == nil { + framework.Logf("Adding deviceplugin addon pod.") + pods.Items = append(pods.Items, devicepluginPods.Items...) + } + + framework.Logf("Starting ResourceUsageGather for the created DaemonSet pods.") + rsgather, err := e2edebug.NewResourceUsageGatherer(ctx, f.ClientSet, + e2edebug.ResourceGathererOptions{InKubemark: false, Nodes: e2edebug.AllNodes, ResourceDataGatheringPeriod: 2 * time.Second, ProbeDuration: 2 * time.Second, PrintVerboseLogs: true}, pods) + framework.ExpectNoError(err, "creating ResourceUsageGather for the daemonset pods") + go rsgather.StartGatheringData(ctx) + + // Wait for Nvidia GPUs to be available on nodes + framework.Logf("Waiting for drivers to be installed and GPUs to be available in Node Capacity...") + gomega.Eventually(ctx, func(ctx context.Context) bool { + return areGPUsAvailableOnAllSchedulableNodes(ctx, f.ClientSet) + }, driverInstallTimeout, time.Second).Should(gomega.BeTrueBecause("expected GPU resources to be available within the timout")) + + return rsgather +}