mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-11-04 07:49:35 +00:00 
			
		
		
		
	Updates sig-scheduling e2e Nvidia GPU tests to install drivers using local manifest by default. Currently the DaemonSet is fetched from the GoogleCloudPlatform/container-enginer-accelerators repo by default. Using a local manifest allows for manually specifying the image cos-gpu-installer image rather than always using latest. A remote manifest can still be fetched by setting NVIDIA_DRIVER_INSTALLER_DAEMONSET env var. Signed-off-by: hasheddan <georgedanielmangum@gmail.com>
		
			
				
	
	
		
			329 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			329 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
Copyright 2017 The Kubernetes Authors.
 | 
						|
 | 
						|
Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
you may not use this file except in compliance with the License.
 | 
						|
You may obtain a copy of the License at
 | 
						|
 | 
						|
    http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
Unless required by applicable law or agreed to in writing, software
 | 
						|
distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
See the License for the specific language governing permissions and
 | 
						|
limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package scheduling
 | 
						|
 | 
						|
import (
 | 
						|
	"context"
 | 
						|
	"os"
 | 
						|
	"regexp"
 | 
						|
	"time"
 | 
						|
 | 
						|
	appsv1 "k8s.io/api/apps/v1"
 | 
						|
	v1 "k8s.io/api/core/v1"
 | 
						|
	"k8s.io/apimachinery/pkg/api/resource"
 | 
						|
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
						|
	"k8s.io/apimachinery/pkg/util/uuid"
 | 
						|
	extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
 | 
						|
	"k8s.io/kubernetes/test/e2e/framework"
 | 
						|
	e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
 | 
						|
	e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
 | 
						|
	e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
 | 
						|
	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
						|
	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 | 
						|
	"k8s.io/kubernetes/test/e2e/framework/providers/gce"
 | 
						|
	e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
 | 
						|
	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 | 
						|
	e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
 | 
						|
	imageutils "k8s.io/kubernetes/test/utils/image"
 | 
						|
 | 
						|
	"github.com/onsi/ginkgo"
 | 
						|
	"github.com/onsi/gomega"
 | 
						|
)
 | 
						|
 | 
						|
const (
 | 
						|
	testPodNamePrefix = "nvidia-gpu-"
 | 
						|
	// Nvidia driver installation can take upwards of 5 minutes.
 | 
						|
	driverInstallTimeout = 10 * time.Minute
 | 
						|
)
 | 
						|
 | 
						|
var (
 | 
						|
	gpuResourceName v1.ResourceName
 | 
						|
)
 | 
						|
 | 
						|
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
 | 
						|
	podName := testPodNamePrefix + string(uuid.NewUUID())
 | 
						|
	testPod := &v1.Pod{
 | 
						|
		ObjectMeta: metav1.ObjectMeta{
 | 
						|
			Name: podName,
 | 
						|
		},
 | 
						|
		Spec: v1.PodSpec{
 | 
						|
			RestartPolicy: v1.RestartPolicyNever,
 | 
						|
			Containers: []v1.Container{
 | 
						|
				{
 | 
						|
					Name:  "vector-addition-cuda8",
 | 
						|
					Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
 | 
						|
					Resources: v1.ResourceRequirements{
 | 
						|
						Limits: v1.ResourceList{
 | 
						|
							gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
 | 
						|
						},
 | 
						|
					},
 | 
						|
				},
 | 
						|
				{
 | 
						|
					Name:  "vector-addition-cuda10",
 | 
						|
					Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd2),
 | 
						|
					Resources: v1.ResourceRequirements{
 | 
						|
						Limits: v1.ResourceList{
 | 
						|
							gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
 | 
						|
						},
 | 
						|
					},
 | 
						|
				},
 | 
						|
			},
 | 
						|
		},
 | 
						|
	}
 | 
						|
	return testPod
 | 
						|
}
 | 
						|
 | 
						|
func logOSImages(f *framework.Framework) {
 | 
						|
	nodeList, err := f.ClientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
 | 
						|
	framework.ExpectNoError(err, "getting node list")
 | 
						|
	for _, node := range nodeList.Items {
 | 
						|
		framework.Logf("Nodename: %v, OS Image: %v", node.Name, node.Status.NodeInfo.OSImage)
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func areGPUsAvailableOnAllSchedulableNodes(f *framework.Framework) bool {
 | 
						|
	framework.Logf("Getting list of Nodes from API server")
 | 
						|
	nodeList, err := f.ClientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
 | 
						|
	framework.ExpectNoError(err, "getting node list")
 | 
						|
	for _, node := range nodeList.Items {
 | 
						|
		if node.Spec.Unschedulable {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
		framework.Logf("gpuResourceName %s", gpuResourceName)
 | 
						|
		if val, ok := node.Status.Capacity[gpuResourceName]; !ok || val.Value() == 0 {
 | 
						|
			framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
 | 
						|
			return false
 | 
						|
		}
 | 
						|
	}
 | 
						|
	framework.Logf("Nvidia GPUs exist on all schedulable nodes")
 | 
						|
	return true
 | 
						|
}
 | 
						|
 | 
						|
func getGPUsAvailable(f *framework.Framework) int64 {
 | 
						|
	nodeList, err := f.ClientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
 | 
						|
	framework.ExpectNoError(err, "getting node list")
 | 
						|
	var gpusAvailable int64
 | 
						|
	for _, node := range nodeList.Items {
 | 
						|
		if val, ok := node.Status.Allocatable[gpuResourceName]; ok {
 | 
						|
			gpusAvailable += (&val).Value()
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return gpusAvailable
 | 
						|
}
 | 
						|
 | 
						|
// SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes
 | 
						|
func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *framework.ContainerResourceGatherer {
 | 
						|
	logOSImages(f)
 | 
						|
 | 
						|
	var err error
 | 
						|
	var ds *appsv1.DaemonSet
 | 
						|
	dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
 | 
						|
	if dsYamlURLFromEnv != "" {
 | 
						|
		// Using DaemonSet from remote URL
 | 
						|
		framework.Logf("Using remote nvidia-driver-installer daemonset manifest from %v", dsYamlURLFromEnv)
 | 
						|
		ds, err = e2emanifest.DaemonSetFromURL(dsYamlURLFromEnv)
 | 
						|
		framework.ExpectNoError(err, "failed get remote")
 | 
						|
	} else {
 | 
						|
		// Using default local DaemonSet
 | 
						|
		framework.Logf("Using default local nvidia-driver-installer daemonset manifest.")
 | 
						|
		data, err := e2etestfiles.Read("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml")
 | 
						|
		framework.ExpectNoError(err, "failed to read local manifest for nvidia-driver-installer daemonset")
 | 
						|
		ds, err = e2emanifest.DaemonSetFromData(data)
 | 
						|
		framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
 | 
						|
	}
 | 
						|
	gpuResourceName = e2egpu.NVIDIAGPUResourceName
 | 
						|
	ds.Namespace = f.Namespace.Name
 | 
						|
	_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(context.TODO(), ds, metav1.CreateOptions{})
 | 
						|
	framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
 | 
						|
	framework.Logf("Successfully created daemonset to install Nvidia drivers.")
 | 
						|
 | 
						|
	pods, err := e2eresource.WaitForControlledPods(f.ClientSet, ds.Namespace, ds.Name, extensionsinternal.Kind("DaemonSet"))
 | 
						|
	framework.ExpectNoError(err, "failed to get pods controlled by the nvidia-driver-installer daemonset")
 | 
						|
 | 
						|
	devicepluginPods, err := e2eresource.WaitForControlledPods(f.ClientSet, "kube-system", "nvidia-gpu-device-plugin", extensionsinternal.Kind("DaemonSet"))
 | 
						|
	if err == nil {
 | 
						|
		framework.Logf("Adding deviceplugin addon pod.")
 | 
						|
		pods.Items = append(pods.Items, devicepluginPods.Items...)
 | 
						|
	}
 | 
						|
 | 
						|
	var rsgather *framework.ContainerResourceGatherer
 | 
						|
	if setupResourceGatherer {
 | 
						|
		framework.Logf("Starting ResourceUsageGather for the created DaemonSet pods.")
 | 
						|
		rsgather, err = framework.NewResourceUsageGatherer(f.ClientSet, framework.ResourceGathererOptions{InKubemark: false, Nodes: framework.AllNodes, ResourceDataGatheringPeriod: 2 * time.Second, ProbeDuration: 2 * time.Second, PrintVerboseLogs: true}, pods)
 | 
						|
		framework.ExpectNoError(err, "creating ResourceUsageGather for the daemonset pods")
 | 
						|
		go rsgather.StartGatheringData()
 | 
						|
	}
 | 
						|
 | 
						|
	// Wait for Nvidia GPUs to be available on nodes
 | 
						|
	framework.Logf("Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
 | 
						|
	gomega.Eventually(func() bool {
 | 
						|
		return areGPUsAvailableOnAllSchedulableNodes(f)
 | 
						|
	}, driverInstallTimeout, time.Second).Should(gomega.BeTrue())
 | 
						|
 | 
						|
	return rsgather
 | 
						|
}
 | 
						|
 | 
						|
func getGPUsPerPod() int64 {
 | 
						|
	var gpusPerPod int64
 | 
						|
	gpuPod := makeCudaAdditionDevicePluginTestPod()
 | 
						|
	for _, container := range gpuPod.Spec.Containers {
 | 
						|
		if val, ok := container.Resources.Limits[gpuResourceName]; ok {
 | 
						|
			gpusPerPod += (&val).Value()
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return gpusPerPod
 | 
						|
}
 | 
						|
 | 
						|
func testNvidiaGPUs(f *framework.Framework) {
 | 
						|
	rsgather := SetupNVIDIAGPUNode(f, true)
 | 
						|
	gpuPodNum := getGPUsAvailable(f) / getGPUsPerPod()
 | 
						|
	framework.Logf("Creating %d pods and have the pods run a CUDA app", gpuPodNum)
 | 
						|
	podList := []*v1.Pod{}
 | 
						|
	for i := int64(0); i < gpuPodNum; i++ {
 | 
						|
		podList = append(podList, f.PodClient().Create(makeCudaAdditionDevicePluginTestPod()))
 | 
						|
	}
 | 
						|
	framework.Logf("Wait for all test pods to succeed")
 | 
						|
	// Wait for all pods to succeed
 | 
						|
	for _, pod := range podList {
 | 
						|
		f.PodClient().WaitForSuccess(pod.Name, 5*time.Minute)
 | 
						|
		logContainers(f, pod)
 | 
						|
	}
 | 
						|
 | 
						|
	framework.Logf("Stopping ResourceUsageGather")
 | 
						|
	constraints := make(map[string]framework.ResourceConstraint)
 | 
						|
	// For now, just gets summary. Can pass valid constraints in the future.
 | 
						|
	summary, err := rsgather.StopAndSummarize([]int{50, 90, 100}, constraints)
 | 
						|
	f.TestSummaries = append(f.TestSummaries, summary)
 | 
						|
	framework.ExpectNoError(err, "getting resource usage summary")
 | 
						|
}
 | 
						|
 | 
						|
func logContainers(f *framework.Framework, pod *v1.Pod) {
 | 
						|
	for _, container := range pod.Spec.Containers {
 | 
						|
		logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, container.Name)
 | 
						|
		framework.ExpectNoError(err, "Should be able to get container logs for container: %s", container.Name)
 | 
						|
		framework.Logf("Got container logs for %s:\n%v", container.Name, logs)
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
 | 
						|
	f := framework.NewDefaultFramework("device-plugin-gpus")
 | 
						|
	ginkgo.It("run Nvidia GPU Device Plugin tests", func() {
 | 
						|
		testNvidiaGPUs(f)
 | 
						|
	})
 | 
						|
})
 | 
						|
 | 
						|
func testNvidiaGPUsJob(f *framework.Framework) {
 | 
						|
	_ = SetupNVIDIAGPUNode(f, false)
 | 
						|
	// Job set to have 5 completions with parallelism of 1 to ensure that it lasts long enough to experience the node recreation
 | 
						|
	completions := int32(5)
 | 
						|
	ginkgo.By("Starting GPU job")
 | 
						|
	StartJob(f, completions)
 | 
						|
 | 
						|
	job, err := e2ejob.GetJob(f.ClientSet, f.Namespace.Name, "cuda-add")
 | 
						|
	framework.ExpectNoError(err)
 | 
						|
 | 
						|
	// make sure job is running by waiting for its first pod to start running
 | 
						|
	err = e2ejob.WaitForAllJobPodsRunning(f.ClientSet, f.Namespace.Name, job.Name, 1)
 | 
						|
	framework.ExpectNoError(err)
 | 
						|
 | 
						|
	numNodes, err := e2enode.TotalRegistered(f.ClientSet)
 | 
						|
	framework.ExpectNoError(err)
 | 
						|
	nodes, err := e2enode.CheckReady(f.ClientSet, numNodes, framework.NodeReadyInitialTimeout)
 | 
						|
	framework.ExpectNoError(err)
 | 
						|
 | 
						|
	ginkgo.By("Recreating nodes")
 | 
						|
	err = gce.RecreateNodes(f.ClientSet, nodes)
 | 
						|
	framework.ExpectNoError(err)
 | 
						|
	ginkgo.By("Done recreating nodes")
 | 
						|
 | 
						|
	ginkgo.By("Waiting for gpu job to finish")
 | 
						|
	err = e2ejob.WaitForJobFinish(f.ClientSet, f.Namespace.Name, job.Name)
 | 
						|
	framework.ExpectNoError(err)
 | 
						|
	ginkgo.By("Done with gpu job")
 | 
						|
 | 
						|
	gomega.Expect(job.Status.Failed).To(gomega.BeZero(), "Job pods failed during node recreation: %v", job.Status.Failed)
 | 
						|
 | 
						|
	VerifyJobNCompletions(f, completions)
 | 
						|
}
 | 
						|
 | 
						|
// StartJob starts a simple CUDA job that requests gpu and the specified number of completions
 | 
						|
func StartJob(f *framework.Framework, completions int32) {
 | 
						|
	var activeSeconds int64 = 3600
 | 
						|
	testJob := e2ejob.NewTestJob("succeed", "cuda-add", v1.RestartPolicyAlways, 1, completions, &activeSeconds, 6)
 | 
						|
	testJob.Spec.Template.Spec = v1.PodSpec{
 | 
						|
		RestartPolicy: v1.RestartPolicyOnFailure,
 | 
						|
		Containers: []v1.Container{
 | 
						|
			{
 | 
						|
				Name:    "vector-addition",
 | 
						|
				Image:   imageutils.GetE2EImage(imageutils.CudaVectorAdd),
 | 
						|
				Command: []string{"/bin/sh", "-c", "./vectorAdd && sleep 60"},
 | 
						|
				Resources: v1.ResourceRequirements{
 | 
						|
					Limits: v1.ResourceList{
 | 
						|
						gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
 | 
						|
					},
 | 
						|
				},
 | 
						|
			},
 | 
						|
		},
 | 
						|
	}
 | 
						|
	ns := f.Namespace.Name
 | 
						|
	_, err := e2ejob.CreateJob(f.ClientSet, ns, testJob)
 | 
						|
	framework.ExpectNoError(err)
 | 
						|
	framework.Logf("Created job %v", testJob)
 | 
						|
}
 | 
						|
 | 
						|
// VerifyJobNCompletions verifies that the job has completions number of successful pods
 | 
						|
func VerifyJobNCompletions(f *framework.Framework, completions int32) {
 | 
						|
	ns := f.Namespace.Name
 | 
						|
	pods, err := e2ejob.GetJobPods(f.ClientSet, f.Namespace.Name, "cuda-add")
 | 
						|
	framework.ExpectNoError(err)
 | 
						|
	createdPods := pods.Items
 | 
						|
	createdPodNames := podNames(createdPods)
 | 
						|
	framework.Logf("Got the following pods for job cuda-add: %v", createdPodNames)
 | 
						|
 | 
						|
	successes := int32(0)
 | 
						|
	regex := regexp.MustCompile("PASSED")
 | 
						|
	for _, podName := range createdPodNames {
 | 
						|
		f.PodClient().WaitForFinish(podName, 5*time.Minute)
 | 
						|
		logs, err := e2epod.GetPodLogs(f.ClientSet, ns, podName, "vector-addition")
 | 
						|
		framework.ExpectNoError(err, "Should be able to get logs for pod %v", podName)
 | 
						|
		if regex.MatchString(logs) {
 | 
						|
			successes++
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if successes != completions {
 | 
						|
		framework.Failf("Only got %v completions. Expected %v completions.", successes, completions)
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func podNames(pods []v1.Pod) []string {
 | 
						|
	originalPodNames := make([]string, len(pods))
 | 
						|
	for i, p := range pods {
 | 
						|
		originalPodNames[i] = p.ObjectMeta.Name
 | 
						|
	}
 | 
						|
	return originalPodNames
 | 
						|
}
 | 
						|
 | 
						|
var _ = SIGDescribe("GPUDevicePluginAcrossRecreate [Feature:Recreate]", func() {
 | 
						|
	ginkgo.BeforeEach(func() {
 | 
						|
		e2eskipper.SkipUnlessProviderIs("gce", "gke")
 | 
						|
	})
 | 
						|
	f := framework.NewDefaultFramework("device-plugin-gpus-recreate")
 | 
						|
	ginkgo.It("run Nvidia GPU Device Plugin tests with a recreation", func() {
 | 
						|
		testNvidiaGPUsJob(f)
 | 
						|
	})
 | 
						|
})
 |