From ba40bee5c1059751c45ea25a5df610715b96d9a5 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang Date: Mon, 18 Sep 2017 16:10:04 -0700 Subject: [PATCH] Modified test/e2e_node/gpu-device-plugin.go to make sure it passes. --- test/e2e/framework/gpu_util.go | 14 ++++- test/e2e/framework/util.go | 25 +++++--- test/e2e/scheduling/nvidia-gpus.go | 10 +-- test/e2e_node/gpu_device_plugin.go | 97 ++++++++++++++++-------------- test/e2e_node/image_list.go | 1 + 5 files changed, 90 insertions(+), 57 deletions(-) diff --git a/test/e2e/framework/gpu_util.go b/test/e2e/framework/gpu_util.go index 065c0bc184b..d0ff9798f19 100644 --- a/test/e2e/framework/gpu_util.go +++ b/test/e2e/framework/gpu_util.go @@ -20,6 +20,8 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/uuid" + + . "github.com/onsi/gomega" ) const ( @@ -49,7 +51,8 @@ func NumberOfNVIDIAGPUs(node *v1.Node) int64 { // NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE func NVIDIADevicePlugin(ns string) *v1.Pod { - ds := DsFromManifest(GPUDevicePluginDSYAML) + ds, err := DsFromManifest(GPUDevicePluginDSYAML) + Expect(err).NotTo(HaveOccurred()) p := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()), @@ -58,9 +61,16 @@ func NVIDIADevicePlugin(ns string) *v1.Pod { Spec: ds.Spec.Template.Spec, } - // Remove NVIDIA drivers installation p.Spec.InitContainers = []v1.Container{} return p } + +func GetGPUDevicePluginImage() string { + ds, err := DsFromManifest(GPUDevicePluginDSYAML) + if err != nil || ds == nil || len(ds.Spec.Template.Spec.Containers) < 1 { + return "" + } + return ds.Spec.Template.Spec.Containers[0].Image +} diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index e2b070c354e..9040517e515 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -5018,7 +5018,7 @@ func IsRetryableAPIError(err error) bool { } // DsFromManifest reads a .json/yaml file and returns the daemonset in it. -func DsFromManifest(url string) *extensions.DaemonSet { +func DsFromManifest(url string) (*extensions.DaemonSet, error) { var controller extensions.DaemonSet Logf("Parsing ds from %v", url) @@ -5033,16 +5033,27 @@ func DsFromManifest(url string) *extensions.DaemonSet { time.Sleep(time.Duration(i) * time.Second) } - Expect(err).NotTo(HaveOccurred()) - Expect(response.StatusCode).To(Equal(200)) + if err != nil { + return nil, fmt.Errorf("failed to get url: %v", err) + } + if response.StatusCode != 200 { + return nil, fmt.Errorf("invalid http response status: %v", response.StatusCode) + } defer response.Body.Close() data, err := ioutil.ReadAll(response.Body) - Expect(err).NotTo(HaveOccurred()) + if err != nil { + return nil, fmt.Errorf("failed to read html response body: %v", err) + } json, err := utilyaml.ToJSON(data) - Expect(err).NotTo(HaveOccurred()) + if err != nil { + return nil, fmt.Errorf("failed to parse data to json: %v", err) + } - Expect(runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)).NotTo(HaveOccurred()) - return &controller + err = runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller) + if err != nil { + return nil, fmt.Errorf("failed to decode DaemonSet spec: %v", err) + } + return &controller, nil } diff --git a/test/e2e/scheduling/nvidia-gpus.go b/test/e2e/scheduling/nvidia-gpus.go index 03ba201f70f..0c57ef1481a 100644 --- a/test/e2e/scheduling/nvidia-gpus.go +++ b/test/e2e/scheduling/nvidia-gpus.go @@ -174,9 +174,10 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) { // GPU drivers might have already been installed. if !areGPUsAvailableOnAllSchedulableNodes(f) { // Install Nvidia Drivers. - ds := framework.DsFromManifest(dsYamlUrl) + ds, err := framework.DsFromManifest(dsYamlUrl) + Expect(err).NotTo(HaveOccurred()) ds.Namespace = f.Namespace.Name - _, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds) + _, err = f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds) framework.ExpectNoError(err, "failed to create daemonset") framework.Logf("Successfully created daemonset to install Nvidia drivers. Waiting for drivers to be installed and GPUs to be available in Node Capacity...") // Wait for Nvidia GPUs to be available on nodes @@ -213,9 +214,10 @@ var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() { // 2. Verifies that when the device plugin DaemonSet is removed, resource capacity drops to zero. By("Deleting device plugin daemonset") - ds := framework.DsFromManifest(dsYamlUrl) + ds, err := framework.DsFromManifest(dsYamlUrl) + Expect(err).NotTo(HaveOccurred()) falseVar := false - err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Delete(ds.Name, &metav1.DeleteOptions{OrphanDependents: &falseVar}) + err = f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Delete(ds.Name, &metav1.DeleteOptions{OrphanDependents: &falseVar}) framework.ExpectNoError(err, "failed to delete daemonset") framework.Logf("Successfully deleted device plugin daemonset. Wait for resource to be removed.") // Wait for Nvidia GPUs to be not available on nodes diff --git a/test/e2e_node/gpu_device_plugin.go b/test/e2e_node/gpu_device_plugin.go index d35b92cbe8a..476c4ca5cc9 100644 --- a/test/e2e_node/gpu_device_plugin.go +++ b/test/e2e_node/gpu_device_plugin.go @@ -17,8 +17,8 @@ limitations under the License. package e2e_node import ( - "fmt" "os/exec" + "regexp" "time" "k8s.io/api/core/v1" @@ -35,30 +35,31 @@ import ( const ( devicePluginFeatureGate = "DevicePlugins=true" testPodNamePrefix = "nvidia-gpu-" - sleepTimeout = 30 ) // Serial because the test restarts Kubelet var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin] [Serial] [Disruptive]", func() { f := framework.NewDefaultFramework("device-plugin-gpus-errors") - Context("", func() { + Context("DevicePlugin", func() { + By("Enabling support for Device Plugin") + tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.FeatureGates += "," + devicePluginFeatureGate + }) + BeforeEach(func() { By("Ensuring that Nvidia GPUs exists on the node") if !checkIfNvidiaGPUsExistOnNode() { Skip("Nvidia GPUs do not exist on the node. Skipping test.") } - By("Enabling support for Device Plugin") - tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { - initialConfig.FeatureGates += "," + devicePluginFeatureGate - }) - By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE") f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name)) By("Waiting for GPUs to become available on the local node") - Eventually(framework.NumberOfNVIDIAGPUs(getLocalNode(f)) != 0, time.Minute, time.Second).Should(BeTrue()) + Eventually(func() bool { + return framework.NumberOfNVIDIAGPUs(getLocalNode(f)) > 0 + }, 10*time.Second, time.Second).Should(BeTrue()) if framework.NumberOfNVIDIAGPUs(getLocalNode(f)) < 2 { Skip("Not enough GPUs to execute this test (at least two needed)") @@ -79,34 +80,26 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi }) It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() { - n := getLocalNode(f) - By("Creating one GPU pod on a node with at least two GPUs") p1 := f.PodClient().CreateSync(makeCudaPauseImage()) - cmd := fmt.Sprintf("exec %s %s nvidia-smi -L", n.Name, p1.Spec.Containers[0].Name) - uuid1, _ := framework.RunKubectl(cmd) + devId1 := getDeviceId(f, p1.Name, p1.Name, 1) + p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) By("Restarting Kubelet and waiting for the current running pod to restart") restartKubelet(f) - Eventually(func() bool { - p, err := f.PodClient().Get(p1.Name, metav1.GetOptions{}) - framework.ExpectNoError(err) - - return p.Status.ContainerStatuses[0].RestartCount != p1.Status.ContainerStatuses[0].RestartCount - }, 2*sleepTimeout) By("Confirming that after a kubelet and pod restart, GPU assignement is kept") - uuid1Restart, _ := framework.RunKubectl(cmd) - Expect(uuid1Restart).To(Equal(uuid1)) + devIdRestart := getDeviceId(f, p1.Name, p1.Name, 2) + Expect(devIdRestart).To(Equal(devId1)) By("Restarting Kubelet and creating another pod") restartKubelet(f) p2 := f.PodClient().CreateSync(makeCudaPauseImage()) By("Checking that pods got a different GPU") - cmd = fmt.Sprintf("exec %s %s nvidia-smi -L", n.Name, p2.Spec.Containers[0].Name) - uuid2, _ := framework.RunKubectl(cmd) - Expect(uuid1).To(Not(Equal(uuid2))) + devId2 := getDeviceId(f, p2.Name, p2.Name, 1) + Expect(devId1).To(Not(Equal(devId2))) // Cleanup f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) @@ -123,9 +116,12 @@ func makeCudaPauseImage() *v1.Pod { Spec: v1.PodSpec{ RestartPolicy: v1.RestartPolicyAlways, Containers: []v1.Container{{ - Name: "cuda-pause", - Image: "nvidia/cuda", - Command: []string{"sleep", string(sleepTimeout)}, + Image: busyboxImage, + Name: podName, + // Retrieves the gpu devices created in the user pod. + // Note the nvidia device plugin implementation doesn't do device id remapping currently. + // Will probably need to use nvidia-smi if that changes. + Command: []string{"sh", "-c", "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs"}, Resources: v1.ResourceRequirements{ Limits: newDecimalResourceList(framework.NVIDIAGPUResourceName, 1), @@ -142,23 +138,36 @@ func newDecimalResourceList(name v1.ResourceName, quantity int64) v1.ResourceLis // TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494 func restartKubelet(f *framework.Framework) { - stdout1, err1 := exec.Command("sudo", "systemctl", "restart", "kubelet").CombinedOutput() - if err1 == nil { + stdout, err := exec.Command("sudo", "systemctl", "list-units", "kubelet*", "--state=running").CombinedOutput() + framework.ExpectNoError(err) + regex := regexp.MustCompile("(kubelet-[0-9]+)") + matches := regex.FindStringSubmatch(string(stdout)) + Expect(len(matches)).NotTo(BeZero()) + kube := matches[0] + framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kube) + stdout, err = exec.Command("sudo", "systemctl", "restart", kube).CombinedOutput() + if err == nil { return } - - stdout2, err2 := exec.Command("sudo", "/etc/init.d/kubelet", "restart").CombinedOutput() - if err2 == nil { - return - } - - stdout3, err3 := exec.Command("sudo", "service", "kubelet", "restart").CombinedOutput() - if err3 == nil { - return - } - - framework.Failf("Failed to trigger kubelet restart with systemctl/initctl/service operations:"+ - "\nsystemclt: %v, %v"+ - "\ninitctl: %v, %v"+ - "\nservice: %v, %v", err1, stdout1, err2, stdout2, err3, stdout3) + framework.Failf("Failed to restart kubelet with systemctl: %v, %v", err, stdout) +} + +func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) string { + // Wait till pod has been restarted at least restartCount times. + Eventually(func() bool { + p, err := f.PodClient().Get(podName, metav1.GetOptions{}) + framework.ExpectNoError(err) + return p.Status.ContainerStatuses[0].RestartCount >= restartCount + }, time.Minute, time.Second).Should(BeTrue()) + logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName) + if err != nil { + framework.Failf("GetPodLogs for pod %q failed: %v", podName, err) + } + framework.Logf("got pod logs: %v", logs) + regex := regexp.MustCompile("gpu devices: (nvidia[0-9]+)") + matches := regex.FindStringSubmatch(logs) + if len(matches) < 2 { + return "" + } + return matches[1] } diff --git a/test/e2e_node/image_list.go b/test/e2e_node/image_list.go index 8d83fcef253..404193ffc19 100644 --- a/test/e2e_node/image_list.go +++ b/test/e2e_node/image_list.go @@ -56,6 +56,7 @@ var NodeImageWhiteList = sets.NewString( imageutils.GetE2EImage(imageutils.Netexec), "gcr.io/google_containers/nonewprivs:1.2", framework.GetPauseImageNameForHostArch(), + framework.GetGPUDevicePluginImage(), ) func init() {