Added device plugin e2e kubelet failure test

Signed-off-by: Renaud Gaubert <renaud.gaubert@gmail.com>
This commit is contained in:
Renaud Gaubert 2017-09-10 12:53:17 -07:00
parent 46ff2c44c7
commit 6993612cec
7 changed files with 268 additions and 41 deletions

View File

@ -17,6 +17,7 @@ go_library(
"framework.go",
"get-kubemark-resource-usage.go",
"google_compute.go",
"gpu_util.go",
"ingress_utils.go",
"jobs_util.go",
"kubelet_stats.go",
@ -121,6 +122,7 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/yaml:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/version:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/watch:go_default_library",
"//vendor/k8s.io/client-go/discovery:go_default_library",

View File

@ -0,0 +1,66 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
)
const (
// GPUResourceName is the extended name of the GPU resource since v1.8
// this uses the device plugin mechanism
NVIDIAGPUResourceName = "nvidia.com/gpu"
// TODO: Parametrize it by making it a feature in TestFramework.
// so we can override the daemonset in other setups (non COS).
// GPUDevicePluginDSYAML is the official Google Device Plugin Daemonset NVIDIA GPU manifest for GKE
GPUDevicePluginDSYAML = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/device-plugin-daemonset.yaml"
)
// TODO make this generic and not linked to COS only
// NumberOfGPUs returs the number of GPUs advertised by a node
// This is based on the Device Plugin system and expected to run on a COS based node
// After the NVIDIA drivers were installed
func NumberOfNVIDIAGPUs(node *v1.Node) int64 {
val, ok := node.Status.Capacity[NVIDIAGPUResourceName]
if !ok {
return 0
}
return val.Value()
}
// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
func NVIDIADevicePlugin(ns string) *v1.Pod {
ds := DsFromManifest(GPUDevicePluginDSYAML)
p := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
Namespace: ns,
},
Spec: ds.Spec.Template.Spec,
}
// Remove NVIDIA drivers installation
p.Spec.InitContainers = []v1.Container{}
return p
}

View File

@ -65,6 +65,7 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/wait"
utilyaml "k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/discovery"
"k8s.io/client-go/dynamic"
@ -5015,3 +5016,33 @@ func DumpDebugInfo(c clientset.Interface, ns string) {
func IsRetryableAPIError(err error) bool {
return apierrs.IsTimeout(err) || apierrs.IsServerTimeout(err) || apierrs.IsTooManyRequests(err) || apierrs.IsInternalError(err)
}
// DsFromManifest reads a .json/yaml file and returns the daemonset in it.
func DsFromManifest(url string) *extensions.DaemonSet {
var controller extensions.DaemonSet
Logf("Parsing ds from %v", url)
var response *http.Response
var err error
for i := 1; i <= 5; i++ {
response, err = http.Get(url)
if err == nil && response.StatusCode == 200 {
break
}
time.Sleep(time.Duration(i) * time.Second)
}
Expect(err).NotTo(HaveOccurred())
Expect(response.StatusCode).To(Equal(200))
defer response.Body.Close()
data, err := ioutil.ReadAll(response.Body)
Expect(err).NotTo(HaveOccurred())
json, err := utilyaml.ToJSON(data)
Expect(err).NotTo(HaveOccurred())
Expect(runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)).NotTo(HaveOccurred())
return &controller
}

View File

@ -34,17 +34,14 @@ go_library(
"//vendor/github.com/onsi/gomega:go_default_library",
"//vendor/github.com/stretchr/testify/assert:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
"//vendor/k8s.io/api/scheduling/v1alpha1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/yaml:go_default_library",
"//vendor/k8s.io/client-go/kubernetes:go_default_library",
],
)

View File

@ -17,19 +17,13 @@ limitations under the License.
package scheduling
import (
"io/ioutil"
"net/http"
"strings"
"time"
"k8s.io/api/core/v1"
extensions "k8s.io/api/extensions/v1beta1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/uuid"
utilyaml "k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/test/e2e/framework"
imageutils "k8s.io/kubernetes/test/utils/image"
@ -168,8 +162,8 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
framework.Logf("Cluster is running on COS. Proceeding with test")
if f.BaseName == "device-plugin-gpus" {
dsYamlUrl = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/device-plugin-daemonset.yaml"
gpuResourceName = "nvidia.com/gpu"
dsYamlUrl = framework.GPUDevicePluginDSYAML
gpuResourceName = framework.NVIDIAGPUResourceName
podCreationFunc = makeCudaAdditionDevicePluginTestPod
} else {
dsYamlUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/master/cos-nvidia-gpu-installer/daemonset.yaml"
@ -180,7 +174,7 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
// GPU drivers might have already been installed.
if !areGPUsAvailableOnAllSchedulableNodes(f) {
// Install Nvidia Drivers.
ds := dsFromManifest(dsYamlUrl)
ds := framework.DsFromManifest(dsYamlUrl)
ds.Namespace = f.Namespace.Name
_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
framework.ExpectNoError(err, "failed to create daemonset")
@ -202,34 +196,6 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
}
}
// dsFromManifest reads a .json/yaml file and returns the daemonset in it.
func dsFromManifest(url string) *extensions.DaemonSet {
var controller extensions.DaemonSet
framework.Logf("Parsing ds from %v", url)
var response *http.Response
var err error
for i := 1; i <= 5; i++ {
response, err = http.Get(url)
if err == nil && response.StatusCode == 200 {
break
}
time.Sleep(time.Duration(i) * time.Second)
}
Expect(err).NotTo(HaveOccurred())
Expect(response.StatusCode).To(Equal(200))
defer response.Body.Close()
data, err := ioutil.ReadAll(response.Body)
Expect(err).NotTo(HaveOccurred())
json, err := utilyaml.ToJSON(data)
Expect(err).NotTo(HaveOccurred())
Expect(runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)).NotTo(HaveOccurred())
return &controller
}
var _ = SIGDescribe("[Feature:GPU]", func() {
f := framework.NewDefaultFramework("gpus")
It("run Nvidia GPU tests on Container Optimized OS only", func() {
@ -247,7 +213,7 @@ var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
// 2. Verifies that when the device plugin DaemonSet is removed, resource capacity drops to zero.
By("Deleting device plugin daemonset")
ds := dsFromManifest(dsYamlUrl)
ds := framework.DsFromManifest(dsYamlUrl)
falseVar := false
err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Delete(ds.Name, &metav1.DeleteOptions{OrphanDependents: &falseVar})
framework.ExpectNoError(err, "failed to delete daemonset")

View File

@ -12,6 +12,7 @@ go_library(
"container.go",
"doc.go",
"docker_util.go",
"gpu_device_plugin.go",
"gpus.go",
"image_list.go",
"simple_mount.go",

View File

@ -0,0 +1,164 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e_node
import (
"fmt"
"os/exec"
"time"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
const (
devicePluginFeatureGate = "DevicePlugins=true"
testPodNamePrefix = "nvidia-gpu-"
sleepTimeout = 30
)
// Serial because the test restarts Kubelet
var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin] [Serial] [Disruptive]", func() {
f := framework.NewDefaultFramework("device-plugin-gpus-errors")
Context("", func() {
BeforeEach(func() {
By("Ensuring that Nvidia GPUs exists on the node")
if !checkIfNvidiaGPUsExistOnNode() {
Skip("Nvidia GPUs do not exist on the node. Skipping test.")
}
By("Enabling support for Device Plugin")
tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
initialConfig.FeatureGates += "," + devicePluginFeatureGate
})
By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))
By("Waiting for GPUs to become available on the local node")
Eventually(framework.NumberOfNVIDIAGPUs(getLocalNode(f)) != 0, time.Minute, time.Second).Should(BeTrue())
if framework.NumberOfNVIDIAGPUs(getLocalNode(f)) < 2 {
Skip("Not enough GPUs to execute this test (at least two needed)")
}
})
AfterEach(func() {
l, err := f.PodClient().List(metav1.ListOptions{})
framework.ExpectNoError(err)
for _, p := range l.Items {
if p.Namespace != f.Namespace.Name {
continue
}
f.PodClient().Delete(p.Name, &metav1.DeleteOptions{})
}
})
It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() {
n := getLocalNode(f)
By("Creating one GPU pod on a node with at least two GPUs")
p1 := f.PodClient().CreateSync(makeCudaPauseImage())
cmd := fmt.Sprintf("exec %s %s nvidia-smi -L", n.Name, p1.Spec.Containers[0].Name)
uuid1, _ := framework.RunKubectl(cmd)
By("Restarting Kubelet and waiting for the current running pod to restart")
restartKubelet(f)
Eventually(func() bool {
p, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
framework.ExpectNoError(err)
return p.Status.ContainerStatuses[0].RestartCount != p1.Status.ContainerStatuses[0].RestartCount
}, 2*sleepTimeout)
By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
uuid1Restart, _ := framework.RunKubectl(cmd)
Expect(uuid1Restart).To(Equal(uuid1))
By("Restarting Kubelet and creating another pod")
restartKubelet(f)
p2 := f.PodClient().CreateSync(makeCudaPauseImage())
By("Checking that pods got a different GPU")
cmd = fmt.Sprintf("exec %s %s nvidia-smi -L", n.Name, p2.Spec.Containers[0].Name)
uuid2, _ := framework.RunKubectl(cmd)
Expect(uuid1).To(Not(Equal(uuid2)))
// Cleanup
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
f.PodClient().DeleteSync(p2.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
})
})
})
func makeCudaPauseImage() *v1.Pod {
podName := testPodNamePrefix + string(uuid.NewUUID())
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podName},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyAlways,
Containers: []v1.Container{{
Name: "cuda-pause",
Image: "nvidia/cuda",
Command: []string{"sleep", string(sleepTimeout)},
Resources: v1.ResourceRequirements{
Limits: newDecimalResourceList(framework.NVIDIAGPUResourceName, 1),
Requests: newDecimalResourceList(framework.NVIDIAGPUResourceName, 1),
},
}},
},
}
}
func newDecimalResourceList(name v1.ResourceName, quantity int64) v1.ResourceList {
return v1.ResourceList{name: *resource.NewQuantity(quantity, resource.DecimalSI)}
}
// TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
func restartKubelet(f *framework.Framework) {
stdout1, err1 := exec.Command("sudo", "systemctl", "restart", "kubelet").CombinedOutput()
if err1 == nil {
return
}
stdout2, err2 := exec.Command("sudo", "/etc/init.d/kubelet", "restart").CombinedOutput()
if err2 == nil {
return
}
stdout3, err3 := exec.Command("sudo", "service", "kubelet", "restart").CombinedOutput()
if err3 == nil {
return
}
framework.Failf("Failed to trigger kubelet restart with systemctl/initctl/service operations:"+
"\nsystemclt: %v, %v"+
"\ninitctl: %v, %v"+
"\nservice: %v, %v", err1, stdout1, err2, stdout2, err3, stdout3)
}