mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 20:53:33 +00:00
Modified test/e2e_node/gpu-device-plugin.go to make sure it passes.
This commit is contained in:
parent
6993612cec
commit
ba40bee5c1
@ -20,6 +20,8 @@ import (
|
|||||||
"k8s.io/api/core/v1"
|
"k8s.io/api/core/v1"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
"k8s.io/apimachinery/pkg/util/uuid"
|
"k8s.io/apimachinery/pkg/util/uuid"
|
||||||
|
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -49,7 +51,8 @@ func NumberOfNVIDIAGPUs(node *v1.Node) int64 {
|
|||||||
|
|
||||||
// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
|
// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
|
||||||
func NVIDIADevicePlugin(ns string) *v1.Pod {
|
func NVIDIADevicePlugin(ns string) *v1.Pod {
|
||||||
ds := DsFromManifest(GPUDevicePluginDSYAML)
|
ds, err := DsFromManifest(GPUDevicePluginDSYAML)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
p := &v1.Pod{
|
p := &v1.Pod{
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
|
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
|
||||||
@ -58,9 +61,16 @@ func NVIDIADevicePlugin(ns string) *v1.Pod {
|
|||||||
|
|
||||||
Spec: ds.Spec.Template.Spec,
|
Spec: ds.Spec.Template.Spec,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove NVIDIA drivers installation
|
// Remove NVIDIA drivers installation
|
||||||
p.Spec.InitContainers = []v1.Container{}
|
p.Spec.InitContainers = []v1.Container{}
|
||||||
|
|
||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetGPUDevicePluginImage() string {
|
||||||
|
ds, err := DsFromManifest(GPUDevicePluginDSYAML)
|
||||||
|
if err != nil || ds == nil || len(ds.Spec.Template.Spec.Containers) < 1 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return ds.Spec.Template.Spec.Containers[0].Image
|
||||||
|
}
|
||||||
|
@ -5018,7 +5018,7 @@ func IsRetryableAPIError(err error) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// DsFromManifest reads a .json/yaml file and returns the daemonset in it.
|
// DsFromManifest reads a .json/yaml file and returns the daemonset in it.
|
||||||
func DsFromManifest(url string) *extensions.DaemonSet {
|
func DsFromManifest(url string) (*extensions.DaemonSet, error) {
|
||||||
var controller extensions.DaemonSet
|
var controller extensions.DaemonSet
|
||||||
Logf("Parsing ds from %v", url)
|
Logf("Parsing ds from %v", url)
|
||||||
|
|
||||||
@ -5033,16 +5033,27 @@ func DsFromManifest(url string) *extensions.DaemonSet {
|
|||||||
time.Sleep(time.Duration(i) * time.Second)
|
time.Sleep(time.Duration(i) * time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
Expect(err).NotTo(HaveOccurred())
|
if err != nil {
|
||||||
Expect(response.StatusCode).To(Equal(200))
|
return nil, fmt.Errorf("failed to get url: %v", err)
|
||||||
|
}
|
||||||
|
if response.StatusCode != 200 {
|
||||||
|
return nil, fmt.Errorf("invalid http response status: %v", response.StatusCode)
|
||||||
|
}
|
||||||
defer response.Body.Close()
|
defer response.Body.Close()
|
||||||
|
|
||||||
data, err := ioutil.ReadAll(response.Body)
|
data, err := ioutil.ReadAll(response.Body)
|
||||||
Expect(err).NotTo(HaveOccurred())
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read html response body: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
json, err := utilyaml.ToJSON(data)
|
json, err := utilyaml.ToJSON(data)
|
||||||
Expect(err).NotTo(HaveOccurred())
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse data to json: %v", err)
|
||||||
Expect(runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)).NotTo(HaveOccurred())
|
}
|
||||||
return &controller
|
|
||||||
|
err = runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to decode DaemonSet spec: %v", err)
|
||||||
|
}
|
||||||
|
return &controller, nil
|
||||||
}
|
}
|
||||||
|
@ -174,9 +174,10 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
|
|||||||
// GPU drivers might have already been installed.
|
// GPU drivers might have already been installed.
|
||||||
if !areGPUsAvailableOnAllSchedulableNodes(f) {
|
if !areGPUsAvailableOnAllSchedulableNodes(f) {
|
||||||
// Install Nvidia Drivers.
|
// Install Nvidia Drivers.
|
||||||
ds := framework.DsFromManifest(dsYamlUrl)
|
ds, err := framework.DsFromManifest(dsYamlUrl)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
ds.Namespace = f.Namespace.Name
|
ds.Namespace = f.Namespace.Name
|
||||||
_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
|
_, err = f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
|
||||||
framework.ExpectNoError(err, "failed to create daemonset")
|
framework.ExpectNoError(err, "failed to create daemonset")
|
||||||
framework.Logf("Successfully created daemonset to install Nvidia drivers. Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
|
framework.Logf("Successfully created daemonset to install Nvidia drivers. Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
|
||||||
// Wait for Nvidia GPUs to be available on nodes
|
// Wait for Nvidia GPUs to be available on nodes
|
||||||
@ -213,9 +214,10 @@ var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
|
|||||||
|
|
||||||
// 2. Verifies that when the device plugin DaemonSet is removed, resource capacity drops to zero.
|
// 2. Verifies that when the device plugin DaemonSet is removed, resource capacity drops to zero.
|
||||||
By("Deleting device plugin daemonset")
|
By("Deleting device plugin daemonset")
|
||||||
ds := framework.DsFromManifest(dsYamlUrl)
|
ds, err := framework.DsFromManifest(dsYamlUrl)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
falseVar := false
|
falseVar := false
|
||||||
err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Delete(ds.Name, &metav1.DeleteOptions{OrphanDependents: &falseVar})
|
err = f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Delete(ds.Name, &metav1.DeleteOptions{OrphanDependents: &falseVar})
|
||||||
framework.ExpectNoError(err, "failed to delete daemonset")
|
framework.ExpectNoError(err, "failed to delete daemonset")
|
||||||
framework.Logf("Successfully deleted device plugin daemonset. Wait for resource to be removed.")
|
framework.Logf("Successfully deleted device plugin daemonset. Wait for resource to be removed.")
|
||||||
// Wait for Nvidia GPUs to be not available on nodes
|
// Wait for Nvidia GPUs to be not available on nodes
|
||||||
|
@ -17,8 +17,8 @@ limitations under the License.
|
|||||||
package e2e_node
|
package e2e_node
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"k8s.io/api/core/v1"
|
"k8s.io/api/core/v1"
|
||||||
@ -35,30 +35,31 @@ import (
|
|||||||
const (
|
const (
|
||||||
devicePluginFeatureGate = "DevicePlugins=true"
|
devicePluginFeatureGate = "DevicePlugins=true"
|
||||||
testPodNamePrefix = "nvidia-gpu-"
|
testPodNamePrefix = "nvidia-gpu-"
|
||||||
sleepTimeout = 30
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Serial because the test restarts Kubelet
|
// Serial because the test restarts Kubelet
|
||||||
var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin] [Serial] [Disruptive]", func() {
|
var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin] [Serial] [Disruptive]", func() {
|
||||||
f := framework.NewDefaultFramework("device-plugin-gpus-errors")
|
f := framework.NewDefaultFramework("device-plugin-gpus-errors")
|
||||||
|
|
||||||
Context("", func() {
|
Context("DevicePlugin", func() {
|
||||||
|
By("Enabling support for Device Plugin")
|
||||||
|
tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||||
|
initialConfig.FeatureGates += "," + devicePluginFeatureGate
|
||||||
|
})
|
||||||
|
|
||||||
BeforeEach(func() {
|
BeforeEach(func() {
|
||||||
By("Ensuring that Nvidia GPUs exists on the node")
|
By("Ensuring that Nvidia GPUs exists on the node")
|
||||||
if !checkIfNvidiaGPUsExistOnNode() {
|
if !checkIfNvidiaGPUsExistOnNode() {
|
||||||
Skip("Nvidia GPUs do not exist on the node. Skipping test.")
|
Skip("Nvidia GPUs do not exist on the node. Skipping test.")
|
||||||
}
|
}
|
||||||
|
|
||||||
By("Enabling support for Device Plugin")
|
|
||||||
tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
|
|
||||||
initialConfig.FeatureGates += "," + devicePluginFeatureGate
|
|
||||||
})
|
|
||||||
|
|
||||||
By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
|
By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
|
||||||
f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))
|
f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))
|
||||||
|
|
||||||
By("Waiting for GPUs to become available on the local node")
|
By("Waiting for GPUs to become available on the local node")
|
||||||
Eventually(framework.NumberOfNVIDIAGPUs(getLocalNode(f)) != 0, time.Minute, time.Second).Should(BeTrue())
|
Eventually(func() bool {
|
||||||
|
return framework.NumberOfNVIDIAGPUs(getLocalNode(f)) > 0
|
||||||
|
}, 10*time.Second, time.Second).Should(BeTrue())
|
||||||
|
|
||||||
if framework.NumberOfNVIDIAGPUs(getLocalNode(f)) < 2 {
|
if framework.NumberOfNVIDIAGPUs(getLocalNode(f)) < 2 {
|
||||||
Skip("Not enough GPUs to execute this test (at least two needed)")
|
Skip("Not enough GPUs to execute this test (at least two needed)")
|
||||||
@ -79,34 +80,26 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
})
|
})
|
||||||
|
|
||||||
It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() {
|
It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() {
|
||||||
n := getLocalNode(f)
|
|
||||||
|
|
||||||
By("Creating one GPU pod on a node with at least two GPUs")
|
By("Creating one GPU pod on a node with at least two GPUs")
|
||||||
p1 := f.PodClient().CreateSync(makeCudaPauseImage())
|
p1 := f.PodClient().CreateSync(makeCudaPauseImage())
|
||||||
cmd := fmt.Sprintf("exec %s %s nvidia-smi -L", n.Name, p1.Spec.Containers[0].Name)
|
devId1 := getDeviceId(f, p1.Name, p1.Name, 1)
|
||||||
uuid1, _ := framework.RunKubectl(cmd)
|
p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
By("Restarting Kubelet and waiting for the current running pod to restart")
|
By("Restarting Kubelet and waiting for the current running pod to restart")
|
||||||
restartKubelet(f)
|
restartKubelet(f)
|
||||||
Eventually(func() bool {
|
|
||||||
p, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
|
|
||||||
framework.ExpectNoError(err)
|
|
||||||
|
|
||||||
return p.Status.ContainerStatuses[0].RestartCount != p1.Status.ContainerStatuses[0].RestartCount
|
|
||||||
}, 2*sleepTimeout)
|
|
||||||
|
|
||||||
By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
|
By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
|
||||||
uuid1Restart, _ := framework.RunKubectl(cmd)
|
devIdRestart := getDeviceId(f, p1.Name, p1.Name, 2)
|
||||||
Expect(uuid1Restart).To(Equal(uuid1))
|
Expect(devIdRestart).To(Equal(devId1))
|
||||||
|
|
||||||
By("Restarting Kubelet and creating another pod")
|
By("Restarting Kubelet and creating another pod")
|
||||||
restartKubelet(f)
|
restartKubelet(f)
|
||||||
p2 := f.PodClient().CreateSync(makeCudaPauseImage())
|
p2 := f.PodClient().CreateSync(makeCudaPauseImage())
|
||||||
|
|
||||||
By("Checking that pods got a different GPU")
|
By("Checking that pods got a different GPU")
|
||||||
cmd = fmt.Sprintf("exec %s %s nvidia-smi -L", n.Name, p2.Spec.Containers[0].Name)
|
devId2 := getDeviceId(f, p2.Name, p2.Name, 1)
|
||||||
uuid2, _ := framework.RunKubectl(cmd)
|
Expect(devId1).To(Not(Equal(devId2)))
|
||||||
Expect(uuid1).To(Not(Equal(uuid2)))
|
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||||
@ -123,9 +116,12 @@ func makeCudaPauseImage() *v1.Pod {
|
|||||||
Spec: v1.PodSpec{
|
Spec: v1.PodSpec{
|
||||||
RestartPolicy: v1.RestartPolicyAlways,
|
RestartPolicy: v1.RestartPolicyAlways,
|
||||||
Containers: []v1.Container{{
|
Containers: []v1.Container{{
|
||||||
Name: "cuda-pause",
|
Image: busyboxImage,
|
||||||
Image: "nvidia/cuda",
|
Name: podName,
|
||||||
Command: []string{"sleep", string(sleepTimeout)},
|
// Retrieves the gpu devices created in the user pod.
|
||||||
|
// Note the nvidia device plugin implementation doesn't do device id remapping currently.
|
||||||
|
// Will probably need to use nvidia-smi if that changes.
|
||||||
|
Command: []string{"sh", "-c", "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs"},
|
||||||
|
|
||||||
Resources: v1.ResourceRequirements{
|
Resources: v1.ResourceRequirements{
|
||||||
Limits: newDecimalResourceList(framework.NVIDIAGPUResourceName, 1),
|
Limits: newDecimalResourceList(framework.NVIDIAGPUResourceName, 1),
|
||||||
@ -142,23 +138,36 @@ func newDecimalResourceList(name v1.ResourceName, quantity int64) v1.ResourceLis
|
|||||||
|
|
||||||
// TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
|
// TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
|
||||||
func restartKubelet(f *framework.Framework) {
|
func restartKubelet(f *framework.Framework) {
|
||||||
stdout1, err1 := exec.Command("sudo", "systemctl", "restart", "kubelet").CombinedOutput()
|
stdout, err := exec.Command("sudo", "systemctl", "list-units", "kubelet*", "--state=running").CombinedOutput()
|
||||||
if err1 == nil {
|
framework.ExpectNoError(err)
|
||||||
|
regex := regexp.MustCompile("(kubelet-[0-9]+)")
|
||||||
|
matches := regex.FindStringSubmatch(string(stdout))
|
||||||
|
Expect(len(matches)).NotTo(BeZero())
|
||||||
|
kube := matches[0]
|
||||||
|
framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kube)
|
||||||
|
stdout, err = exec.Command("sudo", "systemctl", "restart", kube).CombinedOutput()
|
||||||
|
if err == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
framework.Failf("Failed to restart kubelet with systemctl: %v, %v", err, stdout)
|
||||||
stdout2, err2 := exec.Command("sudo", "/etc/init.d/kubelet", "restart").CombinedOutput()
|
|
||||||
if err2 == nil {
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
stdout3, err3 := exec.Command("sudo", "service", "kubelet", "restart").CombinedOutput()
|
func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) string {
|
||||||
if err3 == nil {
|
// Wait till pod has been restarted at least restartCount times.
|
||||||
return
|
Eventually(func() bool {
|
||||||
|
p, err := f.PodClient().Get(podName, metav1.GetOptions{})
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
return p.Status.ContainerStatuses[0].RestartCount >= restartCount
|
||||||
|
}, time.Minute, time.Second).Should(BeTrue())
|
||||||
|
logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName)
|
||||||
|
if err != nil {
|
||||||
|
framework.Failf("GetPodLogs for pod %q failed: %v", podName, err)
|
||||||
}
|
}
|
||||||
|
framework.Logf("got pod logs: %v", logs)
|
||||||
framework.Failf("Failed to trigger kubelet restart with systemctl/initctl/service operations:"+
|
regex := regexp.MustCompile("gpu devices: (nvidia[0-9]+)")
|
||||||
"\nsystemclt: %v, %v"+
|
matches := regex.FindStringSubmatch(logs)
|
||||||
"\ninitctl: %v, %v"+
|
if len(matches) < 2 {
|
||||||
"\nservice: %v, %v", err1, stdout1, err2, stdout2, err3, stdout3)
|
return ""
|
||||||
|
}
|
||||||
|
return matches[1]
|
||||||
}
|
}
|
||||||
|
@ -56,6 +56,7 @@ var NodeImageWhiteList = sets.NewString(
|
|||||||
imageutils.GetE2EImage(imageutils.Netexec),
|
imageutils.GetE2EImage(imageutils.Netexec),
|
||||||
"gcr.io/google_containers/nonewprivs:1.2",
|
"gcr.io/google_containers/nonewprivs:1.2",
|
||||||
framework.GetPauseImageNameForHostArch(),
|
framework.GetPauseImageNameForHostArch(),
|
||||||
|
framework.GetGPUDevicePluginImage(),
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
Loading…
Reference in New Issue
Block a user