mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-29 14:37:00 +00:00
adding an e2e for GPUs
Signed-off-by: Vishnu kannan <vishnuk@google.com>
This commit is contained in:
parent
69acb02394
commit
318f4e102a
@ -686,3 +686,4 @@ windows-line-endings
|
|||||||
www-prefix
|
www-prefix
|
||||||
zone-id
|
zone-id
|
||||||
zone-name
|
zone-name
|
||||||
|
|
||||||
|
@ -58,6 +58,8 @@ go_library(
|
|||||||
"//pkg/kubelet/envvars:go_default_library",
|
"//pkg/kubelet/envvars:go_default_library",
|
||||||
"//pkg/kubelet/events:go_default_library",
|
"//pkg/kubelet/events:go_default_library",
|
||||||
"//pkg/kubelet/eviction:go_default_library",
|
"//pkg/kubelet/eviction:go_default_library",
|
||||||
|
"//pkg/kubelet/gpu:go_default_library",
|
||||||
|
"//pkg/kubelet/gpu/nvidia:go_default_library",
|
||||||
"//pkg/kubelet/images:go_default_library",
|
"//pkg/kubelet/images:go_default_library",
|
||||||
"//pkg/kubelet/kuberuntime:go_default_library",
|
"//pkg/kubelet/kuberuntime:go_default_library",
|
||||||
"//pkg/kubelet/lifecycle:go_default_library",
|
"//pkg/kubelet/lifecycle:go_default_library",
|
||||||
@ -169,6 +171,7 @@ go_test(
|
|||||||
"//pkg/kubelet/container:go_default_library",
|
"//pkg/kubelet/container:go_default_library",
|
||||||
"//pkg/kubelet/container/testing:go_default_library",
|
"//pkg/kubelet/container/testing:go_default_library",
|
||||||
"//pkg/kubelet/eviction:go_default_library",
|
"//pkg/kubelet/eviction:go_default_library",
|
||||||
|
"//pkg/kubelet/gpu:go_default_library",
|
||||||
"//pkg/kubelet/images:go_default_library",
|
"//pkg/kubelet/images:go_default_library",
|
||||||
"//pkg/kubelet/lifecycle:go_default_library",
|
"//pkg/kubelet/lifecycle:go_default_library",
|
||||||
"//pkg/kubelet/network:go_default_library",
|
"//pkg/kubelet/network:go_default_library",
|
||||||
@ -246,6 +249,7 @@ filegroup(
|
|||||||
"//pkg/kubelet/envvars:all-srcs",
|
"//pkg/kubelet/envvars:all-srcs",
|
||||||
"//pkg/kubelet/events:all-srcs",
|
"//pkg/kubelet/events:all-srcs",
|
||||||
"//pkg/kubelet/eviction:all-srcs",
|
"//pkg/kubelet/eviction:all-srcs",
|
||||||
|
"//pkg/kubelet/gpu:all-srcs",
|
||||||
"//pkg/kubelet/images:all-srcs",
|
"//pkg/kubelet/images:all-srcs",
|
||||||
"//pkg/kubelet/kuberuntime:all-srcs",
|
"//pkg/kubelet/kuberuntime:all-srcs",
|
||||||
"//pkg/kubelet/leaky:all-srcs",
|
"//pkg/kubelet/leaky:all-srcs",
|
||||||
|
34
pkg/kubelet/gpu/BUILD
Normal file
34
pkg/kubelet/gpu/BUILD
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
package(default_visibility = ["//visibility:public"])
|
||||||
|
|
||||||
|
licenses(["notice"])
|
||||||
|
|
||||||
|
load(
|
||||||
|
"@io_bazel_rules_go//go:def.bzl",
|
||||||
|
"go_library",
|
||||||
|
)
|
||||||
|
|
||||||
|
go_library(
|
||||||
|
name = "go_default_library",
|
||||||
|
srcs = [
|
||||||
|
"gpu_manager_stub.go",
|
||||||
|
"types.go",
|
||||||
|
],
|
||||||
|
tags = ["automanaged"],
|
||||||
|
deps = ["//pkg/api/v1:go_default_library"],
|
||||||
|
)
|
||||||
|
|
||||||
|
filegroup(
|
||||||
|
name = "package-srcs",
|
||||||
|
srcs = glob(["**"]),
|
||||||
|
tags = ["automanaged"],
|
||||||
|
visibility = ["//visibility:private"],
|
||||||
|
)
|
||||||
|
|
||||||
|
filegroup(
|
||||||
|
name = "all-srcs",
|
||||||
|
srcs = [
|
||||||
|
":package-srcs",
|
||||||
|
"//pkg/kubelet/gpu/nvidia:all-srcs",
|
||||||
|
],
|
||||||
|
tags = ["automanaged"],
|
||||||
|
)
|
38
pkg/kubelet/gpu/nvidia/BUILD
Normal file
38
pkg/kubelet/gpu/nvidia/BUILD
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
package(default_visibility = ["//visibility:public"])
|
||||||
|
|
||||||
|
licenses(["notice"])
|
||||||
|
|
||||||
|
load(
|
||||||
|
"@io_bazel_rules_go//go:def.bzl",
|
||||||
|
"go_library",
|
||||||
|
)
|
||||||
|
|
||||||
|
go_library(
|
||||||
|
name = "go_default_library",
|
||||||
|
srcs = [
|
||||||
|
"helpers.go",
|
||||||
|
"nvidia_gpu_manager.go",
|
||||||
|
],
|
||||||
|
tags = ["automanaged"],
|
||||||
|
deps = [
|
||||||
|
"//pkg/api/v1:go_default_library",
|
||||||
|
"//pkg/kubelet/dockertools:go_default_library",
|
||||||
|
"//pkg/kubelet/gpu:go_default_library",
|
||||||
|
"//vendor:github.com/golang/glog",
|
||||||
|
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||||
|
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
filegroup(
|
||||||
|
name = "package-srcs",
|
||||||
|
srcs = glob(["**"]),
|
||||||
|
tags = ["automanaged"],
|
||||||
|
visibility = ["//visibility:private"],
|
||||||
|
)
|
||||||
|
|
||||||
|
filegroup(
|
||||||
|
name = "all-srcs",
|
||||||
|
srcs = [":package-srcs"],
|
||||||
|
tags = ["automanaged"],
|
||||||
|
)
|
@ -224,7 +224,7 @@ func (c *PodClient) WaitForErrorEventOrSuccess(pod *v1.Pod) (*v1.Event, error) {
|
|||||||
return ev, err
|
return ev, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// MatchContainerOutput gest output of a container and match expected regexp in the output.
|
// MatchContainerOutput gets output of a container and match expected regexp in the output.
|
||||||
func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error {
|
func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error {
|
||||||
f := c.f
|
f := c.f
|
||||||
output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName)
|
output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName)
|
||||||
|
@ -14,6 +14,7 @@ go_library(
|
|||||||
"benchmark_util.go",
|
"benchmark_util.go",
|
||||||
"container.go",
|
"container.go",
|
||||||
"doc.go",
|
"doc.go",
|
||||||
|
"gpus.go",
|
||||||
"image_list.go",
|
"image_list.go",
|
||||||
"resource_collector.go",
|
"resource_collector.go",
|
||||||
"simple_mount.go",
|
"simple_mount.go",
|
||||||
@ -37,12 +38,14 @@ go_library(
|
|||||||
"//vendor:github.com/onsi/gomega",
|
"//vendor:github.com/onsi/gomega",
|
||||||
"//vendor:github.com/opencontainers/runc/libcontainer/cgroups",
|
"//vendor:github.com/opencontainers/runc/libcontainer/cgroups",
|
||||||
"//vendor:k8s.io/apimachinery/pkg/api/errors",
|
"//vendor:k8s.io/apimachinery/pkg/api/errors",
|
||||||
|
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||||
"//vendor:k8s.io/apimachinery/pkg/labels",
|
"//vendor:k8s.io/apimachinery/pkg/labels",
|
||||||
"//vendor:k8s.io/apimachinery/pkg/util/runtime",
|
"//vendor:k8s.io/apimachinery/pkg/util/runtime",
|
||||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||||
"//vendor:k8s.io/apimachinery/pkg/util/uuid",
|
"//vendor:k8s.io/apimachinery/pkg/util/uuid",
|
||||||
"//vendor:k8s.io/apimachinery/pkg/util/wait",
|
"//vendor:k8s.io/apimachinery/pkg/util/wait",
|
||||||
|
"//vendor:k8s.io/client-go/pkg/api",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
135
test/e2e_node/gpus.go
Normal file
135
test/e2e_node/gpus.go
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2017 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package e2e_node
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/client-go/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/api/v1"
|
||||||
|
"k8s.io/kubernetes/pkg/apis/componentconfig"
|
||||||
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
|
|
||||||
|
. "github.com/onsi/ginkgo"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
const acceleratorsFeatureGate = "Accelerators=true"
|
||||||
|
|
||||||
|
// Serial because the test updates kubelet configuration.
|
||||||
|
var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
||||||
|
f := framework.NewDefaultFramework("gpu-test")
|
||||||
|
Context("attempt to use GPUs if available", func() {
|
||||||
|
It("setup the node and create pods to test gpus", func() {
|
||||||
|
By("ensuring that dynamic kubelet configuration is enabled")
|
||||||
|
enabled, err := isKubeletConfigEnabled(f)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
if !enabled {
|
||||||
|
Skip("Dynamic Kubelet configuration is not enabled. Skipping test.")
|
||||||
|
}
|
||||||
|
|
||||||
|
By("enabling support for GPUs")
|
||||||
|
var oldCfg *componentconfig.KubeletConfiguration
|
||||||
|
defer func() {
|
||||||
|
if oldCfg != nil {
|
||||||
|
framework.ExpectNoError(setKubeletConfiguration(f, oldCfg))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
oldCfg, err = getCurrentKubeletConfig()
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
clone, err := api.Scheme.DeepCopy(oldCfg)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
newCfg := clone.(*componentconfig.KubeletConfiguration)
|
||||||
|
if newCfg.FeatureGates != "" {
|
||||||
|
newCfg.FeatureGates = fmt.Sprintf("%s,%s", acceleratorsFeatureGate, newCfg.FeatureGates)
|
||||||
|
} else {
|
||||||
|
newCfg.FeatureGates = acceleratorsFeatureGate
|
||||||
|
}
|
||||||
|
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
|
||||||
|
|
||||||
|
By("Getting the local node object from the api server")
|
||||||
|
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||||
|
framework.ExpectNoError(err, "getting node list")
|
||||||
|
Expect(len(nodeList.Items)).To(Equal(1))
|
||||||
|
node := nodeList.Items[0]
|
||||||
|
gpusAvailable := node.Status.Capacity.NvidiaGPU()
|
||||||
|
By("Skipping the test if GPUs aren't available")
|
||||||
|
if gpusAvailable.IsZero() {
|
||||||
|
Skip("No GPUs available on local node. Skipping test.")
|
||||||
|
}
|
||||||
|
|
||||||
|
By("Creating a pod that will consume all GPUs")
|
||||||
|
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
|
||||||
|
podSuccess = f.PodClient().CreateSync(podSuccess)
|
||||||
|
|
||||||
|
By("Checking if the pod outputted Success to its logs")
|
||||||
|
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
|
||||||
|
|
||||||
|
By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet")
|
||||||
|
podFailure := makePod(1, "gpu-failure")
|
||||||
|
framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) {
|
||||||
|
if pod.Status.Phase == v1.PodFailed {
|
||||||
|
return true, nil
|
||||||
|
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
})
|
||||||
|
|
||||||
|
By("stopping the original Pod with GPUs")
|
||||||
|
gp := int64(0)
|
||||||
|
deleteOptions := metav1.DeleteOptions{
|
||||||
|
GracePeriodSeconds: &gp,
|
||||||
|
}
|
||||||
|
f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, 30*time.Second)
|
||||||
|
|
||||||
|
By("attempting to start the failed pod again")
|
||||||
|
f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, 10*time.Second)
|
||||||
|
podFailure = f.PodClient().CreateSync(podFailure)
|
||||||
|
|
||||||
|
By("Checking if the pod outputted Success to its logs")
|
||||||
|
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success"))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
func makePod(gpus int64, name string) *v1.Pod {
|
||||||
|
resources := v1.ResourceRequirements{
|
||||||
|
Limits: v1.ResourceList{
|
||||||
|
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus)
|
||||||
|
return &v1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: name,
|
||||||
|
},
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{
|
||||||
|
Image: "gcr.io/google_containers/busybox:1.24",
|
||||||
|
Name: name,
|
||||||
|
Command: []string{"sh", "-c", gpuverificationCmd},
|
||||||
|
Resources: resources,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user