mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 20:24:09 +00:00
adding an e2e for GPUs
Signed-off-by: Vishnu kannan <vishnuk@google.com>
This commit is contained in:
parent
69acb02394
commit
318f4e102a
@ -686,3 +686,4 @@ windows-line-endings
|
||||
www-prefix
|
||||
zone-id
|
||||
zone-name
|
||||
|
||||
|
@ -58,6 +58,8 @@ go_library(
|
||||
"//pkg/kubelet/envvars:go_default_library",
|
||||
"//pkg/kubelet/events:go_default_library",
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
"//pkg/kubelet/gpu:go_default_library",
|
||||
"//pkg/kubelet/gpu/nvidia:go_default_library",
|
||||
"//pkg/kubelet/images:go_default_library",
|
||||
"//pkg/kubelet/kuberuntime:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
@ -169,6 +171,7 @@ go_test(
|
||||
"//pkg/kubelet/container:go_default_library",
|
||||
"//pkg/kubelet/container/testing:go_default_library",
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
"//pkg/kubelet/gpu:go_default_library",
|
||||
"//pkg/kubelet/images:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
"//pkg/kubelet/network:go_default_library",
|
||||
@ -246,6 +249,7 @@ filegroup(
|
||||
"//pkg/kubelet/envvars:all-srcs",
|
||||
"//pkg/kubelet/events:all-srcs",
|
||||
"//pkg/kubelet/eviction:all-srcs",
|
||||
"//pkg/kubelet/gpu:all-srcs",
|
||||
"//pkg/kubelet/images:all-srcs",
|
||||
"//pkg/kubelet/kuberuntime:all-srcs",
|
||||
"//pkg/kubelet/leaky:all-srcs",
|
||||
|
34
pkg/kubelet/gpu/BUILD
Normal file
34
pkg/kubelet/gpu/BUILD
Normal file
@ -0,0 +1,34 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_go//go:def.bzl",
|
||||
"go_library",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"gpu_manager_stub.go",
|
||||
"types.go",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
deps = ["//pkg/api/v1:go_default_library"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [
|
||||
":package-srcs",
|
||||
"//pkg/kubelet/gpu/nvidia:all-srcs",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
)
|
38
pkg/kubelet/gpu/nvidia/BUILD
Normal file
38
pkg/kubelet/gpu/nvidia/BUILD
Normal file
@ -0,0 +1,38 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_go//go:def.bzl",
|
||||
"go_library",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"helpers.go",
|
||||
"nvidia_gpu_manager.go",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/kubelet/dockertools:go_default_library",
|
||||
"//pkg/kubelet/gpu:go_default_library",
|
||||
"//vendor:github.com/golang/glog",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [":package-srcs"],
|
||||
tags = ["automanaged"],
|
||||
)
|
@ -224,7 +224,7 @@ func (c *PodClient) WaitForErrorEventOrSuccess(pod *v1.Pod) (*v1.Event, error) {
|
||||
return ev, err
|
||||
}
|
||||
|
||||
// MatchContainerOutput gest output of a container and match expected regexp in the output.
|
||||
// MatchContainerOutput gets output of a container and match expected regexp in the output.
|
||||
func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error {
|
||||
f := c.f
|
||||
output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName)
|
||||
|
@ -14,6 +14,7 @@ go_library(
|
||||
"benchmark_util.go",
|
||||
"container.go",
|
||||
"doc.go",
|
||||
"gpus.go",
|
||||
"image_list.go",
|
||||
"resource_collector.go",
|
||||
"simple_mount.go",
|
||||
@ -37,12 +38,14 @@ go_library(
|
||||
"//vendor:github.com/onsi/gomega",
|
||||
"//vendor:github.com/opencontainers/runc/libcontainer/cgroups",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/errors",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
"//vendor:k8s.io/apimachinery/pkg/labels",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/runtime",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/uuid",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/wait",
|
||||
"//vendor:k8s.io/client-go/pkg/api",
|
||||
],
|
||||
)
|
||||
|
||||
|
135
test/e2e_node/gpus.go
Normal file
135
test/e2e_node/gpus.go
Normal file
@ -0,0 +1,135 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e_node
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/apis/componentconfig"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
const acceleratorsFeatureGate = "Accelerators=true"
|
||||
|
||||
// Serial because the test updates kubelet configuration.
|
||||
var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
||||
f := framework.NewDefaultFramework("gpu-test")
|
||||
Context("attempt to use GPUs if available", func() {
|
||||
It("setup the node and create pods to test gpus", func() {
|
||||
By("ensuring that dynamic kubelet configuration is enabled")
|
||||
enabled, err := isKubeletConfigEnabled(f)
|
||||
framework.ExpectNoError(err)
|
||||
if !enabled {
|
||||
Skip("Dynamic Kubelet configuration is not enabled. Skipping test.")
|
||||
}
|
||||
|
||||
By("enabling support for GPUs")
|
||||
var oldCfg *componentconfig.KubeletConfiguration
|
||||
defer func() {
|
||||
if oldCfg != nil {
|
||||
framework.ExpectNoError(setKubeletConfiguration(f, oldCfg))
|
||||
}
|
||||
}()
|
||||
|
||||
oldCfg, err = getCurrentKubeletConfig()
|
||||
framework.ExpectNoError(err)
|
||||
clone, err := api.Scheme.DeepCopy(oldCfg)
|
||||
framework.ExpectNoError(err)
|
||||
newCfg := clone.(*componentconfig.KubeletConfiguration)
|
||||
if newCfg.FeatureGates != "" {
|
||||
newCfg.FeatureGates = fmt.Sprintf("%s,%s", acceleratorsFeatureGate, newCfg.FeatureGates)
|
||||
} else {
|
||||
newCfg.FeatureGates = acceleratorsFeatureGate
|
||||
}
|
||||
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
|
||||
|
||||
By("Getting the local node object from the api server")
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
Expect(len(nodeList.Items)).To(Equal(1))
|
||||
node := nodeList.Items[0]
|
||||
gpusAvailable := node.Status.Capacity.NvidiaGPU()
|
||||
By("Skipping the test if GPUs aren't available")
|
||||
if gpusAvailable.IsZero() {
|
||||
Skip("No GPUs available on local node. Skipping test.")
|
||||
}
|
||||
|
||||
By("Creating a pod that will consume all GPUs")
|
||||
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
|
||||
podSuccess = f.PodClient().CreateSync(podSuccess)
|
||||
|
||||
By("Checking if the pod outputted Success to its logs")
|
||||
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
|
||||
|
||||
By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet")
|
||||
podFailure := makePod(1, "gpu-failure")
|
||||
framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) {
|
||||
if pod.Status.Phase == v1.PodFailed {
|
||||
return true, nil
|
||||
|
||||
}
|
||||
return false, nil
|
||||
})
|
||||
|
||||
By("stopping the original Pod with GPUs")
|
||||
gp := int64(0)
|
||||
deleteOptions := metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &gp,
|
||||
}
|
||||
f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, 30*time.Second)
|
||||
|
||||
By("attempting to start the failed pod again")
|
||||
f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, 10*time.Second)
|
||||
podFailure = f.PodClient().CreateSync(podFailure)
|
||||
|
||||
By("Checking if the pod outputted Success to its logs")
|
||||
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success"))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
func makePod(gpus int64, name string) *v1.Pod {
|
||||
resources := v1.ResourceRequirements{
|
||||
Limits: v1.ResourceList{
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
|
||||
},
|
||||
}
|
||||
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus)
|
||||
return &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: "gcr.io/google_containers/busybox:1.24",
|
||||
Name: name,
|
||||
Command: []string{"sh", "-c", gpuverificationCmd},
|
||||
Resources: resources,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user