diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index a1d5a810df9..883e275c026 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -686,3 +686,4 @@ windows-line-endings www-prefix zone-id zone-name + diff --git a/pkg/kubelet/BUILD b/pkg/kubelet/BUILD index d412f34fcb3..848f44146c7 100644 --- a/pkg/kubelet/BUILD +++ b/pkg/kubelet/BUILD @@ -58,6 +58,8 @@ go_library( "//pkg/kubelet/envvars:go_default_library", "//pkg/kubelet/events:go_default_library", "//pkg/kubelet/eviction:go_default_library", + "//pkg/kubelet/gpu:go_default_library", + "//pkg/kubelet/gpu/nvidia:go_default_library", "//pkg/kubelet/images:go_default_library", "//pkg/kubelet/kuberuntime:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", @@ -169,6 +171,7 @@ go_test( "//pkg/kubelet/container:go_default_library", "//pkg/kubelet/container/testing:go_default_library", "//pkg/kubelet/eviction:go_default_library", + "//pkg/kubelet/gpu:go_default_library", "//pkg/kubelet/images:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/network:go_default_library", @@ -246,6 +249,7 @@ filegroup( "//pkg/kubelet/envvars:all-srcs", "//pkg/kubelet/events:all-srcs", "//pkg/kubelet/eviction:all-srcs", + "//pkg/kubelet/gpu:all-srcs", "//pkg/kubelet/images:all-srcs", "//pkg/kubelet/kuberuntime:all-srcs", "//pkg/kubelet/leaky:all-srcs", diff --git a/pkg/kubelet/gpu/BUILD b/pkg/kubelet/gpu/BUILD new file mode 100644 index 00000000000..9c0ba77ae6c --- /dev/null +++ b/pkg/kubelet/gpu/BUILD @@ -0,0 +1,34 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +load( + "@io_bazel_rules_go//go:def.bzl", + "go_library", +) + +go_library( + name = "go_default_library", + srcs = [ + "gpu_manager_stub.go", + "types.go", + ], + tags = ["automanaged"], + deps = ["//pkg/api/v1:go_default_library"], +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [ + ":package-srcs", + "//pkg/kubelet/gpu/nvidia:all-srcs", + ], + tags = ["automanaged"], +) diff --git a/pkg/kubelet/gpu/nvidia/BUILD b/pkg/kubelet/gpu/nvidia/BUILD new file mode 100644 index 00000000000..c7f03202945 --- /dev/null +++ b/pkg/kubelet/gpu/nvidia/BUILD @@ -0,0 +1,38 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +load( + "@io_bazel_rules_go//go:def.bzl", + "go_library", +) + +go_library( + name = "go_default_library", + srcs = [ + "helpers.go", + "nvidia_gpu_manager.go", + ], + tags = ["automanaged"], + deps = [ + "//pkg/api/v1:go_default_library", + "//pkg/kubelet/dockertools:go_default_library", + "//pkg/kubelet/gpu:go_default_library", + "//vendor:github.com/golang/glog", + "//vendor:k8s.io/apimachinery/pkg/api/resource", + "//vendor:k8s.io/apimachinery/pkg/util/sets", + ], +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], +) diff --git a/test/e2e/framework/pods.go b/test/e2e/framework/pods.go index b9dd95ad33f..17430ddef48 100644 --- a/test/e2e/framework/pods.go +++ b/test/e2e/framework/pods.go @@ -224,7 +224,7 @@ func (c *PodClient) WaitForErrorEventOrSuccess(pod *v1.Pod) (*v1.Event, error) { return ev, err } -// MatchContainerOutput gest output of a container and match expected regexp in the output. +// MatchContainerOutput gets output of a container and match expected regexp in the output. func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error { f := c.f output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName) diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 31cf23408a5..5408bad9009 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -14,6 +14,7 @@ go_library( "benchmark_util.go", "container.go", "doc.go", + "gpus.go", "image_list.go", "resource_collector.go", "simple_mount.go", @@ -37,12 +38,14 @@ go_library( "//vendor:github.com/onsi/gomega", "//vendor:github.com/opencontainers/runc/libcontainer/cgroups", "//vendor:k8s.io/apimachinery/pkg/api/errors", + "//vendor:k8s.io/apimachinery/pkg/api/resource", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", "//vendor:k8s.io/apimachinery/pkg/labels", "//vendor:k8s.io/apimachinery/pkg/util/runtime", "//vendor:k8s.io/apimachinery/pkg/util/sets", "//vendor:k8s.io/apimachinery/pkg/util/uuid", "//vendor:k8s.io/apimachinery/pkg/util/wait", + "//vendor:k8s.io/client-go/pkg/api", ], ) diff --git a/test/e2e_node/gpus.go b/test/e2e_node/gpus.go new file mode 100644 index 00000000000..d8c651f2e8c --- /dev/null +++ b/test/e2e_node/gpus.go @@ -0,0 +1,135 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_node + +import ( + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/pkg/api" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" + "k8s.io/kubernetes/test/e2e/framework" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +const acceleratorsFeatureGate = "Accelerators=true" + +// Serial because the test updates kubelet configuration. +var _ = framework.KubeDescribe("GPU [Serial]", func() { + f := framework.NewDefaultFramework("gpu-test") + Context("attempt to use GPUs if available", func() { + It("setup the node and create pods to test gpus", func() { + By("ensuring that dynamic kubelet configuration is enabled") + enabled, err := isKubeletConfigEnabled(f) + framework.ExpectNoError(err) + if !enabled { + Skip("Dynamic Kubelet configuration is not enabled. Skipping test.") + } + + By("enabling support for GPUs") + var oldCfg *componentconfig.KubeletConfiguration + defer func() { + if oldCfg != nil { + framework.ExpectNoError(setKubeletConfiguration(f, oldCfg)) + } + }() + + oldCfg, err = getCurrentKubeletConfig() + framework.ExpectNoError(err) + clone, err := api.Scheme.DeepCopy(oldCfg) + framework.ExpectNoError(err) + newCfg := clone.(*componentconfig.KubeletConfiguration) + if newCfg.FeatureGates != "" { + newCfg.FeatureGates = fmt.Sprintf("%s,%s", acceleratorsFeatureGate, newCfg.FeatureGates) + } else { + newCfg.FeatureGates = acceleratorsFeatureGate + } + framework.ExpectNoError(setKubeletConfiguration(f, newCfg)) + + By("Getting the local node object from the api server") + nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + Expect(len(nodeList.Items)).To(Equal(1)) + node := nodeList.Items[0] + gpusAvailable := node.Status.Capacity.NvidiaGPU() + By("Skipping the test if GPUs aren't available") + if gpusAvailable.IsZero() { + Skip("No GPUs available on local node. Skipping test.") + } + + By("Creating a pod that will consume all GPUs") + podSuccess := makePod(gpusAvailable.Value(), "gpus-success") + podSuccess = f.PodClient().CreateSync(podSuccess) + + By("Checking if the pod outputted Success to its logs") + framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success")) + + By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet") + podFailure := makePod(1, "gpu-failure") + framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) { + if pod.Status.Phase == v1.PodFailed { + return true, nil + + } + return false, nil + }) + + By("stopping the original Pod with GPUs") + gp := int64(0) + deleteOptions := metav1.DeleteOptions{ + GracePeriodSeconds: &gp, + } + f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, 30*time.Second) + + By("attempting to start the failed pod again") + f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, 10*time.Second) + podFailure = f.PodClient().CreateSync(podFailure) + + By("Checking if the pod outputted Success to its logs") + framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success")) + }) + }) +}) + +func makePod(gpus int64, name string) *v1.Pod { + resources := v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI), + }, + } + gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus) + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: name, + Command: []string{"sh", "-c", gpuverificationCmd}, + Resources: resources, + }, + }, + }, + } +}