Merge pull request #86062 from haosdent/clean-e2e-framework-gpu

e2e: move funs of framework/gpu to e2e_node
This commit is contained in:
Kubernetes Prow Robot 2019-12-28 21:23:39 -08:00 committed by GitHub
commit a097243cba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 46 deletions

View File

@ -5,12 +5,6 @@ go_library(
srcs = ["gpu_util.go"], srcs = ["gpu_util.go"],
importpath = "k8s.io/kubernetes/test/e2e/framework/gpu", importpath = "k8s.io/kubernetes/test/e2e/framework/gpu",
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = [
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
"//test/e2e/framework:go_default_library",
],
) )
filegroup( filegroup(

View File

@ -16,13 +16,6 @@ limitations under the License.
package gpu package gpu
import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/kubernetes/test/e2e/framework"
)
const ( const (
// NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8 // NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8
// this uses the device plugin mechanism // this uses the device plugin mechanism
@ -33,31 +26,3 @@ const (
// so we can override the daemonset in other setups (non COS). // so we can override the daemonset in other setups (non COS).
GPUDevicePluginDSYAML = "https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml" GPUDevicePluginDSYAML = "https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
) )
// NumberOfNVIDIAGPUs returns the number of GPUs advertised by a node
// This is based on the Device Plugin system and expected to run on a COS based node
// After the NVIDIA drivers were installed
// TODO make this generic and not linked to COS only
func NumberOfNVIDIAGPUs(node *v1.Node) int64 {
val, ok := node.Status.Capacity[NVIDIAGPUResourceName]
if !ok {
return 0
}
return val.Value()
}
// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
func NVIDIADevicePlugin() *v1.Pod {
ds, err := framework.DsFromManifest(GPUDevicePluginDSYAML)
framework.ExpectNoError(err)
p := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
Namespace: metav1.NamespaceSystem,
},
Spec: ds.Spec.Template.Spec,
}
// Remove node affinity
p.Spec.Affinity = nil
return p
}

View File

@ -23,6 +23,7 @@ import (
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics" kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework"
"k8s.io/kubernetes/test/e2e/framework/gpu" "k8s.io/kubernetes/test/e2e/framework/gpu"
@ -33,6 +34,34 @@ import (
"github.com/prometheus/common/model" "github.com/prometheus/common/model"
) )
// numberOfNVIDIAGPUs returns the number of GPUs advertised by a node
// This is based on the Device Plugin system and expected to run on a COS based node
// After the NVIDIA drivers were installed
// TODO make this generic and not linked to COS only
func numberOfNVIDIAGPUs(node *v1.Node) int64 {
val, ok := node.Status.Capacity[gpu.NVIDIAGPUResourceName]
if !ok {
return 0
}
return val.Value()
}
// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
func NVIDIADevicePlugin() *v1.Pod {
ds, err := framework.DsFromManifest(gpu.GPUDevicePluginDSYAML)
framework.ExpectNoError(err)
p := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
Namespace: metav1.NamespaceSystem,
},
Spec: ds.Spec.Template.Spec,
}
// Remove node affinity
p.Spec.Affinity = nil
return p
}
// Serial because the test restarts Kubelet // Serial because the test restarts Kubelet
var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeature:GPUDevicePlugin][Serial] [Disruptive]", func() { var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeature:GPUDevicePlugin][Serial] [Disruptive]", func() {
f := framework.NewDefaultFramework("device-plugin-gpus-errors") f := framework.NewDefaultFramework("device-plugin-gpus-errors")
@ -47,15 +76,15 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
} }
ginkgo.By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE") ginkgo.By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(gpu.NVIDIADevicePlugin()) devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(NVIDIADevicePlugin())
framework.ExpectNoError(err) framework.ExpectNoError(err)
ginkgo.By("Waiting for GPUs to become available on the local node") ginkgo.By("Waiting for GPUs to become available on the local node")
gomega.Eventually(func() bool { gomega.Eventually(func() bool {
return gpu.NumberOfNVIDIAGPUs(getLocalNode(f)) > 0 return numberOfNVIDIAGPUs(getLocalNode(f)) > 0
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
if gpu.NumberOfNVIDIAGPUs(getLocalNode(f)) < 2 { if numberOfNVIDIAGPUs(getLocalNode(f)) < 2 {
ginkgo.Skip("Not enough GPUs to execute this test (at least two needed)") ginkgo.Skip("Not enough GPUs to execute this test (at least two needed)")
} }
}) })
@ -95,7 +124,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
restartKubelet() restartKubelet()
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout) framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
gomega.Eventually(func() bool { gomega.Eventually(func() bool {
return gpu.NumberOfNVIDIAGPUs(getLocalNode(f)) > 0 return numberOfNVIDIAGPUs(getLocalNode(f)) > 0
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
p2 := f.PodClient().CreateSync(makeBusyboxPod(gpu.NVIDIAGPUResourceName, podRECMD)) p2 := f.PodClient().CreateSync(makeBusyboxPod(gpu.NVIDIAGPUResourceName, podRECMD))
@ -110,7 +139,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
gomega.Eventually(func() bool { gomega.Eventually(func() bool {
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{}) node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
framework.ExpectNoError(err) framework.ExpectNoError(err)
return gpu.NumberOfNVIDIAGPUs(node) <= 0 return numberOfNVIDIAGPUs(node) <= 0
}, 10*time.Minute, framework.Poll).Should(gomega.BeTrue()) }, 10*time.Minute, framework.Poll).Should(gomega.BeTrue())
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin.") ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin.")
ensurePodContainerRestart(f, p1.Name, p1.Name) ensurePodContainerRestart(f, p1.Name, p1.Name)