mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-28 05:57:25 +00:00
Merge pull request #86062 from haosdent/clean-e2e-framework-gpu
e2e: move funs of framework/gpu to e2e_node
This commit is contained in:
commit
a097243cba
@ -5,12 +5,6 @@ go_library(
|
|||||||
srcs = ["gpu_util.go"],
|
srcs = ["gpu_util.go"],
|
||||||
importpath = "k8s.io/kubernetes/test/e2e/framework/gpu",
|
importpath = "k8s.io/kubernetes/test/e2e/framework/gpu",
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
|
||||||
"//staging/src/k8s.io/api/core/v1:go_default_library",
|
|
||||||
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
|
||||||
"//staging/src/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
|
|
||||||
"//test/e2e/framework:go_default_library",
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
filegroup(
|
||||||
|
@ -16,13 +16,6 @@ limitations under the License.
|
|||||||
|
|
||||||
package gpu
|
package gpu
|
||||||
|
|
||||||
import (
|
|
||||||
v1 "k8s.io/api/core/v1"
|
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
||||||
"k8s.io/apimachinery/pkg/util/uuid"
|
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8
|
// NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8
|
||||||
// this uses the device plugin mechanism
|
// this uses the device plugin mechanism
|
||||||
@ -33,31 +26,3 @@ const (
|
|||||||
// so we can override the daemonset in other setups (non COS).
|
// so we can override the daemonset in other setups (non COS).
|
||||||
GPUDevicePluginDSYAML = "https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
|
GPUDevicePluginDSYAML = "https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
|
||||||
)
|
)
|
||||||
|
|
||||||
// NumberOfNVIDIAGPUs returns the number of GPUs advertised by a node
|
|
||||||
// This is based on the Device Plugin system and expected to run on a COS based node
|
|
||||||
// After the NVIDIA drivers were installed
|
|
||||||
// TODO make this generic and not linked to COS only
|
|
||||||
func NumberOfNVIDIAGPUs(node *v1.Node) int64 {
|
|
||||||
val, ok := node.Status.Capacity[NVIDIAGPUResourceName]
|
|
||||||
if !ok {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
return val.Value()
|
|
||||||
}
|
|
||||||
|
|
||||||
// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
|
|
||||||
func NVIDIADevicePlugin() *v1.Pod {
|
|
||||||
ds, err := framework.DsFromManifest(GPUDevicePluginDSYAML)
|
|
||||||
framework.ExpectNoError(err)
|
|
||||||
p := &v1.Pod{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
|
|
||||||
Namespace: metav1.NamespaceSystem,
|
|
||||||
},
|
|
||||||
Spec: ds.Spec.Template.Spec,
|
|
||||||
}
|
|
||||||
// Remove node affinity
|
|
||||||
p.Spec.Affinity = nil
|
|
||||||
return p
|
|
||||||
}
|
|
||||||
|
@ -23,6 +23,7 @@ import (
|
|||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/util/uuid"
|
||||||
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
|
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
"k8s.io/kubernetes/test/e2e/framework/gpu"
|
"k8s.io/kubernetes/test/e2e/framework/gpu"
|
||||||
@ -33,6 +34,34 @@ import (
|
|||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// numberOfNVIDIAGPUs returns the number of GPUs advertised by a node
|
||||||
|
// This is based on the Device Plugin system and expected to run on a COS based node
|
||||||
|
// After the NVIDIA drivers were installed
|
||||||
|
// TODO make this generic and not linked to COS only
|
||||||
|
func numberOfNVIDIAGPUs(node *v1.Node) int64 {
|
||||||
|
val, ok := node.Status.Capacity[gpu.NVIDIAGPUResourceName]
|
||||||
|
if !ok {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return val.Value()
|
||||||
|
}
|
||||||
|
|
||||||
|
// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
|
||||||
|
func NVIDIADevicePlugin() *v1.Pod {
|
||||||
|
ds, err := framework.DsFromManifest(gpu.GPUDevicePluginDSYAML)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
p := &v1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
|
||||||
|
Namespace: metav1.NamespaceSystem,
|
||||||
|
},
|
||||||
|
Spec: ds.Spec.Template.Spec,
|
||||||
|
}
|
||||||
|
// Remove node affinity
|
||||||
|
p.Spec.Affinity = nil
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
// Serial because the test restarts Kubelet
|
// Serial because the test restarts Kubelet
|
||||||
var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeature:GPUDevicePlugin][Serial] [Disruptive]", func() {
|
var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeature:GPUDevicePlugin][Serial] [Disruptive]", func() {
|
||||||
f := framework.NewDefaultFramework("device-plugin-gpus-errors")
|
f := framework.NewDefaultFramework("device-plugin-gpus-errors")
|
||||||
@ -47,15 +76,15 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
}
|
}
|
||||||
|
|
||||||
ginkgo.By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
|
ginkgo.By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
|
||||||
devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(gpu.NVIDIADevicePlugin())
|
devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(NVIDIADevicePlugin())
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
ginkgo.By("Waiting for GPUs to become available on the local node")
|
ginkgo.By("Waiting for GPUs to become available on the local node")
|
||||||
gomega.Eventually(func() bool {
|
gomega.Eventually(func() bool {
|
||||||
return gpu.NumberOfNVIDIAGPUs(getLocalNode(f)) > 0
|
return numberOfNVIDIAGPUs(getLocalNode(f)) > 0
|
||||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||||
|
|
||||||
if gpu.NumberOfNVIDIAGPUs(getLocalNode(f)) < 2 {
|
if numberOfNVIDIAGPUs(getLocalNode(f)) < 2 {
|
||||||
ginkgo.Skip("Not enough GPUs to execute this test (at least two needed)")
|
ginkgo.Skip("Not enough GPUs to execute this test (at least two needed)")
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@ -95,7 +124,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
restartKubelet()
|
restartKubelet()
|
||||||
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
|
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
|
||||||
gomega.Eventually(func() bool {
|
gomega.Eventually(func() bool {
|
||||||
return gpu.NumberOfNVIDIAGPUs(getLocalNode(f)) > 0
|
return numberOfNVIDIAGPUs(getLocalNode(f)) > 0
|
||||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||||
p2 := f.PodClient().CreateSync(makeBusyboxPod(gpu.NVIDIAGPUResourceName, podRECMD))
|
p2 := f.PodClient().CreateSync(makeBusyboxPod(gpu.NVIDIAGPUResourceName, podRECMD))
|
||||||
|
|
||||||
@ -110,7 +139,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
gomega.Eventually(func() bool {
|
gomega.Eventually(func() bool {
|
||||||
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
return gpu.NumberOfNVIDIAGPUs(node) <= 0
|
return numberOfNVIDIAGPUs(node) <= 0
|
||||||
}, 10*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
}, 10*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||||
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
||||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||||
|
Loading…
Reference in New Issue
Block a user