Merge pull request #47467 from mindprince/issue-47388-e2e-gke-gpu

Automatic merge from submit-queue

Update GPU e2e tests.

* Use nvidia driver installer from external repo.
    
    That installer decouples itself from COS image version (as long as the
    image version is newer than cos-stable-59-9460-60-0).
    
    A separate commit in the test-infra repo will update the cos version
    used for this test to cos-stable-59-9460-60-0.

* Use cos-stable-59-9460-60-0 and newer installer for GPU node e2e tests.

This is to enable #47388.

This supercedes #47091.

**Release note**:
```release-note
NONE
```

/sig node
This commit is contained in:
Kubernetes Submit Queue 2017-06-13 21:11:08 -07:00 committed by GitHub
commit 9fff13b72a
3 changed files with 24 additions and 9 deletions

View File

@ -17,6 +17,8 @@ limitations under the License.
package e2e
import (
"io/ioutil"
"net/http"
"strings"
"time"
@ -29,7 +31,6 @@ import (
"k8s.io/kubernetes/pkg/api/v1"
extensions "k8s.io/kubernetes/pkg/apis/extensions/v1beta1"
"k8s.io/kubernetes/test/e2e/framework"
"k8s.io/kubernetes/test/e2e/generated"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
@ -42,7 +43,7 @@ const (
// Nvidia driver installation can take upwards of 5 minutes.
driverInstallTimeout = 10 * time.Minute
// Nvidia COS driver installer daemonset.
cosNvidiaDriverInstallerPath = "cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml"
cosNvidiaDriverInstallerUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/stable/cos-nvidia-gpu-installer/daemonset.yaml"
)
func makeCudaAdditionTestPod() *v1.Pod {
@ -135,7 +136,7 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
// GPU drivers might have already been installed.
if !areGPUsAvailableOnAllSchedulableNodes(f) {
// Install Nvidia Drivers.
ds := dsFromManifest(cosNvidiaDriverInstallerPath)
ds := dsFromManifest(cosNvidiaDriverInstallerUrl)
ds.Namespace = f.Namespace.Name
_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
framework.ExpectNoError(err, "failed to create daemonset")
@ -158,10 +159,25 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
}
// dsFromManifest reads a .json/yaml file and returns the daemonset in it.
func dsFromManifest(fileName string) *extensions.DaemonSet {
func dsFromManifest(url string) *extensions.DaemonSet {
var controller extensions.DaemonSet
framework.Logf("Parsing ds from %v", fileName)
data := generated.ReadOrDie(fileName)
framework.Logf("Parsing ds from %v", url)
var response *http.Response
var err error
for i := 1; i <= 5; i++ {
response, err = http.Get(url)
if err == nil && response.StatusCode == 200 {
break
}
time.Sleep(time.Duration(i) * time.Second)
}
Expect(err).NotTo(HaveOccurred())
Expect(response.StatusCode).To(Equal(200))
defer response.Body.Close()
data, err := ioutil.ReadAll(response.Body)
Expect(err).NotTo(HaveOccurred())
json, err := utilyaml.ToJSON(data)
Expect(err).NotTo(HaveOccurred())

View File

@ -2,7 +2,7 @@
runcmd:
- modprobe configs
- docker run -v /dev:/dev -v /home/kubernetes/bin/nvidia:/rootfs/nvidia -v /etc/os-release:/rootfs/etc/os-release -v /proc/sysrq-trigger:/sysrq -e LAKITU_KERNEL_SHA1=26481563cb3788ad254c2bf2126b843c161c7e48 -e BASE_DIR=/rootfs/nvidia --privileged gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8
- docker run -v /dev:/dev -v /home/kubernetes/bin/nvidia:/rootfs/nvidia -v /etc/os-release:/rootfs/etc/os-release -v /proc/sysrq-trigger:/sysrq -e BASE_DIR=/rootfs/nvidia --privileged gcr.io/google_containers/cos-nvidia-driver-install@sha256:cb55c7971c337fece62f2bfe858662522a01e43ac9984a2dd1dd5c71487d225c
- mount /tmp /tmp -o remount,exec,suid
- usermod -a -G docker jenkins
- mkdir -p /var/lib/kubelet

View File

@ -16,11 +16,10 @@ images:
image: e2e-node-containervm-v20161208-image # docker 1.11.2
project: kubernetes-node-e2e-images
gci:
image_regex: cos-beta-59-9460-20-0 # docker 1.11.2
image_regex: cos-stable-59-9460-60-0 # docker 1.11.2
project: cos-cloud
metadata: "user-data<test/e2e_node/jenkins/gci-init-gpu.yaml,gci-update-strategy=update_disabled"
resources:
accelerators:
- type: nvidia-tesla-k80
count: 2