diff --git a/cluster/gce/BUILD b/cluster/gce/BUILD index 66079dc2fcd..ccc7be8ae25 100644 --- a/cluster/gce/BUILD +++ b/cluster/gce/BUILD @@ -32,10 +32,7 @@ filegroup( filegroup( name = "all-srcs", - srcs = [ - ":package-srcs", - "//cluster/gce/gci/nvidia-gpus:all-srcs", - ], + srcs = [":package-srcs"], tags = ["automanaged"], ) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 7e0d0780281..14a565c46ff 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -68,9 +68,7 @@ fi # variable. Also please update corresponding image for node e2e at: # https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml CVM_VERSION=${CVM_VERSION:-container-vm-v20170214} -# NOTE: Update the kernel commit SHA in cluster/addons/nvidia-gpus/cos-installer-daemonset.yaml -# while updating the COS version here. -GCI_VERSION=${KUBE_GCI_VERSION:-cos-beta-59-9460-20-0} +GCI_VERSION=${KUBE_GCI_VERSION:-cos-stable-59-9460-64-0} MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-} MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-cos-cloud} NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${CVM_VERSION}} diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index bf442df0b21..2cbc85a7d9c 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -67,7 +67,7 @@ fi # variable. Also please update corresponding image for node e2e at: # https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml CVM_VERSION=${CVM_VERSION:-container-vm-v20170214} -GCI_VERSION=${KUBE_GCI_VERSION:-cos-beta-59-9460-20-0} +GCI_VERSION=${KUBE_GCI_VERSION:-cos-stable-59-9460-64-0} MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-} MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-cos-cloud} NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${CVM_VERSION}} diff --git a/cluster/gce/gci/nvidia-gpus/BUILD b/cluster/gce/gci/nvidia-gpus/BUILD deleted file mode 100644 index 0f8fa04948e..00000000000 --- a/cluster/gce/gci/nvidia-gpus/BUILD +++ /dev/null @@ -1,24 +0,0 @@ -package(default_visibility = ["//visibility:public"]) - -load("@io_bazel//tools/build_defs/pkg:pkg.bzl", "pkg_tar") -load("@io_kubernetes_build//defs:build.bzl", "release_filegroup") - -filegroup( - name = "sources", - srcs = glob([ - "**/*", - ]), -) - -filegroup( - name = "package-srcs", - srcs = glob(["**"]), - tags = ["automanaged"], - visibility = ["//visibility:private"], -) - -filegroup( - name = "all-srcs", - srcs = [":package-srcs"], - tags = ["automanaged"], -) diff --git a/cluster/gce/gci/nvidia-gpus/Dockerfile b/cluster/gce/gci/nvidia-gpus/Dockerfile deleted file mode 100644 index 9cec8ab365d..00000000000 --- a/cluster/gce/gci/nvidia-gpus/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2017 The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM ubuntu:16.04 - -# Disable prompts from apt -ENV DEBIAN_FRONTEND noninteractive - -RUN apt-get -qq update -RUN apt-get install -qq pciutils gcc g++ git make dpkg-dev bc module-init-tools curl - -RUN mkdir /lakitu-kernel -RUN git clone https://chromium.googlesource.com/chromiumos/third_party/kernel /lakitu-kernel - -ADD installer.sh /usr/bin/nvidia-installer.sh -RUN chmod a+x /usr/bin/nvidia-installer.sh -CMD ["/usr/bin/nvidia-installer.sh"] \ No newline at end of file diff --git a/cluster/gce/gci/nvidia-gpus/Makefile b/cluster/gce/gci/nvidia-gpus/Makefile deleted file mode 100644 index 49a0dfc2c84..00000000000 --- a/cluster/gce/gci/nvidia-gpus/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2017 The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -TAG?=v0.1 -REGISTRY?=gcr.io/google_containers -IMAGE=cos-nvidia-driver-install - -all: container - -container: - docker build --pull -t ${REGISTRY}/${IMAGE}:${TAG} . - -push: - gcloud docker -- push ${REGISTRY}/${IMAGE}:${TAG} - -.PHONY: all container push diff --git a/cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml b/cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml deleted file mode 100644 index a8ecbf24962..00000000000 --- a/cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml +++ /dev/null @@ -1,57 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: DaemonSet -metadata: - name: cos-nvidia-installer - namespace: kube-system -spec: - template: - metadata: - labels: - name: cos-nvidia-installer - # Update the version tag here and `LAKITU_KERNEL_SHA1` while using against new COS releases. - cos-version: cos-beta-59-9460-20-0 - spec: - hostNetwork: true - hostPID: true - volumes: - - name: dev - hostPath: - path: /dev - - name: nvidia-overlay - hostPath: - path: /home/kubernetes/bin/nvidia - - name: os-release - hostPath: - path: /etc/os-release - - name: sysrq - hostPath: - path: /proc/sysrq-trigger - containers: - - image: gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8 - command: ["/bin/sh", "-c"] - args: ["usr/bin/nvidia-installer.sh && sleep infinity"] - env: - - name: BASE_DIR - value: "/rootfs/nvidia" - name: nvidia-driver-installer - resources: - requests: - cpu: 0.15 - securityContext: - privileged: true - env: - # The kernel SHA1 here should correspond to the GCI_VERSION specified by default under cluster/gce/config-default.sh - - name: LAKITU_KERNEL_SHA1 - value: 26481563cb3788ad254c2bf2126b843c161c7e48 - - name: BASE_DIR - value: "/rootfs/nvidia" - volumeMounts: - - name: nvidia-overlay - mountPath: /rootfs/nvidia - - name: dev - mountPath: /dev - - name: os-release - mountPath: /rootfs/etc/os-release - - name: sysrq - mountPath: /sysrq - diff --git a/cluster/gce/gci/nvidia-gpus/installer.sh b/cluster/gce/gci/nvidia-gpus/installer.sh deleted file mode 100644 index a950d426e86..00000000000 --- a/cluster/gce/gci/nvidia-gpus/installer.sh +++ /dev/null @@ -1,207 +0,0 @@ -#!/bin/bash - -# Copyright 2017 The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script is for dynamically installing nvidia kernel drivers in Container Optimized OS - -set -o errexit -set -o pipefail -set -x - -# The script must be run as a root. -# Prerequisites: -# -# LAKITU_KERNEL_SHA1 - The env variable is expected to be set to HEAD of the kernel version used on the host. -# BASE_DIR - Directory that is mapped to a stateful partition on host. Defaults to `/rootfs/nvidia`. -# -# The script will output the following artifacts: -# ${BASE_DIR}/lib* --> Nvidia CUDA libraries -# ${BASE_DIR}/bin/* --> Nvidia debug utilities -# ${BASE_DIR}/.cache/* --> Nvidia driver artifacts cached for idempotency. -# - -BASE_DIR=${BASE_DIR:-"/rootfs/nvidia"} -CACHE_DIR="${BASE_DIR}/.cache" -USR_WORK_DIR="${CACHE_DIR}/usr-work" -USR_WRITABLE_DIR="${CACHE_DIR}/usr-writable" -LIB_WORK_DIR="${CACHE_DIR}/lib-work" -LIB_WRITABLE_DIR="${CACHE_DIR}/lib-writable" - -LIB_OUTPUT_DIR="${BASE_DIR}/lib" -BIN_OUTPUT_DIR="${BASE_DIR}/bin" - -KERNEL_SRC_DIR="/lakitu-kernel" -NVIDIA_DRIVER_DIR="/nvidia" -NVIDIA_DRIVER_VERSION="375.26" - -# Source: https://developer.nvidia.com/cuda-downloads -NVIDIA_CUDA_URL="https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run" -NVIDIA_CUDA_MD5SUM="33e1bd980e91af4e55f3ef835c103f9b" -NVIDIA_CUDA_PKG_NAME="cuda_8.0.61_375.26_linux.run" -NVIDIA_DRIVER_PKG_NAME="NVIDIA-Linux-x86_64-375.26.run" - -check_nvidia_device() { - lspci - if ! lspci | grep -i -q NVIDIA; then - echo "No NVIDIA devices attached to this instance." - exit 0 - fi - echo "Found NVIDIA device on this instance." -} - -prepare_kernel_source() { - local kernel_git_repo="https://chromium.googlesource.com/chromiumos/third_party/kernel" - local kernel_version="$(uname -r)" - local kernel_version_stripped="$(echo ${kernel_version} | sed 's/\+//')" - - # Checkout the correct tag. - echo "Downloading kernel source at tag ${kernel_version_stripped} ..." - pushd "${KERNEL_SRC_DIR}" - # TODO: Consume KERNEL SHA1 from COS image directly. - # git checkout "tags/v${kernel_version_stripped}" - git checkout ${LAKITU_KERNEL_SHA1} - - # Prepare kernel configu and source for modules. - echo "Preparing kernel sources ..." - zcat "/proc/config.gz" > ".config" - make olddefconfig - make modules_prepare - # Done. - popd -} - -download_install_nvidia() { - local pkg_name="${NVIDIA_CUDA_PKG_NAME}" - local url="${NVIDIA_CUDA_URL}" - local log_file_name="${NVIDIA_DRIVER_DIR}/nvidia-installer.log" - - mkdir -p "${NVIDIA_DRIVER_DIR}" - pushd "${NVIDIA_DRIVER_DIR}" - - echo "Downloading Nvidia CUDA package from ${url} ..." - curl -L -s "${url}" -o "${pkg_name}" - echo "${NVIDIA_CUDA_MD5SUM} ${pkg_name}" | md5sum --check - - echo "Extracting Nvidia CUDA package ..." - sh ${pkg_name} --extract="$(pwd)" - - echo "Running the Nvidia driver installer ..." - if ! sh "${NVIDIA_DRIVER_PKG_NAME}" --kernel-source-path="${KERNEL_SRC_DIR}" --silent --accept-license --keep --log-file-name="${log_file_name}"; then - echo "Nvidia installer failed, log below:" - echo "===================================" - tail -50 "${log_file_name}" - echo "===================================" - exit 1 - fi - # Create unified memory device file. - nvidia-modprobe -c0 -u - popd -} - -unlock_loadpin_and_reboot_if_needed() { - kernel_cmdline="$(cat /proc/cmdline)" - if echo "${kernel_cmdline}" | grep -q -v "lsm.module_locking=0"; then - local -r esp_partition="/dev/sda12" - local -r mount_path="/tmp/esp" - local -r grub_cfg="efi/boot/grub.cfg" - - mkdir -p "${mount_path}" - mount "${esp_partition}" "${mount_path}" - - pushd "${mount_path}" - cp "${grub_cfg}" "${grub_cfg}.orig" - sed 's/cros_efi/cros_efi lsm.module_locking=0/g' -i "efi/boot/grub.cfg" - cat "${grub_cfg}" - popd - sync - umount "${mount_path}" - # Restart the node for loadpin to be disabled. - echo b > /sysrq - fi -} - -create_uvm_device() { - # Create unified memory device file. - nvidia-modprobe -c0 -u -} - -verify_base_image() { - mount --bind /rootfs/etc/os-release /etc/os-release - local id="$(grep "^ID=" /etc/os-release)" - if [[ "${id#*=}" != "cos" ]]; then - echo "This installer is designed to run on Container-Optimized OS only" - exit 1 - fi -} - -setup_overlay_mounts() { - mkdir -p ${USR_WRITABLE_DIR} ${USR_WORK_DIR} ${LIB_WRITABLE_DIR} ${LIB_WORK_DIR} - mount -t overlay -o lowerdir=/usr,upperdir=${USR_WRITABLE_DIR},workdir=${USR_WORK_DIR} none /usr - mount -t overlay -o lowerdir=/lib,upperdir=${LIB_WRITABLE_DIR},workdir=${LIB_WORK_DIR} none /lib -} - -exit_if_install_not_needed() { - if nvidia-smi; then - echo "nvidia drivers already installed. Skipping installation" - post_installation_sequence - exit 0 - fi -} - -restart_kubelet() { - echo "Sending SIGTERM to kubelet" - pkill -SIGTERM kubelet -} - -# Copy user space libraries and debug utilities to a special output directory on the host. -# Make these artifacts world readable and executable. -copy_files_to_host() { - mkdir -p ${LIB_OUTPUT_DIR} ${BIN_OUTPUT_DIR} - cp -r ${USR_WRITABLE_DIR}/lib/x86_64-linux-gnu/* ${LIB_OUTPUT_DIR}/ - cp -r ${USR_WRITABLE_DIR}/bin/* ${BIN_OUTPUT_DIR}/ - chmod -R a+rx ${LIB_OUTPUT_DIR} - chmod -R a+rx ${BIN_OUTPUT_DIR} -} - -post_installation_sequence() { - create_uvm_device - # Copy nvidia user space libraries and debug tools to the host for use from other containers. - copy_files_to_host - # Restart the kubelet for it to pick up the GPU devices. - restart_kubelet -} - -main() { - # Do not run the installer unless the base image is Container Optimized OS (COS) - verify_base_image - # Do not run the installer unless a Nvidia device is found on the PCI bus - check_nvidia_device - # Setup overlay mounts to capture nvidia driver artificats in a more permanent storage on the host. - setup_overlay_mounts - # Disable a critical security feature in COS that will allow for dynamically loading Nvidia drivers - unlock_loadpin_and_reboot_if_needed - # Exit if installation is not required (for idempotency) - exit_if_install_not_needed - # Checkout kernel sources appropriate for the base image. - prepare_kernel_source - # Download, compile and install nvidia drivers. - download_install_nvidia - # Verify that the Nvidia drivers have been successfully installed. - nvidia-smi - # Perform post installation steps - copying artifacts, restarting kubelet, etc. - post_installation_sequence -} - -main "$@" diff --git a/cluster/kubemark/gce/config-default.sh b/cluster/kubemark/gce/config-default.sh index 3acd3f38815..2f71696cc41 100644 --- a/cluster/kubemark/gce/config-default.sh +++ b/cluster/kubemark/gce/config-default.sh @@ -36,7 +36,7 @@ PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false} MASTER_OS_DISTRIBUTION=${KUBE_MASTER_OS_DISTRIBUTION:-gci} NODE_OS_DISTRIBUTION=${KUBE_NODE_OS_DISTRIBUTION:-debian} -MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-cos-beta-59-9460-20-0} +MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-cos-stable-59-9460-64-0} MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-cos-cloud} NETWORK=${KUBE_GCE_NETWORK:-default} diff --git a/hack/generate-bindata.sh b/hack/generate-bindata.sh index 99f3ec5eb26..4c843c5cdc8 100755 --- a/hack/generate-bindata.sh +++ b/hack/generate-bindata.sh @@ -48,8 +48,7 @@ go-bindata -nometadata -o "${BINDATA_OUTPUT}.tmp" -pkg generated \ "examples/..." \ "test/e2e/testing-manifests/..." \ "test/images/..." \ - "test/fixtures/..." \ - "cluster/gce/gci/nvidia-gpus/..." + "test/fixtures/..." gofmt -s -w "${BINDATA_OUTPUT}.tmp" diff --git a/test/e2e/generated/BUILD b/test/e2e/generated/BUILD index c8acf6747eb..dc3bd386141 100644 --- a/test/e2e/generated/BUILD +++ b/test/e2e/generated/BUILD @@ -23,7 +23,6 @@ genrule( name = "bindata", srcs = [ "//examples:sources", - "//cluster/gce/gci/nvidia-gpus:sources", "//test/images:sources", "//test/fixtures:sources", "//test/e2e/testing-manifests:sources", diff --git a/test/e2e_node/jenkins/benchmark/benchmark-config.yaml b/test/e2e_node/jenkins/benchmark/benchmark-config.yaml index 643d8255b9c..c59bcbae22b 100644 --- a/test/e2e_node/jenkins/benchmark/benchmark-config.yaml +++ b/test/e2e_node/jenkins/benchmark/benchmark-config.yaml @@ -49,21 +49,21 @@ images: tests: - 'resource tracking for 105 pods per node \[Benchmark\]' gci-resource1: - image: cos-beta-59-9460-20-0 + image: cos-stable-59-9460-64-0 project: cos-cloud machine: n1-standard-1 metadata: "user-data