From e74d4a0d682653dd4f5c2087f8aae8a5ac0dda10 Mon Sep 17 00:00:00 2001 From: Vishnu kannan Date: Fri, 28 Apr 2017 15:57:39 -0700 Subject: [PATCH] Adding support for Accelerators to GCE clusters. Signed-off-by: Vishnu kannan --- cluster/gce/config-default.sh | 12 ++++++++++++ cluster/gce/config-test.sh | 10 ++++++++++ cluster/gce/util.sh | 9 ++++++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index b81febddb21..468469d8060 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -32,6 +32,9 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB} NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard} NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB} NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0} +# Accelerators to be attached to each node. Format "type=,count=" +# More information on available GPUs here - https://cloud.google.com/compute/docs/gpus/ +NODE_ACCELERATORS=${NODE_ACCELERATORS:-""} REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true} PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false} PREEMPTIBLE_MASTER=${PREEMPTIBLE_MASTER:-false} @@ -55,6 +58,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then NODE_OS_DISTRIBUTION="gci" fi +# GPUs supported in GCE do not have compatible drivers in Debian 7. +if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then + NODE_ACCELERATORS="" +fi + # By default a cluster will be started with the master on GCI and nodes on # containervm. If you are updating the containervm version, update this # variable. Also please update corresponding image for node e2e at: @@ -135,6 +143,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}" # Optional: set feature gates FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}" +if [[ ! -z "${NODE_ACCELERATORS}" ]]; then + FEATURE_GATES="${FEATURE_GATES},Accelerators=true" +fi + # Optional: Install cluster DNS. ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}" DNS_SERVER_IP="${KUBE_DNS_SERVER_IP:-10.0.0.10}" diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 273ff5a0cd4..4c82a048c6a 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -32,6 +32,7 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB} NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard} NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB} NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0} +NODE_ACCELERATORS=${NODE_ACCELERATORS:-""} REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true} KUBE_APISERVER_REQUEST_TIMEOUT=300 PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false} @@ -56,6 +57,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then NODE_OS_DISTRIBUTION="gci" fi +# GPUs supported in GCE do not have compatible drivers in Debian 7. +if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then + NODE_ACCELERATORS="" +fi + # By default a cluster will be started with the master on GCI and nodes on # containervm. If you are updating the containervm version, update this # variable. Also please update corresponding image for node e2e at: @@ -91,6 +97,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}" # Optional: set feature gates FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}" +if [[ ! -z "${NODE_ACCELERATORS}" ]]; then + FEATURE_GATES="${FEATURE_GATES},Accelerators=true" +fi + TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100} # Extra docker options for nodes. diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 27d47e925cc..5a3f672dc52 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -527,10 +527,16 @@ function create-node-template() { "${IP_ALIAS_SUBNETWORK:-}" \ "${IP_ALIAS_SIZE:-}") + local accelerator_args="" + # VMs with Accelerators cannot be live migrated. + # More details here - https://cloud.google.com/compute/docs/gpus/add-gpus#create-new-gpu-instance + if [[ ! -z "${NODE_ACCELERATORS}" ]]; then + accelerator_args="--maintenance-policy TERMINATE --restart-on-failure --accelerator ${NODE_ACCELERATORS}" + fi local attempt=1 while true; do echo "Attempt ${attempt} to create ${1}" >&2 - if ! ${gcloud} compute instance-templates create \ + if ! ${gcloud} beta compute instance-templates create \ "$template_name" \ --project "${PROJECT}" \ --machine-type "${NODE_SIZE}" \ @@ -539,6 +545,7 @@ function create-node-template() { --image-project="${NODE_IMAGE_PROJECT}" \ --image "${NODE_IMAGE}" \ --tags "${NODE_TAG}" \ + ${accelerator_args} \ ${local_ssds} \ --region "${REGION}" \ ${network} \