mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 20:53:33 +00:00
Merge pull request #45130 from vishh/gpu-cluster-setup
Automatic merge from submit-queue (batch tested with PRs 44830, 45130) Adding support for Accelerators to GCE clusters. ```release-note Create clusters with GPUs in GKE by specifying "type=<gpu-type>,count=<gpu-count>" to NODE_ACCELERATORS env var. List of available GPUs - https://cloud.google.com/compute/docs/gpus/#introduction ```
This commit is contained in:
commit
d4f92711a1
@ -32,6 +32,9 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB}
|
|||||||
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
|
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
|
||||||
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
|
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
|
||||||
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
|
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
|
||||||
|
# Accelerators to be attached to each node. Format "type=<accelerator-type>,count=<accelerator-count>"
|
||||||
|
# More information on available GPUs here - https://cloud.google.com/compute/docs/gpus/
|
||||||
|
NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
|
||||||
REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
|
REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
|
||||||
PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
|
PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
|
||||||
PREEMPTIBLE_MASTER=${PREEMPTIBLE_MASTER:-false}
|
PREEMPTIBLE_MASTER=${PREEMPTIBLE_MASTER:-false}
|
||||||
@ -55,6 +58,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then
|
|||||||
NODE_OS_DISTRIBUTION="gci"
|
NODE_OS_DISTRIBUTION="gci"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# GPUs supported in GCE do not have compatible drivers in Debian 7.
|
||||||
|
if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then
|
||||||
|
NODE_ACCELERATORS=""
|
||||||
|
fi
|
||||||
|
|
||||||
# By default a cluster will be started with the master on GCI and nodes on
|
# By default a cluster will be started with the master on GCI and nodes on
|
||||||
# containervm. If you are updating the containervm version, update this
|
# containervm. If you are updating the containervm version, update this
|
||||||
# variable. Also please update corresponding image for node e2e at:
|
# variable. Also please update corresponding image for node e2e at:
|
||||||
@ -135,6 +143,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
|
|||||||
# Optional: set feature gates
|
# Optional: set feature gates
|
||||||
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
|
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
|
||||||
|
|
||||||
|
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
|
||||||
|
FEATURE_GATES="${FEATURE_GATES},Accelerators=true"
|
||||||
|
fi
|
||||||
|
|
||||||
# Optional: Install cluster DNS.
|
# Optional: Install cluster DNS.
|
||||||
ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}"
|
ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}"
|
||||||
DNS_SERVER_IP="${KUBE_DNS_SERVER_IP:-10.0.0.10}"
|
DNS_SERVER_IP="${KUBE_DNS_SERVER_IP:-10.0.0.10}"
|
||||||
|
@ -32,6 +32,7 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB}
|
|||||||
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
|
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
|
||||||
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
|
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
|
||||||
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
|
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
|
||||||
|
NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
|
||||||
REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
|
REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
|
||||||
KUBE_APISERVER_REQUEST_TIMEOUT=300
|
KUBE_APISERVER_REQUEST_TIMEOUT=300
|
||||||
PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
|
PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
|
||||||
@ -56,6 +57,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then
|
|||||||
NODE_OS_DISTRIBUTION="gci"
|
NODE_OS_DISTRIBUTION="gci"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# GPUs supported in GCE do not have compatible drivers in Debian 7.
|
||||||
|
if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then
|
||||||
|
NODE_ACCELERATORS=""
|
||||||
|
fi
|
||||||
|
|
||||||
# By default a cluster will be started with the master on GCI and nodes on
|
# By default a cluster will be started with the master on GCI and nodes on
|
||||||
# containervm. If you are updating the containervm version, update this
|
# containervm. If you are updating the containervm version, update this
|
||||||
# variable. Also please update corresponding image for node e2e at:
|
# variable. Also please update corresponding image for node e2e at:
|
||||||
@ -91,6 +97,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
|
|||||||
# Optional: set feature gates
|
# Optional: set feature gates
|
||||||
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
|
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
|
||||||
|
|
||||||
|
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
|
||||||
|
FEATURE_GATES="${FEATURE_GATES},Accelerators=true"
|
||||||
|
fi
|
||||||
|
|
||||||
TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100}
|
TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100}
|
||||||
|
|
||||||
# Extra docker options for nodes.
|
# Extra docker options for nodes.
|
||||||
|
@ -527,10 +527,16 @@ function create-node-template() {
|
|||||||
"${IP_ALIAS_SUBNETWORK:-}" \
|
"${IP_ALIAS_SUBNETWORK:-}" \
|
||||||
"${IP_ALIAS_SIZE:-}")
|
"${IP_ALIAS_SIZE:-}")
|
||||||
|
|
||||||
|
local accelerator_args=""
|
||||||
|
# VMs with Accelerators cannot be live migrated.
|
||||||
|
# More details here - https://cloud.google.com/compute/docs/gpus/add-gpus#create-new-gpu-instance
|
||||||
|
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
|
||||||
|
accelerator_args="--maintenance-policy TERMINATE --restart-on-failure --accelerator ${NODE_ACCELERATORS}"
|
||||||
|
fi
|
||||||
local attempt=1
|
local attempt=1
|
||||||
while true; do
|
while true; do
|
||||||
echo "Attempt ${attempt} to create ${1}" >&2
|
echo "Attempt ${attempt} to create ${1}" >&2
|
||||||
if ! ${gcloud} compute instance-templates create \
|
if ! ${gcloud} beta compute instance-templates create \
|
||||||
"$template_name" \
|
"$template_name" \
|
||||||
--project "${PROJECT}" \
|
--project "${PROJECT}" \
|
||||||
--machine-type "${NODE_SIZE}" \
|
--machine-type "${NODE_SIZE}" \
|
||||||
@ -539,6 +545,7 @@ function create-node-template() {
|
|||||||
--image-project="${NODE_IMAGE_PROJECT}" \
|
--image-project="${NODE_IMAGE_PROJECT}" \
|
||||||
--image "${NODE_IMAGE}" \
|
--image "${NODE_IMAGE}" \
|
||||||
--tags "${NODE_TAG}" \
|
--tags "${NODE_TAG}" \
|
||||||
|
${accelerator_args} \
|
||||||
${local_ssds} \
|
${local_ssds} \
|
||||||
--region "${REGION}" \
|
--region "${REGION}" \
|
||||||
${network} \
|
${network} \
|
||||||
|
Loading…
Reference in New Issue
Block a user