diff --git a/cluster/gce/addons/node-termination-handler/README b/cluster/gce/addons/node-termination-handler/README new file mode 100644 index 00000000000..bca8ade8952 --- /dev/null +++ b/cluster/gce/addons/node-termination-handler/README @@ -0,0 +1,4 @@ +# GCE Node Termination Handler + +This addon deploys [GCE Node Termination Handler](https://github.com/GoogleCloudPlatform/k8s-node-termination-handler) on to kubernetes clusters on GCP. +It is meant to help translate GCE VM termination notifications into kubernetes graceful terminations. diff --git a/cluster/gce/addons/node-termination-handler/daemonset.yaml b/cluster/gce/addons/node-termination-handler/daemonset.yaml new file mode 100644 index 00000000000..4430e929572 --- /dev/null +++ b/cluster/gce/addons/node-termination-handler/daemonset.yaml @@ -0,0 +1,76 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + addonmanager.kubernetes.io/mode: Reconcile + k8s-app: node-termination-handler + namespace: kube-system + name: node-termination-handler +spec: + selector: + matchLabels: + k8s-app: node-termination-handler + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + k8s-app: node-termination-handler + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' + spec: + priorityClassName: system-node-critical + # Necessary to reboot node + hostPID: true + affinity: + nodeAffinity: + # Restrict to GPU nodes or preemptible nodes + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: Exists + - matchExpressions: + - key: cloud.google.com/gke-preemptible + operator: Exists + volumes: + - name: klet-service-account + hostPath: + path: /var/lib/kubelet + - name: klet-ca-crt + hostPath: + path: /etc/srv/kubernetes + tolerations: + # Run regardless of any existing taints. + - effect: NoSchedule + operator: Exists + - effect: NoExecute + operator: Exists + containers: + - image: k8s.gcr.io/gke-node-termination-handler@sha256:e08ca863a547754fa7b75064bdad04f04cbef86c7b0a181ecc7304e747623181 + name: node-termination-handler + command: ["./node-termination-handler"] + args: ["--logtostderr", "--exclude-pods=$(POD_NAME):$(POD_NAMESPACE)", "-v=10", "--kubeconfig=/var/lib/kubelet/kubeconfig", "--annotation=cloud.google.com/impending-node-termination"] + securityContext: + capabilities: + # Necessary to reboot node + add: ["SYS_BOOT"] + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + limits: + cpu: 50m + memory: 30Mi + volumeMounts: + - name: klet-service-account + mountPath: /var/lib/kubelet + - name: klet-ca-crt + mountPath: /etc/srv/kubernetes diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index b39ebbae2d9..d8828c60455 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -37,6 +37,14 @@ MASTER_ROOT_DISK_SIZE=${MASTER_ROOT_DISK_SIZE:-$(get-master-root-disk-size)} NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard} NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB} NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0} + +# Historically fluentd was a manifest pod and then was migrated to DaemonSet. +# To avoid situation during cluster upgrade when there are two instances +# of fluentd running on a node, kubelet need to mark node on which +# fluentd is not running as a manifest pod with appropriate label. +# TODO(piosz): remove this in 1.8 +NODE_LABELS="${KUBE_NODE_LABELS:-beta.kubernetes.io/fluentd-ds-ready=true}" + # An extension to local SSDs allowing users to specify block/fs and SCSI/NVMe devices # Format of this variable will be "#,scsi/nvme,block/fs" you can specify multiple # configurations by separating them by a semi-colon ex. "2,scsi,fs;1,nvme,block" @@ -47,6 +55,9 @@ NODE_LOCAL_SSDS_EXT=${NODE_LOCAL_SSDS_EXT:-} NODE_ACCELERATORS=${NODE_ACCELERATORS:-""} REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true} PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false} +if [[ "${PREEMPTIBLE_NODE}" == "true" ]]; then + NODE_LABELS="${NODE_LABELS},cloud.google.com/gke-preemptible=true" +fi PREEMPTIBLE_MASTER=${PREEMPTIBLE_MASTER:-false} KUBE_DELETE_NODES=${KUBE_DELETE_NODES:-true} KUBE_DELETE_NETWORK=${KUBE_DELETE_NETWORK:-} # default value calculated below @@ -163,13 +174,6 @@ ENABLE_METADATA_AGENT="${KUBE_ENABLE_METADATA_AGENT:-none}" # Useful for scheduling heapster in large clusters with nodes of small size. HEAPSTER_MACHINE_TYPE="${HEAPSTER_MACHINE_TYPE:-}" -# Historically fluentd was a manifest pod and then was migrated to DaemonSet. -# To avoid situation during cluster upgrade when there are two instances -# of fluentd running on a node, kubelet need to mark node on which -# fluentd is not running as a manifest pod with appropriate label. -# TODO(piosz): remove this in 1.8 -NODE_LABELS="${KUBE_NODE_LABELS:-beta.kubernetes.io/fluentd-ds-ready=true}" - # NON_MASTER_NODE_LABELS are labels will only be applied on non-master nodes. NON_MASTER_NODE_LABELS="${KUBE_NON_MASTER_NODE_LABELS:-}" @@ -457,3 +461,11 @@ if [[ "${ENABLE_TOKENREQUEST:-}" == "true" ]]; then SERVICEACCOUNT_ISSUER="https://kubernetes.io/${CLUSTER_NAME}" SERVICEACCOUNT_API_AUDIENCES="https://kubernetes.default.svc" fi + +# Optional: Enable Node termination Handler for Preemptible and GPU VMs. +# https://github.com/GoogleCloudPlatform/k8s-node-termination-handler +ENABLE_NODE_TERMINATION_HANDLER="${ENABLE_NODE_TERMINATION_HANDLER:-false}" +# Override default Node Termination Handler Image +if [[ "${NODE_TERMINATION_HANDLER_IMAGE:-}" ]]; then + PROVIDER_VARS="${PROVIDER_VARS:-} NODE_TERMINATION_HANDLER_IMAGE" +fi diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 62eb1eab30a..57a49458b24 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -37,6 +37,14 @@ MASTER_ROOT_DISK_SIZE=${MASTER_ROOT_DISK_SIZE:-$(get-master-root-disk-size)} NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard} NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB} NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0} + +# Historically fluentd was a manifest pod and then was migrated to DaemonSet. +# To avoid situation during cluster upgrade when there are two instances +# of fluentd running on a node, kubelet need to mark node on which +# fluentd is not running as a manifest pod with appropriate label. +# TODO(piosz): remove this in 1.8 +NODE_LABELS="${KUBE_NODE_LABELS:-beta.kubernetes.io/fluentd-ds-ready=true}" + # An extension to local SSDs allowing users to specify block/fs and SCSI/NVMe devices # Format of this variable will be "#,scsi/nvme,block/fs" you can specify multiple # configurations by separating them by a semi-colon ex. "2,scsi,fs;1,nvme,block" @@ -47,6 +55,9 @@ REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true} KUBE_APISERVER_REQUEST_TIMEOUT=300 PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false} PREEMPTIBLE_MASTER=${PREEMPTIBLE_MASTER:-false} +if [[ "${PREEMPTIBLE_NODE}" == "true" ]]; then + NODE_LABELS="${NODE_LABELS},cloud.google.com/gke-preemptible=true" +fi KUBE_DELETE_NODES=${KUBE_DELETE_NODES:-true} KUBE_DELETE_NETWORK=${KUBE_DELETE_NETWORK:-true} CREATE_CUSTOM_NETWORK=${CREATE_CUSTOM_NETWORK:-false} @@ -201,13 +212,6 @@ CONTROLLER_MANAGER_TEST_ARGS="${CONTROLLER_MANAGER_TEST_ARGS:-} ${TEST_CLUSTER_R SCHEDULER_TEST_ARGS="${SCHEDULER_TEST_ARGS:-} ${TEST_CLUSTER_API_CONTENT_TYPE}" KUBEPROXY_TEST_ARGS="${KUBEPROXY_TEST_ARGS:-} ${TEST_CLUSTER_API_CONTENT_TYPE}" -# Historically fluentd was a manifest pod and then was migrated to DaemonSet. -# To avoid situation during cluster upgrade when there are two instances -# of fluentd running on a node, kubelet need to mark node on which -# fluentd is not running as a manifest pod with appropriate label. -# TODO(piosz): remove this in 1.8 -NODE_LABELS="${KUBE_NODE_LABELS:-beta.kubernetes.io/fluentd-ds-ready=true}" - # NON_MASTER_NODE_LABELS are labels will only be applied on non-master nodes. NON_MASTER_NODE_LABELS="${KUBE_NON_MASTER_NODE_LABELS:-}" @@ -476,3 +480,11 @@ if [[ "${ENABLE_TOKENREQUEST:-}" == "true" ]]; then SERVICEACCOUNT_ISSUER="https://kubernetes.io/${CLUSTER_NAME}" SERVICEACCOUNT_API_AUDIENCES="https://kubernetes.default.svc" fi + +# Optional: Enable Node termination Handler for Preemptible and GPU VMs. +# https://github.com/GoogleCloudPlatform/k8s-node-termination-handler +ENABLE_NODE_TERMINATION_HANDLER="${ENABLE_NODE_TERMINATION_HANDLER:-false}" +# Override default Node Termination Handler Image +if [[ "${NODE_TERMINATION_HANDLER_IMAGE:-}" ]]; then + PROVIDER_VARS="${PROVIDER_VARS:-} NODE_TERMINATION_HANDLER_IMAGE" +fi diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index b6b5205c12e..a1734d2bd13 100644 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -2038,6 +2038,12 @@ function setup-addon-manifests { copy-manifests "${psp_dir}" "${dst_dir}" fi fi + if [[ "${ENABLE_NODE_TERMINATION_HANDLER}" == "true" ]]; then + local -r nth_dir="${src_dir}/${3:-$2}/node-termination-handler" + if [[ -d "${nth_dir}" ]]; then + copy-manifests "${nth_dir}" "${dst_dir}" + fi + fi } # A function that downloads extra addons from a URL and puts them in the GCI @@ -2432,6 +2438,10 @@ EOF if [[ "${ENABLE_NVIDIA_GPU_DEVICE_PLUGIN:-}" == "true" ]]; then setup-addon-manifests "addons" "device-plugins/nvidia-gpu" fi + if [[ "${ENABLE_NODE_TERMINATION_HANDLER}" == "true" ]]; then + setup-addon-manifests "addons" "node-termination-handler" + setup-node-termination-handler-manifest + fi if [[ "${ENABLE_CLUSTER_DNS:-}" == "true" ]]; then if [[ "${CLUSTER_DNS_CORE_DNS:-}" == "true" ]]; then setup-addon-manifests "addons" "dns/coredns" @@ -2511,6 +2521,13 @@ EOF cp "${src_dir}/kube-addon-manager.yaml" /etc/kubernetes/manifests } +function setup-node-termination-handler-manifest { + local -r nth_manifest="/etc/kubernetes/$1/$2/daemonset.yaml" + if [[ -n "${NODE_TERMINATION_HANDLER_IMAGE}" ]]; then + sed -i "s|image:.*|image: ${NODE_TERMINATION_HANDLER_IMAGE}|" "${nth_manifest}" + fi +} + # Starts an image-puller - used in test clusters. function start-image-puller { echo "Start image-puller" diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 6882c04fbe3..4003fc3dcb4 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -911,6 +911,7 @@ VOLUME_PLUGIN_DIR: $(yaml-quote ${VOLUME_PLUGIN_DIR}) KUBELET_ARGS: $(yaml-quote ${KUBELET_ARGS}) REQUIRE_METADATA_KUBELET_CONFIG_FILE: $(yaml-quote true) ENABLE_NETD: $(yaml-quote ${ENABLE_NETD:-false}) +ENABLE_NODE_TERMINATION_HANDLER: $(yaml-quote ${ENABLE_NODE_TERMINATION_HANDLER:-false}) CUSTOM_NETD_YAML: | $(echo "${CUSTOM_NETD_YAML:-}" | sed -e "s/'/''/g") CUSTOM_CALICO_NODE_DAEMONSET_YAML: |