From 2ae4eeb3ea666c65ef0344b848cf4bb317503914 Mon Sep 17 00:00:00 2001 From: Aldo Culquicondor Date: Fri, 24 Jul 2020 10:40:26 -0400 Subject: [PATCH] Mount kubelet and container runtime rootdir on LSSD When environment variable NODE_LOCAL_SSD_EPHEMERAL=true, create a RAID 0 array on all attached SSDs to mount: - kubelet root dir - container runtime root dir - pod logs dir Those directories account for all ephemeral storage. An array is not created when there is only one SSD. Change-Id: I22137f1d83fc19e9ef58a556d7461da43e4ab9bd Signed-off-by: Aldo Culquicondor --- cluster/gce/config-default.sh | 1 + cluster/gce/config-test.sh | 1 + cluster/gce/gci/configure-helper.sh | 62 ++++++++++++++++++++++++++++- cluster/gce/util.sh | 1 + 4 files changed, 64 insertions(+), 1 deletion(-) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 1147ad6b076..c72ac806822 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -40,6 +40,7 @@ NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB} NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0} NODE_LABELS="${KUBE_NODE_LABELS:-}" WINDOWS_NODE_LABELS="${WINDOWS_NODE_LABELS:-}" +NODE_LOCAL_SSDS_EPHEMERAL=${NODE_LOCAL_SSDS_EPHEMERAL:-} # KUBE_CREATE_NODES can be used to avoid creating nodes, while master will be sized for NUM_NODES nodes. # Firewalls and node templates are still created. diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index cd2c37013bc..05eccfe1ecc 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -39,6 +39,7 @@ NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB} NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0} NODE_LABELS=${KUBE_NODE_LABELS:-} WINDOWS_NODE_LABELS=${WINDOWS_NODE_LABELS:-} +NODE_LOCAL_SSDS_EPHEMERAL=${NODE_LOCAL_SSDS_EPHEMERAL:-} # KUBE_CREATE_NODES can be used to avoid creating nodes, while master will be sized for NUM_NODES nodes. # Firewalls and node templates are still created. diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index eb470432ac8..8e97035f184 100644 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -385,6 +385,10 @@ function mount-ext(){ # Local ssds, if present, are mounted or symlinked to their appropriate # locations function ensure-local-ssds() { + if [ "${NODE_LOCAL_SSDS_EPHEMERAL:-false}" == "true" ]; then + ensure-local-ssds-ephemeral-storage + return + fi get-local-disk-num "scsi" "block" local scsiblocknum="${localdisknum}" local i=0 @@ -436,6 +440,62 @@ function ensure-local-ssds() { done } +# Local SSDs, if present, are used in a single RAID 0 array and directories that +# back ephemeral storage are mounted on them (kubelet root, container runtime +# root and pod logs). +function ensure-local-ssds-ephemeral-storage() { + local devices=() + # Get nvme devices + for ssd in /dev/nvme*n*; do + if [ -e "${ssd}" ]; then + # This workaround to find if the NVMe device is a local SSD is required + # because the existing Google images does not them in /dev/disk/by-id + if [[ "$(lsblk -o MODEL -dn "${ssd}")" == "nvme_card" ]]; then + devices+=("${ssd}") + fi + fi + done + if [ "${#devices[@]}" -eq 0 ]; then + echo "No local NVMe SSD disks found." + return + fi + + local device="${devices[0]}" + if [ "${#devices[@]}" -ne 1 ]; then + seen_arrays=(/dev/md/*) + device=${seen_arrays[0]} + echo "Setting RAID array with local SSDs on device ${device}" + if [ ! -e "$device" ]; then + device="/dev/md/0" + echo "y" | mdadm --create "${device}" --level=0 --raid-devices=${#devices[@]} "${devices[@]}" + fi + fi + + local ephemeral_mountpoint="/mnt/stateful_partition/kube-ephemeral-ssd" + safe-format-and-mount "${device}" "${ephemeral_mountpoint}" + + # mount container runtime root dir on SSD + local container_runtime="${CONTAINER_RUNTIME:-docker}" + systemctl stop "$container_runtime" + # Some images remount the container runtime root dir. + umount "/var/lib/${container_runtime}" || true + # Move the container runtime's directory to the new location to preserve + # preloaded images. + if [ ! -d "${ephemeral_mountpoint}/${container_runtime}" ]; then + mv "/var/lib/${container_runtime}" "${ephemeral_mountpoint}/${container_runtime}" + fi + safe-bind-mount "${ephemeral_mountpoint}/${container_runtime}" "/var/lib/${container_runtime}" + systemctl start "$container_runtime" + + # mount kubelet root dir on SSD + mkdir -p "${ephemeral_mountpoint}/kubelet" + safe-bind-mount "${ephemeral_mountpoint}/kubelet" "/var/lib/kubelet" + + # mount pod logs root dir on SSD + mkdir -p "${ephemeral_mountpoint}/log_pods" + safe-bind-mount "${ephemeral_mountpoint}/log_pods" "/var/log/pods" +} + # Installs logrotate configuration files function setup-logrotate() { mkdir -p /etc/logrotate.d/ @@ -2950,8 +3010,8 @@ function main() { setup-os-params config-ip-firewall create-dirs - setup-kubelet-dir ensure-local-ssds + setup-kubelet-dir setup-logrotate if [[ "${KUBERNETES_MASTER:-}" == "true" ]]; then mount-master-pd diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index b9ee82a2c05..2fa4421ae88 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -1215,6 +1215,7 @@ CONTAINER_RUNTIME_TEST_HANDLER: $(yaml-quote "${CONTAINER_RUNTIME_TEST_HANDLER:- UBUNTU_INSTALL_CONTAINERD_VERSION: $(yaml-quote "${UBUNTU_INSTALL_CONTAINERD_VERSION:-}") UBUNTU_INSTALL_RUNC_VERSION: $(yaml-quote "${UBUNTU_INSTALL_RUNC_VERSION:-}") NODE_LOCAL_SSDS_EXT: $(yaml-quote "${NODE_LOCAL_SSDS_EXT:-}") +NODE_LOCAL_SSDS_EPHEMERAL: "$(yaml-quote ${NODE_LOCAL_SSDS_EPHEMERAL:-})" LOAD_IMAGE_COMMAND: $(yaml-quote "${LOAD_IMAGE_COMMAND:-}") ZONE: $(yaml-quote "${ZONE}") REGION: $(yaml-quote "${REGION}")