diff --git a/cluster/aws/config-default.sh b/cluster/aws/config-default.sh index 263c4b56617..b63ec1628c8 100644 --- a/cluster/aws/config-default.sh +++ b/cluster/aws/config-default.sh @@ -26,6 +26,9 @@ NUM_MINIONS=${NUM_MINIONS:-4} # Because regions are globally named, we want to create in a single region; default to us-east-1 AWS_S3_REGION=${AWS_S3_REGION:-us-east-1} +# Which docker storage mechanism to use. +DOCKER_STORAGE=${DOCKER_STORAGE:-aufs} + INSTANCE_PREFIX="${KUBE_AWS_INSTANCE_PREFIX:-kubernetes}" CLUSTER_ID=${INSTANCE_PREFIX} AWS_SSH_KEY=${AWS_SSH_KEY:-$HOME/.ssh/kube_aws_rsa} diff --git a/cluster/aws/config-test.sh b/cluster/aws/config-test.sh index fedae2affc8..3ddf633cf8f 100755 --- a/cluster/aws/config-test.sh +++ b/cluster/aws/config-test.sh @@ -22,6 +22,9 @@ NUM_MINIONS=${NUM_MINIONS:-2} # Because regions are globally named, we want to create in a single region; default to us-east-1 AWS_S3_REGION=${AWS_S3_REGION:-us-east-1} +# Which docker storage mechanism to use. +DOCKER_STORAGE=${DOCKER_STORAGE:-aufs} + INSTANCE_PREFIX="${KUBE_AWS_INSTANCE_PREFIX:-e2e-test-${USER}}" CLUSTER_ID=${INSTANCE_PREFIX} AWS_SSH_KEY=${AWS_SSH_KEY:-$HOME/.ssh/kube_aws_rsa} diff --git a/cluster/aws/options.md b/cluster/aws/options.md index 86642deb00e..20401494119 100644 --- a/cluster/aws/options.md +++ b/cluster/aws/options.md @@ -50,5 +50,24 @@ Please note: Do not set this to "false" unless you... - ... already configured a route for "YOUR_IP/32" to an AWS internet gateway (for the master instance to reach your client directly during setup) +## DOCKER_STORAGE + +Choose the docker storage driver to use. This is an advanced option; most people should leave it as the default aufs +for parity with GCE. + +Supported values: btrfs, aufs, devicemapper, aufs-nolvm + +This will also configure your ephemeral storage in a compatible way, and your Docker containers +will run on this storage if available, as typically the root disk is comparatively small. + +* `btrfs` will combine your ephemeral disks into a btrfs volume. This is a good option if you have a recent kernel + with a reliable btrfs. +* `aufs` uses the aufs driver, but also installs LVM to combine your disks. `aufs-nolvm` will not use LVM, + meaning that only your first ephemeral disk will be used. +* `devicemapper` sets up LVM across all your ephemeral disks and sets Docker to drive it directly. This is a + similar option to btrfs, but without relying on the btrfs filesystem. Sadly, it does not work with most + configurations - see [this docker bug](https://github.com/docker/docker/issues/4036) + +If your machines don't have any ephemeral disks, this will default to the aufs driver on your root disk (with no LVM). [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/aws/options.md?pixel)]() diff --git a/cluster/aws/templates/format-disks.sh b/cluster/aws/templates/format-disks.sh index 32590ad8840..3aeb09c7e53 100644 --- a/cluster/aws/templates/format-disks.sh +++ b/cluster/aws/templates/format-disks.sh @@ -42,39 +42,139 @@ for ephemeral_device in $ephemeral_devices; do fi done +# These are set if we should move where docker/kubelet store data +# Note this gets set to the parent directory +move_docker="" +move_kubelet="" + +apt-get update + +docker_storage=${DOCKER_STORAGE:-aufs} + # Format the ephemeral disks if [[ ${#block_devices[@]} == 0 ]]; then - echo "No ephemeral block devices found" + echo "No ephemeral block devices found; will use aufs on root" + docker_storage="aufs" else - echo "Block devices: ${block_devices}" + echo "Block devices: ${block_devices[@]}" - apt-get install --yes btrfs-tools + if [[ ${docker_storage} == "btrfs" ]]; then + apt-get install --yes btrfs-tools - if [[ ${#block_devices[@]} == 1 ]]; then - echo "One ephemeral block device found; formatting with btrfs" - mkfs.btrfs -f ${block_devices[0]} + if [[ ${#block_devices[@]} == 1 ]]; then + echo "One ephemeral block device found; formatting with btrfs" + mkfs.btrfs -f ${block_devices[0]} + else + echo "Found multiple ephemeral block devices, formatting with btrfs as RAID-0" + mkfs.btrfs -f --data raid0 ${block_devices[@]} + fi + mount -t btrfs ${block_devices[0]} /mnt + + mkdir -p /mnt/kubernetes + + move_docker="/mnt" + move_kubelet="/mnt/kubernetes" + elif [[ ${docker_storage} == "aufs-nolvm" ]]; then + if [[ ${#block_devices[@]} != 1 ]]; then + echo "aufs-nolvm selected, but multiple ephemeral devices were found; only the first will be available" + fi + + /bin/umount ${block_devices[0]} + mkfs -t ext4 ${block_devices[0]} + mount -t ext4 ${block_devices[0]} /mnt + + mkdir -p /mnt/kubernetes + + move_docker="/mnt" + move_kubelet="/mnt/kubernetes" + elif [[ ${docker_storage} == "devicemapper" || ${docker_storage} == "aufs" ]]; then + # We always use LVM, even with one device + # In devicemapper mode, Docker can use LVM directly + # Also, fewer code paths are good + echo "Using LVM2 and ext4" + apt-get install --yes lvm2 + + # Don't output spurious "File descriptor X leaked on vgcreate invocation." + # Known bug: e.g. Ubuntu #591823 + export LVM_SUPPRESS_FD_WARNINGS=1 + + for block_device in ${block_devices}; do + /bin/umount ${block_device} + pvcreate ${block_device} + done + vgcreate vg-ephemeral ${block_devices[@]} + + if [[ ${docker_storage} == "devicemapper" ]]; then + # devicemapper thin provisioning, managed by docker + # This is the best option, but it is sadly broken on most distros + # Bug: https://github.com/docker/docker/issues/4036 + + # 95% goes to the docker thin-pool + lvcreate -l 95%VG --thinpool docker-thinpool vg-ephemeral + + DOCKER_OPTS="${DOCKER_OPTS} --storage-opt dm.thinpooldev=/dev/mapper/vg--ephemeral-docker--thinpool" + # Note that we don't move docker; docker goes direct to the thinpool + else + # aufs + + # Create a docker lv, use docker on it + # 95% goes to the docker thin-pool + lvcreate -l 95%VG --thinpool docker-thinpool vg-ephemeral + + THINPOOL_SIZE=$(lvs vg-ephemeral/docker-thinpool -o LV_SIZE --noheadings --units M --nosuffix) + lvcreate -V${THINPOOL_SIZE}M -T vg-ephemeral/docker-thinpool -n docker + + mkfs -t ext4 /dev/vg-ephemeral/docker + mkdir -p /mnt/docker + mount -t ext4 /dev/vg-ephemeral/docker /mnt/docker + move_docker="/mnt" + fi + + # Remaining 5% is for kubernetes data + # TODO: Should this be a thin pool? e.g. would we ever want to snapshot this data? + lvcreate -l 100%FREE -n kubernetes vg-ephemeral + mkfs -t ext4 /dev/vg-ephemeral/kubernetes + mkdir -p /mnt/kubernetes + mount -t ext4 /dev/vg-ephemeral/kubernetes /mnt/kubernetes + move_kubelet="/mnt/kubernetes" else - echo "Found multiple ephemeral block devices, formatting with btrfs as RAID-0" - mkfs.btrfs -f --data raid0 ${block_devices[@]} + echo "Ignoring unknown DOCKER_STORAGE: ${docker_storage}" fi - mount -t btrfs ${block_devices[0]} /mnt - - # Move docker to /mnt if we have it - if [[ -d /var/lib/docker ]]; then - mv /var/lib/docker /mnt/ - fi - mkdir -p /mnt/docker - ln -s /mnt/docker /var/lib/docker - DOCKER_ROOT="/mnt/docker" - DOCKER_OPTS="${DOCKER_OPTS} -g /mnt/docker" - - # Move /var/lib/kubelet to /mnt if we have it - # (the backing for empty-dir volumes can use a lot of space!) - if [[ -d /var/lib/kubelet ]]; then - mv /var/lib/kubelet /mnt/ - fi - mkdir -p /mnt/kubelet - ln -s /mnt/kubelet /var/lib/kubelet - KUBELET_ROOT="/mnt/kubelet" +fi + + +if [[ ${docker_storage} == "btrfs" ]]; then + DOCKER_OPTS="${DOCKER_OPTS} -s btrfs" +elif [[ ${docker_storage} == "aufs-nolvm" || ${docker_storage} == "aufs" ]]; then + # Install aufs kernel module + apt-get install --yes linux-image-extra-$(uname -r) + + DOCKER_OPTS="${DOCKER_OPTS} -s aufs" +elif [[ ${docker_storage} == "devicemapper" ]]; then + DOCKER_OPTS="${DOCKER_OPTS} -s devicemapper" +else + echo "Ignoring unknown DOCKER_STORAGE: ${docker_storage}" +fi + +if [[ -n "${move_docker}" ]]; then + # Move docker to e.g. /mnt + if [[ -d /var/lib/docker ]]; then + mv /var/lib/docker ${move_docker}/ + fi + mkdir -p ${move_docker}/docker + ln -s ${move_docker}/docker /var/lib/docker + DOCKER_ROOT="${move_docker}/docker" + DOCKER_OPTS="${DOCKER_OPTS} -g ${DOCKER_ROOT}" +fi + +if [[ -n "${move_kubelet}" ]]; then + # Move /var/lib/kubelet to e.g. /mnt + # (the backing for empty-dir volumes can use a lot of space!) + if [[ -d /var/lib/kubelet ]]; then + mv /var/lib/kubelet ${move_kubelet}/ + fi + mkdir -p ${move_kubelet}/kubelet + ln -s ${move_kubelet}/kubelet /var/lib/kubelet + KUBELET_ROOT="${move_kubelet}/kubelet" fi diff --git a/cluster/aws/ubuntu/util.sh b/cluster/aws/ubuntu/util.sh index 35b923274f1..33281c66f6a 100644 --- a/cluster/aws/ubuntu/util.sh +++ b/cluster/aws/ubuntu/util.sh @@ -31,6 +31,7 @@ function generate-minion-user-data { echo "SALT_MASTER='${MASTER_INTERNAL_IP}'" echo "MINION_IP_RANGE='${MINION_IP_RANGES[$i]}'" echo "DOCKER_OPTS='${EXTRA_DOCKER_OPTS:-}'" + echo "readonly DOCKER_STORAGE='${DOCKER_STORAGE:-}'" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/common.sh" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/format-disks.sh" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/salt-minion.sh" diff --git a/cluster/aws/util.sh b/cluster/aws/util.sh index 116014004cc..eae3c5dadc6 100644 --- a/cluster/aws/util.sh +++ b/cluster/aws/util.sh @@ -674,6 +674,7 @@ function kube-up { echo "readonly MASTER_IP_RANGE='${MASTER_IP_RANGE:-}'" echo "readonly KUBELET_TOKEN='${KUBELET_TOKEN}'" echo "readonly KUBE_PROXY_TOKEN='${KUBE_PROXY_TOKEN}'" + echo "readonly DOCKER_STORAGE='${DOCKER_STORAGE:-}'" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/common.sh" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/format-disks.sh" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/create-dynamic-salt-files.sh"