Merge pull request #22434 from justinsb/aws_docker_babysitter

Auto commit by PR queue bot
This commit is contained in:
k8s-merge-robot 2016-03-04 21:52:03 -08:00
commit 57c944caa2
7 changed files with 164 additions and 11 deletions

View File

@ -0,0 +1,44 @@
#!/bin/bash
# Copyright 2015 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is intended to be run periodically, to check the health
# of docker. If it detects a failure, it will restart docker using systemctl.
if timeout 10 docker version > /dev/null; then
exit 0
fi
echo "docker failed"
echo "Giving docker 30 seconds grace before restarting"
sleep 30
if timeout 10 docker version > /dev/null; then
echo "docker recovered"
exit 0
fi
echo "docker still down; triggering docker restart"
systemctl restart docker
echo "Waiting 60 seconds to give docker time to start"
sleep 60
if timeout 10 docker version > /dev/null; then
echo "docker recovered"
exit 0
fi
echo "docker still failing"

View File

@ -0,0 +1,9 @@
[Unit]
Description=Run docker-healthcheck once
[Service]
Type=oneshot
ExecStart=/opt/kubernetes/helpers/docker-healthcheck
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,9 @@
[Unit]
Description=Trigger docker-healthcheck periodically
[Timer]
OnUnitInactiveSec=10s
Unit=docker-healthcheck.service
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,22 @@
#!/bin/bash
# Copyright 2015 The Kubernetes Authors All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is intended to be run before we start Docker.
# cleanup docker network checkpoint to avoid running into known issue
# of docker (https://github.com/docker/docker/issues/18283)
rm -rf /var/lib/docker/network

View File

@ -1,10 +1,11 @@
[Unit]
Description=Docker Application Container Engine
Documentation=http://docs.docker.com
Documentation=https://docs.docker.com
After=network.target docker.socket
Requires=docker.socket
[Service]
Type=notify
EnvironmentFile={{ environment_file }}
ExecStart=/usr/bin/docker daemon -H fd:// "$DOCKER_OPTS"
MountFlags=slave
@ -14,7 +15,7 @@ LimitCORE=infinity
Restart=always
RestartSec=2s
StartLimitInterval=0
ExecStartPre=/opt/kubernetes/helpers/docker-prestart
[Install]
WantedBy=multi-user.target

View File

@ -51,6 +51,13 @@ docker:
{% if pillar.get('is_systemd') %}
/opt/kubernetes/helpers/docker-prestart:
file.managed:
- source: salt://docker/docker-prestart
- user: root
- group: root
- mode: 755
{{ pillar.get('systemd_system_path') }}/docker.service:
file.managed:
- source: salt://docker/docker.service
@ -60,6 +67,8 @@ docker:
- mode: 644
- defaults:
environment_file: {{ environment_file }}
- require:
- file: /opt/kubernetes/helpers/docker-prestart
# The docker service.running block below doesn't work reliably
# Instead we run our script which e.g. does a systemd daemon-reload
@ -297,9 +306,16 @@ docker-upgrade:
- file: /var/cache/docker-install/{{ override_deb }}
{% endif %} # end override_docker_ver != ''
# Default docker systemd unit file doesn't use an EnvironmentFile; replace it with one that does.
{% if pillar.get('is_systemd') %}
/opt/kubernetes/helpers/docker-prestart:
file.managed:
- source: salt://docker/docker-prestart
- user: root
- group: root
- mode: 755
# Default docker systemd unit file doesn't use an EnvironmentFile; replace it with one that does.
{{ pillar.get('systemd_system_path') }}/docker.service:
file.managed:
- source: salt://docker/docker.service
@ -309,6 +325,8 @@ docker-upgrade:
- mode: 644
- defaults:
environment_file: {{ environment_file }}
- require:
- file: /opt/kubernetes/helpers/docker-prestart
# The docker service.running block below doesn't work reliably
# Instead we run our script which e.g. does a systemd daemon-reload
@ -316,7 +334,7 @@ docker-upgrade:
# TODO: Fix this
fix-service-docker:
cmd.wait:
- name: /opt/kubernetes/helpers/services bounce docker
- name: /opt/kubernetes/helpers/services enable docker
- watch:
- file: {{ pillar.get('systemd_system_path') }}/docker.service
- file: {{ environment_file }}
@ -325,30 +343,77 @@ fix-service-docker:
- cmd: docker-upgrade
{% endif %}
/opt/kubernetes/helpers/docker-healthcheck:
file.managed:
- source: salt://docker/docker-healthcheck
- user: root
- group: root
- mode: 755
{{ pillar.get('systemd_system_path') }}/docker-healthcheck.service:
file.managed:
- source: salt://docker/docker-healthcheck.service
- template: jinja
- user: root
- group: root
- mode: 644
{{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer:
file.managed:
- source: salt://docker/docker-healthcheck.timer
- template: jinja
- user: root
- group: root
- mode: 644
# Tell systemd to load the timer
fix-systemd-docker-healthcheck-timer:
cmd.wait:
- name: /opt/kubernetes/helpers/services bounce docker-healthcheck.timer
- watch:
- file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer
# Trigger a first run of docker-healthcheck; needed because the timer fires 10s after the previous run.
fix-systemd-docker-healthcheck-service:
cmd.wait:
- name: /opt/kubernetes/helpers/services bounce docker-healthcheck.service
- watch:
- file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.service
- require:
- cmd: fix-service-docker
{% endif %}
docker:
service.running:
# Starting Docker is racy on aws for some reason. To be honest, since Monit
# is managing Docker restart we should probably just delete this whole thing
# but the kubernetes components use salt 'require' to set up a dag, and that
# complicated and scary to unwind.
# On AWS, we use a trick now... we don't start the docker service through Salt.
# Kubelet or our health checker will start it. But we use service.enabled,
# so we still have a `service: docker` node for our DAG.
{% if grains.cloud is defined and grains.cloud == 'aws' %}
- enable: False
service.enabled:
{% else %}
service.running:
- enable: True
{% endif %}
# If we put a watch on this, salt will try to start the service.
# We put the watch on the fixer instead
{% if not pillar.get('is_systemd') %}
- watch:
- file: {{ environment_file }}
{% if override_docker_ver != '' %}
- cmd: docker-upgrade
{% endif %}
{% endif %}
- require:
- file: {{ environment_file }}
{% if override_docker_ver != '' %}
- cmd: docker-upgrade
{% endif %}
{% if pillar.get('is_systemd') %}
- file: {{ pillar.get('systemd_system_path') }}/docker.service
{% endif %}
{% if override_docker_ver != '' %}
- require:
- cmd: docker-upgrade
- cmd: fix-service-docker
{% endif %}
{% endif %} # end grains.os_family != 'RedHat'

View File

@ -63,6 +63,9 @@ elif [[ "${ACTION}" == "down" ]]; then
reload_state
disable_service
stop_service
elif [[ "${ACTION}" == "enable" ]]; then
reload_state
enable_service
else
echo "Unknown action: ${ACTION}"
exit 1