diff --git a/cluster/saltbase/salt/docker/docker-healthcheck b/cluster/saltbase/salt/docker/docker-healthcheck new file mode 100755 index 00000000000..9167567bb04 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck @@ -0,0 +1,44 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to be run periodically, to check the health +# of docker. If it detects a failure, it will restart docker using systemctl. + +if timeout 10 docker version > /dev/null; then + exit 0 +fi + +echo "docker failed" +echo "Giving docker 30 seconds grace before restarting" +sleep 30 + +if timeout 10 docker version > /dev/null; then + echo "docker recovered" + exit 0 +fi + +echo "docker still down; triggering docker restart" +systemctl restart docker + +echo "Waiting 60 seconds to give docker time to start" +sleep 60 + +if timeout 10 docker version > /dev/null; then + echo "docker recovered" + exit 0 +fi + +echo "docker still failing" diff --git a/cluster/saltbase/salt/docker/docker-healthcheck.service b/cluster/saltbase/salt/docker/docker-healthcheck.service new file mode 100644 index 00000000000..b4ecfaf8f72 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck.service @@ -0,0 +1,9 @@ +[Unit] +Description=Run docker-healthcheck once + +[Service] +Type=oneshot +ExecStart=/opt/kubernetes/helpers/docker-healthcheck + +[Install] +WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/docker-healthcheck.timer b/cluster/saltbase/salt/docker/docker-healthcheck.timer new file mode 100644 index 00000000000..3d252e4464e --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Trigger docker-healthcheck periodically + +[Timer] +OnUnitInactiveSec=10s +Unit=docker-healthcheck.service + +[Install] +WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/docker-prestart b/cluster/saltbase/salt/docker/docker-prestart new file mode 100755 index 00000000000..ea23d6d7237 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-prestart @@ -0,0 +1,22 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to be run before we start Docker. + +# cleanup docker network checkpoint to avoid running into known issue +# of docker (https://github.com/docker/docker/issues/18283) +rm -rf /var/lib/docker/network + diff --git a/cluster/saltbase/salt/docker/docker.service b/cluster/saltbase/salt/docker/docker.service index aa535375ec8..a5e02c9f6e5 100644 --- a/cluster/saltbase/salt/docker/docker.service +++ b/cluster/saltbase/salt/docker/docker.service @@ -1,10 +1,11 @@ [Unit] Description=Docker Application Container Engine -Documentation=http://docs.docker.com +Documentation=https://docs.docker.com After=network.target docker.socket Requires=docker.socket [Service] +Type=notify EnvironmentFile={{ environment_file }} ExecStart=/usr/bin/docker daemon -H fd:// "$DOCKER_OPTS" MountFlags=slave @@ -14,7 +15,7 @@ LimitCORE=infinity Restart=always RestartSec=2s StartLimitInterval=0 +ExecStartPre=/opt/kubernetes/helpers/docker-prestart [Install] WantedBy=multi-user.target - diff --git a/cluster/saltbase/salt/docker/init.sls b/cluster/saltbase/salt/docker/init.sls index 7352bc6a5b7..6e1f5a85739 100644 --- a/cluster/saltbase/salt/docker/init.sls +++ b/cluster/saltbase/salt/docker/init.sls @@ -51,6 +51,13 @@ docker: {% if pillar.get('is_systemd') %} +/opt/kubernetes/helpers/docker-prestart: + file.managed: + - source: salt://docker/docker-prestart + - user: root + - group: root + - mode: 755 + {{ pillar.get('systemd_system_path') }}/docker.service: file.managed: - source: salt://docker/docker.service @@ -60,6 +67,8 @@ docker: - mode: 644 - defaults: environment_file: {{ environment_file }} + - require: + - file: /opt/kubernetes/helpers/docker-prestart # The docker service.running block below doesn't work reliably # Instead we run our script which e.g. does a systemd daemon-reload @@ -297,9 +306,16 @@ docker-upgrade: - file: /var/cache/docker-install/{{ override_deb }} {% endif %} # end override_docker_ver != '' -# Default docker systemd unit file doesn't use an EnvironmentFile; replace it with one that does. {% if pillar.get('is_systemd') %} +/opt/kubernetes/helpers/docker-prestart: + file.managed: + - source: salt://docker/docker-prestart + - user: root + - group: root + - mode: 755 + +# Default docker systemd unit file doesn't use an EnvironmentFile; replace it with one that does. {{ pillar.get('systemd_system_path') }}/docker.service: file.managed: - source: salt://docker/docker.service @@ -309,6 +325,8 @@ docker-upgrade: - mode: 644 - defaults: environment_file: {{ environment_file }} + - require: + - file: /opt/kubernetes/helpers/docker-prestart # The docker service.running block below doesn't work reliably # Instead we run our script which e.g. does a systemd daemon-reload @@ -316,7 +334,7 @@ docker-upgrade: # TODO: Fix this fix-service-docker: cmd.wait: - - name: /opt/kubernetes/helpers/services bounce docker + - name: /opt/kubernetes/helpers/services enable docker - watch: - file: {{ pillar.get('systemd_system_path') }}/docker.service - file: {{ environment_file }} @@ -325,30 +343,77 @@ fix-service-docker: - cmd: docker-upgrade {% endif %} +/opt/kubernetes/helpers/docker-healthcheck: + file.managed: + - source: salt://docker/docker-healthcheck + - user: root + - group: root + - mode: 755 + +{{ pillar.get('systemd_system_path') }}/docker-healthcheck.service: + file.managed: + - source: salt://docker/docker-healthcheck.service + - template: jinja + - user: root + - group: root + - mode: 644 + +{{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer: + file.managed: + - source: salt://docker/docker-healthcheck.timer + - template: jinja + - user: root + - group: root + - mode: 644 + +# Tell systemd to load the timer +fix-systemd-docker-healthcheck-timer: + cmd.wait: + - name: /opt/kubernetes/helpers/services bounce docker-healthcheck.timer + - watch: + - file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer + +# Trigger a first run of docker-healthcheck; needed because the timer fires 10s after the previous run. +fix-systemd-docker-healthcheck-service: + cmd.wait: + - name: /opt/kubernetes/helpers/services bounce docker-healthcheck.service + - watch: + - file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.service + - require: + - cmd: fix-service-docker + {% endif %} docker: - service.running: # Starting Docker is racy on aws for some reason. To be honest, since Monit # is managing Docker restart we should probably just delete this whole thing # but the kubernetes components use salt 'require' to set up a dag, and that # complicated and scary to unwind. +# On AWS, we use a trick now... we don't start the docker service through Salt. +# Kubelet or our health checker will start it. But we use service.enabled, +# so we still have a `service: docker` node for our DAG. {% if grains.cloud is defined and grains.cloud == 'aws' %} - - enable: False + service.enabled: {% else %} + service.running: - enable: True {% endif %} +# If we put a watch on this, salt will try to start the service. +# We put the watch on the fixer instead +{% if not pillar.get('is_systemd') %} - watch: - file: {{ environment_file }} {% if override_docker_ver != '' %} - cmd: docker-upgrade {% endif %} +{% endif %} + - require: + - file: {{ environment_file }} +{% if override_docker_ver != '' %} + - cmd: docker-upgrade +{% endif %} {% if pillar.get('is_systemd') %} - - file: {{ pillar.get('systemd_system_path') }}/docker.service -{% endif %} -{% if override_docker_ver != '' %} - - require: - - cmd: docker-upgrade + - cmd: fix-service-docker {% endif %} {% endif %} # end grains.os_family != 'RedHat' diff --git a/cluster/saltbase/salt/salt-helpers/services b/cluster/saltbase/salt/salt-helpers/services index f55b9b39c77..bc8db58f326 100644 --- a/cluster/saltbase/salt/salt-helpers/services +++ b/cluster/saltbase/salt/salt-helpers/services @@ -63,6 +63,9 @@ elif [[ "${ACTION}" == "down" ]]; then reload_state disable_service stop_service +elif [[ "${ACTION}" == "enable" ]]; then + reload_state + enable_service else echo "Unknown action: ${ACTION}" exit 1