From d074b5ba13f5b0da1817b4a7e220d00dbf4bf6ce Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Wed, 2 Mar 2016 10:21:59 -0500 Subject: [PATCH 1/4] docker systemd file: type->notify, docs->https This minimizes the changes we make to the official Docker systemd file. --- cluster/saltbase/salt/docker/docker.service | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cluster/saltbase/salt/docker/docker.service b/cluster/saltbase/salt/docker/docker.service index aa535375ec8..23a7453b249 100644 --- a/cluster/saltbase/salt/docker/docker.service +++ b/cluster/saltbase/salt/docker/docker.service @@ -1,10 +1,11 @@ [Unit] Description=Docker Application Container Engine -Documentation=http://docs.docker.com +Documentation=https://docs.docker.com After=network.target docker.socket Requires=docker.socket [Service] +Type=notify EnvironmentFile={{ environment_file }} ExecStart=/usr/bin/docker daemon -H fd:// "$DOCKER_OPTS" MountFlags=slave @@ -17,4 +18,3 @@ StartLimitInterval=0 [Install] WantedBy=multi-user.target - From 64f1cbaddd7d59f72b1972e0f86c3575bfdc1134 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Wed, 2 Mar 2016 09:20:26 -0500 Subject: [PATCH 2/4] Systemd/non-Redhat: Add docker prestart file We do the equivalent of #21727 for systemd systems. Issue #21731 --- cluster/saltbase/salt/docker/docker-prestart | 22 ++++++++++++++++++++ cluster/saltbase/salt/docker/docker.service | 1 + cluster/saltbase/salt/docker/init.sls | 20 +++++++++++++++++- 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100755 cluster/saltbase/salt/docker/docker-prestart diff --git a/cluster/saltbase/salt/docker/docker-prestart b/cluster/saltbase/salt/docker/docker-prestart new file mode 100755 index 00000000000..ea23d6d7237 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-prestart @@ -0,0 +1,22 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to be run before we start Docker. + +# cleanup docker network checkpoint to avoid running into known issue +# of docker (https://github.com/docker/docker/issues/18283) +rm -rf /var/lib/docker/network + diff --git a/cluster/saltbase/salt/docker/docker.service b/cluster/saltbase/salt/docker/docker.service index 23a7453b249..a5e02c9f6e5 100644 --- a/cluster/saltbase/salt/docker/docker.service +++ b/cluster/saltbase/salt/docker/docker.service @@ -15,6 +15,7 @@ LimitCORE=infinity Restart=always RestartSec=2s StartLimitInterval=0 +ExecStartPre=/opt/kubernetes/helpers/docker-prestart [Install] WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/init.sls b/cluster/saltbase/salt/docker/init.sls index 86a8ea19962..2de683a42a0 100644 --- a/cluster/saltbase/salt/docker/init.sls +++ b/cluster/saltbase/salt/docker/init.sls @@ -51,6 +51,13 @@ docker: {% if pillar.get('is_systemd') %} +/opt/kubernetes/helpers/docker-prestart: + file.managed: + - source: salt://docker/docker-prestart + - user: root + - group: root + - mode: 755 + {{ pillar.get('systemd_system_path') }}/docker.service: file.managed: - source: salt://docker/docker.service @@ -60,6 +67,8 @@ docker: - mode: 644 - defaults: environment_file: {{ environment_file }} + - require: + - file: /opt/kubernetes/helpers/docker-prestart # The docker service.running block below doesn't work reliably # Instead we run our script which e.g. does a systemd daemon-reload @@ -292,9 +301,16 @@ docker-upgrade: - file: /var/cache/docker-install/{{ override_deb }} {% endif %} # end override_docker_ver != '' -# Default docker systemd unit file doesn't use an EnvironmentFile; replace it with one that does. {% if pillar.get('is_systemd') %} +/opt/kubernetes/helpers/docker-prestart: + file.managed: + - source: salt://docker/docker-prestart + - user: root + - group: root + - mode: 755 + +# Default docker systemd unit file doesn't use an EnvironmentFile; replace it with one that does. {{ pillar.get('systemd_system_path') }}/docker.service: file.managed: - source: salt://docker/docker.service @@ -304,6 +320,8 @@ docker-upgrade: - mode: 644 - defaults: environment_file: {{ environment_file }} + - require: + - file: /opt/kubernetes/helpers/docker-prestart # The docker service.running block below doesn't work reliably # Instead we run our script which e.g. does a systemd daemon-reload From dbff0ef67b5b7df715b4e30256ef20e643407e44 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Thu, 3 Mar 2016 09:24:21 -0500 Subject: [PATCH 3/4] Systemd/non-Redhat: Add docker healthcheck script We do the equivalent of #21727 for systemd systems. Issue #21731 --- .../saltbase/salt/docker/docker-healthcheck | 44 +++++++++++++++++++ .../salt/docker/docker-healthcheck.service | 9 ++++ .../salt/docker/docker-healthcheck.timer | 9 ++++ cluster/saltbase/salt/docker/init.sls | 39 ++++++++++++++++ 4 files changed, 101 insertions(+) create mode 100755 cluster/saltbase/salt/docker/docker-healthcheck create mode 100644 cluster/saltbase/salt/docker/docker-healthcheck.service create mode 100644 cluster/saltbase/salt/docker/docker-healthcheck.timer diff --git a/cluster/saltbase/salt/docker/docker-healthcheck b/cluster/saltbase/salt/docker/docker-healthcheck new file mode 100755 index 00000000000..9167567bb04 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck @@ -0,0 +1,44 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to be run periodically, to check the health +# of docker. If it detects a failure, it will restart docker using systemctl. + +if timeout 10 docker version > /dev/null; then + exit 0 +fi + +echo "docker failed" +echo "Giving docker 30 seconds grace before restarting" +sleep 30 + +if timeout 10 docker version > /dev/null; then + echo "docker recovered" + exit 0 +fi + +echo "docker still down; triggering docker restart" +systemctl restart docker + +echo "Waiting 60 seconds to give docker time to start" +sleep 60 + +if timeout 10 docker version > /dev/null; then + echo "docker recovered" + exit 0 +fi + +echo "docker still failing" diff --git a/cluster/saltbase/salt/docker/docker-healthcheck.service b/cluster/saltbase/salt/docker/docker-healthcheck.service new file mode 100644 index 00000000000..b4ecfaf8f72 --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck.service @@ -0,0 +1,9 @@ +[Unit] +Description=Run docker-healthcheck once + +[Service] +Type=oneshot +ExecStart=/opt/kubernetes/helpers/docker-healthcheck + +[Install] +WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/docker-healthcheck.timer b/cluster/saltbase/salt/docker/docker-healthcheck.timer new file mode 100644 index 00000000000..3d252e4464e --- /dev/null +++ b/cluster/saltbase/salt/docker/docker-healthcheck.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Trigger docker-healthcheck periodically + +[Timer] +OnUnitInactiveSec=10s +Unit=docker-healthcheck.service + +[Install] +WantedBy=multi-user.target diff --git a/cluster/saltbase/salt/docker/init.sls b/cluster/saltbase/salt/docker/init.sls index 2de683a42a0..9abc97e8693 100644 --- a/cluster/saltbase/salt/docker/init.sls +++ b/cluster/saltbase/salt/docker/init.sls @@ -338,6 +338,45 @@ fix-service-docker: - cmd: docker-upgrade {% endif %} +/opt/kubernetes/helpers/docker-healthcheck: + file.managed: + - source: salt://docker/docker-healthcheck + - user: root + - group: root + - mode: 755 + +{{ pillar.get('systemd_system_path') }}/docker-healthcheck.service: + file.managed: + - source: salt://docker/docker-healthcheck.service + - template: jinja + - user: root + - group: root + - mode: 644 + +{{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer: + file.managed: + - source: salt://docker/docker-healthcheck.timer + - template: jinja + - user: root + - group: root + - mode: 644 + +# Tell systemd to load the timer +fix-systemd-docker-healthcheck-timer: + cmd.wait: + - name: /opt/kubernetes/helpers/services bounce docker-healthcheck.timer + - watch: + - file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.timer + +# Trigger a first run of docker-healthcheck; needed because the timer fires 10s after the previous run. +fix-systemd-docker-healthcheck-service: + cmd.wait: + - name: /opt/kubernetes/helpers/services bounce docker-healthcheck.service + - watch: + - file: {{ pillar.get('systemd_system_path') }}/docker-healthcheck.service + - require: + - cmd: fix-service-docker + {% endif %} docker: From 6bdab05d11a1cbcd8e3be790b11a391561712c3e Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Thu, 3 Mar 2016 09:30:52 -0500 Subject: [PATCH 4/4] Salt: Don't use Salt to start Docker Starting docker through Salt has always been problematic. Kubelet or the babysitter process should start it. We've kept it around primarily so we have a `service: docker` node for the Salt DAG. Instead, we enable (but do not start) the Docker service in Salt. This lets us keep the DAG node, but won't start it. There's another bug in Salt, where watches will start the service even on `service.enabled`. So we remove the watches, and move them to our existing Salt bug-fix script. --- cluster/saltbase/salt/docker/init.sls | 20 ++++++++++++++------ cluster/saltbase/salt/salt-helpers/services | 3 +++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/cluster/saltbase/salt/docker/init.sls b/cluster/saltbase/salt/docker/init.sls index 9abc97e8693..99ac1754eca 100644 --- a/cluster/saltbase/salt/docker/init.sls +++ b/cluster/saltbase/salt/docker/init.sls @@ -329,7 +329,7 @@ docker-upgrade: # TODO: Fix this fix-service-docker: cmd.wait: - - name: /opt/kubernetes/helpers/services bounce docker + - name: /opt/kubernetes/helpers/services enable docker - watch: - file: {{ pillar.get('systemd_system_path') }}/docker.service - file: {{ environment_file }} @@ -380,27 +380,35 @@ fix-systemd-docker-healthcheck-service: {% endif %} docker: - service.running: # Starting Docker is racy on aws for some reason. To be honest, since Monit # is managing Docker restart we should probably just delete this whole thing # but the kubernetes components use salt 'require' to set up a dag, and that # complicated and scary to unwind. +# On AWS, we use a trick now... we don't start the docker service through Salt. +# Kubelet or our health checker will start it. But we use service.enabled, +# so we still have a `service: docker` node for our DAG. {% if grains.cloud is defined and grains.cloud == 'aws' %} - - enable: False + service.enabled: {% else %} + service.running: - enable: True {% endif %} +# If we put a watch on this, salt will try to start the service. +# We put the watch on the fixer instead +{% if not pillar.get('is_systemd') %} - watch: - file: {{ environment_file }} {% if override_docker_ver != '' %} - cmd: docker-upgrade {% endif %} -{% if pillar.get('is_systemd') %} - - file: {{ pillar.get('systemd_system_path') }}/docker.service {% endif %} -{% if override_docker_ver != '' %} - require: + - file: {{ environment_file }} +{% if override_docker_ver != '' %} - cmd: docker-upgrade {% endif %} +{% if pillar.get('is_systemd') %} + - cmd: fix-service-docker +{% endif %} {% endif %} # end grains.os_family != 'RedHat' diff --git a/cluster/saltbase/salt/salt-helpers/services b/cluster/saltbase/salt/salt-helpers/services index f55b9b39c77..bc8db58f326 100644 --- a/cluster/saltbase/salt/salt-helpers/services +++ b/cluster/saltbase/salt/salt-helpers/services @@ -63,6 +63,9 @@ elif [[ "${ACTION}" == "down" ]]; then reload_state disable_service stop_service +elif [[ "${ACTION}" == "enable" ]]; then + reload_state + enable_service else echo "Unknown action: ${ACTION}" exit 1