From e6af5ee2965ec47d02d6609f67a9526819872fcb Mon Sep 17 00:00:00 2001 From: Matt Liggett Date: Thu, 7 Apr 2016 11:50:04 -0700 Subject: [PATCH] Rewrite docker-checker.sh to make it less kill-happy. Also a little more organized and easier to re-use. Should minimize the incidence of #23461. --- .../salt/supervisor/docker-checker.sh | 98 ++++++++++++------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/cluster/saltbase/salt/supervisor/docker-checker.sh b/cluster/saltbase/salt/supervisor/docker-checker.sh index b56507d2507..37653b5f6df 100755 --- a/cluster/saltbase/salt/supervisor/docker-checker.sh +++ b/cluster/saltbase/salt/supervisor/docker-checker.sh @@ -18,38 +18,70 @@ # it detects a failure. It then exits, and supervisord restarts it # which in turn restarts docker. -/etc/init.d/docker stop -# Make sure docker gracefully terminated before start again -starttime=`date +%s` -while pidof docker > /dev/null; do - currenttime=`date +%s` - ((elapsedtime = currenttime - starttime)) - # after 60 seconds, forcefully terminate docker process - if test $elapsedtime -gt 60; then - echo "attempting to kill docker process with sigkill signal" - kill -9 `pidof docker` || sleep 10 - else - echo "waiting clean shutdown" - sleep 10 - fi -done - -echo "docker is not running. starting docker" - -# cleanup docker network checkpoint to avoid running into known issue -# of docker (https://github.com/docker/docker/issues/18283) -rm -rf /var/lib/docker/network - -/etc/init.d/docker start - -echo "waiting 30s for startup" -sleep 30 - -while true; do - if ! timeout 60 docker ps > /dev/null; then - echo "Docker failed!" - exit 2 +main() { + if ! healthy 60; then + stop_docker + start_docker + echo "waiting 30s for startup" + sleep 30 + healthy 60 fi - sleep 10 -done + while healthy; do + sleep 10 + done + + echo "Docker failed!" + exit 2 +} + +# Performs health check on docker. If a parameter is passed, it is treated as +# the number of seconds to keep trying for a healthy result. If none is passed +# we make only one attempt. +healthy() { + max_retry_sec="$1" + shift + + starttime=$(date +%s) + while ! timeout 60 docker ps > /dev/null; do + if [[ -z "$max_retry_sec" || $(( $(date +%s) - starttime )) -gt "$max_retry_sec" ]]; then + echo "docker ps did not succeed" + return 2 + else + echo "waiting 5s before retry" + sleep 5 + fi + done + echo "docker is healthy" + return 0 +} + +stop_docker() { + /etc/init.d/docker stop + # Make sure docker gracefully terminated before start again + starttime=`date +%s` + while pidof docker > /dev/null; do + currenttime=`date +%s` + ((elapsedtime = currenttime - starttime)) + # after 60 seconds, forcefully terminate docker process + if test $elapsedtime -gt 60; then + echo "attempting to kill docker process with sigkill signal" + kill -9 `pidof docker` || sleep 10 + else + echo "waiting clean shutdown" + sleep 10 + fi + done +} + +start_docker() { + echo "docker is not running. starting docker" + + # cleanup docker network checkpoint to avoid running into known issue + # of docker (https://github.com/docker/docker/issues/18283) + rm -rf /var/lib/docker/network + + /etc/init.d/docker start +} + +main