From f952b093a7faa62323617a684e7f6a58d863f42f Mon Sep 17 00:00:00 2001
From: Lantao Liu <lantaol@google.com>
Date: Thu, 3 May 2018 17:18:21 -0700
Subject: [PATCH] Still use `docker ps` for docker health monitoring.

Signed-off-by: Lantao Liu <lantaol@google.com>
---
 cluster/gce/gci/health-monitor.sh | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/cluster/gce/gci/health-monitor.sh b/cluster/gce/gci/health-monitor.sh
index f342fc1ec43..f5517cf6451 100644
--- a/cluster/gce/gci/health-monitor.sh
+++ b/cluster/gce/gci/health-monitor.sh
@@ -25,24 +25,32 @@ set -o pipefail
 # We simply kill the process when there is a failure. Another systemd service will
 # automatically restart the process.
 function container_runtime_monitoring {
-  # Container runtime startup takes time. Make initial attempts before starting
-  # killing the container runtime.
   local -r max_attempts=5
   local attempt=1
   local -r crictl="${KUBE_HOME}/bin/crictl"
-  local -r container_runtime="${CONTAINER_RUNTIME_NAME:-docker}"
-  until timeout 60 "${crictl}" pods > /dev/null; do
+  local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}"
+  # We still need to use `docker ps` when container runtime is "docker". This is because
+  # dockershim is still part of kubelet today. When kubelet is down, crictl pods
+  # will also fail, and docker will be killed. This is undesirable especially when
+  # docker live restore is disabled.
+  local healthcheck_command="docker ps"
+  if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then
+    healthcheck_command="${crictl} pods"
+  fi
+  # Container runtime startup takes time. Make initial attempts before starting
+  # killing the container runtime.
+  until timeout 60 ${healthcheck_command} > /dev/null; do
     if (( attempt == max_attempts )); then
       echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness."
       break
     fi
-    echo "$attempt initial attempt \"${crictl} pods\"! Trying again in $attempt seconds..."
-    sleep "$(( attempt++ ))"
+    echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..."
+    sleep "$(( 2 ** attempt++ ))"
   done
   while true; do
-    if ! timeout 60 "${crictl}" pods > /dev/null; then
-      echo "Container runtime ${container_runtime} failed!"
-      systemctl kill --kill-who=main "${container_runtime}"
+    if ! timeout 60 ${healthcheck_command} > /dev/null; then
+      echo "Container runtime ${container_runtime_name} failed!"
+      systemctl kill --kill-who=main "${container_runtime_name}"
       # Wait for a while, as we don't want to kill it again before it is really up.
       sleep 120
     else