From 7427387938f3cd8acfc241f46493747fd7b57f31 Mon Sep 17 00:00:00 2001
From: Andy Zheng <qzheng@google.com>
Date: Mon, 14 Sep 2015 14:14:56 -0700
Subject: [PATCH] Add trusty node health monitoring

Upstart monitors the process of docker, kubelet, and kube-proxy.
This change adds an upstart job running as daemon to conduct
non-PID health monitoring.
---
 cluster/gce/trusty/node.yaml | 40 ++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/cluster/gce/trusty/node.yaml b/cluster/gce/trusty/node.yaml
index f386adde215..a68dffa9ff1 100644
--- a/cluster/gce/trusty/node.yaml
+++ b/cluster/gce/trusty/node.yaml
@@ -372,5 +372,45 @@ script
 	fi
 end script
 
+--===============6024533374511606659==
+MIME-Version: 1.0
+Content-Type: text/upstart-job; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="kube-node-health-monitoring.conf"
+
+description "Kubenetes node health monitoring"
+
+start on stopped kube-docker and started kube-proxy
+
+respawn
+
+script
+	set -o nounset
+
+	# Wait for a minute to let docker, kubelet, and kube-proxy processes finish initialization.
+	# TODO(andyzheng0831): replace it with a more reliable method if possible.
+	sleep 60
+
+	sleep_seconds=10
+	max_seconds=10
+	# We simply kill the process when there is a failure. Another upstart job will automatically
+	# restart the process.
+	while [ 1 ]; do
+		if ! timeout 10 docker version > /dev/null; then
+			echo "Docker daemon failed!"
+			pkill docker
+		fi
+		if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10255/healthz > /dev/null; then
+			echo "Kubelet is unhealthy!"
+			pkill kubelet
+		fi
+		if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10249/healthz > /dev/null; then
+			echo "Kube-proxy is unhealthy!"
+			pkill kube-proxy
+		fi
+		sleep ${sleep_seconds}
+	done
+end script
+
 --===============6024533374511606659==--