From 7427387938f3cd8acfc241f46493747fd7b57f31 Mon Sep 17 00:00:00 2001 From: Andy Zheng Date: Mon, 14 Sep 2015 14:14:56 -0700 Subject: [PATCH] Add trusty node health monitoring Upstart monitors the process of docker, kubelet, and kube-proxy. This change adds an upstart job running as daemon to conduct non-PID health monitoring. --- cluster/gce/trusty/node.yaml | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/cluster/gce/trusty/node.yaml b/cluster/gce/trusty/node.yaml index f386adde215..a68dffa9ff1 100644 --- a/cluster/gce/trusty/node.yaml +++ b/cluster/gce/trusty/node.yaml @@ -372,5 +372,45 @@ script fi end script +--===============6024533374511606659== +MIME-Version: 1.0 +Content-Type: text/upstart-job; charset="us-ascii" +Content-Transfer-Encoding: 7bit +Content-Disposition: attachment; filename="kube-node-health-monitoring.conf" + +description "Kubenetes node health monitoring" + +start on stopped kube-docker and started kube-proxy + +respawn + +script + set -o nounset + + # Wait for a minute to let docker, kubelet, and kube-proxy processes finish initialization. + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 60 + + sleep_seconds=10 + max_seconds=10 + # We simply kill the process when there is a failure. Another upstart job will automatically + # restart the process. + while [ 1 ]; do + if ! timeout 10 docker version > /dev/null; then + echo "Docker daemon failed!" + pkill docker + fi + if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10255/healthz > /dev/null; then + echo "Kubelet is unhealthy!" + pkill kubelet + fi + if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10249/healthz > /dev/null; then + echo "Kube-proxy is unhealthy!" + pkill kube-proxy + fi + sleep ${sleep_seconds} + done +end script + --===============6024533374511606659==--