diff --git a/cluster/gce/trusty/node.yaml b/cluster/gce/trusty/node.yaml index f386adde215..a68dffa9ff1 100644 --- a/cluster/gce/trusty/node.yaml +++ b/cluster/gce/trusty/node.yaml @@ -372,5 +372,45 @@ script fi end script +--===============6024533374511606659== +MIME-Version: 1.0 +Content-Type: text/upstart-job; charset="us-ascii" +Content-Transfer-Encoding: 7bit +Content-Disposition: attachment; filename="kube-node-health-monitoring.conf" + +description "Kubenetes node health monitoring" + +start on stopped kube-docker and started kube-proxy + +respawn + +script + set -o nounset + + # Wait for a minute to let docker, kubelet, and kube-proxy processes finish initialization. + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 60 + + sleep_seconds=10 + max_seconds=10 + # We simply kill the process when there is a failure. Another upstart job will automatically + # restart the process. + while [ 1 ]; do + if ! timeout 10 docker version > /dev/null; then + echo "Docker daemon failed!" + pkill docker + fi + if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10255/healthz > /dev/null; then + echo "Kubelet is unhealthy!" + pkill kubelet + fi + if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10249/healthz > /dev/null; then + echo "Kube-proxy is unhealthy!" + pkill kube-proxy + fi + sleep ${sleep_seconds} + done +end script + --===============6024533374511606659==--