diff --git a/cluster/saltbase/salt/supervisor/docker-checker.sh b/cluster/saltbase/salt/supervisor/docker-checker.sh new file mode 100755 index 00000000000..05492e75589 --- /dev/null +++ b/cluster/saltbase/salt/supervisor/docker-checker.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to start the docker and then loop until +# it detects a failure. It then exits, and supervisord restarts it +# which in turn restarts docker. + +/etc/init.d/docker stop +/etc/init.d/docker start + +echo "waiting a minute for startup" +sleep 60 + +while true; do + if ! sudo timeout 10 docker version > /dev/null; then + echo "Docker failed!" + exit 2 + fi + sleep 10 +done + diff --git a/cluster/saltbase/salt/supervisor/docker.conf b/cluster/saltbase/salt/supervisor/docker.conf new file mode 100644 index 00000000000..3fd61f3f973 --- /dev/null +++ b/cluster/saltbase/salt/supervisor/docker.conf @@ -0,0 +1,6 @@ +[program:docker] +command=/usr/sbin/docker-checker.sh +stderr_logfile=/var/log/supervisor/docker-stderr.log +stdout_logfile=/var/log/supervisor/docker-stdout.log +autorestart=true +startretries=1000000 diff --git a/cluster/saltbase/salt/supervisor/init.sls b/cluster/saltbase/salt/supervisor/init.sls new file mode 100644 index 00000000000..6d13ed6ebdb --- /dev/null +++ b/cluster/saltbase/salt/supervisor/init.sls @@ -0,0 +1,125 @@ +{% if not pillar.get('is_systemd') %} + +supervisor: + pkg: + - installed + +monit: + pkg: + - purged + +/etc/supervisor/conf.d/docker.conf: + file: + - managed + - source: salt://supervisor/docker.conf + - user: root + - group: root + - mode: 644 + - makedirs: True + - require_in: + - pkg: supervisor + - require: + - file: /usr/sbin/docker-checker.sh + +/usr/sbin/docker-checker.sh: + file: + - managed + - source: salt://supervisor/docker-checker.sh + - user: root + - group: root + - mode: 755 + - makedirs: True + +/etc/supervisor/conf.d/kubelet.conf: + file: + - managed + - source: salt://supervisor/kubelet.conf + - user: root + - group: root + - mode: 644 + - makedirs: True + - require_in: + - pkg: supervisor + - require: + - file: /usr/sbin/kubelet-checker.sh + +/usr/sbin/kubelet-checker.sh: + file: + - managed + - source: salt://supervisor/kubelet-checker.sh + - user: root + - group: root + - mode: 755 + - makedirs: True + +{% if "kubernetes-pool" in grains.get('roles', []) %} +/etc/supervisor/conf.d/kube-proxy.conf: + file: + - managed + - source: salt://supervisor/kube-proxy.conf + - user: root + - group: root + - mode: 644 + - makedirs: True + - require_in: + - pkg: supervisor + - require: + - file: /usr/sbin/kube-proxy-checker.sh + +/usr/sbin/kube-proxy-checker.sh: + file: + - managed + - source: salt://supervisor/kube-proxy-checker.sh + - user: root + - group: root + - mode: 755 + - makedirs: True +{% endif %} + +{% if grains['roles'][0] == 'kubernetes-master' -%} +/etc/supervisor/conf.d/kube-addons.conf: + file: + - managed + - source: salt://supervisor/kube-addons.conf + - user: root + - group: root + - mode: 644 + - makedirs: True + - require_in: + - pkg: supervisor + - require: + - file: /usr/sbin/kube-addons-checker.sh + +/usr/sbin/kube-addons-checker.sh: + file: + - managed + - source: salt://supervisor/kube-addons-checker.sh + - user: root + - group: root + - mode: 755 + - makedirs: True +{% endif %} + +/etc/supervisor/supervisor_watcher.sh: + file.managed: + - source: salt://supervisor/supervisor_watcher.sh + - user: root + - group: root + - mode: 755 + - makedirs: True + +crontab -l | { cat; echo "* * * * * /etc/supervisor/supervisor_watcher.sh 2>&1 | logger"; } | crontab -: + cmd.run: + - unless: crontab -l | grep "* * * * * /etc/supervisor/supervisor_watcher.sh 2>&1 | logger" + +supervisor-service: + service: + - running + - name: supervisor + - watch: + - pkg: supervisor + - file: /etc/supervisor/conf.d/* + - require: + - pkg: supervisor + +{% endif %} diff --git a/cluster/saltbase/salt/supervisor/kube-addons-checker.sh b/cluster/saltbase/salt/supervisor/kube-addons-checker.sh new file mode 100644 index 00000000000..d97497368d7 --- /dev/null +++ b/cluster/saltbase/salt/supervisor/kube-addons-checker.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to start the kube-addons and then loop until +# it detects a failure. It then exits, and supervisord restarts it +# which in turn restarts the kube-addons. + +/etc/init.d/kube-addons stop +/etc/init.d/kube-addons start + +echo "waiting a minute for startup" +sleep 60 + +while true; do + if ! /etc/init.d/kube-addons status > /dev/null; then + echo "kube-addons failed!" + exit 2 + fi + sleep 10 +done + diff --git a/cluster/saltbase/salt/supervisor/kube-addons.conf b/cluster/saltbase/salt/supervisor/kube-addons.conf new file mode 100644 index 00000000000..db977aed123 --- /dev/null +++ b/cluster/saltbase/salt/supervisor/kube-addons.conf @@ -0,0 +1,6 @@ +[program:kube-addons] +command=/usr/sbin/kube-addons-checker.sh +stderr_logfile=/var/log/supervisor/kube-addons-stderr.log +stdout_logfile=/var/log/supervisor/kube-addons-stdout.log +autorestart=true +startretries=1000000 diff --git a/cluster/saltbase/salt/supervisor/kube-proxy-checker.sh b/cluster/saltbase/salt/supervisor/kube-proxy-checker.sh new file mode 100755 index 00000000000..1fb8230c4da --- /dev/null +++ b/cluster/saltbase/salt/supervisor/kube-proxy-checker.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to start the kube-proxy and then loop until +# it detects a failure. It then exits, and supervisord restarts it +# which in turn restarts the kube-proxy. + +/etc/init.d/kube-proxy stop +/etc/init.d/kube-proxy start + +echo "waiting a minute for startup" +sleep 60 + +max_seconds=10 + +while true; do + if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10249/healthz > /dev/null; then + echo "kube-proxy failed!" + exit 2 + fi + sleep 10 +done + diff --git a/cluster/saltbase/salt/supervisor/kube-proxy.conf b/cluster/saltbase/salt/supervisor/kube-proxy.conf new file mode 100644 index 00000000000..7320be49993 --- /dev/null +++ b/cluster/saltbase/salt/supervisor/kube-proxy.conf @@ -0,0 +1,6 @@ +[program:kube-proxy] +command=/usr/sbin/kube-proxy-checker.sh +stderr_logfile=/var/log/supervisor/kube-proxy-stderr.log +stdout_logfile=/var/log/supervisor/kube-proxy-stdout.log +autorestart=true +startretries=1000000 diff --git a/cluster/saltbase/salt/supervisor/kubelet-checker.sh b/cluster/saltbase/salt/supervisor/kubelet-checker.sh new file mode 100755 index 00000000000..e933ace1d51 --- /dev/null +++ b/cluster/saltbase/salt/supervisor/kubelet-checker.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is intended to start the kubelet and then loop until +# it detects a failure. It then exits, and supervisord restarts it +# which in turn restarts the kubelet. + +/etc/init.d/kubelet stop +/etc/init.d/kubelet start + +echo "waiting a minute for startup" +sleep 60 + +max_seconds=10 + +while true; do + if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10255/healthz > /dev/null; then + echo "kubelet failed!" + exit 2 + fi + sleep 10 +done + diff --git a/cluster/saltbase/salt/supervisor/kubelet.conf b/cluster/saltbase/salt/supervisor/kubelet.conf new file mode 100644 index 00000000000..25f8274d375 --- /dev/null +++ b/cluster/saltbase/salt/supervisor/kubelet.conf @@ -0,0 +1,6 @@ +[program:kubelet] +command=/usr/sbin/kubelet-checker.sh +stderr_logfile=/var/log/supervisor/kubelet-stderr.log +stdout_logfile=/var/log/supervisor/kubelet-stdout.log +autorestart=true +startretries=1000000 diff --git a/cluster/saltbase/salt/supervisor/supervisor_watcher.sh b/cluster/saltbase/salt/supervisor/supervisor_watcher.sh new file mode 100644 index 00000000000..da214527c98 --- /dev/null +++ b/cluster/saltbase/salt/supervisor/supervisor_watcher.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is invoked by crond every minute to check if supervisord is +# up and oom protected. If down it restarts supervisord; otherwise, it exits +# after applying oom_score_adj + +PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + +if ! /etc/init.d/supervisor status > /dev/null; then + service supervisor start + sleep 10 +fi + +# Apply oom_score_adj: -901 to processes +pids=$(cat /var/run/supervisord.pid) +for pid in "${pids}"; do + echo -901 > /proc/$pid/oom_score_adj +done + + diff --git a/cluster/saltbase/salt/top.sls b/cluster/saltbase/salt/top.sls index feb31ae33e8..4ffec5e8eb7 100644 --- a/cluster/saltbase/salt/top.sls +++ b/cluster/saltbase/salt/top.sls @@ -26,7 +26,11 @@ base: {% endif %} {% endif %} - logrotate +{% if grains['cloud'] is defined and grains.cloud == 'gce' %} + - supervisor +{% else %} - monit +{% endif %} 'roles:kubernetes-master': - match: grain @@ -35,7 +39,11 @@ base: - kube-apiserver - kube-controller-manager - kube-scheduler +{% if grains['cloud'] is defined and grains.cloud == 'gce' %} + - supervisor +{% else %} - monit +{% endif %} {% if grains['cloud'] is defined and not grains.cloud in [ 'aws', 'gce', 'vagrant' ] %} - nginx {% endif %}