Update status to show failing services.

This commit is contained in:
Konstantinos Tsakalozos 2017-07-20 14:57:02 +03:00
parent acc19cafa4
commit 685dff99ab
2 changed files with 52 additions and 5 deletions

View File

@ -30,6 +30,7 @@ from shlex import split
from subprocess import check_call from subprocess import check_call
from subprocess import check_output from subprocess import check_output
from subprocess import CalledProcessError from subprocess import CalledProcessError
from time import sleep
from charms import layer from charms import layer
from charms.layer import snap from charms.layer import snap
@ -323,7 +324,36 @@ def idle_status(kube_api, kube_control):
msg = 'WARN: cannot change service-cidr, still using ' + service_cidr() msg = 'WARN: cannot change service-cidr, still using ' + service_cidr()
hookenv.status_set('active', msg) hookenv.status_set('active', msg)
else: else:
# All services should be up and running at this point. Double-check...
failing_services = master_services_down()
if len(failing_services) == 0:
hookenv.status_set('active', 'Kubernetes master running.') hookenv.status_set('active', 'Kubernetes master running.')
else:
msg = 'Stopped services: {}'.format(','.join(failing_services))
hookenv.status_set('waiting', msg)
def master_services_down():
"""Ensure master services are up and running.
Try to restart any failing services once.
Return: list of failing services"""
services = ['kube-apiserver',
'kube-controller-manager',
'kube-scheduler']
for service in services:
daemon = 'snap.{}.daemon'.format(service)
if not host.service_running(daemon):
hookenv.log("Service {} was down. Starting it.".format(daemon))
host.service_start(daemon)
sleep(10)
failing_services = []
for service in services:
daemon = 'snap.{}.daemon'.format(service)
if not host.service_running(daemon):
failing_services.append(service)
return failing_services
@when('etcd.available', 'tls_client.server.certificate.saved', @when('etcd.available', 'tls_client.server.certificate.saved',

View File

@ -22,6 +22,7 @@ from shlex import split
from subprocess import check_call, check_output from subprocess import check_call, check_output
from subprocess import CalledProcessError from subprocess import CalledProcessError
from socket import gethostname from socket import gethostname
from time import sleep
from charms import layer from charms import layer
from charms.layer import snap from charms.layer import snap
@ -257,11 +258,27 @@ def update_kubelet_status():
''' There are different states that the kubelet can be in, where we are ''' There are different states that the kubelet can be in, where we are
waiting for dns, waiting for cluster turnup, or ready to serve waiting for dns, waiting for cluster turnup, or ready to serve
applications.''' applications.'''
if (_systemctl_is_active('snap.kubelet.daemon')): services = [
'kubelet',
'kube-proxy'
]
for service in services:
daemon = 'snap.{}.daemon'.format(service)
if not _systemctl_is_active(daemon):
hookenv.log("Service {} id down. Starting it.".format(daemon))
sleep(10)
failing_services = []
for service in services:
daemon = 'snap.{}.daemon'.format(service)
if not _systemctl_is_active(daemon):
failing_services.append(service)
if len(failing_services) == 0:
hookenv.status_set('active', 'Kubernetes worker running.') hookenv.status_set('active', 'Kubernetes worker running.')
# if kubelet is not running, we're waiting on something else to converge else:
elif (not _systemctl_is_active('snap.kubelet.daemon')): msg = 'Waiting for {} to start.'.format(','.join(failing_services))
hookenv.status_set('waiting', 'Waiting for kubelet to start.') hookenv.status_set('waiting', msg)
@when('certificates.available') @when('certificates.available')