diff --git a/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py b/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py index a51add3d170..bdbcc57754c 100644 --- a/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py +++ b/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py @@ -323,7 +323,28 @@ def idle_status(kube_api, kube_control): msg = 'WARN: cannot change service-cidr, still using ' + service_cidr() hookenv.status_set('active', msg) else: - hookenv.status_set('active', 'Kubernetes master running.') + # All services should be up and running at this point. Double-check... + failing_services = master_services_down() + if len(failing_services) == 0: + hookenv.status_set('active', 'Kubernetes master running.') + else: + msg = 'Stopped services: {}'.format(','.join(failing_services)) + hookenv.status_set('waiting', msg) + + +def master_services_down(): + """Ensure master services are up and running. + + Return: list of failing services""" + services = ['kube-apiserver', + 'kube-controller-manager', + 'kube-scheduler'] + failing_services = [] + for service in services: + daemon = 'snap.{}.daemon'.format(service) + if not host.service_running(daemon): + failing_services.append(service) + return failing_services @when('etcd.available', 'tls_client.server.certificate.saved', diff --git a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py index 401e3a70480..57e9fae5bf2 100644 --- a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py +++ b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py @@ -257,11 +257,21 @@ def update_kubelet_status(): ''' There are different states that the kubelet can be in, where we are waiting for dns, waiting for cluster turnup, or ready to serve applications.''' - if (_systemctl_is_active('snap.kubelet.daemon')): + services = [ + 'kubelet', + 'kube-proxy' + ] + failing_services = [] + for service in services: + daemon = 'snap.{}.daemon'.format(service) + if not _systemctl_is_active(daemon): + failing_services.append(service) + + if len(failing_services) == 0: hookenv.status_set('active', 'Kubernetes worker running.') - # if kubelet is not running, we're waiting on something else to converge - elif (not _systemctl_is_active('snap.kubelet.daemon')): - hookenv.status_set('waiting', 'Waiting for kubelet to start.') + else: + msg = 'Waiting for {} to start.'.format(','.join(failing_services)) + hookenv.status_set('waiting', msg) @when('certificates.available')