From 685dff99ab9c622b3c7a48b665c2f30facdc6b12 Mon Sep 17 00:00:00 2001 From: Konstantinos Tsakalozos Date: Thu, 20 Jul 2017 14:57:02 +0300 Subject: [PATCH 1/2] Update status to show failing services. --- .../reactive/kubernetes_master.py | 32 ++++++++++++++++++- .../reactive/kubernetes_worker.py | 25 ++++++++++++--- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py b/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py index 9c3a0b387d2..1780795bff8 100644 --- a/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py +++ b/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py @@ -30,6 +30,7 @@ from shlex import split from subprocess import check_call from subprocess import check_output from subprocess import CalledProcessError +from time import sleep from charms import layer from charms.layer import snap @@ -323,7 +324,36 @@ def idle_status(kube_api, kube_control): msg = 'WARN: cannot change service-cidr, still using ' + service_cidr() hookenv.status_set('active', msg) else: - hookenv.status_set('active', 'Kubernetes master running.') + # All services should be up and running at this point. Double-check... + failing_services = master_services_down() + if len(failing_services) == 0: + hookenv.status_set('active', 'Kubernetes master running.') + else: + msg = 'Stopped services: {}'.format(','.join(failing_services)) + hookenv.status_set('waiting', msg) + + +def master_services_down(): + """Ensure master services are up and running. + Try to restart any failing services once. + + Return: list of failing services""" + services = ['kube-apiserver', + 'kube-controller-manager', + 'kube-scheduler'] + for service in services: + daemon = 'snap.{}.daemon'.format(service) + if not host.service_running(daemon): + hookenv.log("Service {} was down. Starting it.".format(daemon)) + host.service_start(daemon) + sleep(10) + + failing_services = [] + for service in services: + daemon = 'snap.{}.daemon'.format(service) + if not host.service_running(daemon): + failing_services.append(service) + return failing_services @when('etcd.available', 'tls_client.server.certificate.saved', diff --git a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py index 401e3a70480..282aa70cbc6 100644 --- a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py +++ b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py @@ -22,6 +22,7 @@ from shlex import split from subprocess import check_call, check_output from subprocess import CalledProcessError from socket import gethostname +from time import sleep from charms import layer from charms.layer import snap @@ -257,11 +258,27 @@ def update_kubelet_status(): ''' There are different states that the kubelet can be in, where we are waiting for dns, waiting for cluster turnup, or ready to serve applications.''' - if (_systemctl_is_active('snap.kubelet.daemon')): + services = [ + 'kubelet', + 'kube-proxy' + ] + for service in services: + daemon = 'snap.{}.daemon'.format(service) + if not _systemctl_is_active(daemon): + hookenv.log("Service {} id down. Starting it.".format(daemon)) + sleep(10) + + failing_services = [] + for service in services: + daemon = 'snap.{}.daemon'.format(service) + if not _systemctl_is_active(daemon): + failing_services.append(service) + + if len(failing_services) == 0: hookenv.status_set('active', 'Kubernetes worker running.') - # if kubelet is not running, we're waiting on something else to converge - elif (not _systemctl_is_active('snap.kubelet.daemon')): - hookenv.status_set('waiting', 'Waiting for kubelet to start.') + else: + msg = 'Waiting for {} to start.'.format(','.join(failing_services)) + hookenv.status_set('waiting', msg) @when('certificates.available') From 53f00722e29bbddbb514c505902c870a76d80e33 Mon Sep 17 00:00:00 2001 From: Konstantinos Tsakalozos Date: Fri, 21 Jul 2017 12:39:42 +0300 Subject: [PATCH 2/2] Do not try to restart services --- .../kubernetes-master/reactive/kubernetes_master.py | 9 --------- .../kubernetes-worker/reactive/kubernetes_worker.py | 7 ------- 2 files changed, 16 deletions(-) diff --git a/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py b/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py index 1780795bff8..8149c9ab288 100644 --- a/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py +++ b/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py @@ -30,7 +30,6 @@ from shlex import split from subprocess import check_call from subprocess import check_output from subprocess import CalledProcessError -from time import sleep from charms import layer from charms.layer import snap @@ -335,19 +334,11 @@ def idle_status(kube_api, kube_control): def master_services_down(): """Ensure master services are up and running. - Try to restart any failing services once. Return: list of failing services""" services = ['kube-apiserver', 'kube-controller-manager', 'kube-scheduler'] - for service in services: - daemon = 'snap.{}.daemon'.format(service) - if not host.service_running(daemon): - hookenv.log("Service {} was down. Starting it.".format(daemon)) - host.service_start(daemon) - sleep(10) - failing_services = [] for service in services: daemon = 'snap.{}.daemon'.format(service) diff --git a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py index 282aa70cbc6..57e9fae5bf2 100644 --- a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py +++ b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py @@ -22,7 +22,6 @@ from shlex import split from subprocess import check_call, check_output from subprocess import CalledProcessError from socket import gethostname -from time import sleep from charms import layer from charms.layer import snap @@ -262,12 +261,6 @@ def update_kubelet_status(): 'kubelet', 'kube-proxy' ] - for service in services: - daemon = 'snap.{}.daemon'.format(service) - if not _systemctl_is_active(daemon): - hookenv.log("Service {} id down. Starting it.".format(daemon)) - sleep(10) - failing_services = [] for service in services: daemon = 'snap.{}.daemon'.format(service)