diff --git a/cluster/juju/layers/kubernetes-master/layer.yaml b/cluster/juju/layers/kubernetes-master/layer.yaml index 9a53451dce5..247f8c20720 100644 --- a/cluster/juju/layers/kubernetes-master/layer.yaml +++ b/cluster/juju/layers/kubernetes-master/layer.yaml @@ -1,7 +1,6 @@ repo: https://github.com/kubernetes/kubernetes.git includes: - 'layer:basic' - - 'layer:status' - 'layer:snap' - 'layer:tls-client' - 'layer:leadership' @@ -17,6 +16,7 @@ includes: - 'interface:kube-control' - 'interface:public-address' - 'interface:aws' + - 'interface:gcp' options: basic: packages: diff --git a/cluster/juju/layers/kubernetes-master/metadata.yaml b/cluster/juju/layers/kubernetes-master/metadata.yaml index dfba03c99aa..52aab0afa4d 100644 --- a/cluster/juju/layers/kubernetes-master/metadata.yaml +++ b/cluster/juju/layers/kubernetes-master/metadata.yaml @@ -42,6 +42,8 @@ requires: interface: ceph-admin aws: interface: aws + gcp: + interface: gcp resources: kubectl: type: file diff --git a/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py b/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py index 36207665d1a..f866471f934 100644 --- a/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py +++ b/cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py @@ -28,10 +28,12 @@ from charms.leadership import leader_get, leader_set from shutil import move +from pathlib import Path from shlex import split from subprocess import check_call from subprocess import check_output from subprocess import CalledProcessError +from urllib.request import Request, urlopen from charms import layer from charms.layer import snap @@ -40,7 +42,7 @@ from charms.reactive import remove_state from charms.reactive import set_state from charms.reactive import is_state from charms.reactive import endpoint_from_flag -from charms.reactive import when, when_any, when_not +from charms.reactive import when, when_any, when_not, when_none from charms.reactive.helpers import data_changed, any_file_changed from charms.kubernetes.common import get_version from charms.kubernetes.common import retry @@ -61,6 +63,8 @@ from charmhelpers.contrib.charmsupport import nrpe # default regex in charmhelpers doesn't allow periods, but nagios itself does. nrpe.Check.shortname_re = '[\.A-Za-z0-9-_]+$' +gcp_creds_env_key = 'GOOGLE_APPLICATION_CREDENTIALS' + os.environ['PATH'] += os.pathsep + os.path.join(os.sep, 'snap', 'bin') @@ -427,6 +431,12 @@ def set_final_status(): hookenv.status_set('waiting', 'Waiting to retry addon deployment') return + req_sent = is_state('kubernetes-master.cloud-request-sent') + aws_ready = is_state('endpoint.aws.ready') + gcp_ready = is_state('endpoint.gcp.ready') + if req_sent and not (aws_ready or gcp_ready): + hookenv.status_set('waiting', 'waiting for cloud integration') + if addons_configured and not all_kube_system_pods_running(): hookenv.status_set('waiting', 'Waiting for kube-system pods to start') return @@ -1227,6 +1237,10 @@ def configure_apiserver(etcd_connection_string, leader_etcd_version): if is_state('endpoint.aws.ready'): api_opts['cloud-provider'] = 'aws' + elif is_state('endpoint.gcp.ready'): + cloud_config_path = _cloud_config_path('kube-apiserver') + api_opts['cloud-provider'] = 'gce' + api_opts['cloud-config'] = str(cloud_config_path) configure_kubernetes_service('kube-apiserver', api_opts, 'api-extra-args') restart_apiserver() @@ -1251,6 +1265,10 @@ def configure_controller_manager(): if is_state('endpoint.aws.ready'): controller_opts['cloud-provider'] = 'aws' + elif is_state('endpoint.gcp.ready'): + cloud_config_path = _cloud_config_path('kube-controller-manager') + controller_opts['cloud-provider'] = 'gce' + controller_opts['cloud-config'] = str(cloud_config_path) configure_kubernetes_service('kube-controller-manager', controller_opts, 'controller-manager-extra-args') @@ -1351,19 +1369,88 @@ def all_kube_system_pods_running(): try: output = check_output(cmd).decode('utf-8') + result = json.loads(output) except CalledProcessError: hookenv.log('failed to get kube-system pod status') return False + hookenv.log('Checking system pods status: {}'.format(', '.join( + '='.join([pod['metadata']['name'], pod['status']['phase']]) + for pod in result['items']))) - result = json.loads(output) - for pod in result['items']: - status = pod['status']['phase'] - # Evicted nodes should re-spawn - if status != 'Running' and \ - pod['status'].get('reason', '') != 'Evicted': - return False + all_pending = all(pod['status']['phase'] == 'Pending' + for pod in result['items']) + if is_state('endpoint.gcp.ready') and all_pending: + poke_network_unavailable() + return False - return True + # All pods must be Running or Evicted (which should re-spawn) + all_running = all(pod['status']['phase'] == 'Running' or + pod['status'].get('reason', '') == 'Evicted' + for pod in result['items']) + return all_running + + +def poke_network_unavailable(): + """ + Work around https://github.com/kubernetes/kubernetes/issues/44254 by + manually poking the status into the API server to tell the nodes they have + a network route. + + This is needed because kubelet sets the NetworkUnavailable flag and expects + the network plugin to clear it, which only kubenet does. There is some + discussion about refactoring the affected code but nothing has happened + in a while. + """ + cmd = ['kubectl', 'get', 'nodes', '-o', 'json'] + + try: + output = check_output(cmd).decode('utf-8') + nodes = json.loads(output)['items'] + except CalledProcessError: + hookenv.log('failed to get kube-system nodes') + return + except (KeyError, json.JSONDecodeError) as e: + hookenv.log('failed to parse kube-system node status ' + '({}): {}'.format(e, output), hookenv.ERROR) + return + + for node in nodes: + node_name = node['metadata']['name'] + url = 'http://localhost:8080/api/v1/nodes/{}/status'.format(node_name) + with urlopen(url) as response: + code = response.getcode() + body = response.read().decode('utf8') + if code != 200: + hookenv.log('failed to get node status from {} [{}]: {}'.format( + url, code, body), hookenv.ERROR) + return + try: + node_info = json.loads(body) + conditions = node_info['status']['conditions'] + i = [c['type'] for c in conditions].index('NetworkUnavailable') + if conditions[i]['status'] == 'True': + hookenv.log('Clearing NetworkUnavailable from {}'.format( + node_name)) + conditions[i] = { + "type": "NetworkUnavailable", + "status": "False", + "reason": "RouteCreated", + "message": "Manually set through k8s api", + } + req = Request(url, method='PUT', + data=json.dumps(node_info).encode('utf8'), + headers={'Content-Type': 'application/json'}) + with urlopen(req) as response: + code = response.getcode() + body = response.read().decode('utf8') + if code not in (200, 201, 202): + hookenv.log('failed to update node status [{}]: {}'.format( + code, body), hookenv.ERROR) + return + except (json.JSONDecodeError, KeyError): + hookenv.log('failed to parse node status: {}'.format(body), + hookenv.ERROR) + return def apiserverVersion(): @@ -1389,7 +1476,7 @@ def getStorageBackend(): @when('leadership.is_leader') @when_not('leadership.set.cluster_tag') def create_cluster_tag(): - cluster_tag = 'kubernetes-{}'.format(token_generator()) + cluster_tag = 'kubernetes-{}'.format(token_generator().lower()) leader_set(cluster_tag=cluster_tag) @@ -1408,37 +1495,100 @@ def clear_cluster_tag_sent(): remove_state('kubernetes-master.cluster-tag-sent') -@when('endpoint.aws.joined', - 'leadership.set.cluster_tag') -@when_not('kubernetes-master.aws-request-sent') +@when_any('endpoint.aws.joined', + 'endpoint.gcp.joined') +@when('leadership.set.cluster_tag') +@when_not('kubernetes-master.cloud-request-sent') def request_integration(): - hookenv.status_set('maintenance', 'requesting aws integration') - aws = endpoint_from_flag('endpoint.aws.joined') + hookenv.status_set('maintenance', 'requesting cloud integration') cluster_tag = leader_get('cluster_tag') - aws.tag_instance({ - 'KubernetesCluster': cluster_tag, - 'k8s.io/role/master': 'true', - }) - aws.tag_instance_security_group({ - 'KubernetesCluster': cluster_tag, - }) - aws.enable_instance_inspection() - aws.enable_network_management() - aws.enable_dns_management() - aws.enable_load_balancer_management() - aws.enable_block_storage_management() - aws.enable_object_storage_management(['kubernetes-*']) - set_state('kubernetes-master.aws-request-sent') - hookenv.status_set('waiting', 'waiting for aws integration') + if is_state('endpoint.aws.joined'): + cloud = endpoint_from_flag('endpoint.aws.joined') + cloud.tag_instance({ + 'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned', + 'k8s.io/role/master': 'true', + }) + cloud.tag_instance_security_group({ + 'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned', + }) + cloud.tag_instance_subnet({ + 'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned', + }) + cloud.enable_object_storage_management(['kubernetes-*']) + cloud.enable_load_balancer_management() + elif is_state('endpoint.gcp.joined'): + cloud = endpoint_from_flag('endpoint.gcp.joined') + cloud.label_instance({ + 'k8s-io-cluster-name': cluster_tag, + 'k8s-io-role-master': 'master', + }) + cloud.enable_object_storage_management() + cloud.enable_security_management() + cloud.enable_instance_inspection() + cloud.enable_network_management() + cloud.enable_dns_management() + cloud.enable_block_storage_management() + set_state('kubernetes-master.cloud-request-sent') -@when_not('endpoint.aws.joined') +@when_none('endpoint.aws.joined', + 'endpoint.gcp.joined') +@when('kubernetes-master.cloud-request-sent') def clear_requested_integration(): - remove_state('kubernetes-master.aws-request-sent') + remove_state('kubernetes-master.cloud-request-sent') -@when('endpoint.aws.ready') -@when_not('kubernetes-master.restarted-for-aws') -def restart_for_aws(): - set_state('kubernetes-master.restarted-for-aws') +@when_any('endpoint.aws.ready', + 'endpoint.gcp.ready') +@when_not('kubernetes-master.restarted-for-cloud') +def restart_for_cloud(): + if is_state('endpoint.gcp.ready'): + _write_gcp_snap_config('kube-apiserver') + _write_gcp_snap_config('kube-controller-manager') + set_state('kubernetes-master.restarted-for-cloud') remove_state('kubernetes-master.components.started') # force restart + + +def _snap_common_path(component): + return Path('/var/snap/{}/common'.format(component)) + + +def _cloud_config_path(component): + return _snap_common_path(component) / 'cloud-config.conf' + + +def _gcp_creds_path(component): + return _snap_common_path(component) / 'gcp-creds.json' + + +def _daemon_env_path(component): + return _snap_common_path(component) / 'environment' + + +def _write_gcp_snap_config(component): + # gcp requires additional credentials setup + gcp = endpoint_from_flag('endpoint.gcp.ready') + creds_path = _gcp_creds_path(component) + with creds_path.open('w') as fp: + os.fchmod(fp.fileno(), 0o600) + fp.write(gcp.credentials) + + # create a cloud-config file that sets token-url to nil to make the + # services use the creds env var instead of the metadata server, as + # well as making the cluster multizone + cloud_config_path = _cloud_config_path(component) + cloud_config_path.write_text('[Global]\n' + 'token-url = nil\n' + 'multizone = true\n') + + daemon_env_path = _daemon_env_path(component) + if daemon_env_path.exists(): + daemon_env = daemon_env_path.read_text() + if not daemon_env.endswith('\n'): + daemon_env += '\n' + else: + daemon_env = '' + if gcp_creds_env_key not in daemon_env: + daemon_env += '{}={}\n'.format(gcp_creds_env_key, creds_path) + daemon_env_path.parent.mkdir(parents=True, exist_ok=True) + daemon_env_path.write_text(daemon_env) diff --git a/cluster/juju/layers/kubernetes-worker/layer.yaml b/cluster/juju/layers/kubernetes-worker/layer.yaml index 26aaa4d40ae..79fbdf117a3 100644 --- a/cluster/juju/layers/kubernetes-worker/layer.yaml +++ b/cluster/juju/layers/kubernetes-worker/layer.yaml @@ -1,7 +1,6 @@ repo: https://github.com/kubernetes/kubernetes.git includes: - 'layer:basic' - - 'layer:status' - 'layer:debug' - 'layer:snap' - 'layer:docker' @@ -14,6 +13,7 @@ includes: - 'interface:kube-dns' - 'interface:kube-control' - 'interface:aws' + - 'interface:gcp' config: deletes: - install_from_upstream diff --git a/cluster/juju/layers/kubernetes-worker/metadata.yaml b/cluster/juju/layers/kubernetes-worker/metadata.yaml index cc09efb8a4d..187106ce608 100644 --- a/cluster/juju/layers/kubernetes-worker/metadata.yaml +++ b/cluster/juju/layers/kubernetes-worker/metadata.yaml @@ -30,6 +30,8 @@ requires: interface: kube-control aws: interface: aws + gcp: + interface: gcp provides: cni: interface: kubernetes-cni diff --git a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py index 6540b6250bc..798c2ad7bce 100644 --- a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py +++ b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py @@ -21,6 +21,7 @@ import shutil import subprocess import time +from pathlib import Path from shlex import split from subprocess import check_call, check_output from subprocess import CalledProcessError @@ -31,7 +32,7 @@ from charms.layer import snap from charms.reactive import hook from charms.reactive import endpoint_from_flag from charms.reactive import set_state, remove_state, is_state -from charms.reactive import when, when_any, when_not +from charms.reactive import when, when_any, when_not, when_none from charms.kubernetes.common import get_version @@ -50,6 +51,7 @@ nrpe.Check.shortname_re = '[\.A-Za-z0-9-_]+$' kubeconfig_path = '/root/cdk/kubeconfig' kubeproxyconfig_path = '/root/cdk/kubeproxyconfig' kubeclientconfig_path = '/root/.kube/config' +gcp_creds_env_key = 'GOOGLE_APPLICATION_CREDENTIALS' os.environ['PATH'] += os.pathsep + os.path.join(os.sep, 'snap', 'bin') db = unitdata.kv() @@ -626,6 +628,10 @@ def configure_kubelet(dns, ingress_ip): if is_state('endpoint.aws.ready'): kubelet_opts['cloud-provider'] = 'aws' + elif is_state('endpoint.gcp.ready'): + cloud_config_path = _cloud_config_path('kubelet') + kubelet_opts['cloud-provider'] = 'gce' + kubelet_opts['cloud-config'] = str(cloud_config_path) configure_kubernetes_service('kubelet', kubelet_opts, 'kubelet-extra-args') @@ -1031,6 +1037,10 @@ def _systemctl_is_active(application): def get_node_name(): kubelet_extra_args = parse_extra_args('kubelet-extra-args') cloud_provider = kubelet_extra_args.get('cloud-provider', '') + if is_state('endpoint.aws.ready'): + cloud_provider = 'aws' + elif is_state('endpoint.gcp.ready'): + cloud_provider = 'gcp' if cloud_provider == 'aws': return getfqdn() else: @@ -1073,37 +1083,94 @@ def remove_label(label): raise ApplyNodeLabelFailed(retry) -@when('endpoint.aws.joined', - 'kube-control.cluster_tag.available') -@when_not('kubernetes-worker.aws-request-sent') +@when_any('endpoint.aws.joined', + 'endpoint.gcp.joined') +@when('kube-control.cluster_tag.available') +@when_not('kubernetes-worker.cloud-request-sent') def request_integration(): + hookenv.status_set('maintenance', 'requesting cloud integration') kube_control = endpoint_from_flag('kube-control.cluster_tag.available') - hookenv.status_set('maintenance', 'requesting aws integration') - aws = endpoint_from_flag('endpoint.aws.joined') cluster_tag = kube_control.get_cluster_tag() - aws.tag_instance({ - 'KubernetesCluster': cluster_tag, - }) - aws.tag_instance_security_group({ - 'KubernetesCluster': cluster_tag, - }) - aws.tag_instance_subnet({ - 'KubernetesCluster': cluster_tag, - }) - aws.enable_instance_inspection() - aws.enable_dns_management() - aws.enable_object_storage_management(['kubernetes-*']) - set_state('kubernetes-worker.aws-request-sent') - hookenv.status_set('waiting', 'waiting for aws integration') + if is_state('endpoint.aws.joined'): + cloud = endpoint_from_flag('endpoint.aws.joined') + cloud.tag_instance({ + 'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned', + }) + cloud.tag_instance_security_group({ + 'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned', + }) + cloud.tag_instance_subnet({ + 'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned', + }) + cloud.enable_object_storage_management(['kubernetes-*']) + elif is_state('endpoint.gcp.joined'): + cloud = endpoint_from_flag('endpoint.gcp.joined') + cloud.label_instance({ + 'k8s-io-cluster-name': cluster_tag, + }) + cloud.enable_object_storage_management() + cloud.enable_instance_inspection() + cloud.enable_dns_management() + set_state('kubernetes-worker.cloud-request-sent') + hookenv.status_set('waiting', 'waiting for cloud integration') -@when_not('endpoint.aws.joined') +@when_none('endpoint.aws.joined', + 'endpoint.gcp.joined') def clear_requested_integration(): - remove_state('kubernetes-worker.aws-request-sent') + remove_state('kubernetes-worker.cloud-request-sent') -@when('endpoint.aws.ready') -@when_not('kubernetes-worker.restarted-for-aws') -def restart_for_aws(): - set_state('kubernetes-worker.restarted-for-aws') +@when_any('endpoint.aws.ready', + 'endpoint.gcp.ready') +@when_not('kubernetes-worker.restarted-for-cloud') +def restart_for_cloud(): + if is_state('endpoint.gcp.ready'): + _write_gcp_snap_config('kubelet') + set_state('kubernetes-worker.restarted-for-cloud') set_state('kubernetes-worker.restart-needed') + + +def _snap_common_path(component): + return Path('/var/snap/{}/common'.format(component)) + + +def _cloud_config_path(component): + return _snap_common_path(component) / 'cloud-config.conf' + + +def _gcp_creds_path(component): + return _snap_common_path(component) / 'gcp-creds.json' + + +def _daemon_env_path(component): + return _snap_common_path(component) / 'environment' + + +def _write_gcp_snap_config(component): + # gcp requires additional credentials setup + gcp = endpoint_from_flag('endpoint.gcp.ready') + creds_path = _gcp_creds_path(component) + with creds_path.open('w') as fp: + os.fchmod(fp.fileno(), 0o600) + fp.write(gcp.credentials) + + # create a cloud-config file that sets token-url to nil to make the + # services use the creds env var instead of the metadata server, as + # well as making the cluster multizone + cloud_config_path = _cloud_config_path(component) + cloud_config_path.write_text('[Global]\n' + 'token-url = nil\n' + 'multizone = true\n') + + daemon_env_path = _daemon_env_path(component) + if daemon_env_path.exists(): + daemon_env = daemon_env_path.read_text() + if not daemon_env.endswith('\n'): + daemon_env += '\n' + else: + daemon_env = '' + if gcp_creds_env_key not in daemon_env: + daemon_env += '{}={}\n'.format(gcp_creds_env_key, creds_path) + daemon_env_path.parent.mkdir(parents=True, exist_ok=True) + daemon_env_path.write_text(daemon_env)