Merge pull request #61542 from juju-solutions/feature/new-gpu

Automatic merge from submit-queue (batch tested with PRs 61096, 61955, 61542, 60597). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Work on master and worker to accommodate the new kind of gpu support

**What this PR does / why we need it**: This PR adds support for the new kind of GPU/nvidia in Juju charms.

**Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*:
Fixes: https://github.com/juju-solutions/bundle-canonical-kubernetes/issues/459

**Special notes for your reviewer**: This PR should go in with https://github.com/juju-solutions/layer-docker/pull/118

**Release note**:

```release-note
NONE
```
This commit is contained in:
Kubernetes Submit Queue 2018-04-02 15:35:09 -07:00 committed by GitHub
commit 7d1146cccb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 60 additions and 31 deletions

View File

@ -31,6 +31,14 @@ options:
privileged mode. If "auto", kube-apiserver will not run in privileged
mode by default, but will switch to privileged mode if gpu hardware is
detected on a worker node.
enable-nvidia-plugin:
type: string
default: "auto"
description: |
Load the nvidia device plugin daemonset. Supported values are
"auto" and "false". When "auto", the daemonset will be loaded
only if GPUs are detected. When "false" the nvidia device plugin
will not be loaded.
channel:
type: string
default: "1.9/stable"

View File

@ -638,6 +638,10 @@ def kick_api_server(tls):
def configure_cdk_addons():
''' Configure CDK addons '''
remove_state('cdk-addons.configured')
load_gpu_plugin = hookenv.config('enable-nvidia-plugin').lower()
gpuEnable = (get_version('kube-apiserver') >= (1, 9) and
load_gpu_plugin == "auto" and
is_state('kubernetes-master.gpu.enabled'))
dbEnabled = str(hookenv.config('enable-dashboard-addons')).lower()
dnsEnabled = str(hookenv.config('enable-kube-dns')).lower()
metricsEnabled = str(hookenv.config('enable-metrics')).lower()
@ -647,7 +651,8 @@ def configure_cdk_addons():
'dns-domain=' + hookenv.config('dns_domain'),
'enable-dashboard=' + dbEnabled,
'enable-kube-dns=' + dnsEnabled,
'enable-metrics=' + metricsEnabled
'enable-metrics=' + metricsEnabled,
'enable-gpu=' + str(gpuEnable).lower()
]
check_call(['snap', 'set', 'cdk-addons'] + args)
if not addons_ready():
@ -887,8 +892,10 @@ def on_gpu_available(kube_control):
We need to run in privileged mode.
"""
kube_version = get_version('kube-apiserver')
config = hookenv.config()
if config['allow-privileged'].lower() == "false":
if (config['allow-privileged'].lower() == "false" and
kube_version < (1, 9)):
hookenv.status_set(
'active',
'GPUs available. Set allow-privileged="auto" to enable.'
@ -900,11 +907,25 @@ def on_gpu_available(kube_control):
@when('kubernetes-master.gpu.enabled')
@when('kubernetes-master.components.started')
@when_not('kubernetes-master.privileged')
def disable_gpu_mode():
def gpu_with_no_privileged():
"""We were in gpu mode, but the operator has set allow-privileged="false",
so we can't run in gpu mode anymore.
"""
if get_version('kube-apiserver') < (1, 9):
remove_state('kubernetes-master.gpu.enabled')
@when('kube-control.connected')
@when_not('kube-control.gpu.available')
@when('kubernetes-master.gpu.enabled')
@when('kubernetes-master.components.started')
def gpu_departed(kube_control):
"""We were in gpu mode, but the workers informed us there is
no gpu support anymore.
"""
remove_state('kubernetes-master.gpu.enabled')
@ -1185,7 +1206,7 @@ def configure_apiserver(etcd_connection_string, leader_etcd_version):
else:
api_opts['admission-control'] = ','.join(admission_control)
if get_version('kube-apiserver') > (1, 6) and \
if kube_version > (1, 6) and \
hookenv.config('enable-metrics'):
api_opts['requestheader-client-ca-file'] = ca_cert_path
api_opts['requestheader-allowed-names'] = 'client'

View File

@ -7,7 +7,6 @@ includes:
- 'layer:metrics'
- 'layer:nagios'
- 'layer:tls-client'
- 'layer:nvidia-cuda'
- 'layer:cdk-service-kicker'
- 'interface:http'
- 'interface:kubernetes-cni'

View File

@ -70,6 +70,7 @@ def upgrade_charm():
# Remove gpu.enabled state so we can reconfigure gpu-related kubelet flags,
# since they can differ between k8s versions
remove_state('kubernetes-worker.gpu.enabled')
disable_gpu()
remove_state('kubernetes-worker.cni-plugins.installed')
remove_state('kubernetes-worker.config.created')
@ -629,12 +630,10 @@ def configure_kubelet(dns, ingress_ip):
kubelet_opts['allow-privileged'] = 'true' if privileged else 'false'
if is_state('kubernetes-worker.gpu.enabled'):
if get_version('kubelet') < (1, 6):
hookenv.log('Adding --experimental-nvidia-gpus=1 to kubelet')
kubelet_opts['experimental-nvidia-gpus'] = '1'
else:
hookenv.log('Adding --feature-gates=Accelerators=true to kubelet')
kubelet_opts['feature-gates'] = 'Accelerators=true'
hookenv.log('Adding '
'--feature-gates=Accelerators=true,DevicePlugins=true '
'to kubelet')
kubelet_opts['feature-gates'] = 'Accelerators=true,DevicePlugins=true'
configure_kubernetes_service('kubelet', kubelet_opts, 'kubelet-extra-args')
@ -870,14 +869,17 @@ def set_privileged():
"""
privileged = hookenv.config('allow-privileged').lower()
if privileged == 'auto':
gpu_enabled = is_state('kubernetes-worker.gpu.enabled')
privileged = 'true' if gpu_enabled else 'false'
gpu_needs_privileged = (is_state('kubernetes-worker.gpu.enabled') and
get_version('kubelet') < (1, 9))
if privileged == 'true':
set_state('kubernetes-worker.privileged')
else:
remove_state('kubernetes-worker.privileged')
if privileged == 'auto':
privileged = 'true' if gpu_needs_privileged else 'false'
if privileged == 'false' and gpu_needs_privileged:
disable_gpu()
remove_state('kubernetes-worker.gpu.enabled')
# No need to restart kubernetes (set the restart-needed state)
# because set-privileged is already in the restart path
@when('config.changed.allow-privileged')
@ -890,18 +892,17 @@ def on_config_allow_privileged_change():
remove_state('config.changed.allow-privileged')
@when('cuda.installed')
@when('nvidia-docker.installed')
@when('kubernetes-worker.config.created')
@when_not('kubernetes-worker.gpu.enabled')
def enable_gpu():
"""Enable GPU usage on this node.
"""
config = hookenv.config()
if config['allow-privileged'] == "false":
if get_version('kubelet') < (1, 9):
hookenv.status_set(
'active',
'GPUs available. Set allow-privileged="auto" to enable.'
'Upgrade to snap channel >= 1.9/stable to enable GPU suppport.'
)
return
@ -916,7 +917,6 @@ def enable_gpu():
hookenv.log(cpe)
return
# Apply node labels
set_label('gpu', 'true')
set_label('cuda', 'true')
@ -925,15 +925,19 @@ def enable_gpu():
@when('kubernetes-worker.gpu.enabled')
@when_not('kubernetes-worker.privileged')
@when_not('nvidia-docker.installed')
@when_not('kubernetes-worker.restart-needed')
def nvidia_departed():
"""Cuda departed, probably due to the docker layer switching to a
non nvidia-docker."""
disable_gpu()
remove_state('kubernetes-worker.gpu.enabled')
set_state('kubernetes-worker.restart-needed')
def disable_gpu():
"""Disable GPU usage on this node.
This handler fires when we're running in gpu mode, and then the operator
sets allow-privileged="false". Since we can no longer run privileged
containers, we need to disable gpu mode.
"""
hookenv.log('Disabling gpu mode')
@ -941,9 +945,6 @@ def disable_gpu():
remove_label('gpu')
remove_label('cuda')
remove_state('kubernetes-worker.gpu.enabled')
set_state('kubernetes-worker.restart-needed')
@when('kubernetes-worker.gpu.enabled')
@when('kube-control.connected')