mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-21 10:51:29 +00:00
Merge pull request #61542 from juju-solutions/feature/new-gpu
Automatic merge from submit-queue (batch tested with PRs 61096, 61955, 61542, 60597). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Work on master and worker to accommodate the new kind of gpu support **What this PR does / why we need it**: This PR adds support for the new kind of GPU/nvidia in Juju charms. **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes: https://github.com/juju-solutions/bundle-canonical-kubernetes/issues/459 **Special notes for your reviewer**: This PR should go in with https://github.com/juju-solutions/layer-docker/pull/118 **Release note**: ```release-note NONE ```
This commit is contained in:
commit
7d1146cccb
@ -31,6 +31,14 @@ options:
|
||||
privileged mode. If "auto", kube-apiserver will not run in privileged
|
||||
mode by default, but will switch to privileged mode if gpu hardware is
|
||||
detected on a worker node.
|
||||
enable-nvidia-plugin:
|
||||
type: string
|
||||
default: "auto"
|
||||
description: |
|
||||
Load the nvidia device plugin daemonset. Supported values are
|
||||
"auto" and "false". When "auto", the daemonset will be loaded
|
||||
only if GPUs are detected. When "false" the nvidia device plugin
|
||||
will not be loaded.
|
||||
channel:
|
||||
type: string
|
||||
default: "1.9/stable"
|
||||
|
@ -638,6 +638,10 @@ def kick_api_server(tls):
|
||||
def configure_cdk_addons():
|
||||
''' Configure CDK addons '''
|
||||
remove_state('cdk-addons.configured')
|
||||
load_gpu_plugin = hookenv.config('enable-nvidia-plugin').lower()
|
||||
gpuEnable = (get_version('kube-apiserver') >= (1, 9) and
|
||||
load_gpu_plugin == "auto" and
|
||||
is_state('kubernetes-master.gpu.enabled'))
|
||||
dbEnabled = str(hookenv.config('enable-dashboard-addons')).lower()
|
||||
dnsEnabled = str(hookenv.config('enable-kube-dns')).lower()
|
||||
metricsEnabled = str(hookenv.config('enable-metrics')).lower()
|
||||
@ -647,7 +651,8 @@ def configure_cdk_addons():
|
||||
'dns-domain=' + hookenv.config('dns_domain'),
|
||||
'enable-dashboard=' + dbEnabled,
|
||||
'enable-kube-dns=' + dnsEnabled,
|
||||
'enable-metrics=' + metricsEnabled
|
||||
'enable-metrics=' + metricsEnabled,
|
||||
'enable-gpu=' + str(gpuEnable).lower()
|
||||
]
|
||||
check_call(['snap', 'set', 'cdk-addons'] + args)
|
||||
if not addons_ready():
|
||||
@ -887,8 +892,10 @@ def on_gpu_available(kube_control):
|
||||
We need to run in privileged mode.
|
||||
|
||||
"""
|
||||
kube_version = get_version('kube-apiserver')
|
||||
config = hookenv.config()
|
||||
if config['allow-privileged'].lower() == "false":
|
||||
if (config['allow-privileged'].lower() == "false" and
|
||||
kube_version < (1, 9)):
|
||||
hookenv.status_set(
|
||||
'active',
|
||||
'GPUs available. Set allow-privileged="auto" to enable.'
|
||||
@ -900,11 +907,25 @@ def on_gpu_available(kube_control):
|
||||
|
||||
|
||||
@when('kubernetes-master.gpu.enabled')
|
||||
@when('kubernetes-master.components.started')
|
||||
@when_not('kubernetes-master.privileged')
|
||||
def disable_gpu_mode():
|
||||
def gpu_with_no_privileged():
|
||||
"""We were in gpu mode, but the operator has set allow-privileged="false",
|
||||
so we can't run in gpu mode anymore.
|
||||
|
||||
"""
|
||||
if get_version('kube-apiserver') < (1, 9):
|
||||
remove_state('kubernetes-master.gpu.enabled')
|
||||
|
||||
|
||||
@when('kube-control.connected')
|
||||
@when_not('kube-control.gpu.available')
|
||||
@when('kubernetes-master.gpu.enabled')
|
||||
@when('kubernetes-master.components.started')
|
||||
def gpu_departed(kube_control):
|
||||
"""We were in gpu mode, but the workers informed us there is
|
||||
no gpu support anymore.
|
||||
|
||||
"""
|
||||
remove_state('kubernetes-master.gpu.enabled')
|
||||
|
||||
@ -1185,7 +1206,7 @@ def configure_apiserver(etcd_connection_string, leader_etcd_version):
|
||||
else:
|
||||
api_opts['admission-control'] = ','.join(admission_control)
|
||||
|
||||
if get_version('kube-apiserver') > (1, 6) and \
|
||||
if kube_version > (1, 6) and \
|
||||
hookenv.config('enable-metrics'):
|
||||
api_opts['requestheader-client-ca-file'] = ca_cert_path
|
||||
api_opts['requestheader-allowed-names'] = 'client'
|
||||
|
@ -7,7 +7,6 @@ includes:
|
||||
- 'layer:metrics'
|
||||
- 'layer:nagios'
|
||||
- 'layer:tls-client'
|
||||
- 'layer:nvidia-cuda'
|
||||
- 'layer:cdk-service-kicker'
|
||||
- 'interface:http'
|
||||
- 'interface:kubernetes-cni'
|
||||
|
@ -70,6 +70,7 @@ def upgrade_charm():
|
||||
# Remove gpu.enabled state so we can reconfigure gpu-related kubelet flags,
|
||||
# since they can differ between k8s versions
|
||||
remove_state('kubernetes-worker.gpu.enabled')
|
||||
disable_gpu()
|
||||
|
||||
remove_state('kubernetes-worker.cni-plugins.installed')
|
||||
remove_state('kubernetes-worker.config.created')
|
||||
@ -629,12 +630,10 @@ def configure_kubelet(dns, ingress_ip):
|
||||
kubelet_opts['allow-privileged'] = 'true' if privileged else 'false'
|
||||
|
||||
if is_state('kubernetes-worker.gpu.enabled'):
|
||||
if get_version('kubelet') < (1, 6):
|
||||
hookenv.log('Adding --experimental-nvidia-gpus=1 to kubelet')
|
||||
kubelet_opts['experimental-nvidia-gpus'] = '1'
|
||||
else:
|
||||
hookenv.log('Adding --feature-gates=Accelerators=true to kubelet')
|
||||
kubelet_opts['feature-gates'] = 'Accelerators=true'
|
||||
hookenv.log('Adding '
|
||||
'--feature-gates=Accelerators=true,DevicePlugins=true '
|
||||
'to kubelet')
|
||||
kubelet_opts['feature-gates'] = 'Accelerators=true,DevicePlugins=true'
|
||||
|
||||
configure_kubernetes_service('kubelet', kubelet_opts, 'kubelet-extra-args')
|
||||
|
||||
@ -870,14 +869,17 @@ def set_privileged():
|
||||
|
||||
"""
|
||||
privileged = hookenv.config('allow-privileged').lower()
|
||||
if privileged == 'auto':
|
||||
gpu_enabled = is_state('kubernetes-worker.gpu.enabled')
|
||||
privileged = 'true' if gpu_enabled else 'false'
|
||||
gpu_needs_privileged = (is_state('kubernetes-worker.gpu.enabled') and
|
||||
get_version('kubelet') < (1, 9))
|
||||
|
||||
if privileged == 'true':
|
||||
set_state('kubernetes-worker.privileged')
|
||||
else:
|
||||
remove_state('kubernetes-worker.privileged')
|
||||
if privileged == 'auto':
|
||||
privileged = 'true' if gpu_needs_privileged else 'false'
|
||||
|
||||
if privileged == 'false' and gpu_needs_privileged:
|
||||
disable_gpu()
|
||||
remove_state('kubernetes-worker.gpu.enabled')
|
||||
# No need to restart kubernetes (set the restart-needed state)
|
||||
# because set-privileged is already in the restart path
|
||||
|
||||
|
||||
@when('config.changed.allow-privileged')
|
||||
@ -890,18 +892,17 @@ def on_config_allow_privileged_change():
|
||||
remove_state('config.changed.allow-privileged')
|
||||
|
||||
|
||||
@when('cuda.installed')
|
||||
@when('nvidia-docker.installed')
|
||||
@when('kubernetes-worker.config.created')
|
||||
@when_not('kubernetes-worker.gpu.enabled')
|
||||
def enable_gpu():
|
||||
"""Enable GPU usage on this node.
|
||||
|
||||
"""
|
||||
config = hookenv.config()
|
||||
if config['allow-privileged'] == "false":
|
||||
if get_version('kubelet') < (1, 9):
|
||||
hookenv.status_set(
|
||||
'active',
|
||||
'GPUs available. Set allow-privileged="auto" to enable.'
|
||||
'Upgrade to snap channel >= 1.9/stable to enable GPU suppport.'
|
||||
)
|
||||
return
|
||||
|
||||
@ -916,7 +917,6 @@ def enable_gpu():
|
||||
hookenv.log(cpe)
|
||||
return
|
||||
|
||||
# Apply node labels
|
||||
set_label('gpu', 'true')
|
||||
set_label('cuda', 'true')
|
||||
|
||||
@ -925,15 +925,19 @@ def enable_gpu():
|
||||
|
||||
|
||||
@when('kubernetes-worker.gpu.enabled')
|
||||
@when_not('kubernetes-worker.privileged')
|
||||
@when_not('nvidia-docker.installed')
|
||||
@when_not('kubernetes-worker.restart-needed')
|
||||
def nvidia_departed():
|
||||
"""Cuda departed, probably due to the docker layer switching to a
|
||||
non nvidia-docker."""
|
||||
disable_gpu()
|
||||
remove_state('kubernetes-worker.gpu.enabled')
|
||||
set_state('kubernetes-worker.restart-needed')
|
||||
|
||||
|
||||
def disable_gpu():
|
||||
"""Disable GPU usage on this node.
|
||||
|
||||
This handler fires when we're running in gpu mode, and then the operator
|
||||
sets allow-privileged="false". Since we can no longer run privileged
|
||||
containers, we need to disable gpu mode.
|
||||
|
||||
"""
|
||||
hookenv.log('Disabling gpu mode')
|
||||
|
||||
@ -941,9 +945,6 @@ def disable_gpu():
|
||||
remove_label('gpu')
|
||||
remove_label('cuda')
|
||||
|
||||
remove_state('kubernetes-worker.gpu.enabled')
|
||||
set_state('kubernetes-worker.restart-needed')
|
||||
|
||||
|
||||
@when('kubernetes-worker.gpu.enabled')
|
||||
@when('kube-control.connected')
|
||||
|
Loading…
Reference in New Issue
Block a user