Putting the nvidia-smi command in a try catch to avoid errors.

2025-09-14 05:36:12 +00:00 · 2017-04-05 16:09:02 -05:00
parent d50cf1e499
commit 258ee22858
1 changed files with 9 additions and 5 deletions
--- a/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py
+++ b/cluster/juju/layers/kubernetes-worker/reactive/kubernetes_worker.py
@@ -669,6 +669,15 @@ def enable_gpu():
        return

    hookenv.log('Enabling gpu mode')
+    try:
+        # Not sure why this is necessary, but if you don't run this, k8s will
+        # think that the node has 0 gpus (as shown by the output of
+        # `kubectl get nodes -o yaml`
+        check_call(['nvidia-smi'])
+    except CalledProcessError as cpe:
+        hookenv.log('Unable to communicate with the NVIDIA driver.')
+        hookenv.log(cpe)
+        return

    kubelet_opts = FlagManager('kubelet')
    if get_version('kubelet') < (1, 6):
@@ -682,11 +691,6 @@ def enable_gpu():
    _apply_node_label('gpu=true', overwrite=True)
    _apply_node_label('cuda=true', overwrite=True)

-    # Not sure why this is necessary, but if you don't run this, k8s will
-    # think that the node has 0 gpus (as shown by the output of
-    # `kubectl get nodes -o yaml`
-    check_call(['nvidia-smi'])
-
    set_state('kubernetes-worker.gpu.enabled')
    set_state('kubernetes-worker.restart-needed')