Add support for multiple nvidia gpus

2025-07-31 07:20:13 +00:00 · 2016-12-03 15:12:38 +08:00 · 2016-12-03 15:12:38 +08:00 · 57c77ffbdd
commit 57c77ffbdd
parent 81d01a84e0
8 changed files with 228 additions and 21 deletions
--- a/cmd/kubelet/app/options/options.go
+++ b/cmd/kubelet/app/options/options.go
@ -206,7 +206,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
 	fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.")
 	fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.")
 	fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.")
-	fs.Int32Var(&s.NvidiaGPUs, "experimental-nvidia-gpus", s.NvidiaGPUs, "Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.")
+	fs.BoolVar(&s.EnableExperimentalNvidiaGPU, "experimental-enable-nvidia-gpu", s.EnableExperimentalNvidiaGPU, "Enable experimental Nvidia GPU support.")
 	// TODO(#40229): Remove the docker-exec-handler flag.
 	fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.")
 	fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.")
--- a/pkg/apis/componentconfig/types.go
+++ b/pkg/apis/componentconfig/types.go
@ -362,8 +362,8 @@ type KubeletConfiguration struct {
 	BabysitDaemons bool
 	// maxPods is the number of pods that can run on this Kubelet.
 	MaxPods int32
-	// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
-	NvidiaGPUs int32
+	// Enable experimental Nvidia GPU
+	EnableExperimentalNvidiaGPU bool
 	// dockerExecHandlerName is the handler to use when executing a command
 	// in a container. Valid values are 'native' and 'nsenter'. Defaults to
 	// 'native'.
--- a/pkg/apis/componentconfig/v1alpha1/types.go
+++ b/pkg/apis/componentconfig/v1alpha1/types.go
@ -407,8 +407,8 @@ type KubeletConfiguration struct {
 	BabysitDaemons bool `json:"babysitDaemons"`
 	// maxPods is the number of pods that can run on this Kubelet.
 	MaxPods int32 `json:"maxPods"`
-	// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
-	NvidiaGPUs int32 `json:"nvidiaGPUs"`
+	// Enable Nvidia GPU support on this node.
+	EnableExperimentalNvidiaGPU bool `json:"enableExperimentalNvidiaGPU"`
 	// dockerExecHandlerName is the handler to use when executing a command
 	// in a container. Valid values are 'native' and 'nsenter'. Defaults to
 	// 'native'.
--- a/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go
+++ b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go
@ -0,0 +1,181 @@
+/*
+Copyright 2016 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package nvidia
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sync"
+
+	"k8s.io/kubernetes/pkg/kubelet/dockertools"
+)
+
+// TODO: If use NVML in the future, the implementation could be more complex,
+// but also more powerful!
+
+const (
+	// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
+	// If the driver installed correctly, the 2 devices must be there.
+	NvidiaCtlDevice string = "/dev/nvidiactl"
+	NvidiaUVMDevice string = "/dev/nvidia-uvm"
+)
+
+// Manage GPU devices.
+type NvidiaGPUManager struct {
+	gpuPaths []string
+	gpuMutex sync.Mutex
+
+	// The interface which could get GPU mapping from all the containers.
+	// TODO: Should make this independent of Docker in the future.
+	dockerClient dockertools.DockerInterface
+}
+
+// Get all the paths of NVIDIA GPU card from /dev/
+// TODO: Without NVML support we only can check whether there has GPU devices, but
+// could not give a health check or get more information like GPU cores, memory, or
+// family name. Need to support NVML in the future. But we do not need NVML until
+// we want more features, features like schedule containers according to GPU family
+// name.
+func (ngm *NvidiaGPUManager) discovery() (err error) {
+	if ngm.gpuPaths == nil {
+		err = filepath.Walk("/dev", func(path string, f os.FileInfo, err error) error {
+			reg := regexp.MustCompile(`^nvidia[0-9]*$`)
+			gpupath := reg.FindAllString(f.Name(), -1)
+			if gpupath != nil && gpupath[0] != "" {
+				ngm.gpuPaths = append(ngm.gpuPaths, "/dev/"+gpupath[0])
+			}
+
+			return nil
+		})
+
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func Valid(path string) bool {
+	reg := regexp.MustCompile(`^/dev/nvidia[0-9]*$`)
+	check := reg.FindAllString(path, -1)
+
+	return check != nil && check[0] != ""
+}
+
+// Initialize the GPU devices, so far only needed to discover the GPU paths.
+func (ngm *NvidiaGPUManager) Init(dc dockertools.DockerInterface) error {
+	if _, err := os.Stat(NvidiaCtlDevice); err != nil {
+		return err
+	}
+
+	if _, err := os.Stat(NvidiaUVMDevice); err != nil {
+		return err
+	}
+
+	ngm.gpuMutex.Lock()
+	defer ngm.gpuMutex.Unlock()
+
+	err := ngm.discovery()
+
+	ngm.dockerClient = dc
+
+	return err
+}
+
+func (ngm *NvidiaGPUManager) Shutdown() {
+	ngm.gpuMutex.Lock()
+	defer ngm.gpuMutex.Unlock()
+
+	ngm.gpuPaths = nil
+}
+
+// Get how many GPU cards we have.
+func (ngm *NvidiaGPUManager) Capacity() int {
+	ngm.gpuMutex.Lock()
+	defer ngm.gpuMutex.Unlock()
+
+	return len(ngm.gpuPaths)
+}
+
+// Check whether the GPU device could be assigned to a container.
+func (ngm *NvidiaGPUManager) isAvailable(path string) bool {
+	containers, err := dockertools.GetKubeletDockerContainers(ngm.dockerClient, false)
+
+	if err != nil {
+		return true
+	}
+
+	for i := range containers {
+		containerJSON, err := ngm.dockerClient.InspectContainer(containers[i].ID)
+		if err != nil {
+			continue
+		}
+
+		devices := containerJSON.HostConfig.Devices
+		if devices == nil {
+			continue
+		}
+
+		for _, device := range devices {
+			if Valid(device.PathOnHost) && path == device.PathOnHost {
+				return false
+			}
+		}
+	}
+
+	return true
+}
+
+// Return the GPU paths as needed, otherwise, return error.
+func (ngm *NvidiaGPUManager) AllocateGPUs(num int) (paths []string, err error) {
+	if num <= 0 {
+		return
+	}
+
+	ngm.gpuMutex.Lock()
+	defer ngm.gpuMutex.Unlock()
+
+	for _, path := range ngm.gpuPaths {
+		if ngm.isAvailable(path) {
+			paths = append(paths, path)
+			if len(paths) == num {
+				return
+			}
+		}
+	}
+
+	err = fmt.Errorf("Not enough GPUs!")
+
+	return
+}
+
+// Return the count of GPUs which are free.
+func (ngm *NvidiaGPUManager) AvailableGPUs() (num int) {
+	ngm.gpuMutex.Lock()
+	defer ngm.gpuMutex.Unlock()
+
+	for _, path := range ngm.gpuPaths {
+		if ngm.isAvailable(path) {
+			num++
+		}
+	}
+
+	return
+}
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -67,6 +67,7 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/dockertools"
 	"k8s.io/kubernetes/pkg/kubelet/events"
 	"k8s.io/kubernetes/pkg/kubelet/eviction"
+	"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
 	"k8s.io/kubernetes/pkg/kubelet/images"
 	"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
@ -449,8 +450,8 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 		writer:            kubeDeps.Writer,
 		nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR,
 		maxPods:           int(kubeCfg.MaxPods),
+		enableNvidiaGPU:   kubeCfg.EnableNvidiaGPU,
 		podsPerCore:       int(kubeCfg.PodsPerCore),
-		nvidiaGPUs:        int(kubeCfg.NvidiaGPUs),
 		syncLoopMonitor:   atomic.Value{},
 		resolverConfig:    kubeCfg.ResolverConfig,
 		cpuCFSQuota:       kubeCfg.CPUCFSQuota,
@ -981,8 +982,8 @@ type Kubelet struct {
 	// Maximum Number of Pods which can be run by this Kubelet
 	maxPods int

-	// Number of NVIDIA GPUs on this node
-	nvidiaGPUs int
+	// Enable experimental Nvidia GPU
+	enableExperimentalNvidiaGPU bool

 	// Monitor Kubelet's sync loop
 	syncLoopMonitor atomic.Value
@ -1089,6 +1090,9 @@ type Kubelet struct {
 	// This should only be enabled when the container runtime is performing user remapping AND if the
 	// experimental behavior is desired.
 	experimentalHostUserNamespaceDefaulting bool
+
+	// NVIDIA GPU Manager
+	nvidiaGPUManager nvidia.NvidiaGPUManager
 }

 // setupDataDirs creates:
@ -1182,7 +1186,13 @@ func (kl *Kubelet) initializeModules() error {
 		return fmt.Errorf("Failed to start OOM watcher %v", err)
 	}

-	// Step 7: Start resource analyzer
+	// Step 7: Init Nvidia Manager. Do not need to return err until we use NVML instead.
+	// Only works when user give true to EnableExperimentalNvidiaGPU
+	if kl.enableExperimentalNvidiaGPU {
+		kl.nvidiaGPUManager.Init(kl.dockerClient)
+	}
+
+	// Step 8: Start resource analyzer
 	kl.resourceAnalyzer.Start()

 	return nil
--- a/pkg/kubelet/kubelet_node_status.go
+++ b/pkg/kubelet/kubelet_node_status.go
@ -482,6 +482,11 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 		node.Status.Capacity = v1.ResourceList{}
 	}

+	nvidiaGPUCapacity := 0
+	if kl.enableExperimentalNvidiaGPU {
+		nvidiaGPUCapacity = kl.nvidiaGPUManager.Capacity()
+	}
+
 	// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
 	// cAdvisor locally, e.g. for test-cmd.sh, and in integration test.
 	info, err := kl.GetCachedMachineInfo()
@ -491,7 +496,7 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 		node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI)
 		node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi")
 		node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI)
-		node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(kl.nvidiaGPUs), resource.DecimalSI)
+		node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(nvidiaGPUCapacity), resource.DecimalSI)

 		glog.Errorf("Error getting machine info: %v", err)
 	} else {
@ -510,7 +515,7 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 				int64(kl.maxPods), resource.DecimalSI)
 		}
 		node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(
-			int64(kl.nvidiaGPUs), resource.DecimalSI)
+			int64(nvidiaGPUCapacity), resource.DecimalSI)
 		if node.Status.NodeInfo.BootID != "" &&
 			node.Status.NodeInfo.BootID != info.BootID {
 			// TODO: This requires a transaction, either both node status is updated
--- a/pkg/kubelet/kubelet_pods.go
+++ b/pkg/kubelet/kubelet_pods.go
@ -28,6 +28,7 @@ import (
 	"path/filepath"
 	"runtime"
 	"sort"
+	"strconv"
 	"strings"
 	"sync"
 	"time"
@ -48,6 +49,7 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/cm"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
 	"k8s.io/kubernetes/pkg/kubelet/envvars"
+	"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
 	"k8s.io/kubernetes/pkg/kubelet/images"
 	"k8s.io/kubernetes/pkg/kubelet/qos"
 	"k8s.io/kubernetes/pkg/kubelet/server/portforward"
@ -84,16 +86,25 @@ func (kl *Kubelet) getActivePods() []*v1.Pod {
 }

 // makeDevices determines the devices for the given container.
-// Experimental. For now, we hardcode /dev/nvidia0 no matter what the user asks for
-// (we only support one device per node).
-// TODO: add support for more than 1 GPU after #28216.
-func makeDevices(container *v1.Container) []kubecontainer.DeviceInfo {
+// Experimental.
+func (kl *Kubelet) makeDevices(container *v1.Container) []kubecontainer.DeviceInfo {
+	if !kl.enableExperimentalNvidiaGPU {
+		return nil
+	}
+
 	nvidiaGPULimit := container.Resources.Limits.NvidiaGPU()
+
 	if nvidiaGPULimit.Value() != 0 {
-		return []kubecontainer.DeviceInfo{
-			{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
-			{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
-			{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
+		if nvidiaGPUPaths, err := kl.nvidiaGPUManager.AllocateGPUs(int(nvidiaGPULimit.Value())); err == nil {
+			devices := []kubecontainer.DeviceInfo{{PathOnHost: nvidia.NvidiaCtlDevice, PathInContainer: nvidia.NvidiaCtlDevice, Permissions: "mrw"},
+				{PathOnHost: nvidia.NvidiaUVMDevice, PathInContainer: nvidia.NvidiaUVMDevice, Permissions: "mrw"}}
+
+			for i, path := range nvidiaGPUPaths {
+				devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: "/dev/nvidia" + strconv.Itoa(i), Permissions: "mrw"})
+			}
+
+			return devices
+
 		}
 	}

@ -285,7 +296,7 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai

 	opts.PortMappings = kubecontainer.MakePortMappings(container)
 	// TODO(random-liu): Move following convert functions into pkg/kubelet/container
-	opts.Devices = makeDevices(container)
+	opts.Devices = kl.makeDevices(container)

 	opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes)
 	if err != nil {
--- a/pkg/kubemark/hollow_kubelet.go
+++ b/pkg/kubemark/hollow_kubelet.go
@ -150,7 +150,7 @@ func GetHollowKubeletConfig(
 	c.MaxContainerCount = 100
 	c.MaxOpenFiles = 1024
 	c.MaxPerPodContainerCount = 2
-	c.NvidiaGPUs = 0
+	c.EnableExperimentalNvidiaGPU = false
 	c.RegisterNode = true
 	c.RegisterSchedulable = true
 	c.RegistryBurst = 10