diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index 4ef0d35f9e8..3504d39d6c9 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -206,7 +206,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.") fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.") fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.") - fs.Int32Var(&s.NvidiaGPUs, "experimental-nvidia-gpus", s.NvidiaGPUs, "Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.") + fs.BoolVar(&s.EnableExperimentalNvidiaGPU, "experimental-enable-nvidia-gpu", s.EnableExperimentalNvidiaGPU, "Enable experimental Nvidia GPU support.") // TODO(#40229): Remove the docker-exec-handler flag. fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.") fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.") diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index 88ea80e8f2d..4b9ed20c8bf 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -362,8 +362,8 @@ type KubeletConfiguration struct { BabysitDaemons bool // maxPods is the number of pods that can run on this Kubelet. MaxPods int32 - // nvidiaGPUs is the number of NVIDIA GPU devices on this node. - NvidiaGPUs int32 + // Enable experimental Nvidia GPU + EnableExperimentalNvidiaGPU bool // dockerExecHandlerName is the handler to use when executing a command // in a container. Valid values are 'native' and 'nsenter'. Defaults to // 'native'. diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 28b7499a3ef..ad74db26fd9 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -407,8 +407,8 @@ type KubeletConfiguration struct { BabysitDaemons bool `json:"babysitDaemons"` // maxPods is the number of pods that can run on this Kubelet. MaxPods int32 `json:"maxPods"` - // nvidiaGPUs is the number of NVIDIA GPU devices on this node. - NvidiaGPUs int32 `json:"nvidiaGPUs"` + // Enable Nvidia GPU support on this node. + EnableExperimentalNvidiaGPU bool `json:"enableExperimentalNvidiaGPU"` // dockerExecHandlerName is the handler to use when executing a command // in a container. Valid values are 'native' and 'nsenter'. Defaults to // 'native'. diff --git a/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go new file mode 100644 index 00000000000..2cce0bd2bb5 --- /dev/null +++ b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go @@ -0,0 +1,181 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nvidia + +import ( + "fmt" + "os" + "path/filepath" + "regexp" + "sync" + + "k8s.io/kubernetes/pkg/kubelet/dockertools" +) + +// TODO: If use NVML in the future, the implementation could be more complex, +// but also more powerful! + +const ( + // All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm + // If the driver installed correctly, the 2 devices must be there. + NvidiaCtlDevice string = "/dev/nvidiactl" + NvidiaUVMDevice string = "/dev/nvidia-uvm" +) + +// Manage GPU devices. +type NvidiaGPUManager struct { + gpuPaths []string + gpuMutex sync.Mutex + + // The interface which could get GPU mapping from all the containers. + // TODO: Should make this independent of Docker in the future. + dockerClient dockertools.DockerInterface +} + +// Get all the paths of NVIDIA GPU card from /dev/ +// TODO: Without NVML support we only can check whether there has GPU devices, but +// could not give a health check or get more information like GPU cores, memory, or +// family name. Need to support NVML in the future. But we do not need NVML until +// we want more features, features like schedule containers according to GPU family +// name. +func (ngm *NvidiaGPUManager) discovery() (err error) { + if ngm.gpuPaths == nil { + err = filepath.Walk("/dev", func(path string, f os.FileInfo, err error) error { + reg := regexp.MustCompile(`^nvidia[0-9]*$`) + gpupath := reg.FindAllString(f.Name(), -1) + if gpupath != nil && gpupath[0] != "" { + ngm.gpuPaths = append(ngm.gpuPaths, "/dev/"+gpupath[0]) + } + + return nil + }) + + if err != nil { + return err + } + } + + return nil +} + +func Valid(path string) bool { + reg := regexp.MustCompile(`^/dev/nvidia[0-9]*$`) + check := reg.FindAllString(path, -1) + + return check != nil && check[0] != "" +} + +// Initialize the GPU devices, so far only needed to discover the GPU paths. +func (ngm *NvidiaGPUManager) Init(dc dockertools.DockerInterface) error { + if _, err := os.Stat(NvidiaCtlDevice); err != nil { + return err + } + + if _, err := os.Stat(NvidiaUVMDevice); err != nil { + return err + } + + ngm.gpuMutex.Lock() + defer ngm.gpuMutex.Unlock() + + err := ngm.discovery() + + ngm.dockerClient = dc + + return err +} + +func (ngm *NvidiaGPUManager) Shutdown() { + ngm.gpuMutex.Lock() + defer ngm.gpuMutex.Unlock() + + ngm.gpuPaths = nil +} + +// Get how many GPU cards we have. +func (ngm *NvidiaGPUManager) Capacity() int { + ngm.gpuMutex.Lock() + defer ngm.gpuMutex.Unlock() + + return len(ngm.gpuPaths) +} + +// Check whether the GPU device could be assigned to a container. +func (ngm *NvidiaGPUManager) isAvailable(path string) bool { + containers, err := dockertools.GetKubeletDockerContainers(ngm.dockerClient, false) + + if err != nil { + return true + } + + for i := range containers { + containerJSON, err := ngm.dockerClient.InspectContainer(containers[i].ID) + if err != nil { + continue + } + + devices := containerJSON.HostConfig.Devices + if devices == nil { + continue + } + + for _, device := range devices { + if Valid(device.PathOnHost) && path == device.PathOnHost { + return false + } + } + } + + return true +} + +// Return the GPU paths as needed, otherwise, return error. +func (ngm *NvidiaGPUManager) AllocateGPUs(num int) (paths []string, err error) { + if num <= 0 { + return + } + + ngm.gpuMutex.Lock() + defer ngm.gpuMutex.Unlock() + + for _, path := range ngm.gpuPaths { + if ngm.isAvailable(path) { + paths = append(paths, path) + if len(paths) == num { + return + } + } + } + + err = fmt.Errorf("Not enough GPUs!") + + return +} + +// Return the count of GPUs which are free. +func (ngm *NvidiaGPUManager) AvailableGPUs() (num int) { + ngm.gpuMutex.Lock() + defer ngm.gpuMutex.Unlock() + + for _, path := range ngm.gpuPaths { + if ngm.isAvailable(path) { + num++ + } + } + + return +} diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 878c9556506..6c52da1ec5b 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -67,6 +67,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/dockertools" "k8s.io/kubernetes/pkg/kubelet/events" "k8s.io/kubernetes/pkg/kubelet/eviction" + "k8s.io/kubernetes/pkg/kubelet/gpu/nvidia" "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/kuberuntime" "k8s.io/kubernetes/pkg/kubelet/lifecycle" @@ -449,8 +450,8 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub writer: kubeDeps.Writer, nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR, maxPods: int(kubeCfg.MaxPods), + enableNvidiaGPU: kubeCfg.EnableNvidiaGPU, podsPerCore: int(kubeCfg.PodsPerCore), - nvidiaGPUs: int(kubeCfg.NvidiaGPUs), syncLoopMonitor: atomic.Value{}, resolverConfig: kubeCfg.ResolverConfig, cpuCFSQuota: kubeCfg.CPUCFSQuota, @@ -981,8 +982,8 @@ type Kubelet struct { // Maximum Number of Pods which can be run by this Kubelet maxPods int - // Number of NVIDIA GPUs on this node - nvidiaGPUs int + // Enable experimental Nvidia GPU + enableExperimentalNvidiaGPU bool // Monitor Kubelet's sync loop syncLoopMonitor atomic.Value @@ -1089,6 +1090,9 @@ type Kubelet struct { // This should only be enabled when the container runtime is performing user remapping AND if the // experimental behavior is desired. experimentalHostUserNamespaceDefaulting bool + + // NVIDIA GPU Manager + nvidiaGPUManager nvidia.NvidiaGPUManager } // setupDataDirs creates: @@ -1182,7 +1186,13 @@ func (kl *Kubelet) initializeModules() error { return fmt.Errorf("Failed to start OOM watcher %v", err) } - // Step 7: Start resource analyzer + // Step 7: Init Nvidia Manager. Do not need to return err until we use NVML instead. + // Only works when user give true to EnableExperimentalNvidiaGPU + if kl.enableExperimentalNvidiaGPU { + kl.nvidiaGPUManager.Init(kl.dockerClient) + } + + // Step 8: Start resource analyzer kl.resourceAnalyzer.Start() return nil diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go index 50b1aeffc80..8ac9594fde7 100644 --- a/pkg/kubelet/kubelet_node_status.go +++ b/pkg/kubelet/kubelet_node_status.go @@ -482,6 +482,11 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { node.Status.Capacity = v1.ResourceList{} } + nvidiaGPUCapacity := 0 + if kl.enableExperimentalNvidiaGPU { + nvidiaGPUCapacity = kl.nvidiaGPUManager.Capacity() + } + // TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start // cAdvisor locally, e.g. for test-cmd.sh, and in integration test. info, err := kl.GetCachedMachineInfo() @@ -491,7 +496,7 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI) node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi") node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI) - node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(kl.nvidiaGPUs), resource.DecimalSI) + node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(nvidiaGPUCapacity), resource.DecimalSI) glog.Errorf("Error getting machine info: %v", err) } else { @@ -510,7 +515,7 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { int64(kl.maxPods), resource.DecimalSI) } node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity( - int64(kl.nvidiaGPUs), resource.DecimalSI) + int64(nvidiaGPUCapacity), resource.DecimalSI) if node.Status.NodeInfo.BootID != "" && node.Status.NodeInfo.BootID != info.BootID { // TODO: This requires a transaction, either both node status is updated diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 453d1d57bc9..46f89209ea7 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -28,6 +28,7 @@ import ( "path/filepath" "runtime" "sort" + "strconv" "strings" "sync" "time" @@ -48,6 +49,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/envvars" + "k8s.io/kubernetes/pkg/kubelet/gpu/nvidia" "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/portforward" @@ -84,16 +86,25 @@ func (kl *Kubelet) getActivePods() []*v1.Pod { } // makeDevices determines the devices for the given container. -// Experimental. For now, we hardcode /dev/nvidia0 no matter what the user asks for -// (we only support one device per node). -// TODO: add support for more than 1 GPU after #28216. -func makeDevices(container *v1.Container) []kubecontainer.DeviceInfo { +// Experimental. +func (kl *Kubelet) makeDevices(container *v1.Container) []kubecontainer.DeviceInfo { + if !kl.enableExperimentalNvidiaGPU { + return nil + } + nvidiaGPULimit := container.Resources.Limits.NvidiaGPU() + if nvidiaGPULimit.Value() != 0 { - return []kubecontainer.DeviceInfo{ - {PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"}, - {PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"}, - {PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"}, + if nvidiaGPUPaths, err := kl.nvidiaGPUManager.AllocateGPUs(int(nvidiaGPULimit.Value())); err == nil { + devices := []kubecontainer.DeviceInfo{{PathOnHost: nvidia.NvidiaCtlDevice, PathInContainer: nvidia.NvidiaCtlDevice, Permissions: "mrw"}, + {PathOnHost: nvidia.NvidiaUVMDevice, PathInContainer: nvidia.NvidiaUVMDevice, Permissions: "mrw"}} + + for i, path := range nvidiaGPUPaths { + devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: "/dev/nvidia" + strconv.Itoa(i), Permissions: "mrw"}) + } + + return devices + } } @@ -285,7 +296,7 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai opts.PortMappings = kubecontainer.MakePortMappings(container) // TODO(random-liu): Move following convert functions into pkg/kubelet/container - opts.Devices = makeDevices(container) + opts.Devices = kl.makeDevices(container) opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes) if err != nil { diff --git a/pkg/kubemark/hollow_kubelet.go b/pkg/kubemark/hollow_kubelet.go index 92f3e329baf..5e449127d95 100644 --- a/pkg/kubemark/hollow_kubelet.go +++ b/pkg/kubemark/hollow_kubelet.go @@ -150,7 +150,7 @@ func GetHollowKubeletConfig( c.MaxContainerCount = 100 c.MaxOpenFiles = 1024 c.MaxPerPodContainerCount = 2 - c.NvidiaGPUs = 0 + c.EnableExperimentalNvidiaGPU = false c.RegisterNode = true c.RegisterSchedulable = true c.RegistryBurst = 10