Add support for multiple nvidia gpus

This commit is contained in:
Hui-Zhi 2016-12-03 15:12:38 +08:00 committed by Vishnu kannan
parent 81d01a84e0
commit 57c77ffbdd
8 changed files with 228 additions and 21 deletions

View File

@ -206,7 +206,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.")
fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.")
fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.")
fs.Int32Var(&s.NvidiaGPUs, "experimental-nvidia-gpus", s.NvidiaGPUs, "Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.")
fs.BoolVar(&s.EnableExperimentalNvidiaGPU, "experimental-enable-nvidia-gpu", s.EnableExperimentalNvidiaGPU, "Enable experimental Nvidia GPU support.")
// TODO(#40229): Remove the docker-exec-handler flag.
fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.")
fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.")

View File

@ -362,8 +362,8 @@ type KubeletConfiguration struct {
BabysitDaemons bool
// maxPods is the number of pods that can run on this Kubelet.
MaxPods int32
// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
NvidiaGPUs int32
// Enable experimental Nvidia GPU
EnableExperimentalNvidiaGPU bool
// dockerExecHandlerName is the handler to use when executing a command
// in a container. Valid values are 'native' and 'nsenter'. Defaults to
// 'native'.

View File

@ -407,8 +407,8 @@ type KubeletConfiguration struct {
BabysitDaemons bool `json:"babysitDaemons"`
// maxPods is the number of pods that can run on this Kubelet.
MaxPods int32 `json:"maxPods"`
// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
NvidiaGPUs int32 `json:"nvidiaGPUs"`
// Enable Nvidia GPU support on this node.
EnableExperimentalNvidiaGPU bool `json:"enableExperimentalNvidiaGPU"`
// dockerExecHandlerName is the handler to use when executing a command
// in a container. Valid values are 'native' and 'nsenter'. Defaults to
// 'native'.

View File

@ -0,0 +1,181 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nvidia
import (
"fmt"
"os"
"path/filepath"
"regexp"
"sync"
"k8s.io/kubernetes/pkg/kubelet/dockertools"
)
// TODO: If use NVML in the future, the implementation could be more complex,
// but also more powerful!
const (
// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
// If the driver installed correctly, the 2 devices must be there.
NvidiaCtlDevice string = "/dev/nvidiactl"
NvidiaUVMDevice string = "/dev/nvidia-uvm"
)
// Manage GPU devices.
type NvidiaGPUManager struct {
gpuPaths []string
gpuMutex sync.Mutex
// The interface which could get GPU mapping from all the containers.
// TODO: Should make this independent of Docker in the future.
dockerClient dockertools.DockerInterface
}
// Get all the paths of NVIDIA GPU card from /dev/
// TODO: Without NVML support we only can check whether there has GPU devices, but
// could not give a health check or get more information like GPU cores, memory, or
// family name. Need to support NVML in the future. But we do not need NVML until
// we want more features, features like schedule containers according to GPU family
// name.
func (ngm *NvidiaGPUManager) discovery() (err error) {
if ngm.gpuPaths == nil {
err = filepath.Walk("/dev", func(path string, f os.FileInfo, err error) error {
reg := regexp.MustCompile(`^nvidia[0-9]*$`)
gpupath := reg.FindAllString(f.Name(), -1)
if gpupath != nil && gpupath[0] != "" {
ngm.gpuPaths = append(ngm.gpuPaths, "/dev/"+gpupath[0])
}
return nil
})
if err != nil {
return err
}
}
return nil
}
func Valid(path string) bool {
reg := regexp.MustCompile(`^/dev/nvidia[0-9]*$`)
check := reg.FindAllString(path, -1)
return check != nil && check[0] != ""
}
// Initialize the GPU devices, so far only needed to discover the GPU paths.
func (ngm *NvidiaGPUManager) Init(dc dockertools.DockerInterface) error {
if _, err := os.Stat(NvidiaCtlDevice); err != nil {
return err
}
if _, err := os.Stat(NvidiaUVMDevice); err != nil {
return err
}
ngm.gpuMutex.Lock()
defer ngm.gpuMutex.Unlock()
err := ngm.discovery()
ngm.dockerClient = dc
return err
}
func (ngm *NvidiaGPUManager) Shutdown() {
ngm.gpuMutex.Lock()
defer ngm.gpuMutex.Unlock()
ngm.gpuPaths = nil
}
// Get how many GPU cards we have.
func (ngm *NvidiaGPUManager) Capacity() int {
ngm.gpuMutex.Lock()
defer ngm.gpuMutex.Unlock()
return len(ngm.gpuPaths)
}
// Check whether the GPU device could be assigned to a container.
func (ngm *NvidiaGPUManager) isAvailable(path string) bool {
containers, err := dockertools.GetKubeletDockerContainers(ngm.dockerClient, false)
if err != nil {
return true
}
for i := range containers {
containerJSON, err := ngm.dockerClient.InspectContainer(containers[i].ID)
if err != nil {
continue
}
devices := containerJSON.HostConfig.Devices
if devices == nil {
continue
}
for _, device := range devices {
if Valid(device.PathOnHost) && path == device.PathOnHost {
return false
}
}
}
return true
}
// Return the GPU paths as needed, otherwise, return error.
func (ngm *NvidiaGPUManager) AllocateGPUs(num int) (paths []string, err error) {
if num <= 0 {
return
}
ngm.gpuMutex.Lock()
defer ngm.gpuMutex.Unlock()
for _, path := range ngm.gpuPaths {
if ngm.isAvailable(path) {
paths = append(paths, path)
if len(paths) == num {
return
}
}
}
err = fmt.Errorf("Not enough GPUs!")
return
}
// Return the count of GPUs which are free.
func (ngm *NvidiaGPUManager) AvailableGPUs() (num int) {
ngm.gpuMutex.Lock()
defer ngm.gpuMutex.Unlock()
for _, path := range ngm.gpuPaths {
if ngm.isAvailable(path) {
num++
}
}
return
}

View File

@ -67,6 +67,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/dockertools"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
"k8s.io/kubernetes/pkg/kubelet/images"
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
@ -449,8 +450,8 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
writer: kubeDeps.Writer,
nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR,
maxPods: int(kubeCfg.MaxPods),
enableNvidiaGPU: kubeCfg.EnableNvidiaGPU,
podsPerCore: int(kubeCfg.PodsPerCore),
nvidiaGPUs: int(kubeCfg.NvidiaGPUs),
syncLoopMonitor: atomic.Value{},
resolverConfig: kubeCfg.ResolverConfig,
cpuCFSQuota: kubeCfg.CPUCFSQuota,
@ -981,8 +982,8 @@ type Kubelet struct {
// Maximum Number of Pods which can be run by this Kubelet
maxPods int
// Number of NVIDIA GPUs on this node
nvidiaGPUs int
// Enable experimental Nvidia GPU
enableExperimentalNvidiaGPU bool
// Monitor Kubelet's sync loop
syncLoopMonitor atomic.Value
@ -1089,6 +1090,9 @@ type Kubelet struct {
// This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired.
experimentalHostUserNamespaceDefaulting bool
// NVIDIA GPU Manager
nvidiaGPUManager nvidia.NvidiaGPUManager
}
// setupDataDirs creates:
@ -1182,7 +1186,13 @@ func (kl *Kubelet) initializeModules() error {
return fmt.Errorf("Failed to start OOM watcher %v", err)
}
// Step 7: Start resource analyzer
// Step 7: Init Nvidia Manager. Do not need to return err until we use NVML instead.
// Only works when user give true to EnableExperimentalNvidiaGPU
if kl.enableExperimentalNvidiaGPU {
kl.nvidiaGPUManager.Init(kl.dockerClient)
}
// Step 8: Start resource analyzer
kl.resourceAnalyzer.Start()
return nil

View File

@ -482,6 +482,11 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.Capacity = v1.ResourceList{}
}
nvidiaGPUCapacity := 0
if kl.enableExperimentalNvidiaGPU {
nvidiaGPUCapacity = kl.nvidiaGPUManager.Capacity()
}
// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
// cAdvisor locally, e.g. for test-cmd.sh, and in integration test.
info, err := kl.GetCachedMachineInfo()
@ -491,7 +496,7 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI)
node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi")
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI)
node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(kl.nvidiaGPUs), resource.DecimalSI)
node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(nvidiaGPUCapacity), resource.DecimalSI)
glog.Errorf("Error getting machine info: %v", err)
} else {
@ -510,7 +515,7 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
int64(kl.maxPods), resource.DecimalSI)
}
node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(
int64(kl.nvidiaGPUs), resource.DecimalSI)
int64(nvidiaGPUCapacity), resource.DecimalSI)
if node.Status.NodeInfo.BootID != "" &&
node.Status.NodeInfo.BootID != info.BootID {
// TODO: This requires a transaction, either both node status is updated

View File

@ -28,6 +28,7 @@ import (
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"time"
@ -48,6 +49,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/envvars"
"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
"k8s.io/kubernetes/pkg/kubelet/images"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/server/portforward"
@ -84,16 +86,25 @@ func (kl *Kubelet) getActivePods() []*v1.Pod {
}
// makeDevices determines the devices for the given container.
// Experimental. For now, we hardcode /dev/nvidia0 no matter what the user asks for
// (we only support one device per node).
// TODO: add support for more than 1 GPU after #28216.
func makeDevices(container *v1.Container) []kubecontainer.DeviceInfo {
// Experimental.
func (kl *Kubelet) makeDevices(container *v1.Container) []kubecontainer.DeviceInfo {
if !kl.enableExperimentalNvidiaGPU {
return nil
}
nvidiaGPULimit := container.Resources.Limits.NvidiaGPU()
if nvidiaGPULimit.Value() != 0 {
return []kubecontainer.DeviceInfo{
{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
if nvidiaGPUPaths, err := kl.nvidiaGPUManager.AllocateGPUs(int(nvidiaGPULimit.Value())); err == nil {
devices := []kubecontainer.DeviceInfo{{PathOnHost: nvidia.NvidiaCtlDevice, PathInContainer: nvidia.NvidiaCtlDevice, Permissions: "mrw"},
{PathOnHost: nvidia.NvidiaUVMDevice, PathInContainer: nvidia.NvidiaUVMDevice, Permissions: "mrw"}}
for i, path := range nvidiaGPUPaths {
devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: "/dev/nvidia" + strconv.Itoa(i), Permissions: "mrw"})
}
return devices
}
}
@ -285,7 +296,7 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai
opts.PortMappings = kubecontainer.MakePortMappings(container)
// TODO(random-liu): Move following convert functions into pkg/kubelet/container
opts.Devices = makeDevices(container)
opts.Devices = kl.makeDevices(container)
opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes)
if err != nil {

View File

@ -150,7 +150,7 @@ func GetHollowKubeletConfig(
c.MaxContainerCount = 100
c.MaxOpenFiles = 1024
c.MaxPerPodContainerCount = 2
c.NvidiaGPUs = 0
c.EnableExperimentalNvidiaGPU = false
c.RegisterNode = true
c.RegisterSchedulable = true
c.RegistryBurst = 10