mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-31 15:25:57 +00:00
fix a bug in nvidia gpu allocation and added unit test
Signed-off-by: Vishnu kannan <vishnuk@google.com>
This commit is contained in:
parent
2554b95994
commit
13582a65aa
@ -5,6 +5,7 @@ licenses(["notice"])
|
|||||||
load(
|
load(
|
||||||
"@io_bazel_rules_go//go:def.bzl",
|
"@io_bazel_rules_go//go:def.bzl",
|
||||||
"go_library",
|
"go_library",
|
||||||
|
"go_test",
|
||||||
)
|
)
|
||||||
|
|
||||||
go_library(
|
go_library(
|
||||||
@ -36,3 +37,18 @@ filegroup(
|
|||||||
srcs = [":package-srcs"],
|
srcs = [":package-srcs"],
|
||||||
tags = ["automanaged"],
|
tags = ["automanaged"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
go_test(
|
||||||
|
name = "go_default_test",
|
||||||
|
srcs = ["nvidia_gpu_manager_test.go"],
|
||||||
|
library = ":go_default_library",
|
||||||
|
tags = ["automanaged"],
|
||||||
|
deps = [
|
||||||
|
"//pkg/api/v1:go_default_library",
|
||||||
|
"//vendor:github.com/stretchr/testify/assert",
|
||||||
|
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||||
|
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||||
|
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||||
|
"//vendor:k8s.io/apimachinery/pkg/util/uuid",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@ -53,7 +53,7 @@ func (pgpu *podGPUs) delete(pods []string) {
|
|||||||
func (pgpu *podGPUs) devices() sets.String {
|
func (pgpu *podGPUs) devices() sets.String {
|
||||||
ret := sets.NewString()
|
ret := sets.NewString()
|
||||||
for _, devices := range pgpu.podGPUMapping {
|
for _, devices := range pgpu.podGPUMapping {
|
||||||
ret.Union(devices)
|
ret = ret.Union(devices)
|
||||||
}
|
}
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
@ -80,7 +80,7 @@ func NewNvidiaGPUManager(activePodsLister activePodsLister, dockerClient dockert
|
|||||||
// Initialize the GPU devices, so far only needed to discover the GPU paths.
|
// Initialize the GPU devices, so far only needed to discover the GPU paths.
|
||||||
func (ngm *nvidiaGPUManager) Start() error {
|
func (ngm *nvidiaGPUManager) Start() error {
|
||||||
if ngm.dockerClient == nil {
|
if ngm.dockerClient == nil {
|
||||||
return fmt.Errorf("invalid docker client specified")
|
return fmt.Errorf("Invalid docker client specified in GPU Manager")
|
||||||
}
|
}
|
||||||
ngm.Lock()
|
ngm.Lock()
|
||||||
defer ngm.Unlock()
|
defer ngm.Unlock()
|
||||||
@ -94,7 +94,7 @@ func (ngm *nvidiaGPUManager) Start() error {
|
|||||||
}
|
}
|
||||||
ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
|
ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
|
||||||
_, err := os.Stat(nvidiaUVMToolsDevice)
|
_, err := os.Stat(nvidiaUVMToolsDevice)
|
||||||
if os.IsNotExist(err) {
|
if !os.IsNotExist(err) {
|
||||||
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
|
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,12 +154,14 @@ func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) (
|
|||||||
}
|
}
|
||||||
// Get GPU devices in use.
|
// Get GPU devices in use.
|
||||||
devicesInUse := ngm.allocated.devices()
|
devicesInUse := ngm.allocated.devices()
|
||||||
|
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
|
||||||
// Get a list of available GPUs.
|
// Get a list of available GPUs.
|
||||||
available := ngm.allGPUs.Difference(devicesInUse)
|
available := ngm.allGPUs.Difference(devicesInUse)
|
||||||
|
glog.V(5).Infof("gpus available: %v", available.List())
|
||||||
if int64(available.Len()) < gpusNeeded {
|
if int64(available.Len()) < gpusNeeded {
|
||||||
return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
|
return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
|
||||||
}
|
}
|
||||||
ret := available.List()[:gpusNeeded]
|
ret := available.UnsortedList()[:gpusNeeded]
|
||||||
for _, device := range ret {
|
for _, device := range ret {
|
||||||
// Update internal allocated GPU cache.
|
// Update internal allocated GPU cache.
|
||||||
ngm.allocated.insert(string(pod.UID), device)
|
ngm.allocated.insert(string(pod.UID), device)
|
||||||
@ -184,6 +186,7 @@ func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error {
|
|||||||
}
|
}
|
||||||
allocatedPodUids := ngm.allocated.pods()
|
allocatedPodUids := ngm.allocated.pods()
|
||||||
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
|
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
|
||||||
|
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
|
||||||
ngm.allocated.delete(podsToBeRemoved.List())
|
ngm.allocated.delete(podsToBeRemoved.List())
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
144
pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go
Normal file
144
pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2017 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package nvidia
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
|
||||||
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
"k8s.io/apimachinery/pkg/util/uuid"
|
||||||
|
"k8s.io/kubernetes/pkg/api/v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
type testActivePodsLister struct {
|
||||||
|
activePods []*v1.Pod
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tapl *testActivePodsLister) GetRunningPods() ([]*v1.Pod, error) {
|
||||||
|
return tapl.activePods, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeTestPod(numContainers int) *v1.Pod {
|
||||||
|
quantity := resource.NewQuantity(1, resource.DecimalSI)
|
||||||
|
resources := v1.ResourceRequirements{
|
||||||
|
Limits: v1.ResourceList{
|
||||||
|
v1.ResourceNvidiaGPU: *quantity,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
pod := &v1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
UID: uuid.NewUUID(),
|
||||||
|
},
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
Containers: []v1.Container{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for ; numContainers > 0; numContainers-- {
|
||||||
|
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
|
||||||
|
Resources: resources,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return pod
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMultiContainerPodGPUAllocation(t *testing.T) {
|
||||||
|
podLister := &testActivePodsLister{}
|
||||||
|
|
||||||
|
testGpuManager := &nvidiaGPUManager{
|
||||||
|
activePodsLister: podLister,
|
||||||
|
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
||||||
|
allocated: newPodGPUs(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expect that no devices are in use.
|
||||||
|
gpusInUse, err := testGpuManager.gpusInUse()
|
||||||
|
as := assert.New(t)
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(gpusInUse.devices()), 0)
|
||||||
|
|
||||||
|
// Allocated GPUs for a pod with two containers.
|
||||||
|
pod := makeTestPod(2)
|
||||||
|
// Allocate for the first container.
|
||||||
|
devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(devices1), 1)
|
||||||
|
|
||||||
|
podLister.activePods = append(podLister.activePods, pod)
|
||||||
|
// Allocate for the second container.
|
||||||
|
devices2, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[1])
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(devices2), 1)
|
||||||
|
|
||||||
|
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
||||||
|
|
||||||
|
// further allocations should fail.
|
||||||
|
newPod := makeTestPod(2)
|
||||||
|
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
||||||
|
as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
|
||||||
|
|
||||||
|
// Now terminate the original pod and observe that GPU allocation for new pod succeeds.
|
||||||
|
podLister.activePods = podLister.activePods[:0]
|
||||||
|
|
||||||
|
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(devices1), 1)
|
||||||
|
|
||||||
|
podLister.activePods = append(podLister.activePods, newPod)
|
||||||
|
|
||||||
|
devices2, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[1])
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(devices2), 1)
|
||||||
|
|
||||||
|
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMultiPodGPUAllocation(t *testing.T) {
|
||||||
|
podLister := &testActivePodsLister{}
|
||||||
|
|
||||||
|
testGpuManager := &nvidiaGPUManager{
|
||||||
|
activePodsLister: podLister,
|
||||||
|
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
||||||
|
allocated: newPodGPUs(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expect that no devices are in use.
|
||||||
|
gpusInUse, err := testGpuManager.gpusInUse()
|
||||||
|
as := assert.New(t)
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(gpusInUse.devices()), 0)
|
||||||
|
|
||||||
|
// Allocated GPUs for a pod with two containers.
|
||||||
|
podA := makeTestPod(1)
|
||||||
|
// Allocate for the first container.
|
||||||
|
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(devicesA), 1)
|
||||||
|
|
||||||
|
podLister.activePods = append(podLister.activePods, podA)
|
||||||
|
|
||||||
|
// further allocations should fail.
|
||||||
|
podB := makeTestPod(1)
|
||||||
|
// Allocate for the first container.
|
||||||
|
devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(devicesB), 1)
|
||||||
|
as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
|
||||||
|
}
|
@ -400,10 +400,9 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
|
|||||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||||
},
|
},
|
||||||
Allocatable: v1.ResourceList{
|
Allocatable: v1.ResourceList{
|
||||||
v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI),
|
v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI),
|
||||||
v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
|
v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
|
||||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user