mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-27 05:27:21 +00:00
handle container restarts for GPUs
Signed-off-by: Vishnu Kannan <vishnuk@google.com>
This commit is contained in:
parent
ab9b299c30
commit
8ed9bff073
@ -18,14 +18,16 @@ package nvidia
|
|||||||
|
|
||||||
import "k8s.io/apimachinery/pkg/util/sets"
|
import "k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
|
||||||
|
type containerToGPU map[string]sets.String
|
||||||
|
|
||||||
// podGPUs represents a list of pod to GPU mappings.
|
// podGPUs represents a list of pod to GPU mappings.
|
||||||
type podGPUs struct {
|
type podGPUs struct {
|
||||||
podGPUMapping map[string]sets.String
|
podGPUMapping map[string]containerToGPU
|
||||||
}
|
}
|
||||||
|
|
||||||
func newPodGPUs() *podGPUs {
|
func newPodGPUs() *podGPUs {
|
||||||
return &podGPUs{
|
return &podGPUs{
|
||||||
podGPUMapping: map[string]sets.String{},
|
podGPUMapping: make(map[string]containerToGPU),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
func (pgpu *podGPUs) pods() sets.String {
|
func (pgpu *podGPUs) pods() sets.String {
|
||||||
@ -36,12 +38,26 @@ func (pgpu *podGPUs) pods() sets.String {
|
|||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pgpu *podGPUs) insert(podUID string, device string) {
|
func (pgpu *podGPUs) insert(podUID, contName string, device string) {
|
||||||
if _, exists := pgpu.podGPUMapping[podUID]; !exists {
|
if _, exists := pgpu.podGPUMapping[podUID]; !exists {
|
||||||
pgpu.podGPUMapping[podUID] = sets.NewString(device)
|
pgpu.podGPUMapping[podUID] = make(containerToGPU)
|
||||||
} else {
|
|
||||||
pgpu.podGPUMapping[podUID].Insert(device)
|
|
||||||
}
|
}
|
||||||
|
if _, exists := pgpu.podGPUMapping[podUID][contName]; !exists {
|
||||||
|
pgpu.podGPUMapping[podUID][contName] = sets.NewString()
|
||||||
|
}
|
||||||
|
pgpu.podGPUMapping[podUID][contName].Insert(device)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pgpu *podGPUs) getGPUs(podUID, contName string) sets.String {
|
||||||
|
containers, exists := pgpu.podGPUMapping[podUID]
|
||||||
|
if !exists {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
devices, exists := containers[contName]
|
||||||
|
if !exists {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return devices
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pgpu *podGPUs) delete(pods []string) {
|
func (pgpu *podGPUs) delete(pods []string) {
|
||||||
@ -52,8 +68,10 @@ func (pgpu *podGPUs) delete(pods []string) {
|
|||||||
|
|
||||||
func (pgpu *podGPUs) devices() sets.String {
|
func (pgpu *podGPUs) devices() sets.String {
|
||||||
ret := sets.NewString()
|
ret := sets.NewString()
|
||||||
for _, devices := range pgpu.podGPUMapping {
|
for _, containerToGPU := range pgpu.podGPUMapping {
|
||||||
ret = ret.Union(devices)
|
for _, deviceSet := range containerToGPU {
|
||||||
|
ret = ret.Union(deviceSet)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
@ -152,6 +152,12 @@ func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) (
|
|||||||
return nil, fmt.Errorf("Failed to allocate GPUs because of issues with updating GPUs in use: %v", err)
|
return nil, fmt.Errorf("Failed to allocate GPUs because of issues with updating GPUs in use: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Check if GPUs have already been allocated. If so return them right away.
|
||||||
|
// This can happen if a container restarts for example.
|
||||||
|
if devices := ngm.allocated.getGPUs(string(pod.UID), container.Name); devices != nil {
|
||||||
|
glog.V(2).Infof("Found pre-allocated GPUs for container %q in Pod %q: %v", container.Name, pod.UID, devices.List())
|
||||||
|
return append(devices.List(), ngm.defaultDevices...), nil
|
||||||
|
}
|
||||||
// Get GPU devices in use.
|
// Get GPU devices in use.
|
||||||
devicesInUse := ngm.allocated.devices()
|
devicesInUse := ngm.allocated.devices()
|
||||||
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
|
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
|
||||||
@ -164,7 +170,7 @@ func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) (
|
|||||||
ret := available.UnsortedList()[:gpusNeeded]
|
ret := available.UnsortedList()[:gpusNeeded]
|
||||||
for _, device := range ret {
|
for _, device := range ret {
|
||||||
// Update internal allocated GPU cache.
|
// Update internal allocated GPU cache.
|
||||||
ngm.allocated.insert(string(pod.UID), device)
|
ngm.allocated.insert(string(pod.UID), container.Name, device)
|
||||||
}
|
}
|
||||||
// Add standard devices files that needs to be exposed.
|
// Add standard devices files that needs to be exposed.
|
||||||
ret = append(ret, ngm.defaultDevices...)
|
ret = append(ret, ngm.defaultDevices...)
|
||||||
@ -222,9 +228,13 @@ func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
type containerIdentifier struct {
|
||||||
|
id string
|
||||||
|
name string
|
||||||
|
}
|
||||||
type podContainers struct {
|
type podContainers struct {
|
||||||
uid string
|
uid string
|
||||||
containerIDs sets.String
|
containers []containerIdentifier
|
||||||
}
|
}
|
||||||
// List of containers to inspect.
|
// List of containers to inspect.
|
||||||
podContainersToInspect := []podContainers{}
|
podContainersToInspect := []podContainers{}
|
||||||
@ -240,21 +250,23 @@ func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
|
|||||||
if containers.Len() == 0 {
|
if containers.Len() == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
containerIDs := sets.NewString()
|
// TODO: If kubelet restarts right after allocating a GPU to a pod, the container might not have started yet and so container status might not be available yet.
|
||||||
|
// Use an internal checkpoint instead or try using the CRI if its checkpoint is reliable.
|
||||||
|
var containersToInspect []containerIdentifier
|
||||||
for _, container := range pod.Status.ContainerStatuses {
|
for _, container := range pod.Status.ContainerStatuses {
|
||||||
if containers.Has(container.Name) {
|
if containers.Has(container.Name) {
|
||||||
containerIDs.Insert(container.ContainerID)
|
containersToInspect = append(containersToInspect, containerIdentifier{container.ContainerID, container.Name})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// add the pod and its containers that need to be inspected.
|
// add the pod and its containers that need to be inspected.
|
||||||
podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containerIDs})
|
podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containersToInspect})
|
||||||
}
|
}
|
||||||
ret := newPodGPUs()
|
ret := newPodGPUs()
|
||||||
for _, podContainer := range podContainersToInspect {
|
for _, podContainer := range podContainersToInspect {
|
||||||
for _, containerId := range podContainer.containerIDs.List() {
|
for _, containerIdentifier := range podContainer.containers {
|
||||||
containerJSON, err := ngm.dockerClient.InspectContainer(containerId)
|
containerJSON, err := ngm.dockerClient.InspectContainer(containerIdentifier.id)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerId, podContainer.uid)
|
glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerIdentifier.id, podContainer.uid)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,7 +278,7 @@ func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
|
|||||||
for _, device := range devices {
|
for _, device := range devices {
|
||||||
if isValidPath(device.PathOnHost) {
|
if isValidPath(device.PathOnHost) {
|
||||||
glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
|
glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
|
||||||
ret.insert(podContainer.uid, device.PathOnHost)
|
ret.insert(podContainer.uid, containerIdentifier.name, device.PathOnHost)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,8 +36,8 @@ func (tapl *testActivePodsLister) GetRunningPods() ([]*v1.Pod, error) {
|
|||||||
return tapl.activePods, nil
|
return tapl.activePods, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func makeTestPod(numContainers int) *v1.Pod {
|
func makeTestPod(numContainers, gpusPerContainer int) *v1.Pod {
|
||||||
quantity := resource.NewQuantity(1, resource.DecimalSI)
|
quantity := resource.NewQuantity(int64(gpusPerContainer), resource.DecimalSI)
|
||||||
resources := v1.ResourceRequirements{
|
resources := v1.ResourceRequirements{
|
||||||
Limits: v1.ResourceList{
|
Limits: v1.ResourceList{
|
||||||
v1.ResourceNvidiaGPU: *quantity,
|
v1.ResourceNvidiaGPU: *quantity,
|
||||||
@ -53,6 +53,7 @@ func makeTestPod(numContainers int) *v1.Pod {
|
|||||||
}
|
}
|
||||||
for ; numContainers > 0; numContainers-- {
|
for ; numContainers > 0; numContainers-- {
|
||||||
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
|
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
|
||||||
|
Name: string(uuid.NewUUID()),
|
||||||
Resources: resources,
|
Resources: resources,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -75,7 +76,7 @@ func TestMultiContainerPodGPUAllocation(t *testing.T) {
|
|||||||
as.Equal(len(gpusInUse.devices()), 0)
|
as.Equal(len(gpusInUse.devices()), 0)
|
||||||
|
|
||||||
// Allocated GPUs for a pod with two containers.
|
// Allocated GPUs for a pod with two containers.
|
||||||
pod := makeTestPod(2)
|
pod := makeTestPod(2, 1)
|
||||||
// Allocate for the first container.
|
// Allocate for the first container.
|
||||||
devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
|
devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
|
||||||
as.Nil(err)
|
as.Nil(err)
|
||||||
@ -90,7 +91,7 @@ func TestMultiContainerPodGPUAllocation(t *testing.T) {
|
|||||||
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
||||||
|
|
||||||
// further allocations should fail.
|
// further allocations should fail.
|
||||||
newPod := makeTestPod(2)
|
newPod := makeTestPod(2, 1)
|
||||||
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
||||||
as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
|
as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
|
||||||
|
|
||||||
@ -126,7 +127,7 @@ func TestMultiPodGPUAllocation(t *testing.T) {
|
|||||||
as.Equal(len(gpusInUse.devices()), 0)
|
as.Equal(len(gpusInUse.devices()), 0)
|
||||||
|
|
||||||
// Allocated GPUs for a pod with two containers.
|
// Allocated GPUs for a pod with two containers.
|
||||||
podA := makeTestPod(1)
|
podA := makeTestPod(1, 1)
|
||||||
// Allocate for the first container.
|
// Allocate for the first container.
|
||||||
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
||||||
as.Nil(err)
|
as.Nil(err)
|
||||||
@ -135,10 +136,48 @@ func TestMultiPodGPUAllocation(t *testing.T) {
|
|||||||
podLister.activePods = append(podLister.activePods, podA)
|
podLister.activePods = append(podLister.activePods, podA)
|
||||||
|
|
||||||
// further allocations should fail.
|
// further allocations should fail.
|
||||||
podB := makeTestPod(1)
|
podB := makeTestPod(1, 1)
|
||||||
// Allocate for the first container.
|
// Allocate for the first container.
|
||||||
devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
||||||
as.Nil(err)
|
as.Nil(err)
|
||||||
as.Equal(len(devicesB), 1)
|
as.Equal(len(devicesB), 1)
|
||||||
as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
|
as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestPodContainerRestart(t *testing.T) {
|
||||||
|
podLister := &testActivePodsLister{}
|
||||||
|
|
||||||
|
testGpuManager := &nvidiaGPUManager{
|
||||||
|
activePodsLister: podLister,
|
||||||
|
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
||||||
|
allocated: newPodGPUs(),
|
||||||
|
defaultDevices: []string{"/dev/nvidia-smi"},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expect that no devices are in use.
|
||||||
|
gpusInUse, err := testGpuManager.gpusInUse()
|
||||||
|
as := assert.New(t)
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(gpusInUse.devices()), 0)
|
||||||
|
|
||||||
|
// Make a pod with one containers that requests two GPUs.
|
||||||
|
podA := makeTestPod(1, 2)
|
||||||
|
// Allocate GPUs
|
||||||
|
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(devicesA), 3)
|
||||||
|
|
||||||
|
podLister.activePods = append(podLister.activePods, podA)
|
||||||
|
|
||||||
|
// further allocations should fail.
|
||||||
|
podB := makeTestPod(1, 1)
|
||||||
|
_, err = testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
||||||
|
as.NotNil(err)
|
||||||
|
|
||||||
|
// Allcate GPU for existing Pod A.
|
||||||
|
// The same gpus must be returned.
|
||||||
|
devicesAretry, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
||||||
|
as.Nil(err)
|
||||||
|
as.Equal(len(devicesA), 3)
|
||||||
|
as.True(sets.NewString(devicesA...).Equal(sets.NewString(devicesAretry...)))
|
||||||
|
}
|
||||||
|
@ -18,6 +18,7 @@ package e2e_node
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
"k8s.io/apimachinery/pkg/api/resource"
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
@ -79,6 +80,20 @@ var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
|||||||
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
|
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
|
||||||
podSuccess = f.PodClient().CreateSync(podSuccess)
|
podSuccess = f.PodClient().CreateSync(podSuccess)
|
||||||
|
|
||||||
|
By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
|
||||||
|
const minContainerRestartCount = 2
|
||||||
|
Eventually(func() bool {
|
||||||
|
p, err := f.ClientSet.Core().Pods(f.Namespace.Name).Get(podSuccess.Name, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
framework.Logf("failed to get pod status: %v", err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if p.Status.ContainerStatuses[0].RestartCount < minContainerRestartCount {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}, time.Minute, time.Second).Should(BeTrue())
|
||||||
|
|
||||||
By("Checking if the pod outputted Success to its logs")
|
By("Checking if the pod outputted Success to its logs")
|
||||||
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
|
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
|
||||||
|
|
||||||
@ -115,12 +130,13 @@ func makePod(gpus int64, name string) *v1.Pod {
|
|||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
|
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus)
|
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success", gpus)
|
||||||
return &v1.Pod{
|
return &v1.Pod{
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
Name: name,
|
Name: name,
|
||||||
},
|
},
|
||||||
Spec: v1.PodSpec{
|
Spec: v1.PodSpec{
|
||||||
|
RestartPolicy: v1.RestartPolicyAlways,
|
||||||
Containers: []v1.Container{
|
Containers: []v1.Container{
|
||||||
{
|
{
|
||||||
Image: "gcr.io/google_containers/busybox:1.24",
|
Image: "gcr.io/google_containers/busybox:1.24",
|
||||||
|
Loading…
Reference in New Issue
Block a user