diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 4c9d296c30f..b367aa30781 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -997,6 +997,18 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont return nil } +// checkPodActive checks if the given pod is still in activePods list +func (m *ManagerImpl) checkPodActive(pod *v1.Pod) bool { + activePods := m.activePods() + for _, activePod := range activePods { + if activePod.UID == pod.UID { + return true + } + } + + return false +} + // GetDeviceRunContainerOptions checks whether we have cached containerDevices // for the passed-in and returns its DeviceRunContainerOptions // for the found one. An empty struct is returned in case no cached state is found. @@ -1013,6 +1025,12 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co if err != nil { return nil, err } + + if !m.checkPodActive(pod) { + klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID) + continue + } + // This is a device plugin resource yet we don't have cached // resource state. This is likely due to a race during node // restart. We re-issue allocate request to cover this race. diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index b96b88baf6c..0345a67f8c8 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -959,6 +959,69 @@ func TestPodContainerDeviceAllocation(t *testing.T) { } +func TestGetDeviceRunContainerOptions(t *testing.T) { + res1 := TestResource{ + resourceName: "domain1.com/resource1", + resourceQuantity: *resource.NewQuantity(int64(2), resource.DecimalSI), + devs: checkpoint.DevicesPerNUMA{0: []string{"dev1", "dev2"}}, + topology: true, + } + res2 := TestResource{ + resourceName: "domain2.com/resource2", + resourceQuantity: *resource.NewQuantity(int64(1), resource.DecimalSI), + devs: checkpoint.DevicesPerNUMA{0: []string{"dev3", "dev4"}}, + topology: false, + } + + testResources := make([]TestResource, 2) + testResources = append(testResources, res1) + testResources = append(testResources, res2) + + podsStub := activePodsStub{ + activePods: []*v1.Pod{}, + } + as := require.New(t) + + tmpDir, err := ioutil.TempDir("", "checkpoint") + as.Nil(err) + defer os.RemoveAll(tmpDir) + + testManager, err := getTestManager(tmpDir, podsStub.getActivePods, testResources) + as.Nil(err) + + pod1 := makePod(v1.ResourceList{ + v1.ResourceName(res1.resourceName): res1.resourceQuantity, + v1.ResourceName(res2.resourceName): res2.resourceQuantity, + }) + pod2 := makePod(v1.ResourceList{ + v1.ResourceName(res2.resourceName): res2.resourceQuantity, + }) + + activePods := []*v1.Pod{pod1, pod2} + podsStub.updateActivePods(activePods) + + err = testManager.Allocate(pod1, &pod1.Spec.Containers[0]) + as.Nil(err) + err = testManager.Allocate(pod2, &pod2.Spec.Containers[0]) + as.Nil(err) + + // when pod is in activePods, GetDeviceRunContainerOptions should return + runContainerOpts, err := testManager.GetDeviceRunContainerOptions(pod1, &pod1.Spec.Containers[0]) + as.Nil(err) + as.Equal(len(runContainerOpts.Devices), 3) + as.Equal(len(runContainerOpts.Mounts), 2) + as.Equal(len(runContainerOpts.Envs), 2) + + activePods = []*v1.Pod{pod2} + podsStub.updateActivePods(activePods) + testManager.UpdateAllocatedDevices() + + // when pod is removed from activePods,G etDeviceRunContainerOptions should return error + runContainerOpts, err = testManager.GetDeviceRunContainerOptions(pod1, &pod1.Spec.Containers[0]) + as.Nil(err) + as.Nil(runContainerOpts) +} + func TestInitContainerDeviceAllocation(t *testing.T) { // Requesting to create a pod that requests resourceName1 in init containers and normal containers // should succeed with devices allocated to init containers reallocated to normal containers.