skip deleted activePods and return nil

This commit is contained in:
waynepeking348 2022-03-27 20:35:00 +08:00 committed by shaowei.wayne
parent 35a456b0c6
commit 6157d3cc4a
2 changed files with 40 additions and 18 deletions

View File

@ -1014,11 +1014,6 @@ func (m *ManagerImpl) checkPodActive(pod *v1.Pod) bool {
// for the found one. An empty struct is returned in case no cached state is found.
func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) {
podUID := string(pod.UID)
if !m.checkPodActive(pod) {
klog.Warningf("pod %s has been deleted from activePods, skip getting device run options", podUID)
return nil, fmt.Errorf("pod %v is removed from activePods list", podUID)
}
contName := container.Name
needsReAllocate := false
for k, v := range container.Resources.Limits {
@ -1030,6 +1025,12 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
if err != nil {
return nil, err
}
if !m.checkPodActive(pod) {
klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID)
continue
}
// This is a device plugin resource yet we don't have cached
// resource state. This is likely due to a race during node
// restart. We re-issue allocate request to cover this race.

View File

@ -960,13 +960,23 @@ func TestPodContainerDeviceAllocation(t *testing.T) {
}
func TestGetDeviceRunContainerOptions(t *testing.T) {
res := TestResource{
res1 := TestResource{
resourceName: "domain1.com/resource1",
resourceQuantity: *resource.NewQuantity(int64(2), resource.DecimalSI),
devs: checkpoint.DevicesPerNUMA{0: []string{"dev1", "dev2"}},
topology: true,
}
testResources := []TestResource{res}
res2 := TestResource{
resourceName: "domain2.com/resource2",
resourceQuantity: *resource.NewQuantity(int64(1), resource.DecimalSI),
devs: checkpoint.DevicesPerNUMA{0: []string{"dev3", "dev4"}},
topology: false,
}
testResources := make([]TestResource, 2)
testResources = append(testResources, res1)
testResources = append(testResources, res2)
podsStub := activePodsStub{
activePods: []*v1.Pod{},
}
@ -979,26 +989,37 @@ func TestGetDeviceRunContainerOptions(t *testing.T) {
testManager, err := getTestManager(tmpDir, podsStub.getActivePods, testResources)
as.Nil(err)
pod := makePod(v1.ResourceList{v1.ResourceName(res.resourceName): res.resourceQuantity})
activePods := []*v1.Pod{pod}
pod1 := makePod(v1.ResourceList{
v1.ResourceName(res1.resourceName): res1.resourceQuantity,
v1.ResourceName(res2.resourceName): res2.resourceQuantity,
})
pod2 := makePod(v1.ResourceList{
v1.ResourceName(res2.resourceName): res2.resourceQuantity,
})
activePods := []*v1.Pod{pod1, pod2}
podsStub.updateActivePods(activePods)
err = testManager.Allocate(pod, &pod.Spec.Containers[0])
err = testManager.Allocate(pod1, &pod1.Spec.Containers[0])
as.Nil(err)
err = testManager.Allocate(pod2, &pod2.Spec.Containers[0])
as.Nil(err)
// when pod is in activePods, GetDeviceRunContainerOptions should return
_, err = testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0])
runContainerOpts, err := testManager.GetDeviceRunContainerOptions(pod1, &pod1.Spec.Containers[0])
as.Nil(err)
as.Equal(len(runContainerOpts.Devices), 3)
as.Equal(len(runContainerOpts.Mounts), 2)
as.Equal(len(runContainerOpts.Envs), 2)
activePods = []*v1.Pod{}
activePods = []*v1.Pod{pod2}
podsStub.updateActivePods(activePods)
testManager.UpdateAllocatedDevices()
// when pod is removed from activePods,G etDeviceRunContainerOptions should return error
_, err = testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0])
expectedErr := fmt.Errorf("pod %v is removed from activePods list", pod.UID)
as.NotNil(err)
if !reflect.DeepEqual(err, expectedErr) {
t.Errorf("GetDeviceRunContainerOptions. expected error: %v but got: %v", expectedErr, err)
}
runContainerOpts, err = testManager.GetDeviceRunContainerOptions(pod1, &pod1.Spec.Containers[0])
as.Nil(err)
as.Nil(runContainerOpts)
}
func TestInitContainerDeviceAllocation(t *testing.T) {