From 35a456b0c62c7cfcff6ce6e03e88c38ac54a6062 Mon Sep 17 00:00:00 2001 From: waynepeking348 Date: Sun, 20 Mar 2022 20:45:41 +0800 Subject: [PATCH 1/2] skip reallocate logic if pod is already removed --- pkg/kubelet/cm/devicemanager/manager.go | 17 ++++++++ pkg/kubelet/cm/devicemanager/manager_test.go | 42 ++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 4c9d296c30f..e44561a1257 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -997,11 +997,28 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont return nil } +// checkPodActive checks if the given pod is still in activePods list +func (m *ManagerImpl) checkPodActive(pod *v1.Pod) bool { + activePods := m.activePods() + for _, activePod := range activePods { + if activePod.UID == pod.UID { + return true + } + } + + return false +} + // GetDeviceRunContainerOptions checks whether we have cached containerDevices // for the passed-in and returns its DeviceRunContainerOptions // for the found one. An empty struct is returned in case no cached state is found. func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) { podUID := string(pod.UID) + if !m.checkPodActive(pod) { + klog.Warningf("pod %s has been deleted from activePods, skip getting device run options", podUID) + return nil, fmt.Errorf("pod %v is removed from activePods list", podUID) + } + contName := container.Name needsReAllocate := false for k, v := range container.Resources.Limits { diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index b96b88baf6c..4a9bad72daf 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -959,6 +959,48 @@ func TestPodContainerDeviceAllocation(t *testing.T) { } +func TestGetDeviceRunContainerOptions(t *testing.T) { + res := TestResource{ + resourceName: "domain1.com/resource1", + resourceQuantity: *resource.NewQuantity(int64(2), resource.DecimalSI), + devs: checkpoint.DevicesPerNUMA{0: []string{"dev1", "dev2"}}, + topology: true, + } + testResources := []TestResource{res} + podsStub := activePodsStub{ + activePods: []*v1.Pod{}, + } + as := require.New(t) + + tmpDir, err := ioutil.TempDir("", "checkpoint") + as.Nil(err) + defer os.RemoveAll(tmpDir) + + testManager, err := getTestManager(tmpDir, podsStub.getActivePods, testResources) + as.Nil(err) + + pod := makePod(v1.ResourceList{v1.ResourceName(res.resourceName): res.resourceQuantity}) + activePods := []*v1.Pod{pod} + podsStub.updateActivePods(activePods) + + err = testManager.Allocate(pod, &pod.Spec.Containers[0]) + as.Nil(err) + + // when pod is in activePods, GetDeviceRunContainerOptions should return + _, err = testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0]) + as.Nil(err) + + activePods = []*v1.Pod{} + podsStub.updateActivePods(activePods) + // when pod is removed from activePods,G etDeviceRunContainerOptions should return error + _, err = testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0]) + expectedErr := fmt.Errorf("pod %v is removed from activePods list", pod.UID) + as.NotNil(err) + if !reflect.DeepEqual(err, expectedErr) { + t.Errorf("GetDeviceRunContainerOptions. expected error: %v but got: %v", expectedErr, err) + } +} + func TestInitContainerDeviceAllocation(t *testing.T) { // Requesting to create a pod that requests resourceName1 in init containers and normal containers // should succeed with devices allocated to init containers reallocated to normal containers. From 6157d3cc4a11d3e5bf587c320cf1c39b2dcdd429 Mon Sep 17 00:00:00 2001 From: waynepeking348 Date: Sun, 27 Mar 2022 20:35:00 +0800 Subject: [PATCH 2/2] skip deleted activePods and return nil --- pkg/kubelet/cm/devicemanager/manager.go | 11 ++--- pkg/kubelet/cm/devicemanager/manager_test.go | 47 ++++++++++++++------ 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index e44561a1257..b367aa30781 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -1014,11 +1014,6 @@ func (m *ManagerImpl) checkPodActive(pod *v1.Pod) bool { // for the found one. An empty struct is returned in case no cached state is found. func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) { podUID := string(pod.UID) - if !m.checkPodActive(pod) { - klog.Warningf("pod %s has been deleted from activePods, skip getting device run options", podUID) - return nil, fmt.Errorf("pod %v is removed from activePods list", podUID) - } - contName := container.Name needsReAllocate := false for k, v := range container.Resources.Limits { @@ -1030,6 +1025,12 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co if err != nil { return nil, err } + + if !m.checkPodActive(pod) { + klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID) + continue + } + // This is a device plugin resource yet we don't have cached // resource state. This is likely due to a race during node // restart. We re-issue allocate request to cover this race. diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index 4a9bad72daf..0345a67f8c8 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -960,13 +960,23 @@ func TestPodContainerDeviceAllocation(t *testing.T) { } func TestGetDeviceRunContainerOptions(t *testing.T) { - res := TestResource{ + res1 := TestResource{ resourceName: "domain1.com/resource1", resourceQuantity: *resource.NewQuantity(int64(2), resource.DecimalSI), devs: checkpoint.DevicesPerNUMA{0: []string{"dev1", "dev2"}}, topology: true, } - testResources := []TestResource{res} + res2 := TestResource{ + resourceName: "domain2.com/resource2", + resourceQuantity: *resource.NewQuantity(int64(1), resource.DecimalSI), + devs: checkpoint.DevicesPerNUMA{0: []string{"dev3", "dev4"}}, + topology: false, + } + + testResources := make([]TestResource, 2) + testResources = append(testResources, res1) + testResources = append(testResources, res2) + podsStub := activePodsStub{ activePods: []*v1.Pod{}, } @@ -979,26 +989,37 @@ func TestGetDeviceRunContainerOptions(t *testing.T) { testManager, err := getTestManager(tmpDir, podsStub.getActivePods, testResources) as.Nil(err) - pod := makePod(v1.ResourceList{v1.ResourceName(res.resourceName): res.resourceQuantity}) - activePods := []*v1.Pod{pod} + pod1 := makePod(v1.ResourceList{ + v1.ResourceName(res1.resourceName): res1.resourceQuantity, + v1.ResourceName(res2.resourceName): res2.resourceQuantity, + }) + pod2 := makePod(v1.ResourceList{ + v1.ResourceName(res2.resourceName): res2.resourceQuantity, + }) + + activePods := []*v1.Pod{pod1, pod2} podsStub.updateActivePods(activePods) - err = testManager.Allocate(pod, &pod.Spec.Containers[0]) + err = testManager.Allocate(pod1, &pod1.Spec.Containers[0]) + as.Nil(err) + err = testManager.Allocate(pod2, &pod2.Spec.Containers[0]) as.Nil(err) // when pod is in activePods, GetDeviceRunContainerOptions should return - _, err = testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0]) + runContainerOpts, err := testManager.GetDeviceRunContainerOptions(pod1, &pod1.Spec.Containers[0]) as.Nil(err) + as.Equal(len(runContainerOpts.Devices), 3) + as.Equal(len(runContainerOpts.Mounts), 2) + as.Equal(len(runContainerOpts.Envs), 2) - activePods = []*v1.Pod{} + activePods = []*v1.Pod{pod2} podsStub.updateActivePods(activePods) + testManager.UpdateAllocatedDevices() + // when pod is removed from activePods,G etDeviceRunContainerOptions should return error - _, err = testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0]) - expectedErr := fmt.Errorf("pod %v is removed from activePods list", pod.UID) - as.NotNil(err) - if !reflect.DeepEqual(err, expectedErr) { - t.Errorf("GetDeviceRunContainerOptions. expected error: %v but got: %v", expectedErr, err) - } + runContainerOpts, err = testManager.GetDeviceRunContainerOptions(pod1, &pod1.Spec.Containers[0]) + as.Nil(err) + as.Nil(runContainerOpts) } func TestInitContainerDeviceAllocation(t *testing.T) {