[FG:InPlacePodVerticalScaling] surface pod resize actuation errors in pod resize conditions (#130902)

* surface pod resize actuation errors in pod resize conditions

* update kubelet unit tests

* fix all other kubelet unit tests

* fix linter error

* address feedback

* fix test failure
This commit is contained in:
Natasha Sarkar 2025-03-19 22:12:54 -05:00 committed by GitHub
parent d8607b91a7
commit 16abcd78bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 217 additions and 43 deletions

View File

@ -45,6 +45,8 @@ var (
ErrConfigPodSandbox = errors.New("ConfigPodSandboxError")
// ErrKillPodSandbox returned when runtime failed to stop pod's sandbox.
ErrKillPodSandbox = errors.New("KillPodSandboxError")
// ErrResizePodInPlace returned when runtime failed to resize a pod.
ErrResizePodInPlace = errors.New("ResizePodInPlaceError")
)
// SyncAction indicates different kind of actions in SyncPod() and KillPod(). Now there are only actions
@ -68,6 +70,8 @@ const (
ConfigPodSandbox SyncAction = "ConfigPodSandbox"
// KillPodSandbox action
KillPodSandbox SyncAction = "KillPodSandbox"
// ResizePodInPlace action is included whenever any containers in the pod are resized without restart
ResizePodInPlace SyncAction = "ResizePodInPlace"
)
// SyncResult is the result of sync action.

View File

@ -61,6 +61,7 @@ type FakeRuntime struct {
VersionInfo string
APIVersionInfo string
RuntimeType string
SyncResults *kubecontainer.PodSyncResult
Err error
InspectErr error
StatusErr error
@ -238,6 +239,9 @@ func (f *FakeRuntime) SyncPod(_ context.Context, pod *v1.Pod, _ *kubecontainer.P
for _, c := range pod.Spec.Containers {
f.StartedContainers = append(f.StartedContainers, c.Name)
}
if f.SyncResults != nil {
return *f.SyncResults
}
// TODO(random-liu): Add SyncResult for starting and killing containers
if f.Err != nil {
result.Fail(f.Err)

View File

@ -2066,20 +2066,19 @@ func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType
sctx := context.WithoutCancel(ctx)
result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.crashLoopBackOff)
kl.reasonCache.Update(pod.UID, result)
if err := result.Error(); err != nil {
// Do not return error if the only failures were pods in backoff
for _, r := range result.SyncResults {
if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff {
// Do not record an event here, as we keep all event logging for sync pod failures
// local to container runtime, so we get better errors.
return false, err
for _, r := range result.SyncResults {
if r.Action == kubecontainer.ResizePodInPlace {
if r.Error == nil {
// The pod was resized successfully, clear any pod resize errors in the PodResizeInProgress condition.
kl.statusManager.SetPodResizeInProgressCondition(pod.UID, "", "", true)
} else {
kl.statusManager.SetPodResizeInProgressCondition(pod.UID, v1.PodReasonError, r.Message, false)
}
}
return false, nil
}
return false, nil
return false, result.Error()
}
// SyncTerminatingPod is expected to terminate all running containers in a pod. Once this method
@ -2932,7 +2931,7 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
}
if kl.isPodResizeInProgress(allocatedPod, podStatus) {
// If a resize is in progress, make sure the cache has the correct state in case the Kubelet restarted.
kl.statusManager.SetPodResizeInProgressCondition(pod.UID, "", "")
kl.statusManager.SetPodResizeInProgressCondition(pod.UID, "", "", false)
} else {
// (Allocated == Actual) => clear the resize in-progress status.
kl.statusManager.ClearPodResizeInProgressCondition(pod.UID)
@ -2961,6 +2960,11 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
return nil, err
}
kl.statusManager.ClearPodResizePendingCondition(pod.UID)
// Clear any errors that may have been surfaced from a previous resize. The condition will be
// added back as needed in the defer block, but this prevents old errors from being preserved.
kl.statusManager.ClearPodResizeInProgressCondition(pod.UID)
for i, container := range pod.Spec.Containers {
if !apiequality.Semantic.DeepEqual(container.Resources, podFromAllocation.Spec.Containers[i].Resources) {
key := kuberuntime.GetStableKey(pod, &container)

View File

@ -4138,5 +4138,112 @@ func TestCrashLoopBackOffConfiguration(t *testing.T) {
assert.Equalf(t, tc.expectedInitial, resultInitial, "wrong base calculated, want: %v, got %v", tc.expectedInitial, resultInitial)
})
}
}
func TestSyncPodWithErrorsDuringInPlacePodResize(t *testing.T) {
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
defer testKubelet.Cleanup()
kubelet := testKubelet.kubelet
pod := podWithUIDNameNsSpec("12345678", "foo", "new", v1.PodSpec{
Containers: []v1.Container{
{Name: "bar"},
},
})
testCases := []struct {
name string
syncResults *kubecontainer.PodSyncResult
expectedErr string
expectedResizeConditions []*v1.PodCondition
}{
{
name: "pod resize error returned from the runtime",
syncResults: &kubecontainer.PodSyncResult{
SyncResults: []*kubecontainer.SyncResult{{
Action: kubecontainer.ResizePodInPlace,
Target: pod.UID,
Error: kubecontainer.ErrResizePodInPlace,
Message: "could not resize pod",
}},
},
expectedErr: "failed to \"ResizePodInPlace\" for \"12345678\" with ResizePodInPlaceError: \"could not resize pod\"",
expectedResizeConditions: []*v1.PodCondition{{
Type: v1.PodResizeInProgress,
Status: v1.ConditionTrue,
Reason: v1.PodReasonError,
Message: "could not resize pod",
}},
},
{
name: "pod resize error cleared upon successful run",
syncResults: &kubecontainer.PodSyncResult{
SyncResults: []*kubecontainer.SyncResult{{
Action: kubecontainer.ResizePodInPlace,
Target: pod.UID,
}},
},
expectedResizeConditions: []*v1.PodCondition{{
Type: v1.PodResizeInProgress,
Status: v1.ConditionTrue,
}},
},
{
name: "sync results have a non-resize error",
syncResults: &kubecontainer.PodSyncResult{
SyncResults: []*kubecontainer.SyncResult{{
Action: kubecontainer.CreatePodSandbox,
Target: pod.UID,
Error: kubecontainer.ErrCreatePodSandbox,
Message: "could not create pod sandbox",
}},
},
expectedErr: "failed to \"CreatePodSandbox\" for \"12345678\" with CreatePodSandboxError: \"could not create pod sandbox\"",
expectedResizeConditions: nil,
},
{
name: "sync results have a non-resize error and a successful pod resize action",
syncResults: &kubecontainer.PodSyncResult{
SyncResults: []*kubecontainer.SyncResult{
{
Action: kubecontainer.CreatePodSandbox,
Target: pod.UID,
Error: kubecontainer.ErrCreatePodSandbox,
Message: "could not create pod sandbox",
},
{
Action: kubecontainer.ResizePodInPlace,
Target: pod.UID,
},
},
},
expectedErr: "failed to \"CreatePodSandbox\" for \"12345678\" with CreatePodSandboxError: \"could not create pod sandbox\"",
expectedResizeConditions: []*v1.PodCondition{{
Type: v1.PodResizeInProgress,
Status: v1.ConditionTrue,
}},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
testKubelet.fakeRuntime.SyncResults = tc.syncResults
kubelet.podManager.SetPods([]*v1.Pod{pod})
isTerminal, err := kubelet.SyncPod(context.Background(), kubetypes.SyncPodUpdate, pod, nil, &kubecontainer.PodStatus{})
require.False(t, isTerminal)
if tc.expectedErr == "" {
require.NoError(t, err)
} else {
require.Error(t, err)
require.Equal(t, tc.expectedErr, err.Error())
}
gotResizeConditions := kubelet.statusManager.GetPodResizeConditions(pod.UID)
for _, c := range gotResizeConditions {
// ignore last probe and transition times for comparison
c.LastProbeTime = metav1.Time{}
c.LastTransitionTime = metav1.Time{}
}
require.Equal(t, tc.expectedResizeConditions, gotResizeConditions)
})
}
}

View File

@ -693,7 +693,8 @@ func (m *kubeGenericRuntimeManager) computePodResizeAction(pod *v1.Pod, containe
return true
}
func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerChanges podActions, result *kubecontainer.PodSyncResult) {
func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerChanges podActions) *kubecontainer.SyncResult {
resizeResult := kubecontainer.NewSyncResult(kubecontainer.ResizePodInPlace, format.Pod(pod))
pcm := m.containerManager.NewPodContainerManager()
//TODO(vinaykul,InPlacePodVerticalScaling): Figure out best way to get enforceMemoryQoS value (parameter #4 below) in platform-agnostic way
enforceCPULimits := m.cpuCFSQuota
@ -704,20 +705,20 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerC
podResources := cm.ResourceConfigForPod(pod, enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false)
if podResources == nil {
klog.ErrorS(nil, "Unable to get resource configuration", "pod", klog.KObj(pod))
result.Fail(fmt.Errorf("unable to get resource configuration processing resize for pod %s", pod.Name))
return
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("unable to get resource configuration processing resize for pod %s", pod.Name))
return resizeResult
}
currentPodMemoryConfig, err := pcm.GetPodCgroupConfig(pod, v1.ResourceMemory)
if err != nil {
klog.ErrorS(nil, "Unable to get pod cgroup memory config", "pod", klog.KObj(pod))
result.Fail(fmt.Errorf("unable to get pod cgroup memory config for pod %s", pod.Name))
return
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("unable to get pod cgroup memory config for pod %s", pod.Name))
return resizeResult
}
currentPodCPUConfig, err := pcm.GetPodCgroupConfig(pod, v1.ResourceCPU)
if err != nil {
klog.ErrorS(nil, "Unable to get pod cgroup cpu config", "pod", klog.KObj(pod))
result.Fail(fmt.Errorf("unable to get pod cgroup cpu config for pod %s", pod.Name))
return
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("unable to get pod cgroup cpu config for pod %s", pod.Name))
return resizeResult
}
currentPodResources := podResources
@ -800,13 +801,13 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerC
currentPodMemoryUsage, err := pcm.GetPodCgroupMemoryUsage(pod)
if err != nil {
klog.ErrorS(err, "GetPodCgroupMemoryUsage failed", "pod", pod.Name)
result.Fail(err)
return
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, err.Error())
return resizeResult
}
if currentPodMemoryUsage >= uint64(*podResources.Memory) {
klog.ErrorS(nil, "Aborting attempt to set pod memory limit less than current memory usage", "pod", pod.Name)
result.Fail(fmt.Errorf("aborting attempt to set pod memory limit less than current memory usage for pod %s", pod.Name))
return
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("aborting attempt to set pod memory limit less than current memory usage for pod %s", pod.Name))
return resizeResult
}
} else {
// Default pod memory limit to the current memory limit if unset to prevent it from updating.
@ -814,16 +815,16 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerC
podResources.Memory = currentPodMemoryConfig.Memory
}
if errResize := resizeContainers(v1.ResourceMemory, int64(*currentPodMemoryConfig.Memory), *podResources.Memory, 0, 0); errResize != nil {
result.Fail(errResize)
return
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, errResize.Error())
return resizeResult
}
}
if len(podContainerChanges.ContainersToUpdate[v1.ResourceCPU]) > 0 || podContainerChanges.UpdatePodResources {
if podResources.CPUShares == nil {
// This shouldn't happen: ResourceConfigForPod always returns a non-nil value for CPUShares.
klog.ErrorS(nil, "podResources.CPUShares is nil", "pod", pod.Name)
result.Fail(fmt.Errorf("podResources.CPUShares is nil for pod %s", pod.Name))
return
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("podResources.CPUShares is nil for pod %s", pod.Name))
return resizeResult
}
// Default pod CPUQuota to the current CPUQuota if no limit is set to prevent the pod limit
@ -834,10 +835,11 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerC
}
if errResize := resizeContainers(v1.ResourceCPU, *currentPodCPUConfig.CPUQuota, *podResources.CPUQuota,
int64(*currentPodCPUConfig.CPUShares), int64(*podResources.CPUShares)); errResize != nil {
result.Fail(errResize)
return
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, errResize.Error())
return resizeResult
}
}
return resizeResult
}
func (m *kubeGenericRuntimeManager) updatePodContainerResources(pod *v1.Pod, resourceName v1.ResourceName, containersToUpdate []containerToUpdateInfo) error {
@ -1401,7 +1403,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po
// Step 7: For containers in podContainerChanges.ContainersToUpdate[CPU,Memory] list, invoke UpdateContainerResources
if resizable, _ := IsInPlacePodVerticalScalingAllowed(pod); resizable {
if len(podContainerChanges.ContainersToUpdate) > 0 || podContainerChanges.UpdatePodResources {
m.doPodResizeAction(pod, podContainerChanges, &result)
result.SyncResults = append(result.SyncResults, m.doPodResizeAction(pod, podContainerChanges))
}
}

View File

@ -67,7 +67,14 @@ var (
)
func createTestRuntimeManager() (*apitest.FakeRuntimeService, *apitest.FakeImageService, *kubeGenericRuntimeManager, error) {
return createTestRuntimeManagerWithErrors(nil)
}
func createTestRuntimeManagerWithErrors(errors map[string][]error) (*apitest.FakeRuntimeService, *apitest.FakeImageService, *kubeGenericRuntimeManager, error) {
fakeRuntimeService := apitest.NewFakeRuntimeService()
if errors != nil {
fakeRuntimeService.Errors = errors
}
fakeImageService := apitest.NewFakeImageService()
// Only an empty machineInfo is needed here, because in unit test all containers are besteffort,
// data in machineInfo is not used. If burstable containers are used in unit test in the future,
@ -3186,9 +3193,6 @@ func TestDoPodResizeAction(t *testing.T) {
}
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)
_, _, m, err := createTestRuntimeManager()
require.NoError(t, err)
m.cpuCFSQuota = true // Enforce CPU Limits
for _, tc := range []struct {
testName string
@ -3196,7 +3200,9 @@ func TestDoPodResizeAction(t *testing.T) {
desiredResources containerResources
updatedResources []v1.ResourceName
otherContainersHaveLimits bool
runtimeErrors map[string][]error
expectedError string
expectedErrorMessage string
expectPodCgroupUpdates int
}{
{
@ -3213,6 +3219,23 @@ func TestDoPodResizeAction(t *testing.T) {
updatedResources: []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory},
expectPodCgroupUpdates: 3, // cpu req, cpu lim, mem lim
},
{
testName: "Increase cpu and memory requests and limits, with computed pod limits and set a runtime error",
currentResources: containerResources{
cpuRequest: 100, cpuLimit: 100,
memoryRequest: 100, memoryLimit: 100,
},
desiredResources: containerResources{
cpuRequest: 200, cpuLimit: 200,
memoryRequest: 200, memoryLimit: 200,
},
otherContainersHaveLimits: true,
updatedResources: []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory},
expectPodCgroupUpdates: 1,
runtimeErrors: map[string][]error{"UpdateContainerResources": {fmt.Errorf("error updating container resources")}},
expectedError: "ResizePodInPlaceError",
expectedErrorMessage: "error updating container resources",
},
{
testName: "Increase cpu and memory requests and limits, without computed pod limits",
currentResources: containerResources{
@ -3296,6 +3319,10 @@ func TestDoPodResizeAction(t *testing.T) {
},
} {
t.Run(tc.testName, func(t *testing.T) {
_, _, m, err := createTestRuntimeManagerWithErrors(tc.runtimeErrors)
require.NoError(t, err)
m.cpuCFSQuota = true // Enforce CPU Limits
mockCM := cmtesting.NewMockContainerManager(t)
mockCM.EXPECT().PodHasExclusiveCPUs(mock.Anything).Return(false).Maybe()
mockCM.EXPECT().ContainerHasExclusiveCPUs(mock.Anything, mock.Anything).Return(false).Maybe()
@ -3358,17 +3385,17 @@ func TestDoPodResizeAction(t *testing.T) {
containersToUpdate[r] = []containerToUpdateInfo{updateInfo}
}
syncResult := &kubecontainer.PodSyncResult{}
actions := podActions{
ContainersToUpdate: containersToUpdate,
}
m.doPodResizeAction(pod, actions, syncResult)
resizeResult := m.doPodResizeAction(pod, actions)
if tc.expectedError != "" {
require.Error(t, syncResult.Error())
require.EqualError(t, syncResult.Error(), tc.expectedError)
require.Error(t, resizeResult.Error)
require.EqualError(t, resizeResult.Error, tc.expectedError)
require.Equal(t, tc.expectedErrorMessage, resizeResult.Message)
} else {
require.NoError(t, syncResult.Error())
require.NoError(t, resizeResult.Error)
}
mock.AssertExpectationsForObjects(t, mockPCM)

View File

@ -159,7 +159,7 @@ type Manager interface {
SetPodResizePendingCondition(podUID types.UID, reason, message string)
// SetPodResizeInProgressCondition caches the last PodResizeInProgress condition for the pod.
SetPodResizeInProgressCondition(podUID types.UID, reason, message string)
SetPodResizeInProgressCondition(podUID types.UID, reason, message string, allowReasonToBeCleared bool)
// ClearPodResizePendingCondition clears the PodResizePending condition for the pod from the cache.
ClearPodResizePendingCondition(podUID types.UID)
@ -266,10 +266,22 @@ func (m *manager) SetPodResizePendingCondition(podUID types.UID, reason, message
}
// SetPodResizeInProgressCondition caches the last PodResizeInProgress condition for the pod.
func (m *manager) SetPodResizeInProgressCondition(podUID types.UID, reason, message string) {
func (m *manager) SetPodResizeInProgressCondition(podUID types.UID, reason, message string, allowReasonToBeCleared bool) {
oldConditions := m.GetPodResizeConditions(podUID)
m.podStatusesLock.Lock()
defer m.podStatusesLock.Unlock()
if !allowReasonToBeCleared && reason == "" && message == "" {
// Preserve the old reason and message if there is one.
for _, c := range oldConditions {
if c.Type == v1.PodResizeInProgress {
reason = c.Reason
message = c.Message
}
}
}
m.podResizeConditions[podUID] = podResizeConditions{
PodResizeInProgress: updatedPodResizeCondition(v1.PodResizeInProgress, m.podResizeConditions[podUID].PodResizeInProgress, reason, message),
PodResizePending: m.podResizeConditions[podUID].PodResizePending,

View File

@ -2047,7 +2047,7 @@ func TestPodResizeConditions(t *testing.T) {
{
name: "set pod resize in progress condition with reason and message",
updateFunc: func(podUID types.UID) {
m.SetPodResizeInProgressCondition(podUID, "some-reason", "some-message")
m.SetPodResizeInProgressCondition(podUID, "some-reason", "some-message", false)
},
expected: []*v1.PodCondition{
{
@ -2059,9 +2059,23 @@ func TestPodResizeConditions(t *testing.T) {
},
},
{
name: "set pod resize in progress condition without reason and message",
name: "set pod resize in progress condition without reason and message/allowReasonToBeCleared=false",
updateFunc: func(podUID types.UID) {
m.SetPodResizeInProgressCondition(podUID, "", "")
m.SetPodResizeInProgressCondition(podUID, "", "", false)
},
expected: []*v1.PodCondition{
{
Type: v1.PodResizeInProgress,
Status: v1.ConditionTrue,
Reason: "some-reason",
Message: "some-message",
},
},
},
{
name: "set pod resize in progress condition without reason and message/allowReasonToBeCleared=true",
updateFunc: func(podUID types.UID) {
m.SetPodResizeInProgressCondition(podUID, "", "", true)
},
expected: []*v1.PodCondition{
{