Merge pull request #116702 from vinaykul/restart-free-pod-vertical-scaling-podmutation-fix

Fix pod object update that may cause data race
This commit is contained in:
Kubernetes Prow Robot 2023-03-21 19:26:36 -07:00 committed by GitHub
commit c7cc7886e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 28 additions and 22 deletions

View File

@ -1889,7 +1889,7 @@ func (kl *Kubelet) SyncPod(_ context.Context, updateType kubetypes.SyncPodType,
// TODO(vinaykul,InPlacePodVerticalScaling): Investigate doing this in HandlePodUpdates + periodic SyncLoop scan // TODO(vinaykul,InPlacePodVerticalScaling): Investigate doing this in HandlePodUpdates + periodic SyncLoop scan
// See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r663160060 // See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r663160060
if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) { if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) {
kl.handlePodResourcesResize(pod) pod = kl.handlePodResourcesResize(pod)
} }
} }
@ -2631,13 +2631,14 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus)
klog.ErrorS(err, "getNodeAnyway function failed") klog.ErrorS(err, "getNodeAnyway function failed")
return false, nil, "" return false, nil, ""
} }
podCopy := pod.DeepCopy()
cpuAvailable := node.Status.Allocatable.Cpu().MilliValue() cpuAvailable := node.Status.Allocatable.Cpu().MilliValue()
memAvailable := node.Status.Allocatable.Memory().Value() memAvailable := node.Status.Allocatable.Memory().Value()
cpuRequests := resource.GetResourceRequest(pod, v1.ResourceCPU) cpuRequests := resource.GetResourceRequest(podCopy, v1.ResourceCPU)
memRequests := resource.GetResourceRequest(pod, v1.ResourceMemory) memRequests := resource.GetResourceRequest(podCopy, v1.ResourceMemory)
if cpuRequests > cpuAvailable || memRequests > memAvailable { if cpuRequests > cpuAvailable || memRequests > memAvailable {
klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", pod.Name) klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", podCopy.Name)
return false, nil, v1.PodResizeStatusInfeasible return false, podCopy, v1.PodResizeStatusInfeasible
} }
// Treat the existing pod needing resize as a new pod with desired resources seeking admit. // Treat the existing pod needing resize as a new pod with desired resources seeking admit.
@ -2649,13 +2650,12 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus)
} }
} }
if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, pod); !ok { if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, podCopy); !ok {
// Log reason and return. Let the next sync iteration retry the resize // Log reason and return. Let the next sync iteration retry the resize
klog.V(3).InfoS("Resize cannot be accommodated", "pod", pod.Name, "reason", failReason, "message", failMessage) klog.V(3).InfoS("Resize cannot be accommodated", "pod", podCopy.Name, "reason", failReason, "message", failMessage)
return false, nil, v1.PodResizeStatusDeferred return false, podCopy, v1.PodResizeStatusDeferred
} }
podCopy := pod.DeepCopy()
for _, container := range podCopy.Spec.Containers { for _, container := range podCopy.Spec.Containers {
idx, found := podutil.GetIndexOfContainerStatus(podCopy.Status.ContainerStatuses, container.Name) idx, found := podutil.GetIndexOfContainerStatus(podCopy.Status.ContainerStatuses, container.Name)
if found { if found {
@ -2667,9 +2667,9 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus)
return true, podCopy, v1.PodResizeStatusInProgress return true, podCopy, v1.PodResizeStatusInProgress
} }
func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) { func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) *v1.Pod {
if pod.Status.Phase != v1.PodRunning { if pod.Status.Phase != v1.PodRunning {
return return pod
} }
podResized := false podResized := false
for _, container := range pod.Spec.Containers { for _, container := range pod.Spec.Containers {
@ -2691,31 +2691,35 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) {
} }
} }
if !podResized { if !podResized {
return return pod
} }
kl.podResizeMutex.Lock() kl.podResizeMutex.Lock()
defer kl.podResizeMutex.Unlock() defer kl.podResizeMutex.Unlock()
fit, updatedPod, resizeStatus := kl.canResizePod(pod) fit, updatedPod, resizeStatus := kl.canResizePod(pod)
if updatedPod == nil {
return pod
}
if fit { if fit {
// Update pod resource allocation checkpoint // Update pod resource allocation checkpoint
if err := kl.statusManager.SetPodAllocation(updatedPod); err != nil { if err := kl.statusManager.SetPodAllocation(updatedPod); err != nil {
//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod)) klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(updatedPod))
return pod
} }
*pod = *updatedPod
} }
if resizeStatus != "" { if resizeStatus != "" {
// Save resize decision to checkpoint // Save resize decision to checkpoint
if err := kl.statusManager.SetPodResizeStatus(pod.UID, resizeStatus); err != nil { if err := kl.statusManager.SetPodResizeStatus(updatedPod.UID, resizeStatus); err != nil {
//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(pod)) klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(updatedPod))
return pod
} }
pod.Status.Resize = resizeStatus updatedPod.Status.Resize = resizeStatus
} }
kl.podManager.UpdatePod(pod) kl.podManager.UpdatePod(updatedPod)
kl.statusManager.SetPodStatus(pod, pod.Status) kl.statusManager.SetPodStatus(updatedPod, updatedPod.Status)
return return updatedPod
} }
// LatestLoopEntryTime returns the last time in the sync loop monitor. // LatestLoopEntryTime returns the last time in the sync loop monitor.

View File

@ -2585,8 +2585,10 @@ func TestHandlePodResourcesResize(t *testing.T) {
tt.pod.Spec.Containers[0].Resources.Requests = tt.newRequests tt.pod.Spec.Containers[0].Resources.Requests = tt.newRequests
tt.pod.Status.ContainerStatuses[0].AllocatedResources = v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M} tt.pod.Status.ContainerStatuses[0].AllocatedResources = v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}
kubelet.handlePodResourcesResize(tt.pod) kubelet.handlePodResourcesResize(tt.pod)
assert.Equal(t, tt.expectedAllocations, tt.pod.Status.ContainerStatuses[0].AllocatedResources, tt.name) updatedPod, found := kubelet.podManager.GetPodByName(tt.pod.Namespace, tt.pod.Name)
assert.Equal(t, tt.expectedResize, tt.pod.Status.Resize, tt.name) assert.True(t, found, "expected to find pod %s", tt.pod.Name)
assert.Equal(t, tt.expectedAllocations, updatedPod.Status.ContainerStatuses[0].AllocatedResources, tt.name)
assert.Equal(t, tt.expectedResize, updatedPod.Status.Resize, tt.name)
testKubelet.fakeKubeClient.ClearActions() testKubelet.fakeKubeClient.ClearActions()
} }
} }