Emit events for resize error states

This commit is contained in:
Tim Allclair 2024-11-08 16:24:33 -08:00
parent 61e6242967
commit 3a2555ee93
2 changed files with 32 additions and 12 deletions

View File

@ -34,6 +34,8 @@ const (
FailedToCreatePodContainer = "FailedCreatePodContainer" FailedToCreatePodContainer = "FailedCreatePodContainer"
FailedToMakePodDataDirectories = "Failed" FailedToMakePodDataDirectories = "Failed"
NetworkNotReady = "NetworkNotReady" NetworkNotReady = "NetworkNotReady"
ResizeDeferred = "ResizeDeferred"
ResizeInfeasible = "ResizeInfeasible"
) )
// Image event reason list // Image event reason list

View File

@ -118,6 +118,7 @@ import (
kubetypes "k8s.io/kubernetes/pkg/kubelet/types" kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/userns" "k8s.io/kubernetes/pkg/kubelet/userns"
"k8s.io/kubernetes/pkg/kubelet/util" "k8s.io/kubernetes/pkg/kubelet/util"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/kubelet/util/manager" "k8s.io/kubernetes/pkg/kubelet/util/manager"
"k8s.io/kubernetes/pkg/kubelet/util/queue" "k8s.io/kubernetes/pkg/kubelet/util/queue"
"k8s.io/kubernetes/pkg/kubelet/util/sliceutils" "k8s.io/kubernetes/pkg/kubelet/util/sliceutils"
@ -2829,22 +2830,24 @@ func isPodResizeInProgress(pod *v1.Pod, podStatus *kubecontainer.PodStatus) bool
// canResizePod determines if the requested resize is currently feasible. // canResizePod determines if the requested resize is currently feasible.
// pod should hold the desired (pre-allocated) spec. // pod should hold the desired (pre-allocated) spec.
// Returns true if the resize can proceed. // Returns true if the resize can proceed.
func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) { func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus, string) {
if goos == "windows" { if goos == "windows" {
return false, v1.PodResizeStatusInfeasible return false, v1.PodResizeStatusInfeasible, "Resizing Windows pods is not supported"
} }
if v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && !utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScalingExclusiveCPUs) { if v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && !utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScalingExclusiveCPUs) {
if utilfeature.DefaultFeatureGate.Enabled(features.CPUManager) { if utilfeature.DefaultFeatureGate.Enabled(features.CPUManager) {
if kl.containerManager.GetNodeConfig().CPUManagerPolicy == "static" { if kl.containerManager.GetNodeConfig().CPUManagerPolicy == "static" {
klog.V(3).InfoS("Resize is infeasible for Guaranteed Pods alongside CPU Manager static policy") msg := "Resize is infeasible for Guaranteed Pods alongside CPU Manager static policy"
return false, v1.PodResizeStatusInfeasible klog.V(3).InfoS(msg, "pod", format.Pod(pod))
return false, v1.PodResizeStatusInfeasible, msg
} }
} }
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) { if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
if kl.containerManager.GetNodeConfig().ExperimentalMemoryManagerPolicy == "static" { if kl.containerManager.GetNodeConfig().ExperimentalMemoryManagerPolicy == "static" {
klog.V(3).InfoS("Resize is infeasible for Guaranteed Pods alongside Memory Manager static policy") msg := "Resize is infeasible for Guaranteed Pods alongside Memory Manager static policy"
return false, v1.PodResizeStatusInfeasible klog.V(3).InfoS(msg, "pod", format.Pod(pod))
return false, v1.PodResizeStatusInfeasible, msg
} }
} }
} }
@ -2852,15 +2855,22 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) {
node, err := kl.getNodeAnyWay() node, err := kl.getNodeAnyWay()
if err != nil { if err != nil {
klog.ErrorS(err, "getNodeAnyway function failed") klog.ErrorS(err, "getNodeAnyway function failed")
return false, "" return false, "", ""
} }
cpuAvailable := node.Status.Allocatable.Cpu().MilliValue() cpuAvailable := node.Status.Allocatable.Cpu().MilliValue()
memAvailable := node.Status.Allocatable.Memory().Value() memAvailable := node.Status.Allocatable.Memory().Value()
cpuRequests := resource.GetResourceRequest(pod, v1.ResourceCPU) cpuRequests := resource.GetResourceRequest(pod, v1.ResourceCPU)
memRequests := resource.GetResourceRequest(pod, v1.ResourceMemory) memRequests := resource.GetResourceRequest(pod, v1.ResourceMemory)
if cpuRequests > cpuAvailable || memRequests > memAvailable { if cpuRequests > cpuAvailable || memRequests > memAvailable {
klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", klog.KObj(pod)) var msg string
return false, v1.PodResizeStatusInfeasible if memRequests > memAvailable {
msg = fmt.Sprintf("memory, requested: %d, capacity: %d", memRequests, memAvailable)
} else {
msg = fmt.Sprintf("cpu, requested: %d, capacity: %d", cpuRequests, cpuAvailable)
}
msg = "Node didn't have enough capacity: " + msg
klog.V(3).InfoS(msg, "pod", klog.KObj(pod))
return false, v1.PodResizeStatusInfeasible, msg
} }
// Treat the existing pod needing resize as a new pod with desired resources seeking admit. // Treat the existing pod needing resize as a new pod with desired resources seeking admit.
@ -2871,10 +2881,10 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) {
if ok, failReason, failMessage := kl.canAdmitPod(allocatedPods, pod); !ok { if ok, failReason, failMessage := kl.canAdmitPod(allocatedPods, pod); !ok {
// Log reason and return. Let the next sync iteration retry the resize // Log reason and return. Let the next sync iteration retry the resize
klog.V(3).InfoS("Resize cannot be accommodated", "pod", klog.KObj(pod), "reason", failReason, "message", failMessage) klog.V(3).InfoS("Resize cannot be accommodated", "pod", klog.KObj(pod), "reason", failReason, "message", failMessage)
return false, v1.PodResizeStatusDeferred return false, v1.PodResizeStatusDeferred, failMessage
} }
return true, v1.PodResizeStatusInProgress return true, v1.PodResizeStatusInProgress, ""
} }
// handlePodResourcesResize returns the "allocated pod", which should be used for all resource // handlePodResourcesResize returns the "allocated pod", which should be used for all resource
@ -2899,7 +2909,7 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
kl.podResizeMutex.Lock() kl.podResizeMutex.Lock()
defer kl.podResizeMutex.Unlock() defer kl.podResizeMutex.Unlock()
// Desired resources != allocated resources. Can we update the allocation to the desired resources? // Desired resources != allocated resources. Can we update the allocation to the desired resources?
fit, resizeStatus := kl.canResizePod(pod) fit, resizeStatus, resizeMsg := kl.canResizePod(pod)
if fit { if fit {
// Update pod resource allocation checkpoint // Update pod resource allocation checkpoint
if err := kl.statusManager.SetPodAllocation(pod); err != nil { if err := kl.statusManager.SetPodAllocation(pod); err != nil {
@ -2925,6 +2935,14 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
} }
if resizeStatus != "" { if resizeStatus != "" {
kl.statusManager.SetPodResizeStatus(pod.UID, resizeStatus) kl.statusManager.SetPodResizeStatus(pod.UID, resizeStatus)
if resizeMsg != "" {
switch resizeStatus {
case v1.PodResizeStatusDeferred:
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.ResizeDeferred, resizeMsg)
case v1.PodResizeStatusInfeasible:
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.ResizeInfeasible, resizeMsg)
}
}
} }
return allocatedPod, nil return allocatedPod, nil
} }