Emit events for resize error states

This commit is contained in:
Tim Allclair 2024-11-08 16:24:33 -08:00
parent 61e6242967
commit 3a2555ee93
2 changed files with 32 additions and 12 deletions

View File

@ -34,6 +34,8 @@ const (
FailedToCreatePodContainer = "FailedCreatePodContainer"
FailedToMakePodDataDirectories = "Failed"
NetworkNotReady = "NetworkNotReady"
ResizeDeferred = "ResizeDeferred"
ResizeInfeasible = "ResizeInfeasible"
)
// Image event reason list

View File

@ -118,6 +118,7 @@ import (
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/userns"
"k8s.io/kubernetes/pkg/kubelet/util"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/kubelet/util/manager"
"k8s.io/kubernetes/pkg/kubelet/util/queue"
"k8s.io/kubernetes/pkg/kubelet/util/sliceutils"
@ -2829,22 +2830,24 @@ func isPodResizeInProgress(pod *v1.Pod, podStatus *kubecontainer.PodStatus) bool
// canResizePod determines if the requested resize is currently feasible.
// pod should hold the desired (pre-allocated) spec.
// Returns true if the resize can proceed.
func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) {
func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus, string) {
if goos == "windows" {
return false, v1.PodResizeStatusInfeasible
return false, v1.PodResizeStatusInfeasible, "Resizing Windows pods is not supported"
}
if v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && !utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScalingExclusiveCPUs) {
if utilfeature.DefaultFeatureGate.Enabled(features.CPUManager) {
if kl.containerManager.GetNodeConfig().CPUManagerPolicy == "static" {
klog.V(3).InfoS("Resize is infeasible for Guaranteed Pods alongside CPU Manager static policy")
return false, v1.PodResizeStatusInfeasible
msg := "Resize is infeasible for Guaranteed Pods alongside CPU Manager static policy"
klog.V(3).InfoS(msg, "pod", format.Pod(pod))
return false, v1.PodResizeStatusInfeasible, msg
}
}
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
if kl.containerManager.GetNodeConfig().ExperimentalMemoryManagerPolicy == "static" {
klog.V(3).InfoS("Resize is infeasible for Guaranteed Pods alongside Memory Manager static policy")
return false, v1.PodResizeStatusInfeasible
msg := "Resize is infeasible for Guaranteed Pods alongside Memory Manager static policy"
klog.V(3).InfoS(msg, "pod", format.Pod(pod))
return false, v1.PodResizeStatusInfeasible, msg
}
}
}
@ -2852,15 +2855,22 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) {
node, err := kl.getNodeAnyWay()
if err != nil {
klog.ErrorS(err, "getNodeAnyway function failed")
return false, ""
return false, "", ""
}
cpuAvailable := node.Status.Allocatable.Cpu().MilliValue()
memAvailable := node.Status.Allocatable.Memory().Value()
cpuRequests := resource.GetResourceRequest(pod, v1.ResourceCPU)
memRequests := resource.GetResourceRequest(pod, v1.ResourceMemory)
if cpuRequests > cpuAvailable || memRequests > memAvailable {
klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", klog.KObj(pod))
return false, v1.PodResizeStatusInfeasible
var msg string
if memRequests > memAvailable {
msg = fmt.Sprintf("memory, requested: %d, capacity: %d", memRequests, memAvailable)
} else {
msg = fmt.Sprintf("cpu, requested: %d, capacity: %d", cpuRequests, cpuAvailable)
}
msg = "Node didn't have enough capacity: " + msg
klog.V(3).InfoS(msg, "pod", klog.KObj(pod))
return false, v1.PodResizeStatusInfeasible, msg
}
// Treat the existing pod needing resize as a new pod with desired resources seeking admit.
@ -2871,10 +2881,10 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) {
if ok, failReason, failMessage := kl.canAdmitPod(allocatedPods, pod); !ok {
// Log reason and return. Let the next sync iteration retry the resize
klog.V(3).InfoS("Resize cannot be accommodated", "pod", klog.KObj(pod), "reason", failReason, "message", failMessage)
return false, v1.PodResizeStatusDeferred
return false, v1.PodResizeStatusDeferred, failMessage
}
return true, v1.PodResizeStatusInProgress
return true, v1.PodResizeStatusInProgress, ""
}
// handlePodResourcesResize returns the "allocated pod", which should be used for all resource
@ -2899,7 +2909,7 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
kl.podResizeMutex.Lock()
defer kl.podResizeMutex.Unlock()
// Desired resources != allocated resources. Can we update the allocation to the desired resources?
fit, resizeStatus := kl.canResizePod(pod)
fit, resizeStatus, resizeMsg := kl.canResizePod(pod)
if fit {
// Update pod resource allocation checkpoint
if err := kl.statusManager.SetPodAllocation(pod); err != nil {
@ -2925,6 +2935,14 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
}
if resizeStatus != "" {
kl.statusManager.SetPodResizeStatus(pod.UID, resizeStatus)
if resizeMsg != "" {
switch resizeStatus {
case v1.PodResizeStatusDeferred:
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.ResizeDeferred, resizeMsg)
case v1.PodResizeStatusInfeasible:
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.ResizeInfeasible, resizeMsg)
}
}
}
return allocatedPod, nil
}