From 3a2555ee93cac90e2f797b8fe4de0d0d52254000 Mon Sep 17 00:00:00 2001 From: Tim Allclair Date: Fri, 8 Nov 2024 16:24:33 -0800 Subject: [PATCH] Emit events for resize error states --- pkg/kubelet/events/event.go | 2 ++ pkg/kubelet/kubelet.go | 42 ++++++++++++++++++++++++++----------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/pkg/kubelet/events/event.go b/pkg/kubelet/events/event.go index a9948ba6a54..0eaa82fc77f 100644 --- a/pkg/kubelet/events/event.go +++ b/pkg/kubelet/events/event.go @@ -34,6 +34,8 @@ const ( FailedToCreatePodContainer = "FailedCreatePodContainer" FailedToMakePodDataDirectories = "Failed" NetworkNotReady = "NetworkNotReady" + ResizeDeferred = "ResizeDeferred" + ResizeInfeasible = "ResizeInfeasible" ) // Image event reason list diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index e4ab7a79dee..55ab7444849 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -118,6 +118,7 @@ import ( kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/userns" "k8s.io/kubernetes/pkg/kubelet/util" + "k8s.io/kubernetes/pkg/kubelet/util/format" "k8s.io/kubernetes/pkg/kubelet/util/manager" "k8s.io/kubernetes/pkg/kubelet/util/queue" "k8s.io/kubernetes/pkg/kubelet/util/sliceutils" @@ -2829,22 +2830,24 @@ func isPodResizeInProgress(pod *v1.Pod, podStatus *kubecontainer.PodStatus) bool // canResizePod determines if the requested resize is currently feasible. // pod should hold the desired (pre-allocated) spec. // Returns true if the resize can proceed. -func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) { +func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus, string) { if goos == "windows" { - return false, v1.PodResizeStatusInfeasible + return false, v1.PodResizeStatusInfeasible, "Resizing Windows pods is not supported" } if v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && !utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScalingExclusiveCPUs) { if utilfeature.DefaultFeatureGate.Enabled(features.CPUManager) { if kl.containerManager.GetNodeConfig().CPUManagerPolicy == "static" { - klog.V(3).InfoS("Resize is infeasible for Guaranteed Pods alongside CPU Manager static policy") - return false, v1.PodResizeStatusInfeasible + msg := "Resize is infeasible for Guaranteed Pods alongside CPU Manager static policy" + klog.V(3).InfoS(msg, "pod", format.Pod(pod)) + return false, v1.PodResizeStatusInfeasible, msg } } if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) { if kl.containerManager.GetNodeConfig().ExperimentalMemoryManagerPolicy == "static" { - klog.V(3).InfoS("Resize is infeasible for Guaranteed Pods alongside Memory Manager static policy") - return false, v1.PodResizeStatusInfeasible + msg := "Resize is infeasible for Guaranteed Pods alongside Memory Manager static policy" + klog.V(3).InfoS(msg, "pod", format.Pod(pod)) + return false, v1.PodResizeStatusInfeasible, msg } } } @@ -2852,15 +2855,22 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) { node, err := kl.getNodeAnyWay() if err != nil { klog.ErrorS(err, "getNodeAnyway function failed") - return false, "" + return false, "", "" } cpuAvailable := node.Status.Allocatable.Cpu().MilliValue() memAvailable := node.Status.Allocatable.Memory().Value() cpuRequests := resource.GetResourceRequest(pod, v1.ResourceCPU) memRequests := resource.GetResourceRequest(pod, v1.ResourceMemory) if cpuRequests > cpuAvailable || memRequests > memAvailable { - klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", klog.KObj(pod)) - return false, v1.PodResizeStatusInfeasible + var msg string + if memRequests > memAvailable { + msg = fmt.Sprintf("memory, requested: %d, capacity: %d", memRequests, memAvailable) + } else { + msg = fmt.Sprintf("cpu, requested: %d, capacity: %d", cpuRequests, cpuAvailable) + } + msg = "Node didn't have enough capacity: " + msg + klog.V(3).InfoS(msg, "pod", klog.KObj(pod)) + return false, v1.PodResizeStatusInfeasible, msg } // Treat the existing pod needing resize as a new pod with desired resources seeking admit. @@ -2871,10 +2881,10 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) { if ok, failReason, failMessage := kl.canAdmitPod(allocatedPods, pod); !ok { // Log reason and return. Let the next sync iteration retry the resize klog.V(3).InfoS("Resize cannot be accommodated", "pod", klog.KObj(pod), "reason", failReason, "message", failMessage) - return false, v1.PodResizeStatusDeferred + return false, v1.PodResizeStatusDeferred, failMessage } - return true, v1.PodResizeStatusInProgress + return true, v1.PodResizeStatusInProgress, "" } // handlePodResourcesResize returns the "allocated pod", which should be used for all resource @@ -2899,7 +2909,7 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine kl.podResizeMutex.Lock() defer kl.podResizeMutex.Unlock() // Desired resources != allocated resources. Can we update the allocation to the desired resources? - fit, resizeStatus := kl.canResizePod(pod) + fit, resizeStatus, resizeMsg := kl.canResizePod(pod) if fit { // Update pod resource allocation checkpoint if err := kl.statusManager.SetPodAllocation(pod); err != nil { @@ -2925,6 +2935,14 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine } if resizeStatus != "" { kl.statusManager.SetPodResizeStatus(pod.UID, resizeStatus) + if resizeMsg != "" { + switch resizeStatus { + case v1.PodResizeStatusDeferred: + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.ResizeDeferred, resizeMsg) + case v1.PodResizeStatusInfeasible: + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.ResizeInfeasible, resizeMsg) + } + } } return allocatedPod, nil }