Merge pull request #39114 from dchen1107/kube-proxy

Automatic merge from submit-queue (batch tested with PRs 39114, 36004) assign -998 as the oom_score_adj for critical pods (e.g. kube-proxy) I also validated this with a testing cluster: Fresh built cluster, and kill kube-proxy pod, etc. ``` root 2660 2643 0 Dec21 ? 00:00:00 /bin/sh -c kube-proxy --master=https://104.198.79.64 --kubeconfig=/var/lib/kube-proxy/kubeconfig --cluster-cidr=10.180.0.0/14 --resource-container="" --v=4 1>>/var/log/kube-proxy.log 2>&1 root 2667 2660 0 Dec21 ? 00:03:14 kube-proxy --master=https://104.198.79.64 --kubeconfig=/var/lib/kube-proxy/kubeconfig --cluster-cidr=10.180.0.0/14 --resource-container= --v=4 # cat /proc/2660/oom_score_adj -998 # cat /proc/2667/oom_score_adj -998 ``` In this pr, I also include a small fix for import cycle issue. The right fix should remove the dependency on qos package from pkg/apis/componentconfig/v1alpha1. But since we plan to cherrypick this pr to both 1.5 and 1.4 (possible), I want touch the source as little as possible. Partial fix: #38322
2025-09-07 20:21:20 +00:00 · 2016-12-21 18:51:28 -08:00
parent f87edaacac b03fca9783
commit 66152b9066
8 changed files with 51 additions and 14 deletions
--- a/pkg/kubelet/eviction/BUILD
+++ b/pkg/kubelet/eviction/BUILD
@@ -28,9 +28,9 @@ go_library(
        "//pkg/kubelet/api/v1alpha1/stats:go_default_library",
        "//pkg/kubelet/cm:go_default_library",
        "//pkg/kubelet/lifecycle:go_default_library",
-        "//pkg/kubelet/pod:go_default_library",
        "//pkg/kubelet/qos:go_default_library",
        "//pkg/kubelet/server/stats:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
        "//pkg/kubelet/util/format:go_default_library",
        "//pkg/quota/evaluator/core:go_default_library",
        "//pkg/util/clock:go_default_library",
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@@ -28,9 +28,9 @@ import (
 	"k8s.io/kubernetes/pkg/client/record"
 	"k8s.io/kubernetes/pkg/kubelet/cm"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
-	kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
 	"k8s.io/kubernetes/pkg/kubelet/qos"
 	"k8s.io/kubernetes/pkg/kubelet/server/stats"
+	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
 	"k8s.io/kubernetes/pkg/kubelet/util/format"
 	"k8s.io/kubernetes/pkg/util/clock"
 	"k8s.io/kubernetes/pkg/util/wait"
@@ -109,7 +109,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
 	// the node has memory pressure, admit if not best-effort
 	if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) {
 		notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod)
-		if notBestEffort || kubepod.IsCriticalPod(attrs.Pod) {
+		if notBestEffort || kubetypes.IsCriticalPod(attrs.Pod) {
 			return lifecycle.PodAdmitResult{Admit: true}
 		}
 	}
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -1915,7 +1915,7 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
 	var criticalPods []*v1.Pod
 	var nonCriticalPods []*v1.Pod
 	for _, p := range pods {
-		if kubepod.IsCriticalPod(p) {
+		if kubetypes.IsCriticalPod(p) {
 			criticalPods = append(criticalPods, p)
 		} else {
 			nonCriticalPods = append(nonCriticalPods, p)
--- a/pkg/kubelet/pod/pod_manager.go
+++ b/pkg/kubelet/pod/pod_manager.go
@@ -21,7 +21,6 @@ import (

 	"k8s.io/kubernetes/pkg/api/v1"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
-	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
 	"k8s.io/kubernetes/pkg/types"
 )

@@ -307,11 +306,3 @@ func (pm *basicManager) GetPodByMirrorPod(mirrorPod *v1.Pod) (*v1.Pod, bool) {
 	pod, ok := pm.podByFullName[kubecontainer.GetPodFullName(mirrorPod)]
 	return pod, ok
 }
-
-// IsCriticalPod returns true if the pod bears the critical pod annotation
-// key. Both the rescheduler and the kubelet use this key to make admission
-// and scheduling decisions.
-func IsCriticalPod(pod *v1.Pod) bool {
-	_, ok := pod.Annotations[kubetypes.CriticalPodAnnotationKey]
-	return ok
-}
--- a/pkg/kubelet/qos/BUILD
+++ b/pkg/kubelet/qos/BUILD
@@ -21,6 +21,7 @@ go_library(
        "//pkg/api:go_default_library",
        "//pkg/api/resource:go_default_library",
        "//pkg/api/v1:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
        "//pkg/util/sets:go_default_library",
    ],
 )
@@ -36,5 +37,6 @@ go_test(
    deps = [
        "//pkg/api/resource:go_default_library",
        "//pkg/api/v1:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
    ],
 )
--- a/pkg/kubelet/qos/policy.go
+++ b/pkg/kubelet/qos/policy.go
@@ -16,14 +16,20 @@ limitations under the License.

 package qos

-import "k8s.io/kubernetes/pkg/api/v1"
+import (
+	"k8s.io/kubernetes/pkg/api/v1"
+	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+)

 const (
 	// PodInfraOOMAdj is very docker specific. For arbitrary runtime, it may not make
 	// sense to set sandbox level oom score, e.g. a sandbox could only be a namespace
 	// without a process.
 	// TODO: Handle infra container oom score adj in a runtime agnostic way.
+	// TODO: Should handle critical pod oom score adj with a proper preemption priority.
+	// This is the workaround for https://github.com/kubernetes/kubernetes/issues/38322.
 	PodInfraOOMAdj        int = -998
+	CriticalPodOOMAdj     int = -998
 	KubeletOOMScoreAdj    int = -999
 	DockerOOMScoreAdj     int = -999
 	KubeProxyOOMScoreAdj  int = -999
@@ -38,6 +44,10 @@ const (
 // and 1000. Containers with higher OOM scores are killed if the system runs out of memory.
 // See https://lwn.net/Articles/391222/ for more information.
 func GetContainerOOMScoreAdjust(pod *v1.Pod, container *v1.Container, memoryCapacity int64) int {
+	if kubetypes.IsCriticalPod(pod) {
+		return CriticalPodOOMAdj
+	}
+
 	switch GetPodQOS(pod) {
 	case Guaranteed:
 		// Guaranteed containers should be the last to get killed.
--- a/pkg/kubelet/qos/policy_test.go
+++ b/pkg/kubelet/qos/policy_test.go
@@ -22,6 +22,7 @@ import (

 	"k8s.io/kubernetes/pkg/api/resource"
 	"k8s.io/kubernetes/pkg/api/v1"
+	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
 )

 const (
@@ -135,6 +136,25 @@ var (
 			},
 		},
 	}
+	criticalPodWithNoLimit = v1.Pod{
+		ObjectMeta: v1.ObjectMeta{
+			Annotations: map[string]string{
+				kubetypes.CriticalPodAnnotationKey: "",
+			},
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Resources: v1.ResourceRequirements{
+						Requests: v1.ResourceList{
+							v1.ResourceName(v1.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)),
+							v1.ResourceName(v1.ResourceCPU):    resource.MustParse("5m"),
+						},
+					},
+				},
+			},
+		},
+	}
 )

 type oomTest struct {
@@ -188,6 +208,12 @@ func TestGetContainerOOMScoreAdjust(t *testing.T) {
 			lowOOMScoreAdj:  2,
 			highOOMScoreAdj: 2,
 		},
+		{
+			pod:             &criticalPodWithNoLimit,
+			memoryCapacity:  standardMemoryAmount,
+			lowOOMScoreAdj:  -998,
+			highOOMScoreAdj: -998,
+		},
 	}
 	for _, test := range oomTests {
 		oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity)
--- a/pkg/kubelet/types/pod_update.go
+++ b/pkg/kubelet/types/pod_update.go
@@ -140,3 +140,11 @@ func (sp SyncPodType) String() string {
 		return "unknown"
 	}
 }
+
+// IsCriticalPod returns true if the pod bears the critical pod annotation
+// key. Both the rescheduler and the kubelet use this key to make admission
+// and scheduling decisions.
+func IsCriticalPod(pod *v1.Pod) bool {
+	_, ok := pod.Annotations[CriticalPodAnnotationKey]
+	return ok
+}