From b8a63537dd0ef8b1f61ed46b77d4a9650a146f4d Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Wed, 25 Jan 2017 20:24:38 -0800 Subject: [PATCH 1/7] Revert "Don't evict static pods" This reverts commit 1743c6b6ab23919c8f10573a33fdac421aa88288. --- pkg/kubelet/eviction/BUILD | 1 - pkg/kubelet/eviction/eviction_manager.go | 10 ---------- 2 files changed, 11 deletions(-) diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index b2345bd9b90..65617ac5790 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -25,7 +25,6 @@ go_library( "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/cm:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", - "//pkg/kubelet/pod:go_default_library", "//pkg/kubelet/qos:go_default_library", "//pkg/kubelet/server/stats:go_default_library", "//pkg/kubelet/types:go_default_library", diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 04d6e9a7514..8d55989faa3 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -31,7 +31,6 @@ import ( "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/lifecycle" - kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" @@ -313,15 +312,6 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // we kill at most a single pod during each eviction interval for i := range activePods { pod := activePods[i] - if kubepod.IsStaticPod(pod) { - // The eviction manager doesn't evict static pods. To stop a static - // pod, the admin needs to remove the manifest from kubelet's - // --config directory. - // TODO(39124): This is a short term fix, we can't assume static pods - // are always well behaved. - glog.Infof("eviction manager: NOT evicting static pod %v", pod.Name) - continue - } status := v1.PodStatus{ Phase: v1.PodFailed, Message: fmt.Sprintf(message, resourceToReclaim), From a3ae8c2b21f5fa87771fae1593652fbcd94eacdb Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Wed, 25 Jan 2017 20:30:10 -0800 Subject: [PATCH 2/7] Revert "assign -998 as the oom_score_adj for critical pods." This reverts commit 53931fbce4d2919f22e99f9dfa681f55e9278721. --- pkg/kubelet/qos/policy.go | 12 +----------- pkg/kubelet/qos/policy_test.go | 27 --------------------------- 2 files changed, 1 insertion(+), 38 deletions(-) diff --git a/pkg/kubelet/qos/policy.go b/pkg/kubelet/qos/policy.go index 313c044cafd..8eb97b80093 100644 --- a/pkg/kubelet/qos/policy.go +++ b/pkg/kubelet/qos/policy.go @@ -16,20 +16,14 @@ limitations under the License. package qos -import ( - "k8s.io/kubernetes/pkg/api/v1" - kubetypes "k8s.io/kubernetes/pkg/kubelet/types" -) +import "k8s.io/kubernetes/pkg/api/v1" const ( // PodInfraOOMAdj is very docker specific. For arbitrary runtime, it may not make // sense to set sandbox level oom score, e.g. a sandbox could only be a namespace // without a process. // TODO: Handle infra container oom score adj in a runtime agnostic way. - // TODO: Should handle critical pod oom score adj with a proper preemption priority. - // This is the workaround for https://github.com/kubernetes/kubernetes/issues/38322. PodInfraOOMAdj int = -998 - CriticalPodOOMAdj int = -998 KubeletOOMScoreAdj int = -999 DockerOOMScoreAdj int = -999 KubeProxyOOMScoreAdj int = -999 @@ -44,10 +38,6 @@ const ( // and 1000. Containers with higher OOM scores are killed if the system runs out of memory. // See https://lwn.net/Articles/391222/ for more information. func GetContainerOOMScoreAdjust(pod *v1.Pod, container *v1.Container, memoryCapacity int64) int { - if kubetypes.IsCriticalPod(pod) { - return CriticalPodOOMAdj - } - switch GetPodQOS(pod) { case v1.PodQOSGuaranteed: // Guaranteed containers should be the last to get killed. diff --git a/pkg/kubelet/qos/policy_test.go b/pkg/kubelet/qos/policy_test.go index b6247ec6125..6cd6f2d6c8c 100644 --- a/pkg/kubelet/qos/policy_test.go +++ b/pkg/kubelet/qos/policy_test.go @@ -21,9 +21,7 @@ import ( "testing" "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/kubernetes/pkg/api/v1" - kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) const ( @@ -137,25 +135,6 @@ var ( }, }, } - criticalPodWithNoLimit = v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - kubetypes.CriticalPodAnnotationKey: "", - }, - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceName(v1.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)), - v1.ResourceName(v1.ResourceCPU): resource.MustParse("5m"), - }, - }, - }, - }, - }, - } ) type oomTest struct { @@ -209,12 +188,6 @@ func TestGetContainerOOMScoreAdjust(t *testing.T) { lowOOMScoreAdj: 2, highOOMScoreAdj: 2, }, - { - pod: &criticalPodWithNoLimit, - memoryCapacity: standardMemoryAmount, - lowOOMScoreAdj: -998, - highOOMScoreAdj: -998, - }, } for _, test := range oomTests { oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity) From ffd7dda2343d6ea05f2feec33b6d684ba640c346 Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Wed, 25 Jan 2017 20:39:13 -0800 Subject: [PATCH 3/7] Revert "Kubelet admits critical pods even under memory pressure" This reverts commit afd676d94c614fc7336737bad1880784e0228c08. --- pkg/kubelet/eviction/BUILD | 1 - pkg/kubelet/eviction/eviction_manager.go | 3 +-- pkg/kubelet/eviction/eviction_manager_test.go | 26 +++++++++---------- pkg/kubelet/pod/pod_manager.go | 1 + pkg/kubelet/types/pod_update.go | 9 ------- 5 files changed, 14 insertions(+), 26 deletions(-) diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 65617ac5790..06b695d89a4 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -54,7 +54,6 @@ go_test( "//pkg/api/v1:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", - "//pkg/kubelet/types:go_default_library", "//pkg/quota:go_default_library", "//vendor:k8s.io/apimachinery/pkg/api/resource", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 8d55989faa3..4726df19fa7 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -33,7 +33,6 @@ import ( "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" - kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/format" ) @@ -110,7 +109,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd // the node has memory pressure, admit if not best-effort if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) { notBestEffort := v1.PodQOSBestEffort != qos.GetPodQOS(attrs.Pod) - if notBestEffort || kubetypes.IsCriticalPod(attrs.Pod) { + if notBestEffort { return lifecycle.PodAdmitResult{Admit: true} } } diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index e441b4204bb..f6e74bbe077 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -28,7 +28,8 @@ import ( "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/kubelet/lifecycle" - kubetypes "k8s.io/kubernetes/pkg/kubelet/types" + "k8s.io/kubernetes/pkg/types" + "k8s.io/kubernetes/pkg/util/clock" ) // mockPodKiller is used to testing which pod is killed @@ -212,8 +213,6 @@ func TestMemoryPressure(t *testing.T) { // create a best effort pod to test admission bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi") - criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") - criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""} // synchronize manager.synchronize(diskInfoProvider, activePodsFunc) @@ -224,8 +223,8 @@ func TestMemoryPressure(t *testing.T) { } // try to admit our pods (they should succeed) - expected := []bool{true, true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { + expected := []bool{true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -300,10 +299,9 @@ func TestMemoryPressure(t *testing.T) { t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) } - // the best-effort pod without critical annotation should not admit, - // burstable and critical pods should - expected = []bool{false, true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { + // the best-effort pod should not admit, burstable should + expected = []bool{false, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -325,9 +323,9 @@ func TestMemoryPressure(t *testing.T) { t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) } - // the best-effort pod should not admit, burstable and critical pods should - expected = []bool{false, true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { + // the best-effort pod should not admit, burstable should + expected = []bool{false, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -350,8 +348,8 @@ func TestMemoryPressure(t *testing.T) { } // all pods should admit now - expected = []bool{true, true, true} - for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { + expected = []bool{true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index dd069b0fa3e..39d9e3c4791 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -23,6 +23,7 @@ import ( "k8s.io/kubernetes/pkg/api/v1" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/secret" + "k8s.io/kubernetes/pkg/types" ) // Manager stores and manages access to pods, maintaining the mappings diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index c91077c1c23..b99ec9de6ae 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -28,15 +28,6 @@ const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror" const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" const ConfigHashAnnotationKey = "kubernetes.io/config.hash" -// This key needs to sync with the key used by the rescheduler, which currently -// lives in contrib. Its presence indicates 2 things, as far as the kubelet is -// concerned: -// 1. Resource related admission checks will prioritize the admission of -// pods bearing the key, over pods without the key, regardless of QoS. -// 2. The OOM score of pods bearing the key will be <= pods without -// the key (where the <= part is determied by QoS). -const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod" - // PodOperation defines what changes will be made on a pod configuration. type PodOperation int From 6ddb52844677db554c65d3ec8c0e1311c16d624b Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Wed, 25 Jan 2017 20:41:09 -0800 Subject: [PATCH 4/7] Revert "Sort critical pods before admission" This reverts commit b7409e00380b349ec971d089363ad7ff22b37c76. --- pkg/kubelet/kubelet.go | 17 ++-------- pkg/kubelet/kubelet_test.go | 63 ------------------------------------- 2 files changed, 2 insertions(+), 78 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index d229e7413f4..63e40a85303 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1898,21 +1898,8 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *v1.Pod, start time.Time) { // a config source. func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { start := kl.clock.Now() - - // Pass critical pods through admission check first. - var criticalPods []*v1.Pod - var nonCriticalPods []*v1.Pod - for _, p := range pods { - if kubetypes.IsCriticalPod(p) { - criticalPods = append(criticalPods, p) - } else { - nonCriticalPods = append(nonCriticalPods, p) - } - } - sort.Sort(sliceutils.PodsByCreationTime(criticalPods)) - sort.Sort(sliceutils.PodsByCreationTime(nonCriticalPods)) - - for _, pod := range append(criticalPods, nonCriticalPods...) { + sort.Sort(sliceutils.PodsByCreationTime(pods)) + for _, pod := range pods { existingPods := kl.podManager.GetPods() // Always add the pod to the pod manager. Kubelet relies on the pod // manager as the source of truth for the desired state. If a pod does diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 4a834f0c22e..9db0360ffa8 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -479,69 +479,6 @@ func TestHandlePortConflicts(t *testing.T) { require.Equal(t, v1.PodPending, status.Phase) } -// Tests that we sort pods based on criticality. -func TestCriticalPrioritySorting(t *testing.T) { - testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) - kl := testKubelet.kubelet - nodes := []v1.Node{ - {ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname}, - Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI), - }}}, - } - kl.nodeLister = testNodeLister{nodes: nodes} - kl.nodeInfo = testNodeInfo{nodes: nodes} - testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorapi.MachineInfo{}, nil) - testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) - testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) - - spec := v1.PodSpec{NodeName: string(kl.nodeName), - Containers: []v1.Container{{Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "memory": resource.MustParse("90"), - }, - }}}, - } - pods := []*v1.Pod{ - podWithUidNameNsSpec("000000000", "newpod", "foo", spec), - podWithUidNameNsSpec("987654321", "oldpod", "foo", spec), - podWithUidNameNsSpec("123456789", "middlepod", "foo", spec), - } - - // Pods are not sorted by creation time. - startTime := time.Now() - pods[0].CreationTimestamp = metav1.NewTime(startTime.Add(10 * time.Second)) - pods[1].CreationTimestamp = metav1.NewTime(startTime) - pods[2].CreationTimestamp = metav1.NewTime(startTime.Add(1 * time.Second)) - - // Make the middle and new pod critical, the middle pod should win - // even though it comes later in the list - critical := map[string]string{kubetypes.CriticalPodAnnotationKey: ""} - pods[0].Annotations = critical - pods[1].Annotations = map[string]string{} - pods[2].Annotations = critical - - // The non-critical pod should be rejected - notfittingPods := []*v1.Pod{pods[0], pods[1]} - fittingPod := pods[2] - - kl.HandlePodAdditions(pods) - // Check pod status stored in the status map. - // notfittingPod should be Failed - for _, p := range notfittingPods { - status, found := kl.statusManager.GetPodStatus(p.UID) - require.True(t, found, "Status of pod %q is not found in the status map", p.UID) - require.Equal(t, v1.PodFailed, status.Phase) - } - - // fittingPod should be Pending - status, found := kl.statusManager.GetPodStatus(fittingPod.UID) - require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID) - require.Equal(t, v1.PodPending, status.Phase) -} - // Tests that we handle host name conflicts correctly by setting the failed status in status map. func TestHandleHostNameConflicts(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) From f85bbcb78d0e6bff2959bb47c5d516bfa76d323c Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Wed, 25 Jan 2017 20:41:42 -0800 Subject: [PATCH 5/7] update kube proxy critical pod annotation comments to reflect reality Signed-off-by: Vishnu Kannan --- cluster/saltbase/salt/kube-proxy/kube-proxy.manifest | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest b/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest index 43c251f156f..af43473c6da 100644 --- a/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest +++ b/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest @@ -38,9 +38,8 @@ kind: Pod metadata: name: kube-proxy namespace: kube-system - # This annotation lowers the possibility that kube-proxy gets evicted when the - # node is under memory pressure, and prioritizes it for admission, even if - # the node is under memory pressure. + # This annotation ensures that kube-proxy does not get evicted if the node + # supports critical pod annotation based priority scheme. # Note that kube-proxy runs as a static pod so this annotation does NOT have # any effect on rescheduler (default scheduler and rescheduler are not # involved in scheduling kube-proxy). From c967ab7b99b02c01ece60bf971e908961a5b52e6 Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Sat, 28 Jan 2017 14:48:35 -0800 Subject: [PATCH 6/7] Avoid evicting critical pods in Kubelet if a special feature gate is enabled Signed-off-by: Vishnu Kannan --- cluster/gce/config-default.sh | 2 +- cluster/gce/config-test.sh | 2 +- pkg/features/kube_features.go | 8 + pkg/kubelet/eviction/BUILD | 76 +++++----- pkg/kubelet/eviction/eviction_manager.go | 9 ++ pkg/kubelet/eviction/eviction_manager_test.go | 137 +++++++++++++++++- pkg/kubelet/pod/pod_manager.go | 1 - pkg/kubelet/qos/BUILD | 32 ++-- pkg/kubelet/types/BUILD | 1 + pkg/kubelet/types/pod_update.go | 23 ++- 10 files changed, 227 insertions(+), 64 deletions(-) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index d2b01ecf64b..dce4dbb6653 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -123,7 +123,7 @@ fi RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}" # Optional: set feature gates -FEATURE_GATES="${KUBE_FEATURE_GATES:-}" +FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}" # Optional: Install cluster DNS. ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}" diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 25ef5b59315..0d45c710d7f 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -83,7 +83,7 @@ MASTER_IP_RANGE="${MASTER_IP_RANGE:-10.246.0.0/24}" RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}" # Optional: set feature gates -FEATURE_GATES="${KUBE_FEATURE_GATES:-}" +FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}" TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100} diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index ddaf44df7d4..61b4d26ece0 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -58,6 +58,13 @@ const ( // contains a privileged container, or specific non-namespaced capabilities (MKNOD, SYS_MODULE, // SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon. ExperimentalHostUserNamespaceDefaultingGate utilfeature.Feature = "ExperimentalHostUserNamespaceDefaulting" + + // owner: @vishh + // alpha: v1.5 + // + // Ensures guaranteed scheduling of pods marked with a special pod annotation `scheduler.alpha.kubernetes.io/critical-pod` + // and also prevents them from being evicted from a node. + ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation" ) func init() { @@ -73,6 +80,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS DynamicKubeletConfig: {Default: false, PreRelease: utilfeature.Alpha}, DynamicVolumeProvisioning: {Default: true, PreRelease: utilfeature.Alpha}, ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta}, + ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 06b695d89a4..7d94cd83a64 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -9,36 +9,16 @@ load( "go_test", ) -go_library( - name = "go_default_library", - srcs = [ - "doc.go", - "eviction_manager.go", - "helpers.go", - "types.go", +cgo_genrule( + name = "cgo_codegen", + srcs = ["threshold_notifier_linux.go"], + clinkopts = [ + "-lz", + "-lm", + "-lpthread", + "-ldl", ], - library = ":cgo_codegen", tags = ["automanaged"], - deps = [ - "//pkg/api:go_default_library", - "//pkg/api/v1:go_default_library", - "//pkg/kubelet/api/v1alpha1/stats:go_default_library", - "//pkg/kubelet/cm:go_default_library", - "//pkg/kubelet/lifecycle:go_default_library", - "//pkg/kubelet/qos:go_default_library", - "//pkg/kubelet/server/stats:go_default_library", - "//pkg/kubelet/types:go_default_library", - "//pkg/kubelet/util/format:go_default_library", - "//pkg/quota/evaluator/core:go_default_library", - "//vendor:github.com/golang/glog", - "//vendor:k8s.io/apimachinery/pkg/api/resource", - "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", - "//vendor:k8s.io/apimachinery/pkg/util/sets", - "//vendor:k8s.io/apimachinery/pkg/util/wait", - "//vendor:k8s.io/client-go/pkg/api/v1", - "//vendor:k8s.io/client-go/tools/record", - "//vendor:k8s.io/client-go/util/clock", - ], ) go_test( @@ -54,26 +34,50 @@ go_test( "//pkg/api/v1:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/quota:go_default_library", "//vendor:k8s.io/apimachinery/pkg/api/resource", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", "//vendor:k8s.io/apimachinery/pkg/types", + "//vendor:k8s.io/apiserver/pkg/util/feature", "//vendor:k8s.io/client-go/pkg/api/v1", "//vendor:k8s.io/client-go/tools/record", "//vendor:k8s.io/client-go/util/clock", ], ) -cgo_genrule( - name = "cgo_codegen", - srcs = ["threshold_notifier_linux.go"], - clinkopts = [ - "-lz", - "-lm", - "-lpthread", - "-ldl", +go_library( + name = "go_default_library", + srcs = [ + "doc.go", + "eviction_manager.go", + "helpers.go", + "types.go", ], + library = ":cgo_codegen", tags = ["automanaged"], + deps = [ + "//pkg/api:go_default_library", + "//pkg/api/v1:go_default_library", + "//pkg/features:go_default_library", + "//pkg/kubelet/api/v1alpha1/stats:go_default_library", + "//pkg/kubelet/cm:go_default_library", + "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/qos:go_default_library", + "//pkg/kubelet/server/stats:go_default_library", + "//pkg/kubelet/types:go_default_library", + "//pkg/kubelet/util/format:go_default_library", + "//pkg/quota/evaluator/core:go_default_library", + "//vendor:github.com/golang/glog", + "//vendor:k8s.io/apimachinery/pkg/api/resource", + "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", + "//vendor:k8s.io/apimachinery/pkg/util/sets", + "//vendor:k8s.io/apimachinery/pkg/util/wait", + "//vendor:k8s.io/apiserver/pkg/util/feature", + "//vendor:k8s.io/client-go/pkg/api/v1", + "//vendor:k8s.io/client-go/tools/record", + "//vendor:k8s.io/client-go/util/clock", + ], ) filegroup( diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 4726df19fa7..665c8721a80 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -25,14 +25,17 @@ import ( "github.com/golang/glog" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientv1 "k8s.io/client-go/pkg/api/v1" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/clock" "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/format" ) @@ -311,6 +314,12 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // we kill at most a single pod during each eviction interval for i := range activePods { pod := activePods[i] + // If the pod is marked as critical and support for critical pod annotations is enabled, + // do not evict such pods. Once Kubelet supports preemptions, these pods can be safely evicted. + if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && + kubelettypes.IsCriticalPod(pod) { + continue + } status := v1.PodStatus{ Phase: v1.PodFailed, Message: fmt.Sprintf(message, resourceToReclaim), diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index f6e74bbe077..2554056a1da 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -22,14 +22,15 @@ import ( "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientv1 "k8s.io/client-go/pkg/api/v1" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/clock" + kubeapi "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/kubelet/lifecycle" - "k8s.io/kubernetes/pkg/types" - "k8s.io/kubernetes/pkg/util/clock" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" ) // mockPodKiller is used to testing which pod is killed @@ -1087,3 +1088,135 @@ func TestInodePressureNodeFsInodes(t *testing.T) { t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) } } + +// TestCriticalPodsAreNotEvicted +func TestCriticalPodsAreNotEvicted(t *testing.T) { + podMaker := makePodWithMemoryStats + summaryStatsMaker := makeMemoryStats + podsToMake := []podToMake{ + {name: "critical", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"}, + } + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet) + pods = append(pods, pod) + podStats[pod] = podStat + } + + // Mark the pod as critical + pods[0].Annotations = map[string]string{ + kubelettypes.CriticalPodAnnotationKey: "", + } + pods[0].Namespace = kubeapi.NamespaceSystem + + podToEvict := pods[0] + activePodsFunc := func() []*v1.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + imageGC := &mockImageGC{freed: int64(0), err: nil} + nodeRef := &clientv1.ObjectReference{ + Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "", + } + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []Threshold{ + { + Signal: SignalMemoryAvailable, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: SignalMemoryAvailable, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: time.Minute * 2, + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: imageGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // Enable critical pod annotation feature gate + utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=True") + // induce soft threshold + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("1500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure since soft threshold was met") + } + + // verify no pod was yet killed because there has not yet been enough time passed. + if podKiller.pod != nil { + t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name) + } + + // step forward in time pass the grace period + fakeClock.Step(3 * time.Minute) + summaryProvider.result = summaryStatsMaker("1500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure since soft threshold was met") + } + + // verify the right pod was killed with the right grace period. + if podKiller.pod == podToEvict { + t.Errorf("Manager chose to kill critical pod: %v, but should have ignored it", podKiller.pod.Name) + } + // reset state + podKiller.pod = nil + podKiller.gracePeriodOverride = nil + + // remove memory pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker("3Gi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have memory pressure + if manager.IsUnderMemoryPressure() { + t.Errorf("Manager should not report memory pressure") + } + + // Disable critical pod annotation feature gate + utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=False") + + // induce memory pressure! + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // check the right pod was killed + if podKiller.pod != podToEvict { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } +} diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 39d9e3c4791..dd069b0fa3e 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -23,7 +23,6 @@ import ( "k8s.io/kubernetes/pkg/api/v1" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/secret" - "k8s.io/kubernetes/pkg/types" ) // Manager stores and manages access to pods, maintaining the mappings diff --git a/pkg/kubelet/qos/BUILD b/pkg/kubelet/qos/BUILD index 177d4eafff6..36383f9e074 100644 --- a/pkg/kubelet/qos/BUILD +++ b/pkg/kubelet/qos/BUILD @@ -8,6 +8,21 @@ load( "go_test", ) +go_test( + name = "go_default_test", + srcs = [ + "policy_test.go", + "qos_test.go", + ], + library = ":go_default_library", + tags = ["automanaged"], + deps = [ + "//pkg/api/v1:go_default_library", + "//vendor:k8s.io/apimachinery/pkg/api/resource", + "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", + ], +) + go_library( name = "go_default_library", srcs = [ @@ -19,28 +34,11 @@ go_library( deps = [ "//pkg/api:go_default_library", "//pkg/api/v1:go_default_library", - "//pkg/kubelet/types:go_default_library", "//vendor:k8s.io/apimachinery/pkg/api/resource", "//vendor:k8s.io/apimachinery/pkg/util/sets", ], ) -go_test( - name = "go_default_test", - srcs = [ - "policy_test.go", - "qos_test.go", - ], - library = ":go_default_library", - tags = ["automanaged"], - deps = [ - "//pkg/api/v1:go_default_library", - "//pkg/kubelet/types:go_default_library", - "//vendor:k8s.io/apimachinery/pkg/api/resource", - "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", - ], -) - filegroup( name = "package-srcs", srcs = glob(["**"]), diff --git a/pkg/kubelet/types/BUILD b/pkg/kubelet/types/BUILD index e93b7c34632..d37d38ff05a 100644 --- a/pkg/kubelet/types/BUILD +++ b/pkg/kubelet/types/BUILD @@ -19,6 +19,7 @@ go_library( ], tags = ["automanaged"], deps = [ + "//pkg/api:go_default_library", "//pkg/api/v1:go_default_library", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", ], diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index b99ec9de6ae..72e1b14d65b 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -20,13 +20,17 @@ import ( "fmt" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeapi "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" ) -const ConfigSourceAnnotationKey = "kubernetes.io/config.source" -const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror" -const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" -const ConfigHashAnnotationKey = "kubernetes.io/config.hash" +const ( + ConfigSourceAnnotationKey = "kubernetes.io/config.source" + ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror" + ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" + ConfigHashAnnotationKey = "kubernetes.io/config.hash" + CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod" +) // PodOperation defines what changes will be made on a pod configuration. type PodOperation int @@ -137,6 +141,13 @@ func (sp SyncPodType) String() string { // key. Both the rescheduler and the kubelet use this key to make admission // and scheduling decisions. func IsCriticalPod(pod *v1.Pod) bool { - _, ok := pod.Annotations[CriticalPodAnnotationKey] - return ok + // Critical pods are restricted to "kube-system" namespace as of now. + if pod.Namespace != kubeapi.NamespaceSystem { + return false + } + val, ok := pod.Annotations[CriticalPodAnnotationKey] + if ok && val == "" { + return true + } + return false } From 77a88f7e8b99ed97a76821c9a19ccb87c06b73e6 Mon Sep 17 00:00:00 2001 From: Vishnu kannan Date: Thu, 2 Feb 2017 10:27:52 -0800 Subject: [PATCH 7/7] update critical pod annotation flag gate to mention that BestEffort pods are not supported Signed-off-by: Vishnu kannan --- pkg/features/kube_features.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 61b4d26ece0..18856478b3d 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -64,6 +64,7 @@ const ( // // Ensures guaranteed scheduling of pods marked with a special pod annotation `scheduler.alpha.kubernetes.io/critical-pod` // and also prevents them from being evicted from a node. + // Note: This feature is not supported for `BestEffort` pods. ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation" )