diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index d2b01ecf64b..dce4dbb6653 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -123,7 +123,7 @@ fi RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}" # Optional: set feature gates -FEATURE_GATES="${KUBE_FEATURE_GATES:-}" +FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}" # Optional: Install cluster DNS. ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}" diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 25ef5b59315..0d45c710d7f 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -83,7 +83,7 @@ MASTER_IP_RANGE="${MASTER_IP_RANGE:-10.246.0.0/24}" RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}" # Optional: set feature gates -FEATURE_GATES="${KUBE_FEATURE_GATES:-}" +FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}" TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100} diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index ddaf44df7d4..61b4d26ece0 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -58,6 +58,13 @@ const ( // contains a privileged container, or specific non-namespaced capabilities (MKNOD, SYS_MODULE, // SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon. ExperimentalHostUserNamespaceDefaultingGate utilfeature.Feature = "ExperimentalHostUserNamespaceDefaulting" + + // owner: @vishh + // alpha: v1.5 + // + // Ensures guaranteed scheduling of pods marked with a special pod annotation `scheduler.alpha.kubernetes.io/critical-pod` + // and also prevents them from being evicted from a node. + ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation" ) func init() { @@ -73,6 +80,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS DynamicKubeletConfig: {Default: false, PreRelease: utilfeature.Alpha}, DynamicVolumeProvisioning: {Default: true, PreRelease: utilfeature.Alpha}, ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta}, + ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 06b695d89a4..7d94cd83a64 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -9,36 +9,16 @@ load( "go_test", ) -go_library( - name = "go_default_library", - srcs = [ - "doc.go", - "eviction_manager.go", - "helpers.go", - "types.go", +cgo_genrule( + name = "cgo_codegen", + srcs = ["threshold_notifier_linux.go"], + clinkopts = [ + "-lz", + "-lm", + "-lpthread", + "-ldl", ], - library = ":cgo_codegen", tags = ["automanaged"], - deps = [ - "//pkg/api:go_default_library", - "//pkg/api/v1:go_default_library", - "//pkg/kubelet/api/v1alpha1/stats:go_default_library", - "//pkg/kubelet/cm:go_default_library", - "//pkg/kubelet/lifecycle:go_default_library", - "//pkg/kubelet/qos:go_default_library", - "//pkg/kubelet/server/stats:go_default_library", - "//pkg/kubelet/types:go_default_library", - "//pkg/kubelet/util/format:go_default_library", - "//pkg/quota/evaluator/core:go_default_library", - "//vendor:github.com/golang/glog", - "//vendor:k8s.io/apimachinery/pkg/api/resource", - "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", - "//vendor:k8s.io/apimachinery/pkg/util/sets", - "//vendor:k8s.io/apimachinery/pkg/util/wait", - "//vendor:k8s.io/client-go/pkg/api/v1", - "//vendor:k8s.io/client-go/tools/record", - "//vendor:k8s.io/client-go/util/clock", - ], ) go_test( @@ -54,26 +34,50 @@ go_test( "//pkg/api/v1:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/quota:go_default_library", "//vendor:k8s.io/apimachinery/pkg/api/resource", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", "//vendor:k8s.io/apimachinery/pkg/types", + "//vendor:k8s.io/apiserver/pkg/util/feature", "//vendor:k8s.io/client-go/pkg/api/v1", "//vendor:k8s.io/client-go/tools/record", "//vendor:k8s.io/client-go/util/clock", ], ) -cgo_genrule( - name = "cgo_codegen", - srcs = ["threshold_notifier_linux.go"], - clinkopts = [ - "-lz", - "-lm", - "-lpthread", - "-ldl", +go_library( + name = "go_default_library", + srcs = [ + "doc.go", + "eviction_manager.go", + "helpers.go", + "types.go", ], + library = ":cgo_codegen", tags = ["automanaged"], + deps = [ + "//pkg/api:go_default_library", + "//pkg/api/v1:go_default_library", + "//pkg/features:go_default_library", + "//pkg/kubelet/api/v1alpha1/stats:go_default_library", + "//pkg/kubelet/cm:go_default_library", + "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/qos:go_default_library", + "//pkg/kubelet/server/stats:go_default_library", + "//pkg/kubelet/types:go_default_library", + "//pkg/kubelet/util/format:go_default_library", + "//pkg/quota/evaluator/core:go_default_library", + "//vendor:github.com/golang/glog", + "//vendor:k8s.io/apimachinery/pkg/api/resource", + "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", + "//vendor:k8s.io/apimachinery/pkg/util/sets", + "//vendor:k8s.io/apimachinery/pkg/util/wait", + "//vendor:k8s.io/apiserver/pkg/util/feature", + "//vendor:k8s.io/client-go/pkg/api/v1", + "//vendor:k8s.io/client-go/tools/record", + "//vendor:k8s.io/client-go/util/clock", + ], ) filegroup( diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 4726df19fa7..665c8721a80 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -25,14 +25,17 @@ import ( "github.com/golang/glog" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientv1 "k8s.io/client-go/pkg/api/v1" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/clock" "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/format" ) @@ -311,6 +314,12 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // we kill at most a single pod during each eviction interval for i := range activePods { pod := activePods[i] + // If the pod is marked as critical and support for critical pod annotations is enabled, + // do not evict such pods. Once Kubelet supports preemptions, these pods can be safely evicted. + if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && + kubelettypes.IsCriticalPod(pod) { + continue + } status := v1.PodStatus{ Phase: v1.PodFailed, Message: fmt.Sprintf(message, resourceToReclaim), diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index f6e74bbe077..2554056a1da 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -22,14 +22,15 @@ import ( "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" clientv1 "k8s.io/client-go/pkg/api/v1" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/clock" + kubeapi "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/kubelet/lifecycle" - "k8s.io/kubernetes/pkg/types" - "k8s.io/kubernetes/pkg/util/clock" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" ) // mockPodKiller is used to testing which pod is killed @@ -1087,3 +1088,135 @@ func TestInodePressureNodeFsInodes(t *testing.T) { t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) } } + +// TestCriticalPodsAreNotEvicted +func TestCriticalPodsAreNotEvicted(t *testing.T) { + podMaker := makePodWithMemoryStats + summaryStatsMaker := makeMemoryStats + podsToMake := []podToMake{ + {name: "critical", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"}, + } + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet) + pods = append(pods, pod) + podStats[pod] = podStat + } + + // Mark the pod as critical + pods[0].Annotations = map[string]string{ + kubelettypes.CriticalPodAnnotationKey: "", + } + pods[0].Namespace = kubeapi.NamespaceSystem + + podToEvict := pods[0] + activePodsFunc := func() []*v1.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + imageGC := &mockImageGC{freed: int64(0), err: nil} + nodeRef := &clientv1.ObjectReference{ + Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "", + } + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []Threshold{ + { + Signal: SignalMemoryAvailable, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: SignalMemoryAvailable, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: time.Minute * 2, + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: imageGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // Enable critical pod annotation feature gate + utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=True") + // induce soft threshold + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("1500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure since soft threshold was met") + } + + // verify no pod was yet killed because there has not yet been enough time passed. + if podKiller.pod != nil { + t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name) + } + + // step forward in time pass the grace period + fakeClock.Step(3 * time.Minute) + summaryProvider.result = summaryStatsMaker("1500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure since soft threshold was met") + } + + // verify the right pod was killed with the right grace period. + if podKiller.pod == podToEvict { + t.Errorf("Manager chose to kill critical pod: %v, but should have ignored it", podKiller.pod.Name) + } + // reset state + podKiller.pod = nil + podKiller.gracePeriodOverride = nil + + // remove memory pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker("3Gi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have memory pressure + if manager.IsUnderMemoryPressure() { + t.Errorf("Manager should not report memory pressure") + } + + // Disable critical pod annotation feature gate + utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=False") + + // induce memory pressure! + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // check the right pod was killed + if podKiller.pod != podToEvict { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } +} diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 39d9e3c4791..dd069b0fa3e 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -23,7 +23,6 @@ import ( "k8s.io/kubernetes/pkg/api/v1" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/secret" - "k8s.io/kubernetes/pkg/types" ) // Manager stores and manages access to pods, maintaining the mappings diff --git a/pkg/kubelet/qos/BUILD b/pkg/kubelet/qos/BUILD index 177d4eafff6..36383f9e074 100644 --- a/pkg/kubelet/qos/BUILD +++ b/pkg/kubelet/qos/BUILD @@ -8,6 +8,21 @@ load( "go_test", ) +go_test( + name = "go_default_test", + srcs = [ + "policy_test.go", + "qos_test.go", + ], + library = ":go_default_library", + tags = ["automanaged"], + deps = [ + "//pkg/api/v1:go_default_library", + "//vendor:k8s.io/apimachinery/pkg/api/resource", + "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", + ], +) + go_library( name = "go_default_library", srcs = [ @@ -19,28 +34,11 @@ go_library( deps = [ "//pkg/api:go_default_library", "//pkg/api/v1:go_default_library", - "//pkg/kubelet/types:go_default_library", "//vendor:k8s.io/apimachinery/pkg/api/resource", "//vendor:k8s.io/apimachinery/pkg/util/sets", ], ) -go_test( - name = "go_default_test", - srcs = [ - "policy_test.go", - "qos_test.go", - ], - library = ":go_default_library", - tags = ["automanaged"], - deps = [ - "//pkg/api/v1:go_default_library", - "//pkg/kubelet/types:go_default_library", - "//vendor:k8s.io/apimachinery/pkg/api/resource", - "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", - ], -) - filegroup( name = "package-srcs", srcs = glob(["**"]), diff --git a/pkg/kubelet/types/BUILD b/pkg/kubelet/types/BUILD index e93b7c34632..d37d38ff05a 100644 --- a/pkg/kubelet/types/BUILD +++ b/pkg/kubelet/types/BUILD @@ -19,6 +19,7 @@ go_library( ], tags = ["automanaged"], deps = [ + "//pkg/api:go_default_library", "//pkg/api/v1:go_default_library", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", ], diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index b99ec9de6ae..72e1b14d65b 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -20,13 +20,17 @@ import ( "fmt" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeapi "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" ) -const ConfigSourceAnnotationKey = "kubernetes.io/config.source" -const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror" -const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" -const ConfigHashAnnotationKey = "kubernetes.io/config.hash" +const ( + ConfigSourceAnnotationKey = "kubernetes.io/config.source" + ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror" + ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" + ConfigHashAnnotationKey = "kubernetes.io/config.hash" + CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod" +) // PodOperation defines what changes will be made on a pod configuration. type PodOperation int @@ -137,6 +141,13 @@ func (sp SyncPodType) String() string { // key. Both the rescheduler and the kubelet use this key to make admission // and scheduling decisions. func IsCriticalPod(pod *v1.Pod) bool { - _, ok := pod.Annotations[CriticalPodAnnotationKey] - return ok + // Critical pods are restricted to "kube-system" namespace as of now. + if pod.Namespace != kubeapi.NamespaceSystem { + return false + } + val, ok := pod.Annotations[CriticalPodAnnotationKey] + if ok && val == "" { + return true + } + return false }