Avoid evicting critical pods in Kubelet if a special feature gate is enabled

Signed-off-by: Vishnu Kannan <vishnuk@google.com>
This commit is contained in:
Vishnu Kannan 2017-01-28 14:48:35 -08:00 committed by Vishnu kannan
parent f85bbcb78d
commit c967ab7b99
10 changed files with 227 additions and 64 deletions

View File

@ -123,7 +123,7 @@ fi
RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
# Optional: set feature gates
FEATURE_GATES="${KUBE_FEATURE_GATES:-}"
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
# Optional: Install cluster DNS.
ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}"

View File

@ -83,7 +83,7 @@ MASTER_IP_RANGE="${MASTER_IP_RANGE:-10.246.0.0/24}"
RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
# Optional: set feature gates
FEATURE_GATES="${KUBE_FEATURE_GATES:-}"
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100}

View File

@ -58,6 +58,13 @@ const (
// contains a privileged container, or specific non-namespaced capabilities (MKNOD, SYS_MODULE,
// SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon.
ExperimentalHostUserNamespaceDefaultingGate utilfeature.Feature = "ExperimentalHostUserNamespaceDefaulting"
// owner: @vishh
// alpha: v1.5
//
// Ensures guaranteed scheduling of pods marked with a special pod annotation `scheduler.alpha.kubernetes.io/critical-pod`
// and also prevents them from being evicted from a node.
ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation"
)
func init() {
@ -73,6 +80,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
DynamicKubeletConfig: {Default: false, PreRelease: utilfeature.Alpha},
DynamicVolumeProvisioning: {Default: true, PreRelease: utilfeature.Alpha},
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha},
// inherited features from generic apiserver, relisted here to get a conflict if it is changed
// unintentionally on either side:

View File

@ -9,36 +9,16 @@ load(
"go_test",
)
go_library(
name = "go_default_library",
srcs = [
"doc.go",
"eviction_manager.go",
"helpers.go",
"types.go",
cgo_genrule(
name = "cgo_codegen",
srcs = ["threshold_notifier_linux.go"],
clinkopts = [
"-lz",
"-lm",
"-lpthread",
"-ldl",
],
library = ":cgo_codegen",
tags = ["automanaged"],
deps = [
"//pkg/api:go_default_library",
"//pkg/api/v1:go_default_library",
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
"//pkg/kubelet/cm:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/qos:go_default_library",
"//pkg/kubelet/server/stats:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
"//pkg/quota/evaluator/core:go_default_library",
"//vendor:github.com/golang/glog",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
"//vendor:k8s.io/apimachinery/pkg/util/sets",
"//vendor:k8s.io/apimachinery/pkg/util/wait",
"//vendor:k8s.io/client-go/pkg/api/v1",
"//vendor:k8s.io/client-go/tools/record",
"//vendor:k8s.io/client-go/util/clock",
],
)
go_test(
@ -54,26 +34,50 @@ go_test(
"//pkg/api/v1:go_default_library",
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/quota:go_default_library",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
"//vendor:k8s.io/apimachinery/pkg/types",
"//vendor:k8s.io/apiserver/pkg/util/feature",
"//vendor:k8s.io/client-go/pkg/api/v1",
"//vendor:k8s.io/client-go/tools/record",
"//vendor:k8s.io/client-go/util/clock",
],
)
cgo_genrule(
name = "cgo_codegen",
srcs = ["threshold_notifier_linux.go"],
clinkopts = [
"-lz",
"-lm",
"-lpthread",
"-ldl",
go_library(
name = "go_default_library",
srcs = [
"doc.go",
"eviction_manager.go",
"helpers.go",
"types.go",
],
library = ":cgo_codegen",
tags = ["automanaged"],
deps = [
"//pkg/api:go_default_library",
"//pkg/api/v1:go_default_library",
"//pkg/features:go_default_library",
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
"//pkg/kubelet/cm:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/qos:go_default_library",
"//pkg/kubelet/server/stats:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
"//pkg/quota/evaluator/core:go_default_library",
"//vendor:github.com/golang/glog",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
"//vendor:k8s.io/apimachinery/pkg/util/sets",
"//vendor:k8s.io/apimachinery/pkg/util/wait",
"//vendor:k8s.io/apiserver/pkg/util/feature",
"//vendor:k8s.io/client-go/pkg/api/v1",
"//vendor:k8s.io/client-go/tools/record",
"//vendor:k8s.io/client-go/util/clock",
],
)
filegroup(

View File

@ -25,14 +25,17 @@ import (
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
clientv1 "k8s.io/client-go/pkg/api/v1"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/clock"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/server/stats"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/format"
)
@ -311,6 +314,12 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
// If the pod is marked as critical and support for critical pod annotations is enabled,
// do not evict such pods. Once Kubelet supports preemptions, these pods can be safely evicted.
if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCriticalPod(pod) {
continue
}
status := v1.PodStatus{
Phase: v1.PodFailed,
Message: fmt.Sprintf(message, resourceToReclaim),

View File

@ -22,14 +22,15 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
clientv1 "k8s.io/client-go/pkg/api/v1"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/clock"
kubeapi "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/v1"
statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/types"
"k8s.io/kubernetes/pkg/util/clock"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
)
// mockPodKiller is used to testing which pod is killed
@ -1087,3 +1088,135 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
}
}
// TestCriticalPodsAreNotEvicted
func TestCriticalPodsAreNotEvicted(t *testing.T) {
podMaker := makePodWithMemoryStats
summaryStatsMaker := makeMemoryStats
podsToMake := []podToMake{
{name: "critical", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"},
}
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
pods = append(pods, pod)
podStats[pod] = podStat
}
// Mark the pod as critical
pods[0].Annotations = map[string]string{
kubelettypes.CriticalPodAnnotationKey: "",
}
pods[0].Namespace = kubeapi.NamespaceSystem
podToEvict := pods[0]
activePodsFunc := func() []*v1.Pod {
return pods
}
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &clientv1.ObjectReference{
Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
}
config := Config{
MaxPodGracePeriodSeconds: 5,
PressureTransitionPeriod: time.Minute * 5,
Thresholds: []Threshold{
{
Signal: SignalMemoryAvailable,
Operator: OpLessThan,
Value: ThresholdValue{
Quantity: quantityMustParse("1Gi"),
},
},
{
Signal: SignalMemoryAvailable,
Operator: OpLessThan,
Value: ThresholdValue{
Quantity: quantityMustParse("2Gi"),
},
GracePeriod: time.Minute * 2,
},
},
}
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)}
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: imageGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
}
// Enable critical pod annotation feature gate
utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=True")
// induce soft threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
t.Errorf("Manager should report memory pressure since soft threshold was met")
}
// verify no pod was yet killed because there has not yet been enough time passed.
if podKiller.pod != nil {
t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name)
}
// step forward in time pass the grace period
fakeClock.Step(3 * time.Minute)
summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
t.Errorf("Manager should report memory pressure since soft threshold was met")
}
// verify the right pod was killed with the right grace period.
if podKiller.pod == podToEvict {
t.Errorf("Manager chose to kill critical pod: %v, but should have ignored it", podKiller.pod.Name)
}
// reset state
podKiller.pod = nil
podKiller.gracePeriodOverride = nil
// remove memory pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("3Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should not have memory pressure
if manager.IsUnderMemoryPressure() {
t.Errorf("Manager should not report memory pressure")
}
// Disable critical pod annotation feature gate
utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=False")
// induce memory pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
t.Errorf("Manager should report memory pressure")
}
// check the right pod was killed
if podKiller.pod != podToEvict {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
}

View File

@ -23,7 +23,6 @@ import (
"k8s.io/kubernetes/pkg/api/v1"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/secret"
"k8s.io/kubernetes/pkg/types"
)
// Manager stores and manages access to pods, maintaining the mappings

View File

@ -8,6 +8,21 @@ load(
"go_test",
)
go_test(
name = "go_default_test",
srcs = [
"policy_test.go",
"qos_test.go",
],
library = ":go_default_library",
tags = ["automanaged"],
deps = [
"//pkg/api/v1:go_default_library",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
],
)
go_library(
name = "go_default_library",
srcs = [
@ -19,28 +34,11 @@ go_library(
deps = [
"//pkg/api:go_default_library",
"//pkg/api/v1:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/util/sets",
],
)
go_test(
name = "go_default_test",
srcs = [
"policy_test.go",
"qos_test.go",
],
library = ":go_default_library",
tags = ["automanaged"],
deps = [
"//pkg/api/v1:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),

View File

@ -19,6 +19,7 @@ go_library(
],
tags = ["automanaged"],
deps = [
"//pkg/api:go_default_library",
"//pkg/api/v1:go_default_library",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
],

View File

@ -20,13 +20,17 @@ import (
"fmt"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kubeapi "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/v1"
)
const ConfigSourceAnnotationKey = "kubernetes.io/config.source"
const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror"
const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
const ConfigHashAnnotationKey = "kubernetes.io/config.hash"
const (
ConfigSourceAnnotationKey = "kubernetes.io/config.source"
ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror"
ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
ConfigHashAnnotationKey = "kubernetes.io/config.hash"
CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod"
)
// PodOperation defines what changes will be made on a pod configuration.
type PodOperation int
@ -137,6 +141,13 @@ func (sp SyncPodType) String() string {
// key. Both the rescheduler and the kubelet use this key to make admission
// and scheduling decisions.
func IsCriticalPod(pod *v1.Pod) bool {
_, ok := pod.Annotations[CriticalPodAnnotationKey]
return ok
// Critical pods are restricted to "kube-system" namespace as of now.
if pod.Namespace != kubeapi.NamespaceSystem {
return false
}
val, ok := pod.Annotations[CriticalPodAnnotationKey]
if ok && val == "" {
return true
}
return false
}