diff --git a/cluster/addons/calico-policy-controller/calico-node-daemonset.yaml b/cluster/addons/calico-policy-controller/calico-node-daemonset.yaml index 5d6ab990aa9..491b2172e6e 100644 --- a/cluster/addons/calico-policy-controller/calico-node-daemonset.yaml +++ b/cluster/addons/calico-policy-controller/calico-node-daemonset.yaml @@ -17,6 +17,8 @@ spec: metadata: labels: k8s-app: calico-node + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical nodeSelector: diff --git a/cluster/addons/calico-policy-controller/calico-node-vertical-autoscaler-deployment.yaml b/cluster/addons/calico-policy-controller/calico-node-vertical-autoscaler-deployment.yaml index 96d5aeb9e79..fc6fab3dd0f 100644 --- a/cluster/addons/calico-policy-controller/calico-node-vertical-autoscaler-deployment.yaml +++ b/cluster/addons/calico-policy-controller/calico-node-vertical-autoscaler-deployment.yaml @@ -16,6 +16,8 @@ spec: metadata: labels: k8s-app: calico-node-autoscaler + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical containers: diff --git a/cluster/addons/calico-policy-controller/typha-deployment.yaml b/cluster/addons/calico-policy-controller/typha-deployment.yaml index a521df42121..f8f143f57ed 100644 --- a/cluster/addons/calico-policy-controller/typha-deployment.yaml +++ b/cluster/addons/calico-policy-controller/typha-deployment.yaml @@ -16,6 +16,8 @@ spec: metadata: labels: k8s-app: calico-typha + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical tolerations: diff --git a/cluster/addons/calico-policy-controller/typha-horizontal-autoscaler-deployment.yaml b/cluster/addons/calico-policy-controller/typha-horizontal-autoscaler-deployment.yaml index b9ae5bf1bbf..82c5a935db9 100644 --- a/cluster/addons/calico-policy-controller/typha-horizontal-autoscaler-deployment.yaml +++ b/cluster/addons/calico-policy-controller/typha-horizontal-autoscaler-deployment.yaml @@ -16,6 +16,8 @@ spec: metadata: labels: k8s-app: calico-typha-autoscaler + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical securityContext: diff --git a/cluster/addons/calico-policy-controller/typha-vertical-autoscaler-deployment.yaml b/cluster/addons/calico-policy-controller/typha-vertical-autoscaler-deployment.yaml index 3e66cfe565a..0e4f22355a1 100644 --- a/cluster/addons/calico-policy-controller/typha-vertical-autoscaler-deployment.yaml +++ b/cluster/addons/calico-policy-controller/typha-vertical-autoscaler-deployment.yaml @@ -16,6 +16,8 @@ spec: metadata: labels: k8s-app: calico-typha-autoscaler + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical containers: diff --git a/cluster/addons/cluster-monitoring/google/heapster-controller.yaml b/cluster/addons/cluster-monitoring/google/heapster-controller.yaml index 533e7e2a806..4ff89886741 100644 --- a/cluster/addons/cluster-monitoring/google/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/google/heapster-controller.yaml @@ -51,6 +51,7 @@ spec: k8s-app: heapster version: v1.6.0-beta.1 annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml index 1d1cd9a31d4..9f359f41418 100644 --- a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml +++ b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml @@ -51,6 +51,7 @@ spec: k8s-app: heapster version: v1.6.0-beta.1 annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml b/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml index 26820387bef..c5b78d12680 100644 --- a/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml @@ -51,6 +51,7 @@ spec: k8s-app: heapster version: v1.6.0-beta.1 annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml b/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml index 769683b9c30..a4f95cb6607 100644 --- a/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml @@ -19,6 +19,7 @@ spec: k8s-app: influxGrafana version: v4 annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/cluster-monitoring/stackdriver/heapster-controller.yaml b/cluster/addons/cluster-monitoring/stackdriver/heapster-controller.yaml index fb3a82499ca..6bb898baf91 100644 --- a/cluster/addons/cluster-monitoring/stackdriver/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/stackdriver/heapster-controller.yaml @@ -39,6 +39,7 @@ spec: k8s-app: heapster version: v1.6.0-beta.1 annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml b/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml index b69083c07fe..414bb1af00c 100644 --- a/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml @@ -39,6 +39,7 @@ spec: k8s-app: heapster version: v1.6.0-beta.1 annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/dashboard/dashboard-controller.yaml b/cluster/addons/dashboard/dashboard-controller.yaml index 830fa7696e6..c4dee396bba 100644 --- a/cluster/addons/dashboard/dashboard-controller.yaml +++ b/cluster/addons/dashboard/dashboard-controller.yaml @@ -24,6 +24,7 @@ spec: labels: k8s-app: kubernetes-dashboard annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml b/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml index 708184acd5a..75d0ea1df67 100644 --- a/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml +++ b/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml @@ -14,6 +14,8 @@ spec: metadata: labels: k8s-app: nvidia-gpu-device-plugin + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical affinity: diff --git a/cluster/addons/dns-horizontal-autoscaler/dns-horizontal-autoscaler.yaml b/cluster/addons/dns-horizontal-autoscaler/dns-horizontal-autoscaler.yaml index 3e8ef1da643..49fd35e76a7 100644 --- a/cluster/addons/dns-horizontal-autoscaler/dns-horizontal-autoscaler.yaml +++ b/cluster/addons/dns-horizontal-autoscaler/dns-horizontal-autoscaler.yaml @@ -76,6 +76,7 @@ spec: labels: k8s-app: kube-dns-autoscaler annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/dns/kube-dns/kube-dns.yaml.base b/cluster/addons/dns/kube-dns/kube-dns.yaml.base index 6a827ce89a0..cd8abb1a82f 100644 --- a/cluster/addons/dns/kube-dns/kube-dns.yaml.base +++ b/cluster/addons/dns/kube-dns/kube-dns.yaml.base @@ -82,6 +82,7 @@ spec: labels: k8s-app: kube-dns annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' prometheus.io/port: "10054" prometheus.io/scrape: "true" diff --git a/cluster/addons/dns/kube-dns/kube-dns.yaml.in b/cluster/addons/dns/kube-dns/kube-dns.yaml.in index b677a232d3c..f4160658aaa 100644 --- a/cluster/addons/dns/kube-dns/kube-dns.yaml.in +++ b/cluster/addons/dns/kube-dns/kube-dns.yaml.in @@ -82,6 +82,7 @@ spec: labels: k8s-app: kube-dns annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' prometheus.io/port: "10054" prometheus.io/scrape: "true" diff --git a/cluster/addons/dns/kube-dns/kube-dns.yaml.sed b/cluster/addons/dns/kube-dns/kube-dns.yaml.sed index ea5e6bae54a..2e397d29175 100644 --- a/cluster/addons/dns/kube-dns/kube-dns.yaml.sed +++ b/cluster/addons/dns/kube-dns/kube-dns.yaml.sed @@ -82,6 +82,7 @@ spec: labels: k8s-app: kube-dns annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' prometheus.io/port: "10054" prometheus.io/scrape: "true" diff --git a/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml b/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml index 5a379ac20d5..047133466e7 100644 --- a/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml +++ b/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml @@ -65,6 +65,7 @@ spec: # supports critical pod annotation based priority scheme. # Note that this does not guarantee admission on the nodes (#40573). annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-node-critical diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml index 5921f9ab969..49c97ad35a0 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml @@ -21,6 +21,11 @@ spec: k8s-app: fluentd-gcp kubernetes.io/cluster-service: "true" version: {{ fluentd_gcp_yaml_version }} + # This annotation ensures that fluentd does not get evicted if the node + # supports critical pod annotation based priority scheme. + # Note that this does not guarantee admission on the nodes (#40573). + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical serviceAccountName: fluentd-gcp diff --git a/cluster/addons/ip-masq-agent/ip-masq-agent.yaml b/cluster/addons/ip-masq-agent/ip-masq-agent.yaml index 0436c6ba1d9..18dc76ad284 100644 --- a/cluster/addons/ip-masq-agent/ip-masq-agent.yaml +++ b/cluster/addons/ip-masq-agent/ip-masq-agent.yaml @@ -24,6 +24,8 @@ spec: metadata: labels: k8s-app: ip-masq-agent + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical serviceAccountName: ip-masq-agent diff --git a/cluster/addons/kube-proxy/kube-proxy-ds.yaml b/cluster/addons/kube-proxy/kube-proxy-ds.yaml index ea81f7c5dc5..aaa9641d13f 100644 --- a/cluster/addons/kube-proxy/kube-proxy-ds.yaml +++ b/cluster/addons/kube-proxy/kube-proxy-ds.yaml @@ -21,6 +21,8 @@ spec: metadata: labels: k8s-app: kube-proxy + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical hostNetwork: true diff --git a/cluster/addons/metadata-proxy/gce/metadata-proxy.yaml b/cluster/addons/metadata-proxy/gce/metadata-proxy.yaml index 7267a393dc5..f5b9ba720c7 100644 --- a/cluster/addons/metadata-proxy/gce/metadata-proxy.yaml +++ b/cluster/addons/metadata-proxy/gce/metadata-proxy.yaml @@ -31,6 +31,11 @@ spec: k8s-app: metadata-proxy kubernetes.io/cluster-service: "true" version: v0.1 + # This annotation ensures that the proxy does not get evicted if the node + # supports critical pod annotation based priority scheme. + # Note that this does not guarantee admission on the nodes (#40573). + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical serviceAccountName: metadata-proxy diff --git a/cluster/addons/metrics-server/metrics-server-deployment.yaml b/cluster/addons/metrics-server/metrics-server-deployment.yaml index 6da5ca70459..2306dc98f9f 100644 --- a/cluster/addons/metrics-server/metrics-server-deployment.yaml +++ b/cluster/addons/metrics-server/metrics-server-deployment.yaml @@ -42,6 +42,7 @@ spec: k8s-app: metrics-server version: v0.3.3 annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' spec: priorityClassName: system-cluster-critical diff --git a/cluster/addons/prometheus/alertmanager-deployment.yaml b/cluster/addons/prometheus/alertmanager-deployment.yaml index 38ec99d3055..85cecd3dd21 100644 --- a/cluster/addons/prometheus/alertmanager-deployment.yaml +++ b/cluster/addons/prometheus/alertmanager-deployment.yaml @@ -19,6 +19,8 @@ spec: labels: k8s-app: alertmanager version: v0.14.0 + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical containers: diff --git a/cluster/addons/prometheus/kube-state-metrics-deployment.yaml b/cluster/addons/prometheus/kube-state-metrics-deployment.yaml index 14ea9802b40..5d81e8e002d 100644 --- a/cluster/addons/prometheus/kube-state-metrics-deployment.yaml +++ b/cluster/addons/prometheus/kube-state-metrics-deployment.yaml @@ -19,6 +19,8 @@ spec: labels: k8s-app: kube-state-metrics version: v1.3.0 + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical serviceAccountName: kube-state-metrics diff --git a/cluster/addons/prometheus/node-exporter-ds.yml b/cluster/addons/prometheus/node-exporter-ds.yml index bc1766a1d38..f5f88f76d41 100644 --- a/cluster/addons/prometheus/node-exporter-ds.yml +++ b/cluster/addons/prometheus/node-exporter-ds.yml @@ -20,6 +20,8 @@ spec: labels: k8s-app: node-exporter version: v0.15.2 + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical containers: diff --git a/cluster/addons/prometheus/prometheus-statefulset.yaml b/cluster/addons/prometheus/prometheus-statefulset.yaml index 01a12d0a969..3fee8e4d6a1 100644 --- a/cluster/addons/prometheus/prometheus-statefulset.yaml +++ b/cluster/addons/prometheus/prometheus-statefulset.yaml @@ -21,6 +21,8 @@ spec: metadata: labels: k8s-app: prometheus + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical serviceAccountName: prometheus diff --git a/cluster/gce/addons/node-termination-handler/daemonset.yaml b/cluster/gce/addons/node-termination-handler/daemonset.yaml index c3b42149c87..789a1ba41ae 100644 --- a/cluster/gce/addons/node-termination-handler/daemonset.yaml +++ b/cluster/gce/addons/node-termination-handler/daemonset.yaml @@ -17,6 +17,8 @@ spec: metadata: labels: k8s-app: node-termination-handler + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical # Necessary to reboot node diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index e9e19297f08..fe68c3816ab 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -250,14 +250,10 @@ if [[ "${KUBE_FEATURE_GATES:-}" == "AllAlpha=true" ]]; then fi # Optional: set feature gates -FEATURE_GATES="${KUBE_FEATURE_GATES:-}" +FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}" if [[ ! -z "${NODE_ACCELERATORS}" ]]; then - if [[ -z "${FEATURE_GATES:-}" ]]; then - FEATURE_GATES="DevicePlugins=true" - else - FEATURE_GATES="${FEATURE_GATES},DevicePlugins=true" - fi + FEATURE_GATES="${FEATURE_GATES},DevicePlugins=true" if [[ "${NODE_ACCELERATORS}" =~ .*type=([a-zA-Z0-9-]+).* ]]; then NON_MASTER_NODE_LABELS="${NON_MASTER_NODE_LABELS},cloud.google.com/gke-accelerator=${BASH_REMATCH[1]}" fi diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 9bcb381d0d1..7950767c069 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -139,7 +139,7 @@ if [[ "${KUBE_FEATURE_GATES:-}" == "AllAlpha=true" ]]; then fi # Optional: set feature gates -FEATURE_GATES="${KUBE_FEATURE_GATES:-}" +FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}" TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100} @@ -283,11 +283,7 @@ if [[ ${KUBE_ENABLE_INSECURE_REGISTRY:-false} == "true" ]]; then fi if [[ ! -z "${NODE_ACCELERATORS}" ]]; then - if [[ -z "${FEATURE_GATES:-}" ]]; then - FEATURE_GATES="DevicePlugins=true" - else - FEATURE_GATES="${FEATURE_GATES},DevicePlugins=true" - fi + FEATURE_GATES="${FEATURE_GATES},DevicePlugins=true" if [[ "${NODE_ACCELERATORS}" =~ .*type=([a-zA-Z0-9-]+).* ]]; then NON_MASTER_NODE_LABELS="${NON_MASTER_NODE_LABELS},cloud.google.com/gke-accelerator=${BASH_REMATCH[1]}" fi diff --git a/cluster/gce/manifests/etcd-empty-dir-cleanup.yaml b/cluster/gce/manifests/etcd-empty-dir-cleanup.yaml index 34ae25baf05..51750a06304 100644 --- a/cluster/gce/manifests/etcd-empty-dir-cleanup.yaml +++ b/cluster/gce/manifests/etcd-empty-dir-cleanup.yaml @@ -4,6 +4,7 @@ metadata: name: etcd-empty-dir-cleanup namespace: kube-system annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' labels: k8s-app: etcd-empty-dir-cleanup diff --git a/cluster/gce/manifests/etcd.manifest b/cluster/gce/manifests/etcd.manifest index 377cc60d5f5..fa54fbc0725 100644 --- a/cluster/gce/manifests/etcd.manifest +++ b/cluster/gce/manifests/etcd.manifest @@ -5,11 +5,11 @@ "name":"etcd-server{{ suffix }}", "namespace": "kube-system", "annotations": { + "scheduler.alpha.kubernetes.io/critical-pod": "", "seccomp.security.alpha.kubernetes.io/pod": "docker/default" } }, "spec":{ -"priorityClass": "system-node-critical", "hostNetwork": true, "containers":[ { diff --git a/cluster/gce/manifests/glbc.manifest b/cluster/gce/manifests/glbc.manifest index 13e1a0fc936..319037d0ef4 100644 --- a/cluster/gce/manifests/glbc.manifest +++ b/cluster/gce/manifests/glbc.manifest @@ -4,13 +4,13 @@ metadata: name: l7-lb-controller-v1.2.3 namespace: kube-system annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' labels: k8s-app: gcp-lb-controller version: v1.2.3 kubernetes.io/name: "GLBC" spec: - priorityClassName: system-node-critical terminationGracePeriodSeconds: 600 hostNetwork: true containers: diff --git a/cluster/gce/manifests/kube-addon-manager.yaml b/cluster/gce/manifests/kube-addon-manager.yaml index f3a4f8b2c3c..f5ac42e940e 100644 --- a/cluster/gce/manifests/kube-addon-manager.yaml +++ b/cluster/gce/manifests/kube-addon-manager.yaml @@ -4,11 +4,11 @@ metadata: name: kube-addon-manager namespace: kube-system annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' seccomp.security.alpha.kubernetes.io/pod: 'docker/default' labels: component: kube-addon-manager spec: - priorityClassName: system-node-critical hostNetwork: true containers: - name: kube-addon-manager diff --git a/cluster/gce/manifests/kube-apiserver.manifest b/cluster/gce/manifests/kube-apiserver.manifest index 300ded9e01f..7b5031ac781 100644 --- a/cluster/gce/manifests/kube-apiserver.manifest +++ b/cluster/gce/manifests/kube-apiserver.manifest @@ -5,6 +5,7 @@ "name":"kube-apiserver", "namespace": "kube-system", "annotations": { + "scheduler.alpha.kubernetes.io/critical-pod": "", "seccomp.security.alpha.kubernetes.io/pod": "docker/default" }, "labels": { @@ -13,7 +14,6 @@ } }, "spec":{ -"priorityClass": "system-node-critical", "hostNetwork": true, "containers":[ { diff --git a/cluster/gce/manifests/kube-controller-manager.manifest b/cluster/gce/manifests/kube-controller-manager.manifest index 3c7a7cf1290..bd5ede264b2 100644 --- a/cluster/gce/manifests/kube-controller-manager.manifest +++ b/cluster/gce/manifests/kube-controller-manager.manifest @@ -5,6 +5,7 @@ "name":"kube-controller-manager", "namespace": "kube-system", "annotations": { + "scheduler.alpha.kubernetes.io/critical-pod": "", "seccomp.security.alpha.kubernetes.io/pod": "docker/default" }, "labels": { @@ -13,7 +14,6 @@ } }, "spec":{ -"priorityClass": "system-node-critical", "hostNetwork": true, "containers":[ { diff --git a/cluster/gce/manifests/kube-proxy.manifest b/cluster/gce/manifests/kube-proxy.manifest index bbde04a8a4c..1bdd395da1d 100644 --- a/cluster/gce/manifests/kube-proxy.manifest +++ b/cluster/gce/manifests/kube-proxy.manifest @@ -3,6 +3,12 @@ kind: Pod metadata: name: kube-proxy namespace: kube-system + # This annotation ensures that kube-proxy does not get evicted if the node + # supports critical pod annotation based priority scheme. + # Note that kube-proxy runs as a static pod so this annotation does NOT have + # any effect on default scheduler which scheduling kube-proxy. + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' labels: tier: node component: kube-proxy diff --git a/cluster/gce/manifests/kube-scheduler.manifest b/cluster/gce/manifests/kube-scheduler.manifest index 0f0a984806c..b1de109ea35 100644 --- a/cluster/gce/manifests/kube-scheduler.manifest +++ b/cluster/gce/manifests/kube-scheduler.manifest @@ -5,6 +5,7 @@ "name":"kube-scheduler", "namespace": "kube-system", "annotations": { + "scheduler.alpha.kubernetes.io/critical-pod": "", "seccomp.security.alpha.kubernetes.io/pod": "docker/default" }, "labels": { @@ -13,7 +14,6 @@ } }, "spec":{ -"priorityClass": "system-node-critical", "hostNetwork": true, "containers":[ { diff --git a/cluster/gce/windows/k8s-node-setup.psm1 b/cluster/gce/windows/k8s-node-setup.psm1 index d8d288d8b26..7c0cd38be5e 100644 --- a/cluster/gce/windows/k8s-node-setup.psm1 +++ b/cluster/gce/windows/k8s-node-setup.psm1 @@ -973,6 +973,7 @@ function Start-WorkerServices { # kube-proxy --master=https://35.239.84.171 # --kubeconfig=/var/lib/kube-proxy/kubeconfig --cluster-cidr=10.64.0.0/14 # --oom-score-adj=-998 --v=2 + # --feature-gates=ExperimentalCriticalPodAnnotation=true # --iptables-sync-period=1m --iptables-min-sync-period=10s # --ipvs-sync-period=1m --ipvs-min-sync-period=10s # And also with various volumeMounts and "securityContext: privileged: true". diff --git a/pkg/controller/daemon/BUILD b/pkg/controller/daemon/BUILD index 0b32d734c9e..8e3e2d55ee1 100644 --- a/pkg/controller/daemon/BUILD +++ b/pkg/controller/daemon/BUILD @@ -66,9 +66,9 @@ go_test( "//pkg/api/legacyscheme:go_default_library", "//pkg/api/v1/pod:go_default_library", "//pkg/apis/core:go_default_library", - "//pkg/apis/scheduling:go_default_library", "//pkg/controller:go_default_library", "//pkg/features:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/scheduler/api:go_default_library", "//pkg/securitycontext:go_default_library", "//pkg/util/labels:go_default_library", diff --git a/pkg/controller/daemon/daemon_controller_test.go b/pkg/controller/daemon/daemon_controller_test.go index 09f5d1bbda9..95a77f086eb 100644 --- a/pkg/controller/daemon/daemon_controller_test.go +++ b/pkg/controller/daemon/daemon_controller_test.go @@ -46,9 +46,9 @@ import ( "k8s.io/kubernetes/pkg/api/legacyscheme" podutil "k8s.io/kubernetes/pkg/api/v1/pod" api "k8s.io/kubernetes/pkg/apis/core" - "k8s.io/kubernetes/pkg/apis/scheduling" "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/features" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" "k8s.io/kubernetes/pkg/securitycontext" labelsutil "k8s.io/kubernetes/pkg/util/labels" @@ -1815,34 +1815,6 @@ func TestTaintPressureNodeDaemonLaunchesPod(t *testing.T) { // When ScheduleDaemonSetPods is disabled, DaemonSet should launch a critical pod even when the node has insufficient free resource. func TestInsufficientCapacityNodeDaemonLaunchesCriticalPod(t *testing.T) { defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ScheduleDaemonSetPods, false)() - for _, strategy := range updateStrategies() { - podSpec := resourcePodSpec("too-much-mem", "75M", "75m") - ds := newDaemonSet("critical") - ds.Spec.UpdateStrategy = *strategy - ds.Spec.Template.Spec = podSpec - - manager, podControl, _, err := newTestController(ds) - if err != nil { - t.Fatalf("error creating DaemonSets controller: %v", err) - } - node := newNode("too-much-mem", nil) - node.Status.Allocatable = allocatableResources("100M", "200m") - manager.nodeStore.Add(node) - manager.podStore.Add(&v1.Pod{ - Spec: podSpec, - }) - - manager.dsStore.Add(ds) - switch strategy.Type { - case apps.OnDeleteDaemonSetStrategyType: - syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0, 2) - case apps.RollingUpdateDaemonSetStrategyType: - syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0, 3) - default: - t.Fatalf("unexpected UpdateStrategy %+v", strategy) - } - } - for _, strategy := range updateStrategies() { podSpec := resourcePodSpec("too-much-mem", "75M", "75m") ds := newDaemonSet("critical") @@ -1861,13 +1833,25 @@ func TestInsufficientCapacityNodeDaemonLaunchesCriticalPod(t *testing.T) { Spec: podSpec, }) + // Without enabling critical pod annotation feature gate, we shouldn't create critical pod + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, false)() manager.dsStore.Add(ds) - switch strategy.Type { case apps.OnDeleteDaemonSetStrategyType: - syncAndValidateDaemonSets(t, manager, ds, podControl, 1, 0, 0) + syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0, 2) case apps.RollingUpdateDaemonSetStrategyType: - syncAndValidateDaemonSets(t, manager, ds, podControl, 1, 0, 0) + syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0, 3) + default: + t.Fatalf("unexpected UpdateStrategy %+v", strategy) + } + + // Enabling critical pod annotation feature gate should create critical pod + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, true)() + switch strategy.Type { + case apps.OnDeleteDaemonSetStrategyType: + syncAndValidateDaemonSets(t, manager, ds, podControl, 1, 0, 2) + case apps.RollingUpdateDaemonSetStrategyType: + syncAndValidateDaemonSets(t, manager, ds, podControl, 1, 0, 3) default: t.Fatalf("unexpected UpdateStrategy %+v", strategy) } @@ -1896,6 +1880,7 @@ func TestPortConflictNodeDaemonDoesNotLaunchCriticalPod(t *testing.T) { Spec: podSpec, }) + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, true)() ds := newDaemonSet("critical") ds.Spec.UpdateStrategy = *strategy ds.Spec.Template.Spec = podSpec @@ -1910,8 +1895,7 @@ func setDaemonSetCritical(ds *apps.DaemonSet) { if ds.Spec.Template.ObjectMeta.Annotations == nil { ds.Spec.Template.ObjectMeta.Annotations = make(map[string]string) } - podPriority := scheduling.SystemCriticalPriority - ds.Spec.Template.Spec.Priority = &podPriority + ds.Spec.Template.ObjectMeta.Annotations[kubelettypes.CriticalPodAnnotationKey] = "" } func TestNodeShouldRunDaemonPod(t *testing.T) { diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index ad19f4c2600..92178d129f1 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -48,6 +48,15 @@ const ( // SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon. ExperimentalHostUserNamespaceDefaultingGate featuregate.Feature = "ExperimentalHostUserNamespaceDefaulting" + // owner: @vishh + // alpha: v1.5 + // + // DEPRECATED - This feature is deprecated by Pod Priority and Preemption as of Kubernetes 1.13. + // Ensures guaranteed scheduling of pods marked with a special pod annotation `scheduler.alpha.kubernetes.io/critical-pod` + // and also prevents them from being evicted from a node. + // Note: This feature is not supported for `BestEffort` pods. + ExperimentalCriticalPodAnnotation featuregate.Feature = "ExperimentalCriticalPodAnnotation" + // owner: @jiayingz // beta: v1.10 // @@ -476,6 +485,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS DynamicKubeletConfig: {Default: true, PreRelease: featuregate.Beta}, ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: featuregate.Beta}, DevicePlugins: {Default: true, PreRelease: featuregate.Beta}, + ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: featuregate.Alpha}, TaintBasedEvictions: {Default: true, PreRelease: featuregate.Beta}, RotateKubeletServerCertificate: {Default: true, PreRelease: featuregate.Beta}, RotateKubeletClientCertificate: {Default: true, PreRelease: featuregate.Beta}, diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 1c81a0003cf..2e6a49bd6f0 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -17,7 +17,6 @@ go_test( embed = [":go_default_library"], deps = [ "//pkg/apis/core:go_default_library", - "//pkg/apis/scheduling:go_default_library", "//pkg/features:go_default_library", "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", "//pkg/kubelet/eviction/api:go_default_library", diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 38c2cba983d..c74faf854e8 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -29,7 +29,6 @@ import ( "k8s.io/client-go/tools/record" featuregatetesting "k8s.io/component-base/featuregate/testing" kubeapi "k8s.io/kubernetes/pkg/apis/core" - "k8s.io/kubernetes/pkg/apis/scheduling" "k8s.io/kubernetes/pkg/features" statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" @@ -1133,12 +1132,12 @@ func TestInodePressureNodeFsInodes(t *testing.T) { } } -// TestStaticCriticalPodsAreNotEvicted -func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { +// TestCriticalPodsAreNotEvicted +func TestCriticalPodsAreNotEvicted(t *testing.T) { podMaker := makePodWithMemoryStats summaryStatsMaker := makeMemoryStats podsToMake := []podToMake{ - {name: "critical", priority: scheduling.SystemCriticalPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "800Mi"}, + {name: "critical", priority: defaultPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "800Mi"}, } pods := []*v1.Pod{} podStats := map[*v1.Pod]statsapi.PodStats{} @@ -1148,12 +1147,11 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { podStats[pod] = podStat } + // Mark the pod as critical pods[0].Annotations = map[string]string{ + kubelettypes.CriticalPodAnnotationKey: "", kubelettypes.ConfigSourceAnnotationKey: kubelettypes.FileSource, } - // Mark the pod as critical - podPriority := scheduling.SystemCriticalPriority - pods[0].Spec.Priority = &podPriority pods[0].Namespace = kubeapi.NamespaceSystem podToEvict := pods[0] @@ -1210,6 +1208,9 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { thresholdsFirstObservedAt: thresholdsObservedAt{}, } + // Enable critical pod annotation feature gate + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, true)() + // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) manager.synchronize(diskInfoProvider, activePodsFunc) @@ -1252,11 +1253,8 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { t.Errorf("Manager should not report memory pressure") } - pods[0].Annotations = map[string]string{ - kubelettypes.ConfigSourceAnnotationKey: kubelettypes.FileSource, - } - pods[0].Spec.Priority = nil - pods[0].Namespace = kubeapi.NamespaceSystem + // Disable critical pod annotation feature gate + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, false)() // induce memory pressure! fakeClock.Step(1 * time.Minute) diff --git a/pkg/kubelet/preemption/BUILD b/pkg/kubelet/preemption/BUILD index 7869ab83683..f3b80386bf1 100644 --- a/pkg/kubelet/preemption/BUILD +++ b/pkg/kubelet/preemption/BUILD @@ -45,9 +45,13 @@ go_test( deps = [ "//pkg/apis/core:go_default_library", "//pkg/apis/scheduling:go_default_library", + "//pkg/features:go_default_library", + "//pkg/kubelet/types:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/client-go/tools/record:go_default_library", + "//staging/src/k8s.io/component-base/featuregate/testing:go_default_library", ], ) diff --git a/pkg/kubelet/preemption/preemption_test.go b/pkg/kubelet/preemption/preemption_test.go index 00a0c2e5475..2110db33585 100644 --- a/pkg/kubelet/preemption/preemption_test.go +++ b/pkg/kubelet/preemption/preemption_test.go @@ -23,12 +23,17 @@ import ( "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/tools/record" + featuregatetesting "k8s.io/component-base/featuregate/testing" kubeapi "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/scheduling" + "k8s.io/kubernetes/pkg/features" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) const ( + critical = "critical" clusterCritical = "cluster-critical" nodeCritical = "node-critical" bestEffort = "bestEffort" @@ -91,6 +96,7 @@ func getTestCriticalPodAdmissionHandler(podProvider *fakePodProvider, podKiller } func TestEvictPodsToFreeRequestsWithError(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, true)() type testRun struct { testName string inputPods []*v1.Pod @@ -106,7 +112,7 @@ func TestEvictPodsToFreeRequestsWithError(t *testing.T) { { testName: "multiple pods eviction error", inputPods: []*v1.Pod{ - allPods[clusterCritical], allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable], + allPods[critical], allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable], allPods[guaranteed], allPods[highRequestGuaranteed]}, insufficientResources: getAdmissionRequirementList(0, 550, 0), expectErr: false, @@ -115,7 +121,7 @@ func TestEvictPodsToFreeRequestsWithError(t *testing.T) { } for _, r := range runs { podProvider.setPods(r.inputPods) - outErr := criticalPodAdmissionHandler.evictPodsToFreeRequests(allPods[clusterCritical], r.insufficientResources) + outErr := criticalPodAdmissionHandler.evictPodsToFreeRequests(allPods[critical], r.insufficientResources) outputPods := podKiller.getKilledPods() if !r.expectErr && outErr != nil { t.Errorf("evictPodsToFreeRequests returned an unexpected error during the %s test. Err: %v", r.testName, outErr) @@ -129,6 +135,7 @@ func TestEvictPodsToFreeRequestsWithError(t *testing.T) { } func TestEvictPodsToFreeRequests(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, true)() type testRun struct { testName string inputPods []*v1.Pod @@ -143,7 +150,7 @@ func TestEvictPodsToFreeRequests(t *testing.T) { runs := []testRun{ { testName: "critical pods cannot be preempted", - inputPods: []*v1.Pod{allPods[clusterCritical]}, + inputPods: []*v1.Pod{allPods[critical]}, insufficientResources: getAdmissionRequirementList(0, 0, 1), expectErr: true, expectedOutput: nil, @@ -158,7 +165,7 @@ func TestEvictPodsToFreeRequests(t *testing.T) { { testName: "multiple pods evicted", inputPods: []*v1.Pod{ - allPods[clusterCritical], allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable], + allPods[critical], allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable], allPods[guaranteed], allPods[highRequestGuaranteed]}, insufficientResources: getAdmissionRequirementList(0, 550, 0), expectErr: false, @@ -167,7 +174,7 @@ func TestEvictPodsToFreeRequests(t *testing.T) { } for _, r := range runs { podProvider.setPods(r.inputPods) - outErr := criticalPodAdmissionHandler.evictPodsToFreeRequests(allPods[clusterCritical], r.insufficientResources) + outErr := criticalPodAdmissionHandler.evictPodsToFreeRequests(allPods[critical], r.insufficientResources) outputPods := podKiller.getKilledPods() if !r.expectErr && outErr != nil { t.Errorf("evictPodsToFreeRequests returned an unexpected error during the %s test. Err: %v", r.testName, outErr) @@ -196,6 +203,7 @@ func BenchmarkGetPodsToPreempt(t *testing.B) { } func TestGetPodsToPreempt(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, true)() type testRun struct { testName string preemptor *v1.Pod @@ -208,7 +216,7 @@ func TestGetPodsToPreempt(t *testing.T) { runs := []testRun{ { testName: "no requirements", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{}, insufficientResources: getAdmissionRequirementList(0, 0, 0), expectErr: false, @@ -216,7 +224,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "no pods", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{}, insufficientResources: getAdmissionRequirementList(0, 0, 1), expectErr: true, @@ -224,7 +232,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "equal pods and resources requirements", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{allPods[burstable]}, insufficientResources: getAdmissionRequirementList(100, 100, 1), expectErr: false, @@ -232,7 +240,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "higher requirements than pod requests", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{allPods[burstable]}, insufficientResources: getAdmissionRequirementList(200, 200, 2), expectErr: true, @@ -240,7 +248,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "choose between bestEffort and burstable", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{allPods[burstable], allPods[bestEffort]}, insufficientResources: getAdmissionRequirementList(0, 0, 1), expectErr: false, @@ -248,7 +256,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "choose between burstable and guaranteed", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{allPods[burstable], allPods[guaranteed]}, insufficientResources: getAdmissionRequirementList(0, 0, 1), expectErr: false, @@ -256,7 +264,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "choose lower request burstable if it meets requirements", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{allPods[bestEffort], allPods[highRequestBurstable], allPods[burstable]}, insufficientResources: getAdmissionRequirementList(100, 100, 0), expectErr: false, @@ -264,7 +272,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "choose higher request burstable if lower does not meet requirements", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable]}, insufficientResources: getAdmissionRequirementList(150, 150, 0), expectErr: false, @@ -272,7 +280,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "multiple pods required", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable], allPods[guaranteed], allPods[highRequestGuaranteed]}, insufficientResources: getAdmissionRequirementList(350, 350, 0), expectErr: false, @@ -280,7 +288,7 @@ func TestGetPodsToPreempt(t *testing.T) { }, { testName: "evict guaranteed when we have to, and dont evict the extra burstable", - preemptor: allPods[clusterCritical], + preemptor: allPods[critical], inputPods: []*v1.Pod{allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable], allPods[guaranteed], allPods[highRequestGuaranteed]}, insufficientResources: getAdmissionRequirementList(0, 550, 0), expectErr: false, @@ -415,6 +423,12 @@ func getTestPods() map[string]*v1.Pod { }, }), bestEffort: getPodWithResources(bestEffort, v1.ResourceRequirements{}), + critical: getPodWithResources(critical, v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("100Mi"), + }, + }), clusterCritical: getPodWithResources(clusterCritical, v1.ResourceRequirements{ Requests: v1.ResourceList{ v1.ResourceCPU: resource.MustParse("100m"), @@ -460,6 +474,9 @@ func getTestPods() map[string]*v1.Pod { }, }), } + allPods[critical].Namespace = kubeapi.NamespaceSystem + allPods[critical].Annotations[kubetypes.CriticalPodAnnotationKey] = "" + allPods[clusterCritical].Namespace = kubeapi.NamespaceSystem allPods[clusterCritical].Spec.PriorityClassName = scheduling.SystemClusterCritical clusterPriority := scheduling.SystemCriticalPriority diff --git a/pkg/kubelet/types/BUILD b/pkg/kubelet/types/BUILD index 437153301de..61a4583b905 100644 --- a/pkg/kubelet/types/BUILD +++ b/pkg/kubelet/types/BUILD @@ -18,10 +18,13 @@ go_library( ], importpath = "k8s.io/kubernetes/pkg/kubelet/types", deps = [ + "//pkg/apis/core:go_default_library", "//pkg/apis/scheduling:go_default_library", + "//pkg/features:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/types:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", ], ) @@ -35,8 +38,11 @@ go_test( ], embed = [":go_default_library"], deps = [ + "//pkg/features:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", + "//staging/src/k8s.io/component-base/featuregate/testing:go_default_library", "//vendor/github.com/stretchr/testify/assert:go_default_library", "//vendor/github.com/stretchr/testify/require:go_default_library", ], diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index c854b958e50..82a5f9a4020 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -19,9 +19,12 @@ package types import ( "fmt" - v1 "k8s.io/api/core/v1" + "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + kubeapi "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/scheduling" + "k8s.io/kubernetes/pkg/features" ) const ( @@ -29,6 +32,7 @@ const ( ConfigMirrorAnnotationKey = v1.MirrorPodAnnotationKey ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" ConfigHashAnnotationKey = "kubernetes.io/config.hash" + CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod" ) // PodOperation defines what changes will be made on a pod configuration. @@ -138,11 +142,18 @@ func (sp SyncPodType) String() string { } } -// IsCriticalPod returns true if pod's priority is greater than or equal to SystemCriticalPriority. +// IsCriticalPod returns true if the pod bears the critical pod annotation key or if pod's priority is greater than +// or equal to SystemCriticalPriority. Both the default scheduler and the kubelet use this function +// to make admission and scheduling decisions. func IsCriticalPod(pod *v1.Pod) bool { if pod.Spec.Priority != nil && IsCriticalPodBasedOnPriority(*pod.Spec.Priority) { return true } + if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) { + if IsCritical(pod.Namespace, pod.Annotations) { + return true + } + } return false } @@ -160,6 +171,21 @@ func Preemptable(preemptor, preemptee *v1.Pod) bool { return false } +// IsCritical returns true if parameters bear the critical pod annotation +// key. The DaemonSetController use this key directly to make scheduling decisions. +// TODO: @ravig - Deprecated. Remove this when we move to resolving critical pods based on priorityClassName. +func IsCritical(ns string, annotations map[string]string) bool { + // Critical pods are restricted to "kube-system" namespace as of now. + if ns != kubeapi.NamespaceSystem { + return false + } + val, ok := annotations[CriticalPodAnnotationKey] + if ok && val == "" { + return true + } + return false +} + // IsCriticalPodBasedOnPriority checks if the given pod is a critical pod based on priority resolved from pod Spec. func IsCriticalPodBasedOnPriority(priority int32) bool { if priority >= scheduling.SystemCriticalPriority { diff --git a/pkg/kubelet/types/pod_update_test.go b/pkg/kubelet/types/pod_update_test.go index 6b45d1fdb9d..42cc2fae97c 100644 --- a/pkg/kubelet/types/pod_update_test.go +++ b/pkg/kubelet/types/pod_update_test.go @@ -23,6 +23,9 @@ import ( "github.com/stretchr/testify/require" "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/kubernetes/pkg/features" ) func TestGetValidatedSources(t *testing.T) { @@ -114,6 +117,70 @@ func TestString(t *testing.T) { } } +func TestIsCriticalPod(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, true)() + cases := []struct { + pod v1.Pod + expected bool + }{ + { + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "ns", + Annotations: map[string]string{ + "scheduler.alpha.kubernetes.io/critical-pod": "", + }, + }, + }, + expected: false, + }, + { + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + Namespace: "ns", + Annotations: map[string]string{ + "scheduler.alpha.kubernetes.io/critical-pod": "abc", + }, + }, + }, + expected: false, + }, + { + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod3", + Namespace: "kube-system", + Annotations: map[string]string{ + "scheduler.alpha.kubernetes.io/critical-pod": "abc", + }, + }, + }, + expected: false, + }, + { + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod4", + Namespace: "kube-system", + Annotations: map[string]string{ + "scheduler.alpha.kubernetes.io/critical-pod": "", + }, + }, + }, + expected: true, + }, + } + for i, data := range cases { + actual := IsCriticalPod(&data.pod) + if actual != data.expected { + t.Errorf("IsCriticalPod result wrong:\nexpected: %v\nactual: %v for test[%d] with Annotations: %v", + data.expected, actual, i, data.pod.Annotations) + } + } +} + func TestIsCriticalPodBasedOnPriority(t *testing.T) { tests := []struct { priority int32 diff --git a/plugin/pkg/admission/priority/BUILD b/plugin/pkg/admission/priority/BUILD index 50f3e71ebf7..2b34a81b71d 100644 --- a/plugin/pkg/admission/priority/BUILD +++ b/plugin/pkg/admission/priority/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/apis/core:go_default_library", "//pkg/apis/scheduling:go_default_library", "//pkg/features:go_default_library", + "//pkg/kubelet/types:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", "//staging/src/k8s.io/api/scheduling/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library", diff --git a/plugin/pkg/admission/priority/admission.go b/plugin/pkg/admission/priority/admission.go index 481f4f893ee..75eae31d3ca 100644 --- a/plugin/pkg/admission/priority/admission.go +++ b/plugin/pkg/admission/priority/admission.go @@ -35,6 +35,7 @@ import ( api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/scheduling" "k8s.io/kubernetes/pkg/features" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" ) const ( @@ -177,6 +178,13 @@ func (p *priorityPlugin) admitPod(a admission.Attributes) error { if operation == admission.Create { var priority int32 var preemptionPolicy *apiv1.PreemptionPolicy + // TODO: @ravig - This is for backwards compatibility to ensure that critical pods with annotations just work fine. + // Remove when no longer needed. + if len(pod.Spec.PriorityClassName) == 0 && + utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && + kubelettypes.IsCritical(a.GetNamespace(), pod.Annotations) { + pod.Spec.PriorityClassName = scheduling.SystemClusterCritical + } if len(pod.Spec.PriorityClassName) == 0 { var err error var pcName string diff --git a/plugin/pkg/admission/priority/admission_test.go b/plugin/pkg/admission/priority/admission_test.go index 1feff4318b5..b84fa4bcf26 100644 --- a/plugin/pkg/admission/priority/admission_test.go +++ b/plugin/pkg/admission/priority/admission_test.go @@ -422,7 +422,23 @@ func TestPodAdmission(t *testing.T) { Priority: &intPriority, }, }, - // pod[7]: Pod with a system priority class name in non-system namespace + // pod[7]: Pod with a critical priority annotation. This needs to be automatically assigned + // system-cluster-critical + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-w-system-priority", + Namespace: "kube-system", + Annotations: map[string]string{"scheduler.alpha.kubernetes.io/critical-pod": ""}, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: containerName, + }, + }, + }, + }, + // pod[8]: Pod with a system priority class name in non-system namespace { ObjectMeta: metav1.ObjectMeta{ Name: "pod-w-system-priority-in-nonsystem-namespace", @@ -437,7 +453,7 @@ func TestPodAdmission(t *testing.T) { PriorityClassName: scheduling.SystemClusterCritical, }, }, - // pod[8]: Pod with a priority value that matches the resolved priority + // pod[9]: Pod with a priority value that matches the resolved priority { ObjectMeta: metav1.ObjectMeta{ Name: "pod-w-zero-priority-in-nonsystem-namespace", @@ -452,7 +468,7 @@ func TestPodAdmission(t *testing.T) { Priority: &zeroPriority, }, }, - // pod[9]: Pod with a priority value that matches the resolved default priority + // pod[10]: Pod with a priority value that matches the resolved default priority { ObjectMeta: metav1.ObjectMeta{ Name: "pod-w-priority-matching-default-priority", @@ -467,7 +483,7 @@ func TestPodAdmission(t *testing.T) { Priority: &defaultClass2.Value, }, }, - // pod[10]: Pod with a priority value that matches the resolved priority + // pod[11]: Pod with a priority value that matches the resolved priority { ObjectMeta: metav1.ObjectMeta{ Name: "pod-w-priority-matching-resolved-default-priority", @@ -483,7 +499,7 @@ func TestPodAdmission(t *testing.T) { Priority: &systemClusterCritical.Value, }, }, - // pod[11]: Pod without a preemption policy that matches the resolved preemption policy + // pod[12]: Pod without a preemption policy that matches the resolved preemption policy { ObjectMeta: metav1.ObjectMeta{ Name: "pod-never-preemption-policy-matching-resolved-preemption-policy", @@ -500,7 +516,7 @@ func TestPodAdmission(t *testing.T) { PreemptionPolicy: nil, }, }, - // pod[12]: Pod with a preemption policy that matches the resolved preemption policy + // pod[13]: Pod with a preemption policy that matches the resolved preemption policy { ObjectMeta: metav1.ObjectMeta{ Name: "pod-preemption-policy-matching-resolved-preemption-policy", @@ -517,7 +533,7 @@ func TestPodAdmission(t *testing.T) { PreemptionPolicy: &preemptLowerPriority, }, }, - // pod[13]: Pod with a preemption policy that does't match the resolved preemption policy + // pod[14]: Pod with a preemption policy that does't match the resolved preemption policy { ObjectMeta: metav1.ObjectMeta{ Name: "pod-preemption-policy-not-matching-resolved-preemption-policy", @@ -535,6 +551,8 @@ func TestPodAdmission(t *testing.T) { }, }, } + // Enable ExperimentalCriticalPodAnnotation feature gate. + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.ExperimentalCriticalPodAnnotation, true)() // Enable NonPreemptingPriority feature gate. defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NonPreemptingPriority, true)() tests := []struct { @@ -621,17 +639,25 @@ func TestPodAdmission(t *testing.T) { nil, }, { - "pod with system critical priority in non-system namespace", + "pod with critical pod annotation", []*scheduling.PriorityClass{systemClusterCritical}, *pods[7], scheduling.SystemCriticalPriority, + false, + nil, + }, + { + "pod with system critical priority in non-system namespace", + []*scheduling.PriorityClass{systemClusterCritical}, + *pods[8], + scheduling.SystemCriticalPriority, true, nil, }, { "pod with priority that matches computed priority", []*scheduling.PriorityClass{nondefaultClass1}, - *pods[8], + *pods[9], 0, false, nil, @@ -639,7 +665,7 @@ func TestPodAdmission(t *testing.T) { { "pod with priority that matches default priority", []*scheduling.PriorityClass{defaultClass2}, - *pods[9], + *pods[10], defaultClass2.Value, false, nil, @@ -647,7 +673,7 @@ func TestPodAdmission(t *testing.T) { { "pod with priority that matches resolved priority", []*scheduling.PriorityClass{systemClusterCritical}, - *pods[10], + *pods[11], systemClusterCritical.Value, false, nil, @@ -655,7 +681,7 @@ func TestPodAdmission(t *testing.T) { { "pod with nil preemtpion policy", []*scheduling.PriorityClass{preemptionPolicyClass}, - *pods[11], + *pods[12], preemptionPolicyClass.Value, false, nil, @@ -663,7 +689,7 @@ func TestPodAdmission(t *testing.T) { { "pod with preemtpion policy that matches resolved preemtpion policy", []*scheduling.PriorityClass{preemptionPolicyClass}, - *pods[12], + *pods[13], preemptionPolicyClass.Value, false, &preemptLowerPriority, @@ -671,7 +697,7 @@ func TestPodAdmission(t *testing.T) { { "pod with preemtpion policy that does't matches resolved preemtpion policy", []*scheduling.PriorityClass{preemptionPolicyClass}, - *pods[13], + *pods[14], preemptionPolicyClass.Value, true, &preemptLowerPriority, diff --git a/test/e2e/testing-manifests/sample-device-plugin.yaml b/test/e2e/testing-manifests/sample-device-plugin.yaml index c2512737881..1c7baff5eb0 100644 --- a/test/e2e/testing-manifests/sample-device-plugin.yaml +++ b/test/e2e/testing-manifests/sample-device-plugin.yaml @@ -14,6 +14,7 @@ spec: labels: k8s-app: sample-device-plugin annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-node-critical tolerations: diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 9028683e52f..4c2c0c45ab2 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -126,7 +126,6 @@ go_test( tags = ["e2e"], deps = [ "//pkg/apis/core:go_default_library", - "//pkg/apis/scheduling:go_default_library", "//pkg/features:go_default_library", "//pkg/kubelet:go_default_library", "//pkg/kubelet/apis/config:go_default_library", diff --git a/test/e2e_node/critical_pod_test.go b/test/e2e_node/critical_pod_test.go index 4c71738e5c4..297be277070 100644 --- a/test/e2e_node/critical_pod_test.go +++ b/test/e2e_node/critical_pod_test.go @@ -23,7 +23,8 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeapi "k8s.io/kubernetes/pkg/apis/core" - "k8s.io/kubernetes/pkg/apis/scheduling" + "k8s.io/kubernetes/pkg/features" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/test/e2e/framework" imageutils "k8s.io/kubernetes/test/utils/image" @@ -43,6 +44,13 @@ var _ = framework.KubeDescribe("CriticalPod [Serial] [Disruptive] [NodeFeature:C f := framework.NewDefaultFramework("critical-pod-test") Context("when we need to admit a critical pod", func() { + tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { + if initialConfig.FeatureGates == nil { + initialConfig.FeatureGates = make(map[string]bool) + } + initialConfig.FeatureGates[string(features.ExperimentalCriticalPodAnnotation)] = true + }) + It("should be able to create and delete a critical pod", func() { configEnabled, err := isKubeletConfigEnabled(f) framework.ExpectNoError(err) @@ -134,11 +142,12 @@ func getTestPod(critical bool, name string, resources v1.ResourceRequirements) * } if critical { pod.ObjectMeta.Namespace = kubeapi.NamespaceSystem - pod.ObjectMeta.Annotations = map[string]string{} - pod.Spec.PriorityClassName = scheduling.SystemClusterCritical - Expect(kubelettypes.IsCriticalPod(pod)).To(BeTrue(), "pod should be a critical pod") + pod.ObjectMeta.Annotations = map[string]string{ + kubelettypes.CriticalPodAnnotationKey: "", + } + Expect(kubelettypes.IsCritical(pod.Namespace, pod.Annotations)).To(BeTrue(), "pod should be a critical pod") } else { - Expect(kubelettypes.IsCriticalPod(pod)).To(BeFalse(), "pod should not be a critical pod") + Expect(kubelettypes.IsCritical(pod.Namespace, pod.Annotations)).To(BeFalse(), "pod should not be a critical pod") } return pod } diff --git a/test/kubemark/resources/kube_dns_template.yaml b/test/kubemark/resources/kube_dns_template.yaml index 5857f43271e..02c2a68a2e5 100644 --- a/test/kubemark/resources/kube_dns_template.yaml +++ b/test/kubemark/resources/kube_dns_template.yaml @@ -57,8 +57,9 @@ spec: metadata: labels: k8s-app: kube-dns + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: - priorityClassName: system-node-critical tolerations: - key: "CriticalAddonsOnly" operator: "Exists"