mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 20:24:09 +00:00
Merge pull request #38836 from bprashanth/kubelet_critical
Automatic merge from submit-queue Admit critical pods in the kubelet Haven't verified in a live cluster yet, just unittested, so applying do-not-merge label.
This commit is contained in:
commit
699964c972
@ -38,6 +38,14 @@ kind: Pod
|
||||
metadata:
|
||||
name: kube-proxy
|
||||
namespace: kube-system
|
||||
# This annotation lowers the possibility that kube-proxy gets evicted when the
|
||||
# node is under memory pressure, and prioritizes it for admission, even if
|
||||
# the node is under memory pressure.
|
||||
# Note that kube-proxy runs as a static pod so this annotation does NOT have
|
||||
# any effect on rescheduler (default scheduler and rescheduler are not
|
||||
# involved in scheduling kube-proxy).
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/critical-pod: ''
|
||||
labels:
|
||||
tier: node
|
||||
component: kube-proxy
|
||||
|
@ -28,6 +28,7 @@ go_library(
|
||||
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
|
||||
"//pkg/kubelet/cm:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
"//pkg/kubelet/pod:go_default_library",
|
||||
"//pkg/kubelet/qos:go_default_library",
|
||||
"//pkg/kubelet/server/stats:go_default_library",
|
||||
"//pkg/kubelet/util/format:go_default_library",
|
||||
@ -55,6 +56,7 @@ go_test(
|
||||
"//pkg/client/record:go_default_library",
|
||||
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
"//pkg/quota:go_default_library",
|
||||
"//pkg/types:go_default_library",
|
||||
"//pkg/util/clock:go_default_library",
|
||||
|
@ -28,6 +28,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/client/record"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
||||
"k8s.io/kubernetes/pkg/kubelet/qos"
|
||||
"k8s.io/kubernetes/pkg/kubelet/server/stats"
|
||||
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
||||
@ -108,7 +109,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
||||
// the node has memory pressure, admit if not best-effort
|
||||
if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) {
|
||||
notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod)
|
||||
if notBestEffort {
|
||||
if notBestEffort || kubepod.IsCriticalPod(attrs.Pod) {
|
||||
return lifecycle.PodAdmitResult{Admit: true}
|
||||
}
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/client/record"
|
||||
statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
"k8s.io/kubernetes/pkg/types"
|
||||
"k8s.io/kubernetes/pkg/util/clock"
|
||||
)
|
||||
@ -210,6 +211,8 @@ func TestMemoryPressure(t *testing.T) {
|
||||
// create a best effort pod to test admission
|
||||
bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
|
||||
burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")
|
||||
criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
|
||||
criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""}
|
||||
|
||||
// synchronize
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
@ -220,8 +223,8 @@ func TestMemoryPressure(t *testing.T) {
|
||||
}
|
||||
|
||||
// try to admit our pods (they should succeed)
|
||||
expected := []bool{true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
|
||||
expected := []bool{true, true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
|
||||
}
|
||||
@ -296,9 +299,10 @@ func TestMemoryPressure(t *testing.T) {
|
||||
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
|
||||
}
|
||||
|
||||
// the best-effort pod should not admit, burstable should
|
||||
expected = []bool{false, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
|
||||
// the best-effort pod without critical annotation should not admit,
|
||||
// burstable and critical pods should
|
||||
expected = []bool{false, true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
|
||||
}
|
||||
@ -320,9 +324,9 @@ func TestMemoryPressure(t *testing.T) {
|
||||
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
|
||||
}
|
||||
|
||||
// the best-effort pod should not admit, burstable should
|
||||
expected = []bool{false, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
|
||||
// the best-effort pod should not admit, burstable and critical pods should
|
||||
expected = []bool{false, true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
|
||||
}
|
||||
@ -345,8 +349,8 @@ func TestMemoryPressure(t *testing.T) {
|
||||
}
|
||||
|
||||
// all pods should admit now
|
||||
expected = []bool{true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
|
||||
expected = []bool{true, true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
|
||||
}
|
||||
|
@ -1909,8 +1909,21 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *v1.Pod, start time.Time) {
|
||||
// a config source.
|
||||
func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
|
||||
start := kl.clock.Now()
|
||||
sort.Sort(sliceutils.PodsByCreationTime(pods))
|
||||
for _, pod := range pods {
|
||||
|
||||
// Pass critical pods through admission check first.
|
||||
var criticalPods []*v1.Pod
|
||||
var nonCriticalPods []*v1.Pod
|
||||
for _, p := range pods {
|
||||
if kubepod.IsCriticalPod(p) {
|
||||
criticalPods = append(criticalPods, p)
|
||||
} else {
|
||||
nonCriticalPods = append(nonCriticalPods, p)
|
||||
}
|
||||
}
|
||||
sort.Sort(sliceutils.PodsByCreationTime(criticalPods))
|
||||
sort.Sort(sliceutils.PodsByCreationTime(nonCriticalPods))
|
||||
|
||||
for _, pod := range append(criticalPods, nonCriticalPods...) {
|
||||
existingPods := kl.podManager.GetPods()
|
||||
// Always add the pod to the pod manager. Kubelet relies on the pod
|
||||
// manager as the source of truth for the desired state. If a pod does
|
||||
|
@ -461,6 +461,68 @@ func TestHandlePortConflicts(t *testing.T) {
|
||||
require.Equal(t, v1.PodPending, status.Phase)
|
||||
}
|
||||
|
||||
// Tests that we sort pods based on criticality.
|
||||
func TestCriticalPrioritySorting(t *testing.T) {
|
||||
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
|
||||
kl := testKubelet.kubelet
|
||||
nodes := []v1.Node{
|
||||
{ObjectMeta: v1.ObjectMeta{Name: testKubeletHostname},
|
||||
Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI),
|
||||
}}},
|
||||
}
|
||||
kl.nodeLister = testNodeLister{nodes: nodes}
|
||||
kl.nodeInfo = testNodeInfo{nodes: nodes}
|
||||
testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorapi.MachineInfo{}, nil)
|
||||
testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)
|
||||
testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)
|
||||
|
||||
spec := v1.PodSpec{NodeName: string(kl.nodeName),
|
||||
Containers: []v1.Container{{Resources: v1.ResourceRequirements{
|
||||
Requests: v1.ResourceList{
|
||||
"memory": resource.MustParse("90"),
|
||||
},
|
||||
}}}}
|
||||
pods := []*v1.Pod{
|
||||
podWithUidNameNsSpec("000000000", "newpod", "foo", spec),
|
||||
podWithUidNameNsSpec("987654321", "oldpod", "foo", spec),
|
||||
podWithUidNameNsSpec("123456789", "middlepod", "foo", spec),
|
||||
}
|
||||
|
||||
// Pods are not sorted by creation time.
|
||||
startTime := time.Now()
|
||||
pods[0].CreationTimestamp = metav1.NewTime(startTime.Add(10 * time.Second))
|
||||
pods[1].CreationTimestamp = metav1.NewTime(startTime)
|
||||
pods[2].CreationTimestamp = metav1.NewTime(startTime.Add(1 * time.Second))
|
||||
|
||||
// Make the middle and new pod critical, the middle pod should win
|
||||
// even though it comes later in the list
|
||||
critical := map[string]string{kubetypes.CriticalPodAnnotationKey: ""}
|
||||
pods[0].Annotations = critical
|
||||
pods[1].Annotations = map[string]string{}
|
||||
pods[2].Annotations = critical
|
||||
|
||||
// The non-critical pod should be rejected
|
||||
notfittingPods := []*v1.Pod{pods[0], pods[1]}
|
||||
fittingPod := pods[2]
|
||||
|
||||
kl.HandlePodAdditions(pods)
|
||||
// Check pod status stored in the status map.
|
||||
// notfittingPod should be Failed
|
||||
for _, p := range notfittingPods {
|
||||
status, found := kl.statusManager.GetPodStatus(p.UID)
|
||||
require.True(t, found, "Status of pod %q is not found in the status map", p.UID)
|
||||
require.Equal(t, v1.PodFailed, status.Phase)
|
||||
}
|
||||
|
||||
// fittingPod should be Pending
|
||||
status, found := kl.statusManager.GetPodStatus(fittingPod.UID)
|
||||
require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID)
|
||||
require.Equal(t, v1.PodPending, status.Phase)
|
||||
}
|
||||
|
||||
// Tests that we handle host name conflicts correctly by setting the failed status in status map.
|
||||
func TestHandleHostNameConflicts(t *testing.T) {
|
||||
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
|
||||
|
@ -21,6 +21,7 @@ import (
|
||||
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
"k8s.io/kubernetes/pkg/types"
|
||||
)
|
||||
|
||||
@ -306,3 +307,11 @@ func (pm *basicManager) GetPodByMirrorPod(mirrorPod *v1.Pod) (*v1.Pod, bool) {
|
||||
pod, ok := pm.podByFullName[kubecontainer.GetPodFullName(mirrorPod)]
|
||||
return pod, ok
|
||||
}
|
||||
|
||||
// IsCriticalPod returns true if the pod bears the critical pod annotation
|
||||
// key. Both the rescheduler and the kubelet use this key to make admission
|
||||
// and scheduling decisions.
|
||||
func IsCriticalPod(pod *v1.Pod) bool {
|
||||
_, ok := pod.Annotations[kubetypes.CriticalPodAnnotationKey]
|
||||
return ok
|
||||
}
|
||||
|
@ -27,6 +27,15 @@ const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror"
|
||||
const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
|
||||
const ConfigHashAnnotationKey = "kubernetes.io/config.hash"
|
||||
|
||||
// This key needs to sync with the key used by the rescheduler, which currently
|
||||
// lives in contrib. Its presence indicates 2 things, as far as the kubelet is
|
||||
// concerned:
|
||||
// 1. Resource related admission checks will prioritize the admission of
|
||||
// pods bearing the key, over pods without the key, regardless of QoS.
|
||||
// 2. The OOM score of pods bearing the key will be <= pods without
|
||||
// the key (where the <= part is determied by QoS).
|
||||
const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod"
|
||||
|
||||
// PodOperation defines what changes will be made on a pod configuration.
|
||||
type PodOperation int
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user