mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-02 08:17:26 +00:00
Merge pull request #128556 from AnishShah/kubelet-reject-metric
Introduce a metric to track kubelet admission failure.
This commit is contained in:
commit
099449954e
@ -29,6 +29,7 @@ import (
|
|||||||
sysruntime "runtime"
|
sysruntime "runtime"
|
||||||
"slices"
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
@ -81,6 +82,7 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/kubelet/cloudresource"
|
"k8s.io/kubernetes/pkg/kubelet/cloudresource"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
|
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/config"
|
"k8s.io/kubernetes/pkg/kubelet/config"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/configmap"
|
"k8s.io/kubernetes/pkg/kubelet/configmap"
|
||||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||||
@ -120,6 +122,7 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/kubelet/volumemanager"
|
"k8s.io/kubernetes/pkg/kubelet/volumemanager"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/watchdog"
|
"k8s.io/kubernetes/pkg/kubelet/watchdog"
|
||||||
httpprobe "k8s.io/kubernetes/pkg/probe/http"
|
httpprobe "k8s.io/kubernetes/pkg/probe/http"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
|
||||||
"k8s.io/kubernetes/pkg/security/apparmor"
|
"k8s.io/kubernetes/pkg/security/apparmor"
|
||||||
"k8s.io/kubernetes/pkg/util/oom"
|
"k8s.io/kubernetes/pkg/util/oom"
|
||||||
"k8s.io/kubernetes/pkg/volume"
|
"k8s.io/kubernetes/pkg/volume"
|
||||||
@ -220,6 +223,26 @@ var (
|
|||||||
// ContainerLogsDir can be overwritten for testing usage
|
// ContainerLogsDir can be overwritten for testing usage
|
||||||
ContainerLogsDir = DefaultContainerLogsDir
|
ContainerLogsDir = DefaultContainerLogsDir
|
||||||
etcHostsPath = getContainerEtcHostsPath()
|
etcHostsPath = getContainerEtcHostsPath()
|
||||||
|
|
||||||
|
admissionRejectionReasons = sets.New[string](
|
||||||
|
lifecycle.AppArmorNotAdmittedReason,
|
||||||
|
lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
|
||||||
|
lifecycle.PodOSNotSupported,
|
||||||
|
lifecycle.InvalidNodeInfo,
|
||||||
|
lifecycle.InitContainerRestartPolicyForbidden,
|
||||||
|
lifecycle.UnexpectedAdmissionError,
|
||||||
|
lifecycle.UnknownReason,
|
||||||
|
lifecycle.UnexpectedPredicateFailureType,
|
||||||
|
lifecycle.OutOfCPU,
|
||||||
|
lifecycle.OutOfMemory,
|
||||||
|
lifecycle.OutOfEphemeralStorage,
|
||||||
|
lifecycle.OutOfPods,
|
||||||
|
tainttoleration.ErrReasonNotMatch,
|
||||||
|
eviction.Reason,
|
||||||
|
sysctl.ForbiddenReason,
|
||||||
|
topologymanager.ErrorTopologyAffinity,
|
||||||
|
nodeshutdown.NodeShutdownNotAdmittedReason,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
func getContainerEtcHostsPath() string {
|
func getContainerEtcHostsPath() string {
|
||||||
@ -2310,7 +2333,6 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
|
|||||||
attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods}
|
attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods}
|
||||||
for _, podAdmitHandler := range kl.admitHandlers {
|
for _, podAdmitHandler := range kl.admitHandlers {
|
||||||
if result := podAdmitHandler.Admit(attrs); !result.Admit {
|
if result := podAdmitHandler.Admit(attrs); !result.Admit {
|
||||||
|
|
||||||
klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message)
|
klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message)
|
||||||
|
|
||||||
return false, result.Reason, result.Message
|
return false, result.Reason, result.Message
|
||||||
@ -2320,6 +2342,22 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
|
|||||||
return true, "", ""
|
return true, "", ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func recordAdmissionRejection(reason string) {
|
||||||
|
// It is possible that the "reason" label can have high cardinality.
|
||||||
|
// To avoid this metric from exploding, we create an allowlist of known
|
||||||
|
// reasons, and only record reasons from this list. Use "Other" reason
|
||||||
|
// for the rest.
|
||||||
|
if admissionRejectionReasons.Has(reason) {
|
||||||
|
metrics.AdmissionRejectionsTotal.WithLabelValues(reason).Inc()
|
||||||
|
} else if strings.HasPrefix(reason, lifecycle.InsufficientResourcePrefix) {
|
||||||
|
// non-extended resources (like cpu, memory, ephemeral-storage, pods)
|
||||||
|
// are already included in admissionRejectionReasons.
|
||||||
|
metrics.AdmissionRejectionsTotal.WithLabelValues("OutOfExtendedResources").Inc()
|
||||||
|
} else {
|
||||||
|
metrics.AdmissionRejectionsTotal.WithLabelValues("Other").Inc()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// syncLoop is the main loop for processing changes. It watches for changes from
|
// syncLoop is the main loop for processing changes. It watches for changes from
|
||||||
// three channels (file, apiserver, and http) and creates a union of them. For
|
// three channels (file, apiserver, and http) and creates a union of them. For
|
||||||
// any new change seen, will run a sync against desired state and running state. If
|
// any new change seen, will run a sync against desired state and running state. If
|
||||||
@ -2590,6 +2628,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
|
|||||||
// Check if we can admit the pod; if not, reject it.
|
// Check if we can admit the pod; if not, reject it.
|
||||||
if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok {
|
if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok {
|
||||||
kl.rejectPod(pod, reason, message)
|
kl.rejectPod(pod, reason, message)
|
||||||
|
// We avoid recording the metric in canAdmitPod because it's called
|
||||||
|
// repeatedly during a resize, which would inflate the metric.
|
||||||
|
// Instead, we record the metric here in HandlePodAdditions for new pods
|
||||||
|
// and capture resize events separately.
|
||||||
|
recordAdmissionRejection(reason)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// For new pod, checkpoint the resource values at which the Pod has been admitted
|
// For new pod, checkpoint the resource values at which the Pod has been admitted
|
||||||
@ -2601,6 +2644,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
|
|||||||
// Check if we can admit the pod; if not, reject it.
|
// Check if we can admit the pod; if not, reject it.
|
||||||
if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok {
|
if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok {
|
||||||
kl.rejectPod(pod, reason, message)
|
kl.rejectPod(pod, reason, message)
|
||||||
|
// We avoid recording the metric in canAdmitPod because it's called
|
||||||
|
// repeatedly during a resize, which would inflate the metric.
|
||||||
|
// Instead, we record the metric here in HandlePodAdditions for new pods
|
||||||
|
// and capture resize events separately.
|
||||||
|
recordAdmissionRejection(reason)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -55,6 +55,7 @@ import (
|
|||||||
"k8s.io/client-go/tools/record"
|
"k8s.io/client-go/tools/record"
|
||||||
"k8s.io/client-go/util/flowcontrol"
|
"k8s.io/client-go/util/flowcontrol"
|
||||||
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||||
|
"k8s.io/component-base/metrics/testutil"
|
||||||
internalapi "k8s.io/cri-api/pkg/apis"
|
internalapi "k8s.io/cri-api/pkg/apis"
|
||||||
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
|
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
remote "k8s.io/cri-client/pkg"
|
remote "k8s.io/cri-client/pkg"
|
||||||
@ -65,6 +66,7 @@ import (
|
|||||||
cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing"
|
cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
|
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/config"
|
"k8s.io/kubernetes/pkg/kubelet/config"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/configmap"
|
"k8s.io/kubernetes/pkg/kubelet/configmap"
|
||||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||||
@ -74,6 +76,7 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
|
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/logs"
|
"k8s.io/kubernetes/pkg/kubelet/logs"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/network/dns"
|
"k8s.io/kubernetes/pkg/kubelet/network/dns"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
|
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/pleg"
|
"k8s.io/kubernetes/pkg/kubelet/pleg"
|
||||||
@ -89,12 +92,14 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/kubelet/status"
|
"k8s.io/kubernetes/pkg/kubelet/status"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/status/state"
|
"k8s.io/kubernetes/pkg/kubelet/status/state"
|
||||||
statustest "k8s.io/kubernetes/pkg/kubelet/status/testing"
|
statustest "k8s.io/kubernetes/pkg/kubelet/status/testing"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/sysctl"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/token"
|
"k8s.io/kubernetes/pkg/kubelet/token"
|
||||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||||
kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
|
kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/util/queue"
|
"k8s.io/kubernetes/pkg/kubelet/util/queue"
|
||||||
kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager"
|
kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager"
|
||||||
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
|
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
|
||||||
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
|
||||||
"k8s.io/kubernetes/pkg/util/oom"
|
"k8s.io/kubernetes/pkg/util/oom"
|
||||||
"k8s.io/kubernetes/pkg/volume"
|
"k8s.io/kubernetes/pkg/volume"
|
||||||
_ "k8s.io/kubernetes/pkg/volume/hostpath"
|
_ "k8s.io/kubernetes/pkg/volume/hostpath"
|
||||||
@ -3461,3 +3466,200 @@ func TestIsPodResizeInProgress(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRecordAdmissionRejection(t *testing.T) {
|
||||||
|
metrics.Register()
|
||||||
|
|
||||||
|
testCases := []struct {
|
||||||
|
name string
|
||||||
|
reason string
|
||||||
|
wants string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "AppArmor",
|
||||||
|
reason: lifecycle.AppArmorNotAdmittedReason,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="AppArmor"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "PodOSSelectorNodeLabelDoesNotMatch",
|
||||||
|
reason: lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="PodOSSelectorNodeLabelDoesNotMatch"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "PodOSNotSupported",
|
||||||
|
reason: lifecycle.PodOSNotSupported,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="PodOSNotSupported"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "InvalidNodeInfo",
|
||||||
|
reason: lifecycle.InvalidNodeInfo,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="InvalidNodeInfo"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "InitContainerRestartPolicyForbidden",
|
||||||
|
reason: lifecycle.InitContainerRestartPolicyForbidden,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="InitContainerRestartPolicyForbidden"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "UnexpectedAdmissionError",
|
||||||
|
reason: lifecycle.UnexpectedAdmissionError,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="UnexpectedAdmissionError"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "UnknownReason",
|
||||||
|
reason: lifecycle.UnknownReason,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="UnknownReason"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "UnexpectedPredicateFailureType",
|
||||||
|
reason: lifecycle.UnexpectedPredicateFailureType,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="UnexpectedPredicateFailureType"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "node(s) had taints that the pod didn't tolerate",
|
||||||
|
reason: tainttoleration.ErrReasonNotMatch,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="node(s) had taints that the pod didn't tolerate"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Evicted",
|
||||||
|
reason: eviction.Reason,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="Evicted"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "SysctlForbidden",
|
||||||
|
reason: sysctl.ForbiddenReason,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="SysctlForbidden"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "TopologyAffinityError",
|
||||||
|
reason: topologymanager.ErrorTopologyAffinity,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="TopologyAffinityError"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "NodeShutdown",
|
||||||
|
reason: nodeshutdown.NodeShutdownNotAdmittedReason,
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="NodeShutdown"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "OutOfcpu",
|
||||||
|
reason: "OutOfcpu",
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="OutOfcpu"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "OutOfmemory",
|
||||||
|
reason: "OutOfmemory",
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="OutOfmemory"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "OutOfephemeral-storage",
|
||||||
|
reason: "OutOfephemeral-storage",
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="OutOfephemeral-storage"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "OutOfpods",
|
||||||
|
reason: "OutOfpods",
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="OutOfpods"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "OutOfgpu",
|
||||||
|
reason: "OutOfgpu",
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="OutOfExtendedResources"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "OtherReason",
|
||||||
|
reason: "OtherReason",
|
||||||
|
wants: `
|
||||||
|
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||||
|
# TYPE kubelet_admission_rejections_total counter
|
||||||
|
kubelet_admission_rejections_total{reason="Other"} 1
|
||||||
|
`,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run tests.
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
// Clear the metrics after the test.
|
||||||
|
metrics.AdmissionRejectionsTotal.Reset()
|
||||||
|
|
||||||
|
// Call the function.
|
||||||
|
recordAdmissionRejection(tc.reason)
|
||||||
|
|
||||||
|
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(tc.wants), "kubelet_admission_rejections_total"); err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -44,6 +44,8 @@ import (
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
maxRespBodyLength = 10 * 1 << 10 // 10KB
|
maxRespBodyLength = 10 * 1 << 10 // 10KB
|
||||||
|
|
||||||
|
AppArmorNotAdmittedReason = "AppArmor"
|
||||||
)
|
)
|
||||||
|
|
||||||
type handlerRunner struct {
|
type handlerRunner struct {
|
||||||
@ -224,7 +226,7 @@ func (a *appArmorAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult {
|
|||||||
}
|
}
|
||||||
return PodAdmitResult{
|
return PodAdmitResult{
|
||||||
Admit: false,
|
Admit: false,
|
||||||
Reason: "AppArmor",
|
Reason: AppArmorNotAdmittedReason,
|
||||||
Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err),
|
Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,52 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// PodOSSelectorNodeLabelDoesNotMatch is used to denote that the pod was
|
||||||
|
// rejected admission to the node because the pod's node selector
|
||||||
|
// corresponding to kubernetes.io/os label didn't match the node label.
|
||||||
|
PodOSSelectorNodeLabelDoesNotMatch = "PodOSSelectorNodeLabelDoesNotMatch"
|
||||||
|
|
||||||
|
// PodOSNotSupported is used to denote that the pod was rejected admission
|
||||||
|
// to the node because the pod's OS field didn't match the node OS.
|
||||||
|
PodOSNotSupported = "PodOSNotSupported"
|
||||||
|
|
||||||
|
// InvalidNodeInfo is used to denote that the pod was rejected admission
|
||||||
|
// to the node because the kubelet was unable to retrieve the node info.
|
||||||
|
InvalidNodeInfo = "InvalidNodeInfo"
|
||||||
|
|
||||||
|
// InitContainerRestartPolicyForbidden is used to denote that the pod was
|
||||||
|
// rejected admission to the node because it uses a restart policy other
|
||||||
|
// than Always for some of its init containers.
|
||||||
|
InitContainerRestartPolicyForbidden = "InitContainerRestartPolicyForbidden"
|
||||||
|
|
||||||
|
// UnexpectedAdmissionError is used to denote that the pod was rejected
|
||||||
|
// admission to the node because of an error during admission that could not
|
||||||
|
// be categorized.
|
||||||
|
UnexpectedAdmissionError = "UnexpectedAdmissionError"
|
||||||
|
|
||||||
|
// UnknownReason is used to denote that the pod was rejected admission to
|
||||||
|
// the node because a predicate failed for a reason that could not be
|
||||||
|
// determined.
|
||||||
|
UnknownReason = "UnknownReason"
|
||||||
|
|
||||||
|
// UnexpectedPredicateFailureType is used to denote that the pod was
|
||||||
|
// rejected admission to the node because a predicate returned a reason
|
||||||
|
// object that was not an InsufficientResourceError or a PredicateFailureError.
|
||||||
|
UnexpectedPredicateFailureType = "UnexpectedPredicateFailureType"
|
||||||
|
|
||||||
|
// Prefix for admission reason when kubelet rejects a pod due to insufficient
|
||||||
|
// resources available.
|
||||||
|
InsufficientResourcePrefix = "OutOf"
|
||||||
|
|
||||||
|
// These reasons are used to denote that the pod has reject admission
|
||||||
|
// to the node because there's not enough resources to run the pod.
|
||||||
|
OutOfCPU = "OutOfcpu"
|
||||||
|
OutOfMemory = "OutOfmemory"
|
||||||
|
OutOfEphemeralStorage = "OutOfephemeral-storage"
|
||||||
|
OutOfPods = "OutOfpods"
|
||||||
|
)
|
||||||
|
|
||||||
type getNodeAnyWayFuncType func() (*v1.Node, error)
|
type getNodeAnyWayFuncType func() (*v1.Node, error)
|
||||||
|
|
||||||
type pluginResourceUpdateFuncType func(*schedulerframework.NodeInfo, *PodAdmitAttributes) error
|
type pluginResourceUpdateFuncType func(*schedulerframework.NodeInfo, *PodAdmitAttributes) error
|
||||||
@ -66,7 +112,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
|||||||
klog.ErrorS(err, "Cannot get Node info")
|
klog.ErrorS(err, "Cannot get Node info")
|
||||||
return PodAdmitResult{
|
return PodAdmitResult{
|
||||||
Admit: false,
|
Admit: false,
|
||||||
Reason: "InvalidNodeInfo",
|
Reason: InvalidNodeInfo,
|
||||||
Message: "Kubelet cannot get node info.",
|
Message: "Kubelet cannot get node info.",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -76,14 +122,14 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
|||||||
if rejectPodAdmissionBasedOnOSSelector(admitPod, node) {
|
if rejectPodAdmissionBasedOnOSSelector(admitPod, node) {
|
||||||
return PodAdmitResult{
|
return PodAdmitResult{
|
||||||
Admit: false,
|
Admit: false,
|
||||||
Reason: "PodOSSelectorNodeLabelDoesNotMatch",
|
Reason: PodOSSelectorNodeLabelDoesNotMatch,
|
||||||
Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label",
|
Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if rejectPodAdmissionBasedOnOSField(admitPod) {
|
if rejectPodAdmissionBasedOnOSField(admitPod) {
|
||||||
return PodAdmitResult{
|
return PodAdmitResult{
|
||||||
Admit: false,
|
Admit: false,
|
||||||
Reason: "PodOSNotSupported",
|
Reason: PodOSNotSupported,
|
||||||
Message: "Failed to admit pod as the OS field doesn't match node OS",
|
Message: "Failed to admit pod as the OS field doesn't match node OS",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -100,7 +146,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
|||||||
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
|
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
|
||||||
return PodAdmitResult{
|
return PodAdmitResult{
|
||||||
Admit: false,
|
Admit: false,
|
||||||
Reason: "InitContainerRestartPolicyForbidden",
|
Reason: InitContainerRestartPolicyForbidden,
|
||||||
Message: message,
|
Message: message,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -113,7 +159,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
|||||||
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
|
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
|
||||||
return PodAdmitResult{
|
return PodAdmitResult{
|
||||||
Admit: false,
|
Admit: false,
|
||||||
Reason: "UnexpectedAdmissionError",
|
Reason: UnexpectedAdmissionError,
|
||||||
Message: message,
|
Message: message,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -138,7 +184,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
|||||||
klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err)
|
klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err)
|
||||||
return PodAdmitResult{
|
return PodAdmitResult{
|
||||||
Admit: fit,
|
Admit: fit,
|
||||||
Reason: "UnexpectedAdmissionError",
|
Reason: UnexpectedAdmissionError,
|
||||||
Message: message,
|
Message: message,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -151,7 +197,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
|||||||
klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod))
|
klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod))
|
||||||
return PodAdmitResult{
|
return PodAdmitResult{
|
||||||
Admit: fit,
|
Admit: fit,
|
||||||
Reason: "UnknownReason",
|
Reason: UnknownReason,
|
||||||
Message: message,
|
Message: message,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -163,11 +209,22 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
|||||||
message = re.Error()
|
message = re.Error()
|
||||||
klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)
|
klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)
|
||||||
case *InsufficientResourceError:
|
case *InsufficientResourceError:
|
||||||
reason = fmt.Sprintf("OutOf%s", re.ResourceName)
|
switch re.ResourceName {
|
||||||
|
case v1.ResourceCPU:
|
||||||
|
reason = OutOfCPU
|
||||||
|
case v1.ResourceMemory:
|
||||||
|
reason = OutOfMemory
|
||||||
|
case v1.ResourceEphemeralStorage:
|
||||||
|
reason = OutOfEphemeralStorage
|
||||||
|
case v1.ResourcePods:
|
||||||
|
reason = OutOfPods
|
||||||
|
default:
|
||||||
|
reason = fmt.Sprintf("%s%s", InsufficientResourcePrefix, re.ResourceName)
|
||||||
|
}
|
||||||
message = re.Error()
|
message = re.Error()
|
||||||
klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)
|
klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)
|
||||||
default:
|
default:
|
||||||
reason = "UnexpectedPredicateFailureType"
|
reason = UnexpectedPredicateFailureType
|
||||||
message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r)
|
message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r)
|
||||||
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message)
|
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message)
|
||||||
}
|
}
|
||||||
|
@ -149,6 +149,9 @@ const (
|
|||||||
|
|
||||||
AlignedPhysicalCPU = "physical_cpu"
|
AlignedPhysicalCPU = "physical_cpu"
|
||||||
AlignedNUMANode = "numa_node"
|
AlignedNUMANode = "numa_node"
|
||||||
|
|
||||||
|
// Metrics to track kubelet admission rejections.
|
||||||
|
AdmissionRejectionsTotalKey = "admission_rejections_total"
|
||||||
)
|
)
|
||||||
|
|
||||||
type imageSizeBucket struct {
|
type imageSizeBucket struct {
|
||||||
@ -994,6 +997,17 @@ var (
|
|||||||
},
|
},
|
||||||
[]string{"driver_name", "method_name", "grpc_status_code"},
|
[]string{"driver_name", "method_name", "grpc_status_code"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// AdmissionRejectionsTotal tracks the number of failed admission times, currently, just record it for pod additions
|
||||||
|
AdmissionRejectionsTotal = metrics.NewCounterVec(
|
||||||
|
&metrics.CounterOpts{
|
||||||
|
Subsystem: KubeletSubsystem,
|
||||||
|
Name: AdmissionRejectionsTotalKey,
|
||||||
|
Help: "Cumulative number pod admission rejections by the Kubelet.",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
[]string{"reason"},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
var registerMetrics sync.Once
|
var registerMetrics sync.Once
|
||||||
@ -1091,6 +1105,8 @@ func Register(collectors ...metrics.StableCollector) {
|
|||||||
legacyregistry.MustRegister(DRAOperationsDuration)
|
legacyregistry.MustRegister(DRAOperationsDuration)
|
||||||
legacyregistry.MustRegister(DRAGRPCOperationsDuration)
|
legacyregistry.MustRegister(DRAGRPCOperationsDuration)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
legacyregistry.MustRegister(AdmissionRejectionsTotal)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -83,6 +83,10 @@ func (managerStub) ShutdownStatus() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
NodeShutdownNotAdmittedReason = "NodeShutdown"
|
||||||
|
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
||||||
|
localStorageStateFile = "graceful_node_shutdown_state"
|
||||||
|
|
||||||
nodeShutdownReason = "Terminated"
|
nodeShutdownReason = "Terminated"
|
||||||
nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown."
|
nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown."
|
||||||
)
|
)
|
||||||
|
@ -40,10 +40,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
nodeShutdownNotAdmittedReason = "NodeShutdown"
|
dbusReconnectPeriod = 1 * time.Second
|
||||||
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
|
||||||
dbusReconnectPeriod = 1 * time.Second
|
|
||||||
localStorageStateFile = "graceful_node_shutdown_state"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var systemDbus = func() (dbusInhibiter, error) {
|
var systemDbus = func() (dbusInhibiter, error) {
|
||||||
@ -123,7 +120,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
|||||||
if nodeShuttingDown {
|
if nodeShuttingDown {
|
||||||
return lifecycle.PodAdmitResult{
|
return lifecycle.PodAdmitResult{
|
||||||
Admit: false,
|
Admit: false,
|
||||||
Reason: nodeShutdownNotAdmittedReason,
|
Reason: NodeShutdownNotAdmittedReason,
|
||||||
Message: nodeShutdownNotAdmittedMessage,
|
Message: nodeShutdownNotAdmittedMessage,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -51,12 +51,6 @@ const (
|
|||||||
shutdownOrderStringValue = "PreshutdownOrder"
|
shutdownOrderStringValue = "PreshutdownOrder"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
|
||||||
nodeShutdownNotAdmittedReason = "NodeShutdown"
|
|
||||||
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
|
||||||
localStorageStateFile = "graceful_node_shutdown_state"
|
|
||||||
)
|
|
||||||
|
|
||||||
// managerImpl has functions that can be used to interact with the Node Shutdown Manager.
|
// managerImpl has functions that can be used to interact with the Node Shutdown Manager.
|
||||||
type managerImpl struct {
|
type managerImpl struct {
|
||||||
logger klog.Logger
|
logger klog.Logger
|
||||||
@ -120,7 +114,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
|||||||
if nodeShuttingDown {
|
if nodeShuttingDown {
|
||||||
return lifecycle.PodAdmitResult{
|
return lifecycle.PodAdmitResult{
|
||||||
Admit: false,
|
Admit: false,
|
||||||
Reason: nodeShutdownNotAdmittedReason,
|
Reason: NodeShutdownNotAdmittedReason,
|
||||||
Message: nodeShutdownNotAdmittedMessage,
|
Message: nodeShutdownNotAdmittedMessage,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user