Merge pull request #128556 from AnishShah/kubelet-reject-metric

Introduce a metric to track kubelet admission failure.
This commit is contained in:
Kubernetes Prow Robot 2024-11-06 20:10:33 +00:00 committed by GitHub
commit 099449954e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 343 additions and 23 deletions

View File

@ -29,6 +29,7 @@ import (
sysruntime "runtime" sysruntime "runtime"
"slices" "slices"
"sort" "sort"
"strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
"time" "time"
@ -81,6 +82,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cloudresource" "k8s.io/kubernetes/pkg/kubelet/cloudresource"
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" "k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
"k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/configmap" "k8s.io/kubernetes/pkg/kubelet/configmap"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@ -120,6 +122,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/volumemanager" "k8s.io/kubernetes/pkg/kubelet/volumemanager"
"k8s.io/kubernetes/pkg/kubelet/watchdog" "k8s.io/kubernetes/pkg/kubelet/watchdog"
httpprobe "k8s.io/kubernetes/pkg/probe/http" httpprobe "k8s.io/kubernetes/pkg/probe/http"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
"k8s.io/kubernetes/pkg/security/apparmor" "k8s.io/kubernetes/pkg/security/apparmor"
"k8s.io/kubernetes/pkg/util/oom" "k8s.io/kubernetes/pkg/util/oom"
"k8s.io/kubernetes/pkg/volume" "k8s.io/kubernetes/pkg/volume"
@ -220,6 +223,26 @@ var (
// ContainerLogsDir can be overwritten for testing usage // ContainerLogsDir can be overwritten for testing usage
ContainerLogsDir = DefaultContainerLogsDir ContainerLogsDir = DefaultContainerLogsDir
etcHostsPath = getContainerEtcHostsPath() etcHostsPath = getContainerEtcHostsPath()
admissionRejectionReasons = sets.New[string](
lifecycle.AppArmorNotAdmittedReason,
lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
lifecycle.PodOSNotSupported,
lifecycle.InvalidNodeInfo,
lifecycle.InitContainerRestartPolicyForbidden,
lifecycle.UnexpectedAdmissionError,
lifecycle.UnknownReason,
lifecycle.UnexpectedPredicateFailureType,
lifecycle.OutOfCPU,
lifecycle.OutOfMemory,
lifecycle.OutOfEphemeralStorage,
lifecycle.OutOfPods,
tainttoleration.ErrReasonNotMatch,
eviction.Reason,
sysctl.ForbiddenReason,
topologymanager.ErrorTopologyAffinity,
nodeshutdown.NodeShutdownNotAdmittedReason,
)
) )
func getContainerEtcHostsPath() string { func getContainerEtcHostsPath() string {
@ -2310,7 +2333,6 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods} attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods}
for _, podAdmitHandler := range kl.admitHandlers { for _, podAdmitHandler := range kl.admitHandlers {
if result := podAdmitHandler.Admit(attrs); !result.Admit { if result := podAdmitHandler.Admit(attrs); !result.Admit {
klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message) klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message)
return false, result.Reason, result.Message return false, result.Reason, result.Message
@ -2320,6 +2342,22 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
return true, "", "" return true, "", ""
} }
func recordAdmissionRejection(reason string) {
// It is possible that the "reason" label can have high cardinality.
// To avoid this metric from exploding, we create an allowlist of known
// reasons, and only record reasons from this list. Use "Other" reason
// for the rest.
if admissionRejectionReasons.Has(reason) {
metrics.AdmissionRejectionsTotal.WithLabelValues(reason).Inc()
} else if strings.HasPrefix(reason, lifecycle.InsufficientResourcePrefix) {
// non-extended resources (like cpu, memory, ephemeral-storage, pods)
// are already included in admissionRejectionReasons.
metrics.AdmissionRejectionsTotal.WithLabelValues("OutOfExtendedResources").Inc()
} else {
metrics.AdmissionRejectionsTotal.WithLabelValues("Other").Inc()
}
}
// syncLoop is the main loop for processing changes. It watches for changes from // syncLoop is the main loop for processing changes. It watches for changes from
// three channels (file, apiserver, and http) and creates a union of them. For // three channels (file, apiserver, and http) and creates a union of them. For
// any new change seen, will run a sync against desired state and running state. If // any new change seen, will run a sync against desired state and running state. If
@ -2590,6 +2628,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
// Check if we can admit the pod; if not, reject it. // Check if we can admit the pod; if not, reject it.
if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok { if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok {
kl.rejectPod(pod, reason, message) kl.rejectPod(pod, reason, message)
// We avoid recording the metric in canAdmitPod because it's called
// repeatedly during a resize, which would inflate the metric.
// Instead, we record the metric here in HandlePodAdditions for new pods
// and capture resize events separately.
recordAdmissionRejection(reason)
continue continue
} }
// For new pod, checkpoint the resource values at which the Pod has been admitted // For new pod, checkpoint the resource values at which the Pod has been admitted
@ -2601,6 +2644,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
// Check if we can admit the pod; if not, reject it. // Check if we can admit the pod; if not, reject it.
if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok { if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok {
kl.rejectPod(pod, reason, message) kl.rejectPod(pod, reason, message)
// We avoid recording the metric in canAdmitPod because it's called
// repeatedly during a resize, which would inflate the metric.
// Instead, we record the metric here in HandlePodAdditions for new pods
// and capture resize events separately.
recordAdmissionRejection(reason)
continue continue
} }
} }

View File

@ -55,6 +55,7 @@ import (
"k8s.io/client-go/tools/record" "k8s.io/client-go/tools/record"
"k8s.io/client-go/util/flowcontrol" "k8s.io/client-go/util/flowcontrol"
featuregatetesting "k8s.io/component-base/featuregate/testing" featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/component-base/metrics/testutil"
internalapi "k8s.io/cri-api/pkg/apis" internalapi "k8s.io/cri-api/pkg/apis"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
remote "k8s.io/cri-client/pkg" remote "k8s.io/cri-client/pkg"
@ -65,6 +66,7 @@ import (
cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing" cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing"
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" "k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
"k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/configmap" "k8s.io/kubernetes/pkg/kubelet/configmap"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@ -74,6 +76,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/kuberuntime" "k8s.io/kubernetes/pkg/kubelet/kuberuntime"
"k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/logs" "k8s.io/kubernetes/pkg/kubelet/logs"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/network/dns" "k8s.io/kubernetes/pkg/kubelet/network/dns"
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown" "k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
"k8s.io/kubernetes/pkg/kubelet/pleg" "k8s.io/kubernetes/pkg/kubelet/pleg"
@ -89,12 +92,14 @@ import (
"k8s.io/kubernetes/pkg/kubelet/status" "k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/kubelet/status/state" "k8s.io/kubernetes/pkg/kubelet/status/state"
statustest "k8s.io/kubernetes/pkg/kubelet/status/testing" statustest "k8s.io/kubernetes/pkg/kubelet/status/testing"
"k8s.io/kubernetes/pkg/kubelet/sysctl"
"k8s.io/kubernetes/pkg/kubelet/token" "k8s.io/kubernetes/pkg/kubelet/token"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types" kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
kubeletutil "k8s.io/kubernetes/pkg/kubelet/util" kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
"k8s.io/kubernetes/pkg/kubelet/util/queue" "k8s.io/kubernetes/pkg/kubelet/util/queue"
kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager" kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
"k8s.io/kubernetes/pkg/util/oom" "k8s.io/kubernetes/pkg/util/oom"
"k8s.io/kubernetes/pkg/volume" "k8s.io/kubernetes/pkg/volume"
_ "k8s.io/kubernetes/pkg/volume/hostpath" _ "k8s.io/kubernetes/pkg/volume/hostpath"
@ -3461,3 +3466,200 @@ func TestIsPodResizeInProgress(t *testing.T) {
}) })
} }
} }
func TestRecordAdmissionRejection(t *testing.T) {
metrics.Register()
testCases := []struct {
name string
reason string
wants string
}{
{
name: "AppArmor",
reason: lifecycle.AppArmorNotAdmittedReason,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="AppArmor"} 1
`,
},
{
name: "PodOSSelectorNodeLabelDoesNotMatch",
reason: lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="PodOSSelectorNodeLabelDoesNotMatch"} 1
`,
},
{
name: "PodOSNotSupported",
reason: lifecycle.PodOSNotSupported,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="PodOSNotSupported"} 1
`,
},
{
name: "InvalidNodeInfo",
reason: lifecycle.InvalidNodeInfo,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="InvalidNodeInfo"} 1
`,
},
{
name: "InitContainerRestartPolicyForbidden",
reason: lifecycle.InitContainerRestartPolicyForbidden,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="InitContainerRestartPolicyForbidden"} 1
`,
},
{
name: "UnexpectedAdmissionError",
reason: lifecycle.UnexpectedAdmissionError,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="UnexpectedAdmissionError"} 1
`,
},
{
name: "UnknownReason",
reason: lifecycle.UnknownReason,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="UnknownReason"} 1
`,
},
{
name: "UnexpectedPredicateFailureType",
reason: lifecycle.UnexpectedPredicateFailureType,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="UnexpectedPredicateFailureType"} 1
`,
},
{
name: "node(s) had taints that the pod didn't tolerate",
reason: tainttoleration.ErrReasonNotMatch,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="node(s) had taints that the pod didn't tolerate"} 1
`,
},
{
name: "Evicted",
reason: eviction.Reason,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="Evicted"} 1
`,
},
{
name: "SysctlForbidden",
reason: sysctl.ForbiddenReason,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="SysctlForbidden"} 1
`,
},
{
name: "TopologyAffinityError",
reason: topologymanager.ErrorTopologyAffinity,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="TopologyAffinityError"} 1
`,
},
{
name: "NodeShutdown",
reason: nodeshutdown.NodeShutdownNotAdmittedReason,
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="NodeShutdown"} 1
`,
},
{
name: "OutOfcpu",
reason: "OutOfcpu",
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="OutOfcpu"} 1
`,
},
{
name: "OutOfmemory",
reason: "OutOfmemory",
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="OutOfmemory"} 1
`,
},
{
name: "OutOfephemeral-storage",
reason: "OutOfephemeral-storage",
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="OutOfephemeral-storage"} 1
`,
},
{
name: "OutOfpods",
reason: "OutOfpods",
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="OutOfpods"} 1
`,
},
{
name: "OutOfgpu",
reason: "OutOfgpu",
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="OutOfExtendedResources"} 1
`,
},
{
name: "OtherReason",
reason: "OtherReason",
wants: `
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
# TYPE kubelet_admission_rejections_total counter
kubelet_admission_rejections_total{reason="Other"} 1
`,
},
}
// Run tests.
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// Clear the metrics after the test.
metrics.AdmissionRejectionsTotal.Reset()
// Call the function.
recordAdmissionRejection(tc.reason)
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(tc.wants), "kubelet_admission_rejections_total"); err != nil {
t.Error(err)
}
})
}
}

View File

@ -44,6 +44,8 @@ import (
const ( const (
maxRespBodyLength = 10 * 1 << 10 // 10KB maxRespBodyLength = 10 * 1 << 10 // 10KB
AppArmorNotAdmittedReason = "AppArmor"
) )
type handlerRunner struct { type handlerRunner struct {
@ -224,7 +226,7 @@ func (a *appArmorAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult {
} }
return PodAdmitResult{ return PodAdmitResult{
Admit: false, Admit: false,
Reason: "AppArmor", Reason: AppArmorNotAdmittedReason,
Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err), Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err),
} }
} }

View File

@ -32,6 +32,52 @@ import (
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
) )
const (
// PodOSSelectorNodeLabelDoesNotMatch is used to denote that the pod was
// rejected admission to the node because the pod's node selector
// corresponding to kubernetes.io/os label didn't match the node label.
PodOSSelectorNodeLabelDoesNotMatch = "PodOSSelectorNodeLabelDoesNotMatch"
// PodOSNotSupported is used to denote that the pod was rejected admission
// to the node because the pod's OS field didn't match the node OS.
PodOSNotSupported = "PodOSNotSupported"
// InvalidNodeInfo is used to denote that the pod was rejected admission
// to the node because the kubelet was unable to retrieve the node info.
InvalidNodeInfo = "InvalidNodeInfo"
// InitContainerRestartPolicyForbidden is used to denote that the pod was
// rejected admission to the node because it uses a restart policy other
// than Always for some of its init containers.
InitContainerRestartPolicyForbidden = "InitContainerRestartPolicyForbidden"
// UnexpectedAdmissionError is used to denote that the pod was rejected
// admission to the node because of an error during admission that could not
// be categorized.
UnexpectedAdmissionError = "UnexpectedAdmissionError"
// UnknownReason is used to denote that the pod was rejected admission to
// the node because a predicate failed for a reason that could not be
// determined.
UnknownReason = "UnknownReason"
// UnexpectedPredicateFailureType is used to denote that the pod was
// rejected admission to the node because a predicate returned a reason
// object that was not an InsufficientResourceError or a PredicateFailureError.
UnexpectedPredicateFailureType = "UnexpectedPredicateFailureType"
// Prefix for admission reason when kubelet rejects a pod due to insufficient
// resources available.
InsufficientResourcePrefix = "OutOf"
// These reasons are used to denote that the pod has reject admission
// to the node because there's not enough resources to run the pod.
OutOfCPU = "OutOfcpu"
OutOfMemory = "OutOfmemory"
OutOfEphemeralStorage = "OutOfephemeral-storage"
OutOfPods = "OutOfpods"
)
type getNodeAnyWayFuncType func() (*v1.Node, error) type getNodeAnyWayFuncType func() (*v1.Node, error)
type pluginResourceUpdateFuncType func(*schedulerframework.NodeInfo, *PodAdmitAttributes) error type pluginResourceUpdateFuncType func(*schedulerframework.NodeInfo, *PodAdmitAttributes) error
@ -66,7 +112,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
klog.ErrorS(err, "Cannot get Node info") klog.ErrorS(err, "Cannot get Node info")
return PodAdmitResult{ return PodAdmitResult{
Admit: false, Admit: false,
Reason: "InvalidNodeInfo", Reason: InvalidNodeInfo,
Message: "Kubelet cannot get node info.", Message: "Kubelet cannot get node info.",
} }
} }
@ -76,14 +122,14 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
if rejectPodAdmissionBasedOnOSSelector(admitPod, node) { if rejectPodAdmissionBasedOnOSSelector(admitPod, node) {
return PodAdmitResult{ return PodAdmitResult{
Admit: false, Admit: false,
Reason: "PodOSSelectorNodeLabelDoesNotMatch", Reason: PodOSSelectorNodeLabelDoesNotMatch,
Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label", Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label",
} }
} }
if rejectPodAdmissionBasedOnOSField(admitPod) { if rejectPodAdmissionBasedOnOSField(admitPod) {
return PodAdmitResult{ return PodAdmitResult{
Admit: false, Admit: false,
Reason: "PodOSNotSupported", Reason: PodOSNotSupported,
Message: "Failed to admit pod as the OS field doesn't match node OS", Message: "Failed to admit pod as the OS field doesn't match node OS",
} }
} }
@ -100,7 +146,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message) klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
return PodAdmitResult{ return PodAdmitResult{
Admit: false, Admit: false,
Reason: "InitContainerRestartPolicyForbidden", Reason: InitContainerRestartPolicyForbidden,
Message: message, Message: message,
} }
} }
@ -113,7 +159,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message) klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
return PodAdmitResult{ return PodAdmitResult{
Admit: false, Admit: false,
Reason: "UnexpectedAdmissionError", Reason: UnexpectedAdmissionError,
Message: message, Message: message,
} }
} }
@ -138,7 +184,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err) klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err)
return PodAdmitResult{ return PodAdmitResult{
Admit: fit, Admit: fit,
Reason: "UnexpectedAdmissionError", Reason: UnexpectedAdmissionError,
Message: message, Message: message,
} }
} }
@ -151,7 +197,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod)) klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod))
return PodAdmitResult{ return PodAdmitResult{
Admit: fit, Admit: fit,
Reason: "UnknownReason", Reason: UnknownReason,
Message: message, Message: message,
} }
} }
@ -163,11 +209,22 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
message = re.Error() message = re.Error()
klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message) klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)
case *InsufficientResourceError: case *InsufficientResourceError:
reason = fmt.Sprintf("OutOf%s", re.ResourceName) switch re.ResourceName {
case v1.ResourceCPU:
reason = OutOfCPU
case v1.ResourceMemory:
reason = OutOfMemory
case v1.ResourceEphemeralStorage:
reason = OutOfEphemeralStorage
case v1.ResourcePods:
reason = OutOfPods
default:
reason = fmt.Sprintf("%s%s", InsufficientResourcePrefix, re.ResourceName)
}
message = re.Error() message = re.Error()
klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message) klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)
default: default:
reason = "UnexpectedPredicateFailureType" reason = UnexpectedPredicateFailureType
message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r) message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r)
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message) klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message)
} }

View File

@ -149,6 +149,9 @@ const (
AlignedPhysicalCPU = "physical_cpu" AlignedPhysicalCPU = "physical_cpu"
AlignedNUMANode = "numa_node" AlignedNUMANode = "numa_node"
// Metrics to track kubelet admission rejections.
AdmissionRejectionsTotalKey = "admission_rejections_total"
) )
type imageSizeBucket struct { type imageSizeBucket struct {
@ -994,6 +997,17 @@ var (
}, },
[]string{"driver_name", "method_name", "grpc_status_code"}, []string{"driver_name", "method_name", "grpc_status_code"},
) )
// AdmissionRejectionsTotal tracks the number of failed admission times, currently, just record it for pod additions
AdmissionRejectionsTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: AdmissionRejectionsTotalKey,
Help: "Cumulative number pod admission rejections by the Kubelet.",
StabilityLevel: metrics.ALPHA,
},
[]string{"reason"},
)
) )
var registerMetrics sync.Once var registerMetrics sync.Once
@ -1091,6 +1105,8 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(DRAOperationsDuration) legacyregistry.MustRegister(DRAOperationsDuration)
legacyregistry.MustRegister(DRAGRPCOperationsDuration) legacyregistry.MustRegister(DRAGRPCOperationsDuration)
} }
legacyregistry.MustRegister(AdmissionRejectionsTotal)
}) })
} }

View File

@ -83,6 +83,10 @@ func (managerStub) ShutdownStatus() error {
} }
const ( const (
NodeShutdownNotAdmittedReason = "NodeShutdown"
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
localStorageStateFile = "graceful_node_shutdown_state"
nodeShutdownReason = "Terminated" nodeShutdownReason = "Terminated"
nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown." nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown."
) )

View File

@ -40,10 +40,7 @@ import (
) )
const ( const (
nodeShutdownNotAdmittedReason = "NodeShutdown" dbusReconnectPeriod = 1 * time.Second
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
dbusReconnectPeriod = 1 * time.Second
localStorageStateFile = "graceful_node_shutdown_state"
) )
var systemDbus = func() (dbusInhibiter, error) { var systemDbus = func() (dbusInhibiter, error) {
@ -123,7 +120,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
if nodeShuttingDown { if nodeShuttingDown {
return lifecycle.PodAdmitResult{ return lifecycle.PodAdmitResult{
Admit: false, Admit: false,
Reason: nodeShutdownNotAdmittedReason, Reason: NodeShutdownNotAdmittedReason,
Message: nodeShutdownNotAdmittedMessage, Message: nodeShutdownNotAdmittedMessage,
} }
} }

View File

@ -51,12 +51,6 @@ const (
shutdownOrderStringValue = "PreshutdownOrder" shutdownOrderStringValue = "PreshutdownOrder"
) )
const (
nodeShutdownNotAdmittedReason = "NodeShutdown"
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
localStorageStateFile = "graceful_node_shutdown_state"
)
// managerImpl has functions that can be used to interact with the Node Shutdown Manager. // managerImpl has functions that can be used to interact with the Node Shutdown Manager.
type managerImpl struct { type managerImpl struct {
logger klog.Logger logger klog.Logger
@ -120,7 +114,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
if nodeShuttingDown { if nodeShuttingDown {
return lifecycle.PodAdmitResult{ return lifecycle.PodAdmitResult{
Admit: false, Admit: false,
Reason: nodeShutdownNotAdmittedReason, Reason: NodeShutdownNotAdmittedReason,
Message: nodeShutdownNotAdmittedMessage, Message: nodeShutdownNotAdmittedMessage,
} }
} }