mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-29 14:37:00 +00:00
Introduce a metric to track kubelet admission failure.
This commit is contained in:
parent
aafcf4e932
commit
d4f05fdda5
@ -29,6 +29,7 @@ import (
|
||||
sysruntime "runtime"
|
||||
"slices"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
@ -81,6 +82,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/cloudresource"
|
||||
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/config"
|
||||
"k8s.io/kubernetes/pkg/kubelet/configmap"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
@ -120,6 +122,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/volumemanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/watchdog"
|
||||
httpprobe "k8s.io/kubernetes/pkg/probe/http"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
|
||||
"k8s.io/kubernetes/pkg/security/apparmor"
|
||||
"k8s.io/kubernetes/pkg/util/oom"
|
||||
"k8s.io/kubernetes/pkg/volume"
|
||||
@ -220,6 +223,26 @@ var (
|
||||
// ContainerLogsDir can be overwritten for testing usage
|
||||
ContainerLogsDir = DefaultContainerLogsDir
|
||||
etcHostsPath = getContainerEtcHostsPath()
|
||||
|
||||
admissionRejectionReasons = sets.New[string](
|
||||
lifecycle.AppArmorNotAdmittedReason,
|
||||
lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
|
||||
lifecycle.PodOSNotSupported,
|
||||
lifecycle.InvalidNodeInfo,
|
||||
lifecycle.InitContainerRestartPolicyForbidden,
|
||||
lifecycle.UnexpectedAdmissionError,
|
||||
lifecycle.UnknownReason,
|
||||
lifecycle.UnexpectedPredicateFailureType,
|
||||
lifecycle.OutOfCPU,
|
||||
lifecycle.OutOfMemory,
|
||||
lifecycle.OutOfEphemeralStorage,
|
||||
lifecycle.OutOfPods,
|
||||
tainttoleration.ErrReasonNotMatch,
|
||||
eviction.Reason,
|
||||
sysctl.ForbiddenReason,
|
||||
topologymanager.ErrorTopologyAffinity,
|
||||
nodeshutdown.NodeShutdownNotAdmittedReason,
|
||||
)
|
||||
)
|
||||
|
||||
func getContainerEtcHostsPath() string {
|
||||
@ -2304,7 +2327,6 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
|
||||
attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods}
|
||||
for _, podAdmitHandler := range kl.admitHandlers {
|
||||
if result := podAdmitHandler.Admit(attrs); !result.Admit {
|
||||
|
||||
klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message)
|
||||
|
||||
return false, result.Reason, result.Message
|
||||
@ -2314,6 +2336,22 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
|
||||
return true, "", ""
|
||||
}
|
||||
|
||||
func recordAdmissionRejection(reason string) {
|
||||
// It is possible that the "reason" label can have high cardinality.
|
||||
// To avoid this metric from exploding, we create an allowlist of known
|
||||
// reasons, and only record reasons from this list. Use "Other" reason
|
||||
// for the rest.
|
||||
if admissionRejectionReasons.Has(reason) {
|
||||
metrics.AdmissionRejectionsTotal.WithLabelValues(reason).Inc()
|
||||
} else if strings.HasPrefix(reason, lifecycle.InsufficientResourcePrefix) {
|
||||
// non-extended resources (like cpu, memory, ephemeral-storage, pods)
|
||||
// are already included in admissionRejectionReasons.
|
||||
metrics.AdmissionRejectionsTotal.WithLabelValues("OutOfExtendedResources").Inc()
|
||||
} else {
|
||||
metrics.AdmissionRejectionsTotal.WithLabelValues("Other").Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// syncLoop is the main loop for processing changes. It watches for changes from
|
||||
// three channels (file, apiserver, and http) and creates a union of them. For
|
||||
// any new change seen, will run a sync against desired state and running state. If
|
||||
@ -2584,6 +2622,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
|
||||
// Check if we can admit the pod; if not, reject it.
|
||||
if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok {
|
||||
kl.rejectPod(pod, reason, message)
|
||||
// We avoid recording the metric in canAdmitPod because it's called
|
||||
// repeatedly during a resize, which would inflate the metric.
|
||||
// Instead, we record the metric here in HandlePodAdditions for new pods
|
||||
// and capture resize events separately.
|
||||
recordAdmissionRejection(reason)
|
||||
continue
|
||||
}
|
||||
// For new pod, checkpoint the resource values at which the Pod has been admitted
|
||||
@ -2595,6 +2638,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
|
||||
// Check if we can admit the pod; if not, reject it.
|
||||
if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok {
|
||||
kl.rejectPod(pod, reason, message)
|
||||
// We avoid recording the metric in canAdmitPod because it's called
|
||||
// repeatedly during a resize, which would inflate the metric.
|
||||
// Instead, we record the metric here in HandlePodAdditions for new pods
|
||||
// and capture resize events separately.
|
||||
recordAdmissionRejection(reason)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
@ -55,6 +55,7 @@ import (
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/client-go/util/flowcontrol"
|
||||
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||
"k8s.io/component-base/metrics/testutil"
|
||||
internalapi "k8s.io/cri-api/pkg/apis"
|
||||
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
remote "k8s.io/cri-client/pkg"
|
||||
@ -66,6 +67,7 @@ import (
|
||||
cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing"
|
||||
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/config"
|
||||
"k8s.io/kubernetes/pkg/kubelet/configmap"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
@ -75,6 +77,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/logs"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
"k8s.io/kubernetes/pkg/kubelet/network/dns"
|
||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
|
||||
"k8s.io/kubernetes/pkg/kubelet/pleg"
|
||||
@ -90,12 +93,14 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/status"
|
||||
"k8s.io/kubernetes/pkg/kubelet/status/state"
|
||||
statustest "k8s.io/kubernetes/pkg/kubelet/status/testing"
|
||||
"k8s.io/kubernetes/pkg/kubelet/sysctl"
|
||||
"k8s.io/kubernetes/pkg/kubelet/token"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
|
||||
"k8s.io/kubernetes/pkg/kubelet/util/queue"
|
||||
kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager"
|
||||
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
|
||||
"k8s.io/kubernetes/pkg/util/oom"
|
||||
"k8s.io/kubernetes/pkg/volume"
|
||||
_ "k8s.io/kubernetes/pkg/volume/hostpath"
|
||||
@ -3460,3 +3465,200 @@ func TestIsPodResizeInProgress(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecordAdmissionRejection(t *testing.T) {
|
||||
metrics.Register()
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
reason string
|
||||
wants string
|
||||
}{
|
||||
{
|
||||
name: "AppArmor",
|
||||
reason: lifecycle.AppArmorNotAdmittedReason,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="AppArmor"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "PodOSSelectorNodeLabelDoesNotMatch",
|
||||
reason: lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="PodOSSelectorNodeLabelDoesNotMatch"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "PodOSNotSupported",
|
||||
reason: lifecycle.PodOSNotSupported,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="PodOSNotSupported"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "InvalidNodeInfo",
|
||||
reason: lifecycle.InvalidNodeInfo,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="InvalidNodeInfo"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "InitContainerRestartPolicyForbidden",
|
||||
reason: lifecycle.InitContainerRestartPolicyForbidden,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="InitContainerRestartPolicyForbidden"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "UnexpectedAdmissionError",
|
||||
reason: lifecycle.UnexpectedAdmissionError,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="UnexpectedAdmissionError"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "UnknownReason",
|
||||
reason: lifecycle.UnknownReason,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="UnknownReason"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "UnexpectedPredicateFailureType",
|
||||
reason: lifecycle.UnexpectedPredicateFailureType,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="UnexpectedPredicateFailureType"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "node(s) had taints that the pod didn't tolerate",
|
||||
reason: tainttoleration.ErrReasonNotMatch,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="node(s) had taints that the pod didn't tolerate"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "Evicted",
|
||||
reason: eviction.Reason,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="Evicted"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "SysctlForbidden",
|
||||
reason: sysctl.ForbiddenReason,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="SysctlForbidden"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "TopologyAffinityError",
|
||||
reason: topologymanager.ErrorTopologyAffinity,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="TopologyAffinityError"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "NodeShutdown",
|
||||
reason: nodeshutdown.NodeShutdownNotAdmittedReason,
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="NodeShutdown"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "OutOfcpu",
|
||||
reason: "OutOfcpu",
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="OutOfcpu"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "OutOfmemory",
|
||||
reason: "OutOfmemory",
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="OutOfmemory"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "OutOfephemeral-storage",
|
||||
reason: "OutOfephemeral-storage",
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="OutOfephemeral-storage"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "OutOfpods",
|
||||
reason: "OutOfpods",
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="OutOfpods"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "OutOfgpu",
|
||||
reason: "OutOfgpu",
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="OutOfExtendedResources"} 1
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "OtherReason",
|
||||
reason: "OtherReason",
|
||||
wants: `
|
||||
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
|
||||
# TYPE kubelet_admission_rejections_total counter
|
||||
kubelet_admission_rejections_total{reason="Other"} 1
|
||||
`,
|
||||
},
|
||||
}
|
||||
|
||||
// Run tests.
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// Clear the metrics after the test.
|
||||
metrics.AdmissionRejectionsTotal.Reset()
|
||||
|
||||
// Call the function.
|
||||
recordAdmissionRejection(tc.reason)
|
||||
|
||||
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(tc.wants), "kubelet_admission_rejections_total"); err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -44,6 +44,8 @@ import (
|
||||
|
||||
const (
|
||||
maxRespBodyLength = 10 * 1 << 10 // 10KB
|
||||
|
||||
AppArmorNotAdmittedReason = "AppArmor"
|
||||
)
|
||||
|
||||
type handlerRunner struct {
|
||||
@ -224,7 +226,7 @@ func (a *appArmorAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult {
|
||||
}
|
||||
return PodAdmitResult{
|
||||
Admit: false,
|
||||
Reason: "AppArmor",
|
||||
Reason: AppArmorNotAdmittedReason,
|
||||
Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err),
|
||||
}
|
||||
}
|
||||
|
@ -32,6 +32,52 @@ import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
|
||||
)
|
||||
|
||||
const (
|
||||
// PodOSSelectorNodeLabelDoesNotMatch is used to denote that the pod was
|
||||
// rejected admission to the node because the pod's node selector
|
||||
// corresponding to kubernetes.io/os label didn't match the node label.
|
||||
PodOSSelectorNodeLabelDoesNotMatch = "PodOSSelectorNodeLabelDoesNotMatch"
|
||||
|
||||
// PodOSNotSupported is used to denote that the pod was rejected admission
|
||||
// to the node because the pod's OS field didn't match the node OS.
|
||||
PodOSNotSupported = "PodOSNotSupported"
|
||||
|
||||
// InvalidNodeInfo is used to denote that the pod was rejected admission
|
||||
// to the node because the kubelet was unable to retrieve the node info.
|
||||
InvalidNodeInfo = "InvalidNodeInfo"
|
||||
|
||||
// InitContainerRestartPolicyForbidden is used to denote that the pod was
|
||||
// rejected admission to the node because it uses a restart policy other
|
||||
// than Always for some of its init containers.
|
||||
InitContainerRestartPolicyForbidden = "InitContainerRestartPolicyForbidden"
|
||||
|
||||
// UnexpectedAdmissionError is used to denote that the pod was rejected
|
||||
// admission to the node because of an error during admission that could not
|
||||
// be categorized.
|
||||
UnexpectedAdmissionError = "UnexpectedAdmissionError"
|
||||
|
||||
// UnknownReason is used to denote that the pod was rejected admission to
|
||||
// the node because a predicate failed for a reason that could not be
|
||||
// determined.
|
||||
UnknownReason = "UnknownReason"
|
||||
|
||||
// UnexpectedPredicateFailureType is used to denote that the pod was
|
||||
// rejected admission to the node because a predicate returned a reason
|
||||
// object that was not an InsufficientResourceError or a PredicateFailureError.
|
||||
UnexpectedPredicateFailureType = "UnexpectedPredicateFailureType"
|
||||
|
||||
// Prefix for admission reason when kubelet rejects a pod due to insufficient
|
||||
// resources available.
|
||||
InsufficientResourcePrefix = "OutOf"
|
||||
|
||||
// These reasons are used to denote that the pod has reject admission
|
||||
// to the node because there's not enough resources to run the pod.
|
||||
OutOfCPU = "OutOfcpu"
|
||||
OutOfMemory = "OutOfmemory"
|
||||
OutOfEphemeralStorage = "OutOfephemeral-storage"
|
||||
OutOfPods = "OutOfpods"
|
||||
)
|
||||
|
||||
type getNodeAnyWayFuncType func() (*v1.Node, error)
|
||||
|
||||
type pluginResourceUpdateFuncType func(*schedulerframework.NodeInfo, *PodAdmitAttributes) error
|
||||
@ -66,7 +112,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
||||
klog.ErrorS(err, "Cannot get Node info")
|
||||
return PodAdmitResult{
|
||||
Admit: false,
|
||||
Reason: "InvalidNodeInfo",
|
||||
Reason: InvalidNodeInfo,
|
||||
Message: "Kubelet cannot get node info.",
|
||||
}
|
||||
}
|
||||
@ -76,14 +122,14 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
||||
if rejectPodAdmissionBasedOnOSSelector(admitPod, node) {
|
||||
return PodAdmitResult{
|
||||
Admit: false,
|
||||
Reason: "PodOSSelectorNodeLabelDoesNotMatch",
|
||||
Reason: PodOSSelectorNodeLabelDoesNotMatch,
|
||||
Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label",
|
||||
}
|
||||
}
|
||||
if rejectPodAdmissionBasedOnOSField(admitPod) {
|
||||
return PodAdmitResult{
|
||||
Admit: false,
|
||||
Reason: "PodOSNotSupported",
|
||||
Reason: PodOSNotSupported,
|
||||
Message: "Failed to admit pod as the OS field doesn't match node OS",
|
||||
}
|
||||
}
|
||||
@ -100,7 +146,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
||||
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
|
||||
return PodAdmitResult{
|
||||
Admit: false,
|
||||
Reason: "InitContainerRestartPolicyForbidden",
|
||||
Reason: InitContainerRestartPolicyForbidden,
|
||||
Message: message,
|
||||
}
|
||||
}
|
||||
@ -113,7 +159,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
||||
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
|
||||
return PodAdmitResult{
|
||||
Admit: false,
|
||||
Reason: "UnexpectedAdmissionError",
|
||||
Reason: UnexpectedAdmissionError,
|
||||
Message: message,
|
||||
}
|
||||
}
|
||||
@ -138,7 +184,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
||||
klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err)
|
||||
return PodAdmitResult{
|
||||
Admit: fit,
|
||||
Reason: "UnexpectedAdmissionError",
|
||||
Reason: UnexpectedAdmissionError,
|
||||
Message: message,
|
||||
}
|
||||
}
|
||||
@ -151,7 +197,7 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
||||
klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod))
|
||||
return PodAdmitResult{
|
||||
Admit: fit,
|
||||
Reason: "UnknownReason",
|
||||
Reason: UnknownReason,
|
||||
Message: message,
|
||||
}
|
||||
}
|
||||
@ -163,11 +209,22 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
|
||||
message = re.Error()
|
||||
klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)
|
||||
case *InsufficientResourceError:
|
||||
reason = fmt.Sprintf("OutOf%s", re.ResourceName)
|
||||
switch re.ResourceName {
|
||||
case v1.ResourceCPU:
|
||||
reason = OutOfCPU
|
||||
case v1.ResourceMemory:
|
||||
reason = OutOfMemory
|
||||
case v1.ResourceEphemeralStorage:
|
||||
reason = OutOfEphemeralStorage
|
||||
case v1.ResourcePods:
|
||||
reason = OutOfPods
|
||||
default:
|
||||
reason = fmt.Sprintf("%s%s", InsufficientResourcePrefix, re.ResourceName)
|
||||
}
|
||||
message = re.Error()
|
||||
klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)
|
||||
default:
|
||||
reason = "UnexpectedPredicateFailureType"
|
||||
reason = UnexpectedPredicateFailureType
|
||||
message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r)
|
||||
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message)
|
||||
}
|
||||
|
@ -149,6 +149,9 @@ const (
|
||||
|
||||
AlignedPhysicalCPU = "physical_cpu"
|
||||
AlignedNUMANode = "numa_node"
|
||||
|
||||
// Metrics to track kubelet admission rejections.
|
||||
AdmissionRejectionsTotalKey = "admission_rejections_total"
|
||||
)
|
||||
|
||||
type imageSizeBucket struct {
|
||||
@ -994,6 +997,17 @@ var (
|
||||
},
|
||||
[]string{"driver_name", "method_name", "grpc_status_code"},
|
||||
)
|
||||
|
||||
// AdmissionRejectionsTotal tracks the number of failed admission times, currently, just record it for pod additions
|
||||
AdmissionRejectionsTotal = metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: AdmissionRejectionsTotalKey,
|
||||
Help: "Cumulative number pod admission rejections by the Kubelet.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"reason"},
|
||||
)
|
||||
)
|
||||
|
||||
var registerMetrics sync.Once
|
||||
@ -1091,6 +1105,8 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
legacyregistry.MustRegister(DRAOperationsDuration)
|
||||
legacyregistry.MustRegister(DRAGRPCOperationsDuration)
|
||||
}
|
||||
|
||||
legacyregistry.MustRegister(AdmissionRejectionsTotal)
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -83,6 +83,10 @@ func (managerStub) ShutdownStatus() error {
|
||||
}
|
||||
|
||||
const (
|
||||
NodeShutdownNotAdmittedReason = "NodeShutdown"
|
||||
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
||||
localStorageStateFile = "graceful_node_shutdown_state"
|
||||
|
||||
nodeShutdownReason = "Terminated"
|
||||
nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown."
|
||||
)
|
||||
|
@ -40,10 +40,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
nodeShutdownNotAdmittedReason = "NodeShutdown"
|
||||
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
||||
dbusReconnectPeriod = 1 * time.Second
|
||||
localStorageStateFile = "graceful_node_shutdown_state"
|
||||
dbusReconnectPeriod = 1 * time.Second
|
||||
)
|
||||
|
||||
var systemDbus = func() (dbusInhibiter, error) {
|
||||
@ -123,7 +120,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
||||
if nodeShuttingDown {
|
||||
return lifecycle.PodAdmitResult{
|
||||
Admit: false,
|
||||
Reason: nodeShutdownNotAdmittedReason,
|
||||
Reason: NodeShutdownNotAdmittedReason,
|
||||
Message: nodeShutdownNotAdmittedMessage,
|
||||
}
|
||||
}
|
||||
|
@ -51,12 +51,6 @@ const (
|
||||
shutdownOrderStringValue = "PreshutdownOrder"
|
||||
)
|
||||
|
||||
const (
|
||||
nodeShutdownNotAdmittedReason = "NodeShutdown"
|
||||
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
||||
localStorageStateFile = "graceful_node_shutdown_state"
|
||||
)
|
||||
|
||||
// managerImpl has functions that can be used to interact with the Node Shutdown Manager.
|
||||
type managerImpl struct {
|
||||
logger klog.Logger
|
||||
@ -120,7 +114,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
||||
if nodeShuttingDown {
|
||||
return lifecycle.PodAdmitResult{
|
||||
Admit: false,
|
||||
Reason: nodeShutdownNotAdmittedReason,
|
||||
Reason: NodeShutdownNotAdmittedReason,
|
||||
Message: nodeShutdownNotAdmittedMessage,
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user