fix gc metrics

This commit is contained in:
carlory 2023-06-06 08:45:40 +08:00
parent 8ffbbe455b
commit 322da7c1aa
3 changed files with 55 additions and 25 deletions

View File

@ -37,6 +37,7 @@ import (
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/controller/podgc/metrics"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/eviction"
nodeutil "k8s.io/kubernetes/pkg/util/node"
@ -69,11 +70,6 @@ type PodGCController struct {
quarantineTime time.Duration
}
func init() {
// Register prometheus metrics
RegisterMetrics()
}
func NewPodGC(ctx context.Context, kubeClient clientset.Interface, podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer, terminatedPodThreshold int) *PodGCController {
return NewPodGCInternal(ctx, kubeClient, podInformer, nodeInformer, terminatedPodThreshold, gcCheckPeriod, quarantineTime)
@ -94,6 +90,8 @@ func NewPodGCInternal(ctx context.Context, kubeClient clientset.Interface, podIn
quarantineTime: quarantineTime,
}
// Register prometheus metrics
metrics.RegisterMetrics()
return gcc
}
@ -179,11 +177,11 @@ func (gcc *PodGCController) gcTerminating(ctx context.Context, pods []*v1.Pod) {
wait.Add(1)
go func(pod *v1.Pod) {
defer wait.Done()
deletingPodsTotal.WithLabelValues().Inc()
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc()
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
// ignore not founds
utilruntime.HandleError(err)
deletingPodsErrorTotal.WithLabelValues().Inc()
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc()
}
}(terminatingPods[i])
}
@ -216,7 +214,9 @@ func (gcc *PodGCController) gcTerminated(ctx context.Context, pods []*v1.Pod) {
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
// ignore not founds
defer utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc()
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc()
}(terminatedPods[i])
}
wait.Wait()
@ -254,9 +254,11 @@ func (gcc *PodGCController) gcOrphaned(ctx context.Context, pods []*v1.Pod, node
WithLastTransitionTime(metav1.Now())
if err := gcc.markFailedAndDeletePodWithCondition(ctx, pod, condition); err != nil {
utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc()
} else {
klog.InfoS("Forced deletion of orphaned Pod succeeded", "pod", klog.KObj(pod))
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc()
}
}
@ -303,9 +305,11 @@ func (gcc *PodGCController) gcUnscheduledTerminating(ctx context.Context, pods [
klog.V(2).InfoS("Found unscheduled terminating Pod not assigned to any Node, deleting.", "pod", klog.KObj(pod))
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc()
} else {
klog.InfoS("Forced deletion of unscheduled terminating Pod succeeded", "pod", klog.KObj(pod))
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc()
}
}

View File

@ -37,6 +37,7 @@ import (
featuregatetesting "k8s.io/component-base/featuregate/testing"
metricstestutil "k8s.io/component-base/metrics/testutil"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/podgc/metrics"
"k8s.io/kubernetes/pkg/controller/testutil"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/eviction"
@ -159,7 +160,7 @@ func TestGCTerminated(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime}},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime}},
Status: v1.PodStatus{Phase: pod.phase, Reason: pod.reason},
Spec: v1.PodSpec{NodeName: "node"},
})
@ -175,12 +176,16 @@ func TestGCTerminated(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}
// testDeletingPodsMetrics is 9 in this test
testDeletingPodsMetrics(t, 9, metrics.PodGCReasonTerminated)
}
func makePod(name string, nodeName string, phase v1.PodPhase) *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: metav1.NamespaceDefault,
},
Spec: v1.PodSpec{NodeName: nodeName},
Status: v1.PodStatus{Phase: phase},
@ -406,6 +411,9 @@ func TestGCOrphaned(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}
// testDeletingPodsMetrics is 10 in this test
testDeletingPodsMetrics(t, 10, metrics.PodGCReasonOrphaned)
}
func TestGCUnscheduledTerminating(t *testing.T) {
@ -463,7 +471,7 @@ func TestGCUnscheduledTerminating(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime},
DeletionTimestamp: pod.deletionTimeStamp},
Status: v1.PodStatus{Phase: pod.phase},
Spec: v1.PodSpec{NodeName: pod.nodeName},
@ -486,6 +494,9 @@ func TestGCUnscheduledTerminating(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}
// testDeletingPodsMetrics is 6 in this test
testDeletingPodsMetrics(t, 6, metrics.PodGCReasonTerminatingUnscheduled)
}
func TestGCTerminating(t *testing.T) {
@ -633,7 +644,7 @@ func TestGCTerminating(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime},
DeletionTimestamp: pod.deletionTimeStamp},
Status: v1.PodStatus{Phase: pod.phase},
Spec: v1.PodSpec{NodeName: pod.nodeName},
@ -653,8 +664,8 @@ func TestGCTerminating(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}
// deletingPodsTotal is 7 in this test
testDeletingPodsMetrics(t, 7)
// testDeletingPodsMetrics is 7 in this test
testDeletingPodsMetrics(t, 7, metrics.PodGCReasonTerminatingOutOfService)
}
func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDeletedPodNames, wantPatchedPodNames sets.String) {
@ -669,18 +680,18 @@ func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDelet
}
}
func testDeletingPodsMetrics(t *testing.T, inputDeletingPodsTotal int) {
func testDeletingPodsMetrics(t *testing.T, total int, reason string) {
t.Helper()
actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsTotal.WithLabelValues())
actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsTotal.WithLabelValues(metav1.NamespaceDefault, reason))
if err != nil {
t.Errorf("Error getting actualDeletingPodsTotal")
}
if actualDeletingPodsTotal != float64(inputDeletingPodsTotal) {
t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", inputDeletingPodsTotal, actualDeletingPodsTotal)
if actualDeletingPodsTotal != float64(total) {
t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", total, actualDeletingPodsTotal)
}
actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsErrorTotal.WithLabelValues())
actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsErrorTotal.WithLabelValues("", reason))
if err != nil {
t.Errorf("Error getting actualDeletingPodsErrorTotal")
}

View File

@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package podgc
package metrics
import (
"sync"
@ -28,32 +28,47 @@ const (
)
var (
deletingPodsTotal = metrics.NewCounterVec(
DeletingPodsTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: podGCController,
Name: "force_delete_pods_total",
Help: "Number of pods that are being forcefully deleted since the Pod GC Controller started.",
StabilityLevel: metrics.ALPHA,
},
[]string{},
[]string{"namespace", "reason"},
)
deletingPodsErrorTotal = metrics.NewCounterVec(
DeletingPodsErrorTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: podGCController,
Name: "force_delete_pod_errors_total",
Help: "Number of errors encountered when forcefully deleting the pods since the Pod GC Controller started.",
StabilityLevel: metrics.ALPHA,
},
[]string{},
[]string{"namespace", "reason"},
)
)
const (
// Possible values for the "reason" label in the above metrics.
// PodGCReasonTerminated is used when the pod is terminated.
PodGCReasonTerminated = "terminated"
// PodGCReasonCompleted is used when the pod is terminating and the corresponding node
// is not ready and has `node.kubernetes.io/out-of-service` taint.
PodGCReasonTerminatingOutOfService = "out-of-service"
// PodGCReasonOrphaned is used when the pod is orphaned which means the corresponding node
// has been deleted.
PodGCReasonOrphaned = "orphaned"
// PodGCReasonUnscheduled is used when the pod is terminating and unscheduled.
PodGCReasonTerminatingUnscheduled = "unscheduled"
)
var registerMetrics sync.Once
// Register the metrics that are to be monitored.
func RegisterMetrics() {
registerMetrics.Do(func() {
legacyregistry.MustRegister(deletingPodsTotal)
legacyregistry.MustRegister(deletingPodsErrorTotal)
legacyregistry.MustRegister(DeletingPodsTotal)
legacyregistry.MustRegister(DeletingPodsErrorTotal)
})
}