Graduate kubelet resource metrics to GA

2025-09-16 06:32:32 +00:00 · 2023-03-23 15:15:11 -07:00
parent 93bf570d46
commit 4712025ea8
5 changed files with 192 additions and 25 deletions
--- a/pkg/kubelet/metrics/collectors/resource_metrics.go
+++ b/pkg/kubelet/metrics/collectors/resource_metrics.go
@@ -31,14 +31,14 @@ var (
 		"Cumulative cpu time consumed by the node in core-seconds",
 		nil,
 		nil,
-		metrics.ALPHA,
+		metrics.STABLE,
 		"")

 	nodeMemoryUsageDesc = metrics.NewDesc("node_memory_working_set_bytes",
 		"Current working set of the node in bytes",
 		nil,
 		nil,
-		metrics.ALPHA,
+		metrics.STABLE,
 		"")

 	nodeSwapUsageDesc = metrics.NewDesc("node_swap_usage_bytes",
@@ -52,14 +52,14 @@ var (
 		"Cumulative cpu time consumed by the container in core-seconds",
 		[]string{"container", "pod", "namespace"},
 		nil,
-		metrics.ALPHA,
+		metrics.STABLE,
 		"")

 	containerMemoryUsageDesc = metrics.NewDesc("container_memory_working_set_bytes",
 		"Current working set of the container in bytes",
 		[]string{"container", "pod", "namespace"},
 		nil,
-		metrics.ALPHA,
+		metrics.STABLE,
 		"")

 	containerSwapUsageDesc = metrics.NewDesc("container_swap_usage_bytes",
@@ -73,14 +73,14 @@ var (
 		"Cumulative cpu time consumed by the pod in core-seconds",
 		[]string{"pod", "namespace"},
 		nil,
-		metrics.ALPHA,
+		metrics.STABLE,
 		"")

 	podMemoryUsageDesc = metrics.NewDesc("pod_memory_working_set_bytes",
 		"Current working set of the pod in bytes",
 		[]string{"pod", "namespace"},
 		nil,
-		metrics.ALPHA,
+		metrics.STABLE,
 		"")

 	podSwapUsageDesc = metrics.NewDesc("pod_swap_usage_bytes",
@@ -95,13 +95,20 @@ var (
 		nil,
 		nil,
 		metrics.ALPHA,
+		"1.29.0")
+
+	resourceScrapeErrorResultDesc = metrics.NewDesc("resource_scrape_error",
+		"1 if there was an error while getting container metrics, 0 otherwise",
+		nil,
+		nil,
+		metrics.STABLE,
 		"")

 	containerStartTimeDesc = metrics.NewDesc("container_start_time_seconds",
 		"Start time of the container since unix epoch in seconds",
 		[]string{"container", "pod", "namespace"},
 		nil,
-		metrics.ALPHA,
+		metrics.STABLE,
 		"")
 )

@@ -134,6 +141,7 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des
 	ch <- podMemoryUsageDesc
 	ch <- podSwapUsageDesc
 	ch <- resourceScrapeResultDesc
+	ch <- resourceScrapeErrorResultDesc
 }

 // CollectWithStability implements metrics.StableCollector
@@ -145,6 +153,7 @@ func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metri
 	var errorCount float64
 	defer func() {
 		ch <- metrics.NewLazyConstMetric(resourceScrapeResultDesc, metrics.GaugeValue, errorCount)
+		ch <- metrics.NewLazyConstMetric(resourceScrapeErrorResultDesc, metrics.GaugeValue, errorCount)
 	}()
 	statsSummary, err := rc.provider.GetCPUAndMemoryStats(ctx)
 	if err != nil {
--- a/pkg/kubelet/metrics/collectors/resource_metrics_test.go
+++ b/pkg/kubelet/metrics/collectors/resource_metrics_test.go
@@ -36,6 +36,7 @@ func TestCollectResourceMetrics(t *testing.T) {
 	testTime := metav1.NewTime(staticTimestamp)
 	interestedMetrics := []string{
 		"scrape_error",
+		"resource_scrape_error",
 		"node_cpu_usage_seconds_total",
 		"node_memory_working_set_bytes",
 		"node_swap_usage_bytes",
@@ -64,6 +65,9 @@ func TestCollectResourceMetrics(t *testing.T) {
 				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
 				# TYPE scrape_error gauge
 				scrape_error 1
+				# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE resource_scrape_error gauge
+				resource_scrape_error 1
 			`,
 		},
 		{
@@ -86,10 +90,10 @@ func TestCollectResourceMetrics(t *testing.T) {
 			},
 			summaryErr: nil,
 			expectedMetrics: `
-				# HELP node_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the node in core-seconds
+				# HELP node_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the node in core-seconds
 				# TYPE node_cpu_usage_seconds_total counter
 				node_cpu_usage_seconds_total 10 1624396278302
-				# HELP node_memory_working_set_bytes [ALPHA] Current working set of the node in bytes
+				# HELP node_memory_working_set_bytes [STABLE] Current working set of the node in bytes
 				# TYPE node_memory_working_set_bytes gauge
 				node_memory_working_set_bytes 1000 1624396278302
 				# HELP node_swap_usage_bytes [ALPHA] Current swap usage of the node in bytes. Reported only on non-windows systems
@@ -98,6 +102,9 @@ func TestCollectResourceMetrics(t *testing.T) {
 				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
 				# TYPE scrape_error gauge
 				scrape_error 0
+				# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE resource_scrape_error gauge
+				resource_scrape_error 0
 			`,
 		},
 		{
@@ -119,6 +126,9 @@ func TestCollectResourceMetrics(t *testing.T) {
 				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
 				# TYPE scrape_error gauge
 				scrape_error 0
+				# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE resource_scrape_error gauge
+				resource_scrape_error 0
 			`,
 		},
 		{
@@ -188,17 +198,20 @@ func TestCollectResourceMetrics(t *testing.T) {
 				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
 				# TYPE scrape_error gauge
 				scrape_error 0
-				# HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds
+				# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE resource_scrape_error gauge
+				resource_scrape_error 0
+				# HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds
 				# TYPE container_cpu_usage_seconds_total counter
 				container_cpu_usage_seconds_total{container="container_a",namespace="namespace_a",pod="pod_a"} 10 1624396278302
 				container_cpu_usage_seconds_total{container="container_a",namespace="namespace_b",pod="pod_b"} 10 1624396278302
 				container_cpu_usage_seconds_total{container="container_b",namespace="namespace_a",pod="pod_a"} 10 1624396278302
-				# HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes
+				# HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes
 				# TYPE container_memory_working_set_bytes gauge
 				container_memory_working_set_bytes{container="container_a",namespace="namespace_a",pod="pod_a"} 1000 1624396278302
 				container_memory_working_set_bytes{container="container_a",namespace="namespace_b",pod="pod_b"} 1000 1624396278302
 				container_memory_working_set_bytes{container="container_b",namespace="namespace_a",pod="pod_a"} 1000 1624396278302
-				# HELP container_start_time_seconds [ALPHA] Start time of the container since unix epoch in seconds
+				# HELP container_start_time_seconds [STABLE] Start time of the container since unix epoch in seconds
 				# TYPE container_start_time_seconds gauge
 				container_start_time_seconds{container="container_a",namespace="namespace_a",pod="pod_a"} 1.6243962483020916e+09 1624396248302
 				container_start_time_seconds{container="container_a",namespace="namespace_b",pod="pod_b"} 1.6243956783020916e+09 1624395678302
@@ -239,10 +252,13 @@ func TestCollectResourceMetrics(t *testing.T) {
 				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
 				# TYPE scrape_error gauge
 				scrape_error 0
-				# HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds
+				# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE resource_scrape_error gauge
+				resource_scrape_error 0
+				# HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds
 				# TYPE container_cpu_usage_seconds_total counter
 				container_cpu_usage_seconds_total{container="container_a",namespace="namespace_a",pod="pod_a"} 10 1624396278302
-				# HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes
+				# HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes
 				# TYPE container_memory_working_set_bytes gauge
 				container_memory_working_set_bytes{container="container_a",namespace="namespace_a",pod="pod_a"} 1000 1624396278302
 			`,
@@ -295,19 +311,22 @@ func TestCollectResourceMetrics(t *testing.T) {
 			},
 			summaryErr: nil,
 			expectedMetrics: `
-				# HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds
+				# HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds
 				# TYPE container_cpu_usage_seconds_total counter
 				container_cpu_usage_seconds_total{container="container_a",namespace="namespace_b",pod="pod_b"} 10 1624396278302
-				# HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes
+				# HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes
 				# TYPE container_memory_working_set_bytes gauge
 				container_memory_working_set_bytes{container="container_a",namespace="namespace_b",pod="pod_b"} 1000 1624396278302
-				# HELP container_start_time_seconds [ALPHA] Start time of the container since unix epoch in seconds
+				# HELP container_start_time_seconds [STABLE] Start time of the container since unix epoch in seconds
 				# TYPE container_start_time_seconds gauge
 				container_start_time_seconds{container="container_a",namespace="namespace_a",pod="pod_a"} 1.6243962483020916e+09 1624396248302
 				container_start_time_seconds{container="container_a",namespace="namespace_b",pod="pod_b"} 1.6243956783020916e+09 1624395678302
 				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
 				# TYPE scrape_error gauge
 				scrape_error 0
+				# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE resource_scrape_error gauge
+				resource_scrape_error 0
 			`,
 		},
 		{
@@ -339,10 +358,13 @@ func TestCollectResourceMetrics(t *testing.T) {
 				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
 				# TYPE scrape_error gauge
 				scrape_error 0
-				# HELP pod_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the pod in core-seconds
+				# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE resource_scrape_error gauge
+				resource_scrape_error 0
+				# HELP pod_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the pod in core-seconds
 				# TYPE pod_cpu_usage_seconds_total counter
 				pod_cpu_usage_seconds_total{namespace="namespace_a",pod="pod_a"} 10 1624396278302
-				# HELP pod_memory_working_set_bytes [ALPHA] Current working set of the pod in bytes
+				# HELP pod_memory_working_set_bytes [STABLE] Current working set of the pod in bytes
 				# TYPE pod_memory_working_set_bytes gauge
 				pod_memory_working_set_bytes{namespace="namespace_a",pod="pod_a"} 1000 1624396278302
 				# HELP pod_swap_usage_bytes [ALPHA] Current amount of the pod swap usage in bytes. Reported only on non-windows systems
@@ -375,6 +397,9 @@ func TestCollectResourceMetrics(t *testing.T) {
 				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
 				# TYPE scrape_error gauge
 				scrape_error 0
+				# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE resource_scrape_error gauge
+				resource_scrape_error 0
 			`,
 		},
 	}
--- a/test/e2e/framework/metrics/metrics_grabber.go
+++ b/test/e2e/framework/metrics/metrics_grabber.go
@@ -181,21 +181,34 @@ func (g *Grabber) GrabFromKubelet(ctx context.Context, nodeName string) (Kubelet
 		return KubeletMetrics{}, fmt.Errorf("Error listing nodes with name %v, got %v", nodeName, nodes.Items)
 	}
 	kubeletPort := nodes.Items[0].Status.DaemonEndpoints.KubeletEndpoint.Port
-	return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort))
+	return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort), "metrics")
 }

-func (g *Grabber) grabFromKubeletInternal(ctx context.Context, nodeName string, kubeletPort int) (KubeletMetrics, error) {
+// GrabresourceMetricsFromKubelet returns resource metrics from kubelet
+func (g *Grabber) GrabResourceMetricsFromKubelet(ctx context.Context, nodeName string) (KubeletMetrics, error) {
+	nodes, err := g.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{"metadata.name": nodeName}.AsSelector().String()})
+	if err != nil {
+		return KubeletMetrics{}, err
+	}
+	if len(nodes.Items) != 1 {
+		return KubeletMetrics{}, fmt.Errorf("Error listing nodes with name %v, got %v", nodeName, nodes.Items)
+	}
+	kubeletPort := nodes.Items[0].Status.DaemonEndpoints.KubeletEndpoint.Port
+	return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort), "metrics/resource")
+}
+
+func (g *Grabber) grabFromKubeletInternal(ctx context.Context, nodeName string, kubeletPort int, pathSuffix string) (KubeletMetrics, error) {
 	if kubeletPort <= 0 || kubeletPort > 65535 {
 		return KubeletMetrics{}, fmt.Errorf("Invalid Kubelet port %v. Skipping Kubelet's metrics gathering", kubeletPort)
 	}
-	output, err := g.getMetricsFromNode(ctx, nodeName, int(kubeletPort))
+	output, err := g.getMetricsFromNode(ctx, nodeName, int(kubeletPort), pathSuffix)
 	if err != nil {
 		return KubeletMetrics{}, err
 	}
 	return parseKubeletMetrics(output)
 }

-func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubeletPort int) (string, error) {
+func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubeletPort int, pathSuffix string) (string, error) {
 	// There's a problem with timing out during proxy. Wrapping this in a goroutine to prevent deadlock.
 	finished := make(chan struct{}, 1)
 	var err error
@@ -205,7 +218,7 @@ func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubel
 			Resource("nodes").
 			SubResource("proxy").
 			Name(fmt.Sprintf("%v:%v", nodeName, kubeletPort)).
-			Suffix("metrics").
+			Suffix(pathSuffix).
 			Do(ctx).Raw()
 		finished <- struct{}{}
 	}()
@@ -432,7 +445,7 @@ func (g *Grabber) Grab(ctx context.Context) (Collection, error) {
 		} else {
 			for _, node := range nodes.Items {
 				kubeletPort := node.Status.DaemonEndpoints.KubeletEndpoint.Port
-				metrics, err := g.grabFromKubeletInternal(ctx, node.Name, int(kubeletPort))
+				metrics, err := g.grabFromKubeletInternal(ctx, node.Name, int(kubeletPort), "metrics")
 				if err != nil {
 					errs = append(errs, err)
 				}
--- a/test/e2e/instrumentation/metrics.go
+++ b/test/e2e/instrumentation/metrics.go
@@ -0,0 +1,70 @@
+/*
+Copyright 2023 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package instrumentation
+
+import (
+	"context"
+	"errors"
+	"time"
+
+	"github.com/onsi/gomega"
+	clientset "k8s.io/client-go/kubernetes"
+	"k8s.io/kubernetes/test/e2e/framework"
+	e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
+	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
+	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
+	"k8s.io/kubernetes/test/e2e/instrumentation/common"
+	admissionapi "k8s.io/pod-security-admission/api"
+
+	"github.com/onsi/ginkgo/v2"
+)
+
+var _ = common.SIGDescribe("Metrics", func() {
+	f := framework.NewDefaultFramework("metrics")
+	f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
+	var c, ec clientset.Interface
+	var grabber *e2emetrics.Grabber
+	ginkgo.BeforeEach(func(ctx context.Context) {
+		var err error
+		c = f.ClientSet
+		ec = f.KubemarkExternalClusterClientSet
+		gomega.Eventually(ctx, func() error {
+			grabber, err = e2emetrics.NewMetricsGrabber(ctx, c, ec, f.ClientConfig(), true, true, true, true, true, true)
+			if err != nil {
+				framework.ExpectNoError(err, "failed to create metrics grabber")
+			}
+			return nil
+		}, 5*time.Minute, 10*time.Second).Should(gomega.BeNil())
+	})
+
+	/*
+	   Release: v1.29
+	   Testname: Kubelet resource metrics
+	   Description: Should attempt to grab all resource metrics from kubelet metrics/resource endpoint.
+	*/
+	ginkgo.It("should grab all metrics from kubelet /metrics/resource endpoint", func(ctx context.Context) {
+		ginkgo.By("Connecting to kubelet's /metrics/resource endpoint")
+		node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
+		if errors.Is(err, e2emetrics.MetricsGrabbingDisabledError) {
+			e2eskipper.Skipf("%v", err)
+		}
+		framework.ExpectNoError(err)
+		response, err := grabber.GrabResourceMetricsFromKubelet(ctx, node.Name)
+		framework.ExpectNoError(err)
+		gomega.Expect(response).NotTo(gomega.BeEmpty())
+	})
+})
--- a/test/instrumentation/testdata/stable-metrics-list.yaml
+++ b/test/instrumentation/testdata/stable-metrics-list.yaml
@@ -74,6 +74,56 @@
  stabilityLevel: STABLE
  labels:
  - zone
+- name: container_cpu_usage_seconds_total
+  help: Cumulative cpu time consumed by the container in core-seconds
+  type: Custom
+  stabilityLevel: STABLE
+  labels:
+  - container
+  - pod
+  - namespace
+- name: container_memory_working_set_bytes
+  help: Current working set of the container in bytes
+  type: Custom
+  stabilityLevel: STABLE
+  labels:
+  - container
+  - pod
+  - namespace
+- name: container_start_time_seconds
+  help: Start time of the container since unix epoch in seconds
+  type: Custom
+  stabilityLevel: STABLE
+  labels:
+  - container
+  - pod
+  - namespace
+- name: node_cpu_usage_seconds_total
+  help: Cumulative cpu time consumed by the node in core-seconds
+  type: Custom
+  stabilityLevel: STABLE
+- name: node_memory_working_set_bytes
+  help: Current working set of the node in bytes
+  type: Custom
+  stabilityLevel: STABLE
+- name: pod_cpu_usage_seconds_total
+  help: Cumulative cpu time consumed by the pod in core-seconds
+  type: Custom
+  stabilityLevel: STABLE
+  labels:
+  - pod
+  - namespace
+- name: pod_memory_working_set_bytes
+  help: Current working set of the pod in bytes
+  type: Custom
+  stabilityLevel: STABLE
+  labels:
+  - pod
+  - namespace
+- name: resource_scrape_error
+  help: 1 if there was an error while getting container metrics, 0 otherwise
+  type: Custom
+  stabilityLevel: STABLE
 - name: pod_scheduling_sli_duration_seconds
  subsystem: scheduler
  help: E2e latency for a pod being scheduled, from the time the pod enters the scheduling