diff --git a/pkg/kubelet/metrics/collectors/resource_metrics.go b/pkg/kubelet/metrics/collectors/resource_metrics.go index 1b80b29c96a..36d29be4f6f 100644 --- a/pkg/kubelet/metrics/collectors/resource_metrics.go +++ b/pkg/kubelet/metrics/collectors/resource_metrics.go @@ -31,14 +31,14 @@ var ( "Cumulative cpu time consumed by the node in core-seconds", nil, nil, - metrics.ALPHA, + metrics.STABLE, "") nodeMemoryUsageDesc = metrics.NewDesc("node_memory_working_set_bytes", "Current working set of the node in bytes", nil, nil, - metrics.ALPHA, + metrics.STABLE, "") nodeSwapUsageDesc = metrics.NewDesc("node_swap_usage_bytes", @@ -52,14 +52,14 @@ var ( "Cumulative cpu time consumed by the container in core-seconds", []string{"container", "pod", "namespace"}, nil, - metrics.ALPHA, + metrics.STABLE, "") containerMemoryUsageDesc = metrics.NewDesc("container_memory_working_set_bytes", "Current working set of the container in bytes", []string{"container", "pod", "namespace"}, nil, - metrics.ALPHA, + metrics.STABLE, "") containerSwapUsageDesc = metrics.NewDesc("container_swap_usage_bytes", @@ -73,14 +73,14 @@ var ( "Cumulative cpu time consumed by the pod in core-seconds", []string{"pod", "namespace"}, nil, - metrics.ALPHA, + metrics.STABLE, "") podMemoryUsageDesc = metrics.NewDesc("pod_memory_working_set_bytes", "Current working set of the pod in bytes", []string{"pod", "namespace"}, nil, - metrics.ALPHA, + metrics.STABLE, "") podSwapUsageDesc = metrics.NewDesc("pod_swap_usage_bytes", @@ -95,13 +95,20 @@ var ( nil, nil, metrics.ALPHA, + "1.29.0") + + resourceScrapeErrorResultDesc = metrics.NewDesc("resource_scrape_error", + "1 if there was an error while getting container metrics, 0 otherwise", + nil, + nil, + metrics.STABLE, "") containerStartTimeDesc = metrics.NewDesc("container_start_time_seconds", "Start time of the container since unix epoch in seconds", []string{"container", "pod", "namespace"}, nil, - metrics.ALPHA, + metrics.STABLE, "") ) @@ -134,6 +141,7 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des ch <- podMemoryUsageDesc ch <- podSwapUsageDesc ch <- resourceScrapeResultDesc + ch <- resourceScrapeErrorResultDesc } // CollectWithStability implements metrics.StableCollector @@ -145,6 +153,7 @@ func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metri var errorCount float64 defer func() { ch <- metrics.NewLazyConstMetric(resourceScrapeResultDesc, metrics.GaugeValue, errorCount) + ch <- metrics.NewLazyConstMetric(resourceScrapeErrorResultDesc, metrics.GaugeValue, errorCount) }() statsSummary, err := rc.provider.GetCPUAndMemoryStats(ctx) if err != nil { diff --git a/pkg/kubelet/metrics/collectors/resource_metrics_test.go b/pkg/kubelet/metrics/collectors/resource_metrics_test.go index b4e36339d7d..f2bf39433c5 100644 --- a/pkg/kubelet/metrics/collectors/resource_metrics_test.go +++ b/pkg/kubelet/metrics/collectors/resource_metrics_test.go @@ -36,6 +36,7 @@ func TestCollectResourceMetrics(t *testing.T) { testTime := metav1.NewTime(staticTimestamp) interestedMetrics := []string{ "scrape_error", + "resource_scrape_error", "node_cpu_usage_seconds_total", "node_memory_working_set_bytes", "node_swap_usage_bytes", @@ -64,6 +65,9 @@ func TestCollectResourceMetrics(t *testing.T) { # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise # TYPE scrape_error gauge scrape_error 1 + # HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE resource_scrape_error gauge + resource_scrape_error 1 `, }, { @@ -86,10 +90,10 @@ func TestCollectResourceMetrics(t *testing.T) { }, summaryErr: nil, expectedMetrics: ` - # HELP node_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the node in core-seconds + # HELP node_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the node in core-seconds # TYPE node_cpu_usage_seconds_total counter node_cpu_usage_seconds_total 10 1624396278302 - # HELP node_memory_working_set_bytes [ALPHA] Current working set of the node in bytes + # HELP node_memory_working_set_bytes [STABLE] Current working set of the node in bytes # TYPE node_memory_working_set_bytes gauge node_memory_working_set_bytes 1000 1624396278302 # HELP node_swap_usage_bytes [ALPHA] Current swap usage of the node in bytes. Reported only on non-windows systems @@ -98,6 +102,9 @@ func TestCollectResourceMetrics(t *testing.T) { # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise # TYPE scrape_error gauge scrape_error 0 + # HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE resource_scrape_error gauge + resource_scrape_error 0 `, }, { @@ -119,6 +126,9 @@ func TestCollectResourceMetrics(t *testing.T) { # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise # TYPE scrape_error gauge scrape_error 0 + # HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE resource_scrape_error gauge + resource_scrape_error 0 `, }, { @@ -188,17 +198,20 @@ func TestCollectResourceMetrics(t *testing.T) { # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise # TYPE scrape_error gauge scrape_error 0 - # HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds + # HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE resource_scrape_error gauge + resource_scrape_error 0 + # HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds # TYPE container_cpu_usage_seconds_total counter container_cpu_usage_seconds_total{container="container_a",namespace="namespace_a",pod="pod_a"} 10 1624396278302 container_cpu_usage_seconds_total{container="container_a",namespace="namespace_b",pod="pod_b"} 10 1624396278302 container_cpu_usage_seconds_total{container="container_b",namespace="namespace_a",pod="pod_a"} 10 1624396278302 - # HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes + # HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes # TYPE container_memory_working_set_bytes gauge container_memory_working_set_bytes{container="container_a",namespace="namespace_a",pod="pod_a"} 1000 1624396278302 container_memory_working_set_bytes{container="container_a",namespace="namespace_b",pod="pod_b"} 1000 1624396278302 container_memory_working_set_bytes{container="container_b",namespace="namespace_a",pod="pod_a"} 1000 1624396278302 - # HELP container_start_time_seconds [ALPHA] Start time of the container since unix epoch in seconds + # HELP container_start_time_seconds [STABLE] Start time of the container since unix epoch in seconds # TYPE container_start_time_seconds gauge container_start_time_seconds{container="container_a",namespace="namespace_a",pod="pod_a"} 1.6243962483020916e+09 1624396248302 container_start_time_seconds{container="container_a",namespace="namespace_b",pod="pod_b"} 1.6243956783020916e+09 1624395678302 @@ -239,10 +252,13 @@ func TestCollectResourceMetrics(t *testing.T) { # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise # TYPE scrape_error gauge scrape_error 0 - # HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds + # HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE resource_scrape_error gauge + resource_scrape_error 0 + # HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds # TYPE container_cpu_usage_seconds_total counter container_cpu_usage_seconds_total{container="container_a",namespace="namespace_a",pod="pod_a"} 10 1624396278302 - # HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes + # HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes # TYPE container_memory_working_set_bytes gauge container_memory_working_set_bytes{container="container_a",namespace="namespace_a",pod="pod_a"} 1000 1624396278302 `, @@ -295,19 +311,22 @@ func TestCollectResourceMetrics(t *testing.T) { }, summaryErr: nil, expectedMetrics: ` - # HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds + # HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds # TYPE container_cpu_usage_seconds_total counter container_cpu_usage_seconds_total{container="container_a",namespace="namespace_b",pod="pod_b"} 10 1624396278302 - # HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes + # HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes # TYPE container_memory_working_set_bytes gauge container_memory_working_set_bytes{container="container_a",namespace="namespace_b",pod="pod_b"} 1000 1624396278302 - # HELP container_start_time_seconds [ALPHA] Start time of the container since unix epoch in seconds + # HELP container_start_time_seconds [STABLE] Start time of the container since unix epoch in seconds # TYPE container_start_time_seconds gauge container_start_time_seconds{container="container_a",namespace="namespace_a",pod="pod_a"} 1.6243962483020916e+09 1624396248302 container_start_time_seconds{container="container_a",namespace="namespace_b",pod="pod_b"} 1.6243956783020916e+09 1624395678302 # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise # TYPE scrape_error gauge scrape_error 0 + # HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE resource_scrape_error gauge + resource_scrape_error 0 `, }, { @@ -339,10 +358,13 @@ func TestCollectResourceMetrics(t *testing.T) { # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise # TYPE scrape_error gauge scrape_error 0 - # HELP pod_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the pod in core-seconds + # HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE resource_scrape_error gauge + resource_scrape_error 0 + # HELP pod_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the pod in core-seconds # TYPE pod_cpu_usage_seconds_total counter pod_cpu_usage_seconds_total{namespace="namespace_a",pod="pod_a"} 10 1624396278302 - # HELP pod_memory_working_set_bytes [ALPHA] Current working set of the pod in bytes + # HELP pod_memory_working_set_bytes [STABLE] Current working set of the pod in bytes # TYPE pod_memory_working_set_bytes gauge pod_memory_working_set_bytes{namespace="namespace_a",pod="pod_a"} 1000 1624396278302 # HELP pod_swap_usage_bytes [ALPHA] Current amount of the pod swap usage in bytes. Reported only on non-windows systems @@ -375,6 +397,9 @@ func TestCollectResourceMetrics(t *testing.T) { # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise # TYPE scrape_error gauge scrape_error 0 + # HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE resource_scrape_error gauge + resource_scrape_error 0 `, }, } diff --git a/test/e2e/framework/metrics/metrics_grabber.go b/test/e2e/framework/metrics/metrics_grabber.go index 2fdcd842df4..227b6ff1eb3 100644 --- a/test/e2e/framework/metrics/metrics_grabber.go +++ b/test/e2e/framework/metrics/metrics_grabber.go @@ -181,21 +181,34 @@ func (g *Grabber) GrabFromKubelet(ctx context.Context, nodeName string) (Kubelet return KubeletMetrics{}, fmt.Errorf("Error listing nodes with name %v, got %v", nodeName, nodes.Items) } kubeletPort := nodes.Items[0].Status.DaemonEndpoints.KubeletEndpoint.Port - return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort)) + return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort), "metrics") } -func (g *Grabber) grabFromKubeletInternal(ctx context.Context, nodeName string, kubeletPort int) (KubeletMetrics, error) { +// GrabresourceMetricsFromKubelet returns resource metrics from kubelet +func (g *Grabber) GrabResourceMetricsFromKubelet(ctx context.Context, nodeName string) (KubeletMetrics, error) { + nodes, err := g.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{"metadata.name": nodeName}.AsSelector().String()}) + if err != nil { + return KubeletMetrics{}, err + } + if len(nodes.Items) != 1 { + return KubeletMetrics{}, fmt.Errorf("Error listing nodes with name %v, got %v", nodeName, nodes.Items) + } + kubeletPort := nodes.Items[0].Status.DaemonEndpoints.KubeletEndpoint.Port + return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort), "metrics/resource") +} + +func (g *Grabber) grabFromKubeletInternal(ctx context.Context, nodeName string, kubeletPort int, pathSuffix string) (KubeletMetrics, error) { if kubeletPort <= 0 || kubeletPort > 65535 { return KubeletMetrics{}, fmt.Errorf("Invalid Kubelet port %v. Skipping Kubelet's metrics gathering", kubeletPort) } - output, err := g.getMetricsFromNode(ctx, nodeName, int(kubeletPort)) + output, err := g.getMetricsFromNode(ctx, nodeName, int(kubeletPort), pathSuffix) if err != nil { return KubeletMetrics{}, err } return parseKubeletMetrics(output) } -func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubeletPort int) (string, error) { +func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubeletPort int, pathSuffix string) (string, error) { // There's a problem with timing out during proxy. Wrapping this in a goroutine to prevent deadlock. finished := make(chan struct{}, 1) var err error @@ -205,7 +218,7 @@ func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubel Resource("nodes"). SubResource("proxy"). Name(fmt.Sprintf("%v:%v", nodeName, kubeletPort)). - Suffix("metrics"). + Suffix(pathSuffix). Do(ctx).Raw() finished <- struct{}{} }() @@ -432,7 +445,7 @@ func (g *Grabber) Grab(ctx context.Context) (Collection, error) { } else { for _, node := range nodes.Items { kubeletPort := node.Status.DaemonEndpoints.KubeletEndpoint.Port - metrics, err := g.grabFromKubeletInternal(ctx, node.Name, int(kubeletPort)) + metrics, err := g.grabFromKubeletInternal(ctx, node.Name, int(kubeletPort), "metrics") if err != nil { errs = append(errs, err) } diff --git a/test/e2e/instrumentation/metrics.go b/test/e2e/instrumentation/metrics.go new file mode 100644 index 00000000000..e022ade6696 --- /dev/null +++ b/test/e2e/instrumentation/metrics.go @@ -0,0 +1,70 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package instrumentation + +import ( + "context" + "errors" + "time" + + "github.com/onsi/gomega" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" + e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" + e2enode "k8s.io/kubernetes/test/e2e/framework/node" + e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + "k8s.io/kubernetes/test/e2e/instrumentation/common" + admissionapi "k8s.io/pod-security-admission/api" + + "github.com/onsi/ginkgo/v2" +) + +var _ = common.SIGDescribe("Metrics", func() { + f := framework.NewDefaultFramework("metrics") + f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged + var c, ec clientset.Interface + var grabber *e2emetrics.Grabber + ginkgo.BeforeEach(func(ctx context.Context) { + var err error + c = f.ClientSet + ec = f.KubemarkExternalClusterClientSet + gomega.Eventually(ctx, func() error { + grabber, err = e2emetrics.NewMetricsGrabber(ctx, c, ec, f.ClientConfig(), true, true, true, true, true, true) + if err != nil { + framework.ExpectNoError(err, "failed to create metrics grabber") + } + return nil + }, 5*time.Minute, 10*time.Second).Should(gomega.BeNil()) + }) + + /* + Release: v1.29 + Testname: Kubelet resource metrics + Description: Should attempt to grab all resource metrics from kubelet metrics/resource endpoint. + */ + ginkgo.It("should grab all metrics from kubelet /metrics/resource endpoint", func(ctx context.Context) { + ginkgo.By("Connecting to kubelet's /metrics/resource endpoint") + node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) + if errors.Is(err, e2emetrics.MetricsGrabbingDisabledError) { + e2eskipper.Skipf("%v", err) + } + framework.ExpectNoError(err) + response, err := grabber.GrabResourceMetricsFromKubelet(ctx, node.Name) + framework.ExpectNoError(err) + gomega.Expect(response).NotTo(gomega.BeEmpty()) + }) +}) diff --git a/test/instrumentation/testdata/stable-metrics-list.yaml b/test/instrumentation/testdata/stable-metrics-list.yaml index a33d6c6476f..6d8c6c642a0 100644 --- a/test/instrumentation/testdata/stable-metrics-list.yaml +++ b/test/instrumentation/testdata/stable-metrics-list.yaml @@ -74,6 +74,56 @@ stabilityLevel: STABLE labels: - zone +- name: container_cpu_usage_seconds_total + help: Cumulative cpu time consumed by the container in core-seconds + type: Custom + stabilityLevel: STABLE + labels: + - container + - pod + - namespace +- name: container_memory_working_set_bytes + help: Current working set of the container in bytes + type: Custom + stabilityLevel: STABLE + labels: + - container + - pod + - namespace +- name: container_start_time_seconds + help: Start time of the container since unix epoch in seconds + type: Custom + stabilityLevel: STABLE + labels: + - container + - pod + - namespace +- name: node_cpu_usage_seconds_total + help: Cumulative cpu time consumed by the node in core-seconds + type: Custom + stabilityLevel: STABLE +- name: node_memory_working_set_bytes + help: Current working set of the node in bytes + type: Custom + stabilityLevel: STABLE +- name: pod_cpu_usage_seconds_total + help: Cumulative cpu time consumed by the pod in core-seconds + type: Custom + stabilityLevel: STABLE + labels: + - pod + - namespace +- name: pod_memory_working_set_bytes + help: Current working set of the pod in bytes + type: Custom + stabilityLevel: STABLE + labels: + - pod + - namespace +- name: resource_scrape_error + help: 1 if there was an error while getting container metrics, 0 otherwise + type: Custom + stabilityLevel: STABLE - name: pod_scheduling_sli_duration_seconds subsystem: scheduler help: E2e latency for a pod being scheduled, from the time the pod enters the scheduling