Graduate kubelet resource metrics to GA

This commit is contained in:
Richa Banker 2023-03-23 15:15:11 -07:00
parent 93bf570d46
commit 4712025ea8
5 changed files with 192 additions and 25 deletions

View File

@ -31,14 +31,14 @@ var (
"Cumulative cpu time consumed by the node in core-seconds",
nil,
nil,
metrics.ALPHA,
metrics.STABLE,
"")
nodeMemoryUsageDesc = metrics.NewDesc("node_memory_working_set_bytes",
"Current working set of the node in bytes",
nil,
nil,
metrics.ALPHA,
metrics.STABLE,
"")
nodeSwapUsageDesc = metrics.NewDesc("node_swap_usage_bytes",
@ -52,14 +52,14 @@ var (
"Cumulative cpu time consumed by the container in core-seconds",
[]string{"container", "pod", "namespace"},
nil,
metrics.ALPHA,
metrics.STABLE,
"")
containerMemoryUsageDesc = metrics.NewDesc("container_memory_working_set_bytes",
"Current working set of the container in bytes",
[]string{"container", "pod", "namespace"},
nil,
metrics.ALPHA,
metrics.STABLE,
"")
containerSwapUsageDesc = metrics.NewDesc("container_swap_usage_bytes",
@ -73,14 +73,14 @@ var (
"Cumulative cpu time consumed by the pod in core-seconds",
[]string{"pod", "namespace"},
nil,
metrics.ALPHA,
metrics.STABLE,
"")
podMemoryUsageDesc = metrics.NewDesc("pod_memory_working_set_bytes",
"Current working set of the pod in bytes",
[]string{"pod", "namespace"},
nil,
metrics.ALPHA,
metrics.STABLE,
"")
podSwapUsageDesc = metrics.NewDesc("pod_swap_usage_bytes",
@ -95,13 +95,20 @@ var (
nil,
nil,
metrics.ALPHA,
"1.29.0")
resourceScrapeErrorResultDesc = metrics.NewDesc("resource_scrape_error",
"1 if there was an error while getting container metrics, 0 otherwise",
nil,
nil,
metrics.STABLE,
"")
containerStartTimeDesc = metrics.NewDesc("container_start_time_seconds",
"Start time of the container since unix epoch in seconds",
[]string{"container", "pod", "namespace"},
nil,
metrics.ALPHA,
metrics.STABLE,
"")
)
@ -134,6 +141,7 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des
ch <- podMemoryUsageDesc
ch <- podSwapUsageDesc
ch <- resourceScrapeResultDesc
ch <- resourceScrapeErrorResultDesc
}
// CollectWithStability implements metrics.StableCollector
@ -145,6 +153,7 @@ func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metri
var errorCount float64
defer func() {
ch <- metrics.NewLazyConstMetric(resourceScrapeResultDesc, metrics.GaugeValue, errorCount)
ch <- metrics.NewLazyConstMetric(resourceScrapeErrorResultDesc, metrics.GaugeValue, errorCount)
}()
statsSummary, err := rc.provider.GetCPUAndMemoryStats(ctx)
if err != nil {

View File

@ -36,6 +36,7 @@ func TestCollectResourceMetrics(t *testing.T) {
testTime := metav1.NewTime(staticTimestamp)
interestedMetrics := []string{
"scrape_error",
"resource_scrape_error",
"node_cpu_usage_seconds_total",
"node_memory_working_set_bytes",
"node_swap_usage_bytes",
@ -64,6 +65,9 @@ func TestCollectResourceMetrics(t *testing.T) {
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 1
# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE resource_scrape_error gauge
resource_scrape_error 1
`,
},
{
@ -86,10 +90,10 @@ func TestCollectResourceMetrics(t *testing.T) {
},
summaryErr: nil,
expectedMetrics: `
# HELP node_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the node in core-seconds
# HELP node_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the node in core-seconds
# TYPE node_cpu_usage_seconds_total counter
node_cpu_usage_seconds_total 10 1624396278302
# HELP node_memory_working_set_bytes [ALPHA] Current working set of the node in bytes
# HELP node_memory_working_set_bytes [STABLE] Current working set of the node in bytes
# TYPE node_memory_working_set_bytes gauge
node_memory_working_set_bytes 1000 1624396278302
# HELP node_swap_usage_bytes [ALPHA] Current swap usage of the node in bytes. Reported only on non-windows systems
@ -98,6 +102,9 @@ func TestCollectResourceMetrics(t *testing.T) {
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 0
# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE resource_scrape_error gauge
resource_scrape_error 0
`,
},
{
@ -119,6 +126,9 @@ func TestCollectResourceMetrics(t *testing.T) {
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 0
# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE resource_scrape_error gauge
resource_scrape_error 0
`,
},
{
@ -188,17 +198,20 @@ func TestCollectResourceMetrics(t *testing.T) {
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 0
# HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds
# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE resource_scrape_error gauge
resource_scrape_error 0
# HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds
# TYPE container_cpu_usage_seconds_total counter
container_cpu_usage_seconds_total{container="container_a",namespace="namespace_a",pod="pod_a"} 10 1624396278302
container_cpu_usage_seconds_total{container="container_a",namespace="namespace_b",pod="pod_b"} 10 1624396278302
container_cpu_usage_seconds_total{container="container_b",namespace="namespace_a",pod="pod_a"} 10 1624396278302
# HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes
# HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes
# TYPE container_memory_working_set_bytes gauge
container_memory_working_set_bytes{container="container_a",namespace="namespace_a",pod="pod_a"} 1000 1624396278302
container_memory_working_set_bytes{container="container_a",namespace="namespace_b",pod="pod_b"} 1000 1624396278302
container_memory_working_set_bytes{container="container_b",namespace="namespace_a",pod="pod_a"} 1000 1624396278302
# HELP container_start_time_seconds [ALPHA] Start time of the container since unix epoch in seconds
# HELP container_start_time_seconds [STABLE] Start time of the container since unix epoch in seconds
# TYPE container_start_time_seconds gauge
container_start_time_seconds{container="container_a",namespace="namespace_a",pod="pod_a"} 1.6243962483020916e+09 1624396248302
container_start_time_seconds{container="container_a",namespace="namespace_b",pod="pod_b"} 1.6243956783020916e+09 1624395678302
@ -239,10 +252,13 @@ func TestCollectResourceMetrics(t *testing.T) {
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 0
# HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds
# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE resource_scrape_error gauge
resource_scrape_error 0
# HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds
# TYPE container_cpu_usage_seconds_total counter
container_cpu_usage_seconds_total{container="container_a",namespace="namespace_a",pod="pod_a"} 10 1624396278302
# HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes
# HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes
# TYPE container_memory_working_set_bytes gauge
container_memory_working_set_bytes{container="container_a",namespace="namespace_a",pod="pod_a"} 1000 1624396278302
`,
@ -295,19 +311,22 @@ func TestCollectResourceMetrics(t *testing.T) {
},
summaryErr: nil,
expectedMetrics: `
# HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds
# HELP container_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the container in core-seconds
# TYPE container_cpu_usage_seconds_total counter
container_cpu_usage_seconds_total{container="container_a",namespace="namespace_b",pod="pod_b"} 10 1624396278302
# HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes
# HELP container_memory_working_set_bytes [STABLE] Current working set of the container in bytes
# TYPE container_memory_working_set_bytes gauge
container_memory_working_set_bytes{container="container_a",namespace="namespace_b",pod="pod_b"} 1000 1624396278302
# HELP container_start_time_seconds [ALPHA] Start time of the container since unix epoch in seconds
# HELP container_start_time_seconds [STABLE] Start time of the container since unix epoch in seconds
# TYPE container_start_time_seconds gauge
container_start_time_seconds{container="container_a",namespace="namespace_a",pod="pod_a"} 1.6243962483020916e+09 1624396248302
container_start_time_seconds{container="container_a",namespace="namespace_b",pod="pod_b"} 1.6243956783020916e+09 1624395678302
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 0
# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE resource_scrape_error gauge
resource_scrape_error 0
`,
},
{
@ -339,10 +358,13 @@ func TestCollectResourceMetrics(t *testing.T) {
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 0
# HELP pod_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the pod in core-seconds
# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE resource_scrape_error gauge
resource_scrape_error 0
# HELP pod_cpu_usage_seconds_total [STABLE] Cumulative cpu time consumed by the pod in core-seconds
# TYPE pod_cpu_usage_seconds_total counter
pod_cpu_usage_seconds_total{namespace="namespace_a",pod="pod_a"} 10 1624396278302
# HELP pod_memory_working_set_bytes [ALPHA] Current working set of the pod in bytes
# HELP pod_memory_working_set_bytes [STABLE] Current working set of the pod in bytes
# TYPE pod_memory_working_set_bytes gauge
pod_memory_working_set_bytes{namespace="namespace_a",pod="pod_a"} 1000 1624396278302
# HELP pod_swap_usage_bytes [ALPHA] Current amount of the pod swap usage in bytes. Reported only on non-windows systems
@ -375,6 +397,9 @@ func TestCollectResourceMetrics(t *testing.T) {
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 0
# HELP resource_scrape_error [STABLE] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE resource_scrape_error gauge
resource_scrape_error 0
`,
},
}

View File

@ -181,21 +181,34 @@ func (g *Grabber) GrabFromKubelet(ctx context.Context, nodeName string) (Kubelet
return KubeletMetrics{}, fmt.Errorf("Error listing nodes with name %v, got %v", nodeName, nodes.Items)
}
kubeletPort := nodes.Items[0].Status.DaemonEndpoints.KubeletEndpoint.Port
return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort))
return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort), "metrics")
}
func (g *Grabber) grabFromKubeletInternal(ctx context.Context, nodeName string, kubeletPort int) (KubeletMetrics, error) {
// GrabresourceMetricsFromKubelet returns resource metrics from kubelet
func (g *Grabber) GrabResourceMetricsFromKubelet(ctx context.Context, nodeName string) (KubeletMetrics, error) {
nodes, err := g.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{"metadata.name": nodeName}.AsSelector().String()})
if err != nil {
return KubeletMetrics{}, err
}
if len(nodes.Items) != 1 {
return KubeletMetrics{}, fmt.Errorf("Error listing nodes with name %v, got %v", nodeName, nodes.Items)
}
kubeletPort := nodes.Items[0].Status.DaemonEndpoints.KubeletEndpoint.Port
return g.grabFromKubeletInternal(ctx, nodeName, int(kubeletPort), "metrics/resource")
}
func (g *Grabber) grabFromKubeletInternal(ctx context.Context, nodeName string, kubeletPort int, pathSuffix string) (KubeletMetrics, error) {
if kubeletPort <= 0 || kubeletPort > 65535 {
return KubeletMetrics{}, fmt.Errorf("Invalid Kubelet port %v. Skipping Kubelet's metrics gathering", kubeletPort)
}
output, err := g.getMetricsFromNode(ctx, nodeName, int(kubeletPort))
output, err := g.getMetricsFromNode(ctx, nodeName, int(kubeletPort), pathSuffix)
if err != nil {
return KubeletMetrics{}, err
}
return parseKubeletMetrics(output)
}
func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubeletPort int) (string, error) {
func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubeletPort int, pathSuffix string) (string, error) {
// There's a problem with timing out during proxy. Wrapping this in a goroutine to prevent deadlock.
finished := make(chan struct{}, 1)
var err error
@ -205,7 +218,7 @@ func (g *Grabber) getMetricsFromNode(ctx context.Context, nodeName string, kubel
Resource("nodes").
SubResource("proxy").
Name(fmt.Sprintf("%v:%v", nodeName, kubeletPort)).
Suffix("metrics").
Suffix(pathSuffix).
Do(ctx).Raw()
finished <- struct{}{}
}()
@ -432,7 +445,7 @@ func (g *Grabber) Grab(ctx context.Context) (Collection, error) {
} else {
for _, node := range nodes.Items {
kubeletPort := node.Status.DaemonEndpoints.KubeletEndpoint.Port
metrics, err := g.grabFromKubeletInternal(ctx, node.Name, int(kubeletPort))
metrics, err := g.grabFromKubeletInternal(ctx, node.Name, int(kubeletPort), "metrics")
if err != nil {
errs = append(errs, err)
}

View File

@ -0,0 +1,70 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package instrumentation
import (
"context"
"errors"
"time"
"github.com/onsi/gomega"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
"k8s.io/kubernetes/test/e2e/instrumentation/common"
admissionapi "k8s.io/pod-security-admission/api"
"github.com/onsi/ginkgo/v2"
)
var _ = common.SIGDescribe("Metrics", func() {
f := framework.NewDefaultFramework("metrics")
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
var c, ec clientset.Interface
var grabber *e2emetrics.Grabber
ginkgo.BeforeEach(func(ctx context.Context) {
var err error
c = f.ClientSet
ec = f.KubemarkExternalClusterClientSet
gomega.Eventually(ctx, func() error {
grabber, err = e2emetrics.NewMetricsGrabber(ctx, c, ec, f.ClientConfig(), true, true, true, true, true, true)
if err != nil {
framework.ExpectNoError(err, "failed to create metrics grabber")
}
return nil
}, 5*time.Minute, 10*time.Second).Should(gomega.BeNil())
})
/*
Release: v1.29
Testname: Kubelet resource metrics
Description: Should attempt to grab all resource metrics from kubelet metrics/resource endpoint.
*/
ginkgo.It("should grab all metrics from kubelet /metrics/resource endpoint", func(ctx context.Context) {
ginkgo.By("Connecting to kubelet's /metrics/resource endpoint")
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
if errors.Is(err, e2emetrics.MetricsGrabbingDisabledError) {
e2eskipper.Skipf("%v", err)
}
framework.ExpectNoError(err)
response, err := grabber.GrabResourceMetricsFromKubelet(ctx, node.Name)
framework.ExpectNoError(err)
gomega.Expect(response).NotTo(gomega.BeEmpty())
})
})

View File

@ -74,6 +74,56 @@
stabilityLevel: STABLE
labels:
- zone
- name: container_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the container in core-seconds
type: Custom
stabilityLevel: STABLE
labels:
- container
- pod
- namespace
- name: container_memory_working_set_bytes
help: Current working set of the container in bytes
type: Custom
stabilityLevel: STABLE
labels:
- container
- pod
- namespace
- name: container_start_time_seconds
help: Start time of the container since unix epoch in seconds
type: Custom
stabilityLevel: STABLE
labels:
- container
- pod
- namespace
- name: node_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the node in core-seconds
type: Custom
stabilityLevel: STABLE
- name: node_memory_working_set_bytes
help: Current working set of the node in bytes
type: Custom
stabilityLevel: STABLE
- name: pod_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the pod in core-seconds
type: Custom
stabilityLevel: STABLE
labels:
- pod
- namespace
- name: pod_memory_working_set_bytes
help: Current working set of the pod in bytes
type: Custom
stabilityLevel: STABLE
labels:
- pod
- namespace
- name: resource_scrape_error
help: 1 if there was an error while getting container metrics, 0 otherwise
type: Custom
stabilityLevel: STABLE
- name: pod_scheduling_sli_duration_seconds
subsystem: scheduler
help: E2e latency for a pod being scheduled, from the time the pod enters the scheduling