From 8dfc548709b8cd9f1d6a4c49418b09bfc6b51b43 Mon Sep 17 00:00:00 2001 From: Eric Ernst Date: Fri, 23 Oct 2020 11:23:59 -0700 Subject: [PATCH 1/2] resource-metrics: add pod/sandbox metrics to endpoint Pod metrics may not be the same as the sum of container metrics. Add support for pod specific metrics to allow for more accurate accounting of resources. Signed-off-by: Eric Ernst --- .../metrics/collectors/resource_metrics.go | 44 +++++++++++++++++-- .../collectors/resource_metrics_test.go | 35 +++++++++++++++ 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/metrics/collectors/resource_metrics.go b/pkg/kubelet/metrics/collectors/resource_metrics.go index 00945f4ca6b..44a54254c73 100644 --- a/pkg/kubelet/metrics/collectors/resource_metrics.go +++ b/pkg/kubelet/metrics/collectors/resource_metrics.go @@ -54,7 +54,21 @@ var ( metrics.ALPHA, "") - resouceScrapeResultDesc = metrics.NewDesc("scrape_error", + podCPUUsageDesc = metrics.NewDesc("pod_cpu_usage_seconds_total", + "Cumulative cpu time consumed by the pod in core-seconds", + []string{"pod", "namespace"}, + nil, + metrics.ALPHA, + "") + + podMemoryUsageDesc = metrics.NewDesc("pod_memory_working_set_bytes", + "Current working set of the pod in bytes", + []string{"pod", "namespace"}, + nil, + metrics.ALPHA, + "") + + resourceScrapeResultDesc = metrics.NewDesc("scrape_error", "1 if there was an error while getting container metrics, 0 otherwise", nil, nil, @@ -84,7 +98,9 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des ch <- nodeMemoryUsageDesc ch <- containerCPUUsageDesc ch <- containerMemoryUsageDesc - ch <- resouceScrapeResultDesc + ch <- podCPUUsageDesc + ch <- podMemoryUsageDesc + ch <- resourceScrapeResultDesc } // CollectWithStability implements metrics.StableCollector @@ -94,7 +110,7 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metric) { var errorCount float64 defer func() { - ch <- metrics.NewLazyConstMetric(resouceScrapeResultDesc, metrics.GaugeValue, errorCount) + ch <- metrics.NewLazyConstMetric(resourceScrapeResultDesc, metrics.GaugeValue, errorCount) }() statsSummary, err := rc.provider.GetCPUAndMemoryStats() if err != nil { @@ -111,6 +127,8 @@ func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metri rc.collectContainerCPUMetrics(ch, pod, container) rc.collectContainerMemoryMetrics(ch, pod, container) } + rc.collectPodCPUMetrics(ch, pod) + rc.collectPodMemoryMetrics(ch, pod) } } @@ -151,3 +169,23 @@ func (rc *resourceMetricsCollector) collectContainerMemoryMetrics(ch chan<- metr metrics.NewLazyConstMetric(containerMemoryUsageDesc, metrics.GaugeValue, float64(*s.Memory.WorkingSetBytes), s.Name, pod.PodRef.Name, pod.PodRef.Namespace)) } + +func (rc *resourceMetricsCollector) collectPodCPUMetrics(ch chan<- metrics.Metric, pod summary.PodStats) { + if pod.CPU == nil { + return + } + + ch <- metrics.NewLazyMetricWithTimestamp(pod.CPU.Time.Time, + metrics.NewLazyConstMetric(podCPUUsageDesc, metrics.CounterValue, + float64(*pod.CPU.UsageCoreNanoSeconds)/float64(time.Second), pod.PodRef.Name, pod.PodRef.Namespace)) +} + +func (rc *resourceMetricsCollector) collectPodMemoryMetrics(ch chan<- metrics.Metric, pod summary.PodStats) { + if pod.Memory == nil { + return + } + + ch <- metrics.NewLazyMetricWithTimestamp(pod.Memory.Time.Time, + metrics.NewLazyConstMetric(podMemoryUsageDesc, metrics.GaugeValue, + float64(*pod.Memory.WorkingSetBytes), pod.PodRef.Name, pod.PodRef.Namespace)) +} diff --git a/pkg/kubelet/metrics/collectors/resource_metrics_test.go b/pkg/kubelet/metrics/collectors/resource_metrics_test.go index 2470c1b21fb..aefb57d4480 100644 --- a/pkg/kubelet/metrics/collectors/resource_metrics_test.go +++ b/pkg/kubelet/metrics/collectors/resource_metrics_test.go @@ -51,6 +51,8 @@ func TestCollectResourceMetrics(t *testing.T) { "node_memory_working_set_bytes", "container_cpu_usage_seconds_total", "container_memory_working_set_bytes", + "pod_cpu_usage_seconds_total", + "pod_memory_working_set_bytes", } tests := []struct { @@ -168,6 +170,39 @@ func TestCollectResourceMetrics(t *testing.T) { container_memory_working_set_bytes{container="container_b",namespace="namespace_a",pod="pod_a"} 1000 2000 `, }, + { + name: "arbitrary pod metrics", + summary: &statsapi.Summary{ + Pods: []statsapi.PodStats{ + { + PodRef: statsapi.PodReference{ + Name: "pod_a", + Namespace: "namespace_a", + }, + CPU: &statsapi.CPUStats{ + Time: testTime, + UsageCoreNanoSeconds: uint64Ptr(10000000000), + }, + Memory: &statsapi.MemoryStats{ + Time: testTime, + WorkingSetBytes: uint64Ptr(1000), + }, + }, + }, + }, + summaryErr: nil, + expectedMetrics: ` + # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE scrape_error gauge + scrape_error 0 + # HELP pod_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the pod in core-seconds + # TYPE pod_cpu_usage_seconds_total counter + pod_cpu_usage_seconds_total{namespace="namespace_a",pod="pod_a"} 10 2000 + # HELP pod_memory_working_set_bytes [ALPHA] Current working set of the pod in bytes + # TYPE pod_memory_working_set_bytes gauge + pod_memory_working_set_bytes{namespace="namespace_a",pod="pod_a"} 1000 2000 + `, + }, } for _, test := range tests { From b26b755e94a709f343d030e18391c3d0cc05e78a Mon Sep 17 00:00:00 2001 From: Eric Ernst Date: Tue, 3 Nov 2020 10:14:13 -0800 Subject: [PATCH 2/2] resource-metrics: add pod metrics e2e test Signed-off-by: Eric Ernst --- test/e2e_node/resource_metrics_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/e2e_node/resource_metrics_test.go b/test/e2e_node/resource_metrics_test.go index 215ef1e400d..7a3a074bdd7 100644 --- a/test/e2e_node/resource_metrics_test.go +++ b/test/e2e_node/resource_metrics_test.go @@ -88,6 +88,16 @@ var _ = framework.KubeDescribe("ResourceMetricsAPI", func() { fmt.Sprintf("%s::%s::%s", f.Namespace.Name, pod0, "busybox-container"): boundedSample(10*e2evolume.Kb, 80*e2evolume.Mb), fmt.Sprintf("%s::%s::%s", f.Namespace.Name, pod1, "busybox-container"): boundedSample(10*e2evolume.Kb, 80*e2evolume.Mb), }), + + "pod_cpu_usage_seconds_total": gstruct.MatchElements(containerID, gstruct.IgnoreExtras, gstruct.Elements{ + fmt.Sprintf("%s::%s", f.Namespace.Name, pod0): boundedSample(0, 100), + fmt.Sprintf("%s::%s", f.Namespace.Name, pod1): boundedSample(0, 100), + }), + + "pod_memory_working_set_bytes": gstruct.MatchAllElements(containerID, gstruct.Elements{ + fmt.Sprintf("%s::%s", f.Namespace.Name, pod0): boundedSample(10*e2evolume.Kb, 80*e2evolume.Mb), + fmt.Sprintf("%s::%s", f.Namespace.Name, pod1): boundedSample(10*e2evolume.Kb, 80*e2evolume.Mb), + }), }) ginkgo.By("Giving pods a minute to start up and produce metrics") gomega.Eventually(getV1alpha1ResourceMetrics, 1*time.Minute, 15*time.Second).Should(matchV1alpha1Expectations)