resource-metrics: add pod/sandbox metrics to endpoint

Pod metrics may not be the same as the sum of container metrics. Add support for pod specific
metrics to allow for more accurate accounting of resources.

Signed-off-by: Eric Ernst <eric_ernst@apple.com>
This commit is contained in:
Eric Ernst 2020-10-23 11:23:59 -07:00
parent b3033da9a1
commit 8dfc548709
2 changed files with 76 additions and 3 deletions

View File

@ -54,7 +54,21 @@ var (
metrics.ALPHA,
"")
resouceScrapeResultDesc = metrics.NewDesc("scrape_error",
podCPUUsageDesc = metrics.NewDesc("pod_cpu_usage_seconds_total",
"Cumulative cpu time consumed by the pod in core-seconds",
[]string{"pod", "namespace"},
nil,
metrics.ALPHA,
"")
podMemoryUsageDesc = metrics.NewDesc("pod_memory_working_set_bytes",
"Current working set of the pod in bytes",
[]string{"pod", "namespace"},
nil,
metrics.ALPHA,
"")
resourceScrapeResultDesc = metrics.NewDesc("scrape_error",
"1 if there was an error while getting container metrics, 0 otherwise",
nil,
nil,
@ -84,7 +98,9 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des
ch <- nodeMemoryUsageDesc
ch <- containerCPUUsageDesc
ch <- containerMemoryUsageDesc
ch <- resouceScrapeResultDesc
ch <- podCPUUsageDesc
ch <- podMemoryUsageDesc
ch <- resourceScrapeResultDesc
}
// CollectWithStability implements metrics.StableCollector
@ -94,7 +110,7 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des
func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metric) {
var errorCount float64
defer func() {
ch <- metrics.NewLazyConstMetric(resouceScrapeResultDesc, metrics.GaugeValue, errorCount)
ch <- metrics.NewLazyConstMetric(resourceScrapeResultDesc, metrics.GaugeValue, errorCount)
}()
statsSummary, err := rc.provider.GetCPUAndMemoryStats()
if err != nil {
@ -111,6 +127,8 @@ func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metri
rc.collectContainerCPUMetrics(ch, pod, container)
rc.collectContainerMemoryMetrics(ch, pod, container)
}
rc.collectPodCPUMetrics(ch, pod)
rc.collectPodMemoryMetrics(ch, pod)
}
}
@ -151,3 +169,23 @@ func (rc *resourceMetricsCollector) collectContainerMemoryMetrics(ch chan<- metr
metrics.NewLazyConstMetric(containerMemoryUsageDesc, metrics.GaugeValue,
float64(*s.Memory.WorkingSetBytes), s.Name, pod.PodRef.Name, pod.PodRef.Namespace))
}
func (rc *resourceMetricsCollector) collectPodCPUMetrics(ch chan<- metrics.Metric, pod summary.PodStats) {
if pod.CPU == nil {
return
}
ch <- metrics.NewLazyMetricWithTimestamp(pod.CPU.Time.Time,
metrics.NewLazyConstMetric(podCPUUsageDesc, metrics.CounterValue,
float64(*pod.CPU.UsageCoreNanoSeconds)/float64(time.Second), pod.PodRef.Name, pod.PodRef.Namespace))
}
func (rc *resourceMetricsCollector) collectPodMemoryMetrics(ch chan<- metrics.Metric, pod summary.PodStats) {
if pod.Memory == nil {
return
}
ch <- metrics.NewLazyMetricWithTimestamp(pod.Memory.Time.Time,
metrics.NewLazyConstMetric(podMemoryUsageDesc, metrics.GaugeValue,
float64(*pod.Memory.WorkingSetBytes), pod.PodRef.Name, pod.PodRef.Namespace))
}

View File

@ -51,6 +51,8 @@ func TestCollectResourceMetrics(t *testing.T) {
"node_memory_working_set_bytes",
"container_cpu_usage_seconds_total",
"container_memory_working_set_bytes",
"pod_cpu_usage_seconds_total",
"pod_memory_working_set_bytes",
}
tests := []struct {
@ -168,6 +170,39 @@ func TestCollectResourceMetrics(t *testing.T) {
container_memory_working_set_bytes{container="container_b",namespace="namespace_a",pod="pod_a"} 1000 2000
`,
},
{
name: "arbitrary pod metrics",
summary: &statsapi.Summary{
Pods: []statsapi.PodStats{
{
PodRef: statsapi.PodReference{
Name: "pod_a",
Namespace: "namespace_a",
},
CPU: &statsapi.CPUStats{
Time: testTime,
UsageCoreNanoSeconds: uint64Ptr(10000000000),
},
Memory: &statsapi.MemoryStats{
Time: testTime,
WorkingSetBytes: uint64Ptr(1000),
},
},
},
},
summaryErr: nil,
expectedMetrics: `
# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
# TYPE scrape_error gauge
scrape_error 0
# HELP pod_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the pod in core-seconds
# TYPE pod_cpu_usage_seconds_total counter
pod_cpu_usage_seconds_total{namespace="namespace_a",pod="pod_a"} 10 2000
# HELP pod_memory_working_set_bytes [ALPHA] Current working set of the pod in bytes
# TYPE pod_memory_working_set_bytes gauge
pod_memory_working_set_bytes{namespace="namespace_a",pod="pod_a"} 1000 2000
`,
},
}
for _, test := range tests {