From f33652ce61e289d3d3b281517fb1895781d479e4 Mon Sep 17 00:00:00 2001 From: JunYang Date: Thu, 14 Jul 2022 16:38:48 +0800 Subject: [PATCH] Fix kubelet panic when accessing metrics/resource endpoint --- .../metrics/collectors/resource_metrics.go | 12 +- .../collectors/resource_metrics_test.go | 111 ++++++++++++++++++ 2 files changed, 117 insertions(+), 6 deletions(-) diff --git a/pkg/kubelet/metrics/collectors/resource_metrics.go b/pkg/kubelet/metrics/collectors/resource_metrics.go index 74d2ea7af10..0f0662a2cd8 100644 --- a/pkg/kubelet/metrics/collectors/resource_metrics.go +++ b/pkg/kubelet/metrics/collectors/resource_metrics.go @@ -142,7 +142,7 @@ func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metri } func (rc *resourceMetricsCollector) collectNodeCPUMetrics(ch chan<- metrics.Metric, s summary.NodeStats) { - if s.CPU == nil { + if s.CPU == nil || s.CPU.UsageCoreNanoSeconds == nil { return } @@ -151,7 +151,7 @@ func (rc *resourceMetricsCollector) collectNodeCPUMetrics(ch chan<- metrics.Metr } func (rc *resourceMetricsCollector) collectNodeMemoryMetrics(ch chan<- metrics.Metric, s summary.NodeStats) { - if s.Memory == nil { + if s.Memory == nil || s.Memory.WorkingSetBytes == nil { return } @@ -169,7 +169,7 @@ func (rc *resourceMetricsCollector) collectContainerStartTime(ch chan<- metrics. } func (rc *resourceMetricsCollector) collectContainerCPUMetrics(ch chan<- metrics.Metric, pod summary.PodStats, s summary.ContainerStats) { - if s.CPU == nil { + if s.CPU == nil || s.CPU.UsageCoreNanoSeconds == nil { return } @@ -179,7 +179,7 @@ func (rc *resourceMetricsCollector) collectContainerCPUMetrics(ch chan<- metrics } func (rc *resourceMetricsCollector) collectContainerMemoryMetrics(ch chan<- metrics.Metric, pod summary.PodStats, s summary.ContainerStats) { - if s.Memory == nil { + if s.Memory == nil || s.Memory.WorkingSetBytes == nil { return } @@ -189,7 +189,7 @@ func (rc *resourceMetricsCollector) collectContainerMemoryMetrics(ch chan<- metr } func (rc *resourceMetricsCollector) collectPodCPUMetrics(ch chan<- metrics.Metric, pod summary.PodStats) { - if pod.CPU == nil { + if pod.CPU == nil || pod.CPU.UsageCoreNanoSeconds == nil { return } @@ -199,7 +199,7 @@ func (rc *resourceMetricsCollector) collectPodCPUMetrics(ch chan<- metrics.Metri } func (rc *resourceMetricsCollector) collectPodMemoryMetrics(ch chan<- metrics.Metric, pod summary.PodStats) { - if pod.Memory == nil { + if pod.Memory == nil || pod.Memory.WorkingSetBytes == nil { return } diff --git a/pkg/kubelet/metrics/collectors/resource_metrics_test.go b/pkg/kubelet/metrics/collectors/resource_metrics_test.go index 5d2077815aa..6dd4fd8b531 100644 --- a/pkg/kubelet/metrics/collectors/resource_metrics_test.go +++ b/pkg/kubelet/metrics/collectors/resource_metrics_test.go @@ -89,6 +89,27 @@ func TestCollectResourceMetrics(t *testing.T) { scrape_error 0 `, }, + { + name: "nil node metrics", + summary: &statsapi.Summary{ + Node: statsapi.NodeStats{ + CPU: &statsapi.CPUStats{ + Time: testTime, + UsageCoreNanoSeconds: nil, + }, + Memory: &statsapi.MemoryStats{ + Time: testTime, + WorkingSetBytes: nil, + }, + }, + }, + summaryErr: nil, + expectedMetrics: ` + # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE scrape_error gauge + scrape_error 0 + `, + }, { name: "arbitrary container metrics for different container, pods and namespaces", summary: &statsapi.Summary{ @@ -169,6 +190,69 @@ func TestCollectResourceMetrics(t *testing.T) { container_start_time_seconds{container="container_b",namespace="namespace_a",pod="pod_a"} 1.6243961583020916e+09 1624396158302 `, }, + { + name: "nil container metrics", + summary: &statsapi.Summary{ + Pods: []statsapi.PodStats{ + { + PodRef: statsapi.PodReference{ + Name: "pod_a", + Namespace: "namespace_a", + }, + Containers: []statsapi.ContainerStats{ + { + Name: "container_a", + StartTime: metav1.NewTime(staticTimestamp.Add(-30 * time.Second)), + CPU: &statsapi.CPUStats{ + Time: testTime, + UsageCoreNanoSeconds: nil, + }, + Memory: &statsapi.MemoryStats{ + Time: testTime, + WorkingSetBytes: nil, + }, + }, + }, + }, + { + PodRef: statsapi.PodReference{ + Name: "pod_b", + Namespace: "namespace_b", + }, + Containers: []statsapi.ContainerStats{ + { + Name: "container_a", + StartTime: metav1.NewTime(staticTimestamp.Add(-10 * time.Minute)), + CPU: &statsapi.CPUStats{ + Time: testTime, + UsageCoreNanoSeconds: uint64Ptr(10000000000), + }, + Memory: &statsapi.MemoryStats{ + Time: testTime, + WorkingSetBytes: uint64Ptr(1000), + }, + }, + }, + }, + }, + }, + summaryErr: nil, + expectedMetrics: ` + # HELP container_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the container in core-seconds + # TYPE container_cpu_usage_seconds_total counter + container_cpu_usage_seconds_total{container="container_a",namespace="namespace_b",pod="pod_b"} 10 1624396278302 + # HELP container_memory_working_set_bytes [ALPHA] Current working set of the container in bytes + # TYPE container_memory_working_set_bytes gauge + container_memory_working_set_bytes{container="container_a",namespace="namespace_b",pod="pod_b"} 1000 1624396278302 + # HELP container_start_time_seconds [ALPHA] Start time of the container since unix epoch in seconds + # TYPE container_start_time_seconds gauge + container_start_time_seconds{container="container_a",namespace="namespace_a",pod="pod_a"} 1.6243962483020916e+09 1624396248302 + container_start_time_seconds{container="container_a",namespace="namespace_b",pod="pod_b"} 1.6243956783020916e+09 1624395678302 + # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE scrape_error gauge + scrape_error 0 + `, + }, { name: "arbitrary pod metrics", summary: &statsapi.Summary{ @@ -202,6 +286,33 @@ func TestCollectResourceMetrics(t *testing.T) { pod_memory_working_set_bytes{namespace="namespace_a",pod="pod_a"} 1000 1624396278302 `, }, + { + name: "nil pod metrics", + summary: &statsapi.Summary{ + Pods: []statsapi.PodStats{ + { + PodRef: statsapi.PodReference{ + Name: "pod_a", + Namespace: "namespace_a", + }, + CPU: &statsapi.CPUStats{ + Time: testTime, + UsageCoreNanoSeconds: nil, + }, + Memory: &statsapi.MemoryStats{ + Time: testTime, + WorkingSetBytes: nil, + }, + }, + }, + }, + summaryErr: nil, + expectedMetrics: ` + # HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise + # TYPE scrape_error gauge + scrape_error 0 + `, + }, } for _, test := range tests {