Work-around for missing metrics on CRI-O exited containers

HPA needs metrics for exited init containers before it will take action. By setting memory and CPU usage to zero for any containers that cAdvisor didn't provide statistics for, we are assured that HPA will be able to correctly calculate pod resource usage.
2025-09-26 12:46:06 +00:00 · 2020-03-02 08:08:35 -07:00
parent 0ad60b35c1
commit da988294ec
3 changed files with 82 additions and 4 deletions
--- a/pkg/kubelet/stats/cadvisor_stats_provider_test.go
+++ b/pkg/kubelet/stats/cadvisor_stats_provider_test.go
@@ -89,6 +89,9 @@ func TestCadvisorListPodStats(t *testing.T) {
 		seedPod1Container     = 4000
 		seedPod2Infra         = 5000
 		seedPod2Container     = 6000
+		seedPod3Infra         = 7000
+		seedPod3Container0    = 8000
+		seedPod3Container1    = 8001
 		seedEphemeralVolume1  = 10000
 		seedEphemeralVolume2  = 10001
 		seedPersistentVolume1 = 20000
@@ -98,12 +101,15 @@ func TestCadvisorListPodStats(t *testing.T) {
 		pName0 = "pod0"
 		pName1 = "pod1"
 		pName2 = "pod0" // ensure pName2 conflicts with pName0, but is in a different namespace
+		pName3 = "pod3"
 	)
 	const (
 		cName00 = "c0"
 		cName01 = "c1"
 		cName10 = "c0" // ensure cName10 conflicts with cName02, but is in a different pod
 		cName20 = "c1" // ensure cName20 conflicts with cName01, but is in a different pod + namespace
+		cName30 = "c0-init"
+		cName31 = "c1"
 	)
 	const (
 		rootfsCapacity    = uint64(10000000)
@@ -119,6 +125,7 @@ func TestCadvisorListPodStats(t *testing.T) {
 	prf0 := statsapi.PodReference{Name: pName0, Namespace: namespace0, UID: "UID" + pName0}
 	prf1 := statsapi.PodReference{Name: pName1, Namespace: namespace0, UID: "UID" + pName1}
 	prf2 := statsapi.PodReference{Name: pName2, Namespace: namespace2, UID: "UID" + pName2}
+	prf3 := statsapi.PodReference{Name: pName3, Namespace: namespace0, UID: "UID" + pName3}
 	infos := map[string]cadvisorapiv2.ContainerInfo{
 		"/":              getTestContainerInfo(seedRoot, "", "", ""),
 		"/docker-daemon": getTestContainerInfo(seedRuntime, "", "", ""),
@@ -136,6 +143,10 @@ func TestCadvisorListPodStats(t *testing.T) {
 		"/pod2-c0":                       getTestContainerInfo(seedPod2Container, pName2, namespace2, cName20),
 		"/kubepods/burstable/podUIDpod0": getTestContainerInfo(seedPod0Infra, pName0, namespace0, leaky.PodInfraContainerName),
 		"/kubepods/podUIDpod1":           getTestContainerInfo(seedPod1Infra, pName1, namespace0, leaky.PodInfraContainerName),
+		// Pod3 - Namespace0
+		"/pod3-i":       getTestContainerInfo(seedPod3Infra, pName3, namespace0, leaky.PodInfraContainerName),
+		"/pod3-c0-init": getTestContainerInfo(seedPod3Container0, pName3, namespace0, cName30),
+		"/pod3-c1":      getTestContainerInfo(seedPod3Container1, pName3, namespace0, cName31),
 	}

 	freeRootfsInodes := rootfsInodesFree
@@ -169,6 +180,21 @@ func TestCadvisorListPodStats(t *testing.T) {
 		info.Spec.Memory.Limit = memoryLimitOverride
 		infos[name] = info
 	}
+	// any container for which cadvisor should return no stats (as might be the case for an exited init container)
+	nostatsOverrides := []string{
+		"/pod3-c0-init",
+	}
+	for _, name := range nostatsOverrides {
+		info, found := infos[name]
+		if !found {
+			t.Errorf("No container defined with name %v", name)
+		}
+		info.Spec.Memory = cadvisorapiv2.MemorySpec{}
+		info.Spec.Cpu = cadvisorapiv2.CpuSpec{}
+		info.Spec.HasMemory = false
+		info.Spec.HasCpu = false
+		infos[name] = info
+	}

 	options := cadvisorapiv2.RequestOptions{
 		IdType:    cadvisorapiv2.TypeName,
@@ -197,10 +223,12 @@ func TestCadvisorListPodStats(t *testing.T) {
 	p0Time := metav1.Now()
 	p1Time := metav1.Now()
 	p2Time := metav1.Now()
+	p3Time := metav1.Now()
 	mockStatus := new(statustest.MockStatusProvider)
 	mockStatus.On("GetPodStatus", types.UID("UID"+pName0)).Return(v1.PodStatus{StartTime: &p0Time}, true)
 	mockStatus.On("GetPodStatus", types.UID("UID"+pName1)).Return(v1.PodStatus{StartTime: &p1Time}, true)
 	mockStatus.On("GetPodStatus", types.UID("UID"+pName2)).Return(v1.PodStatus{StartTime: &p2Time}, true)
+	mockStatus.On("GetPodStatus", types.UID("UID"+pName3)).Return(v1.PodStatus{StartTime: &p3Time}, true)

 	resourceAnalyzer := &fakeResourceAnalyzer{podVolumeStats: volumeStats}

@@ -208,7 +236,7 @@ func TestCadvisorListPodStats(t *testing.T) {
 	pods, err := p.ListPodStats()
 	assert.NoError(t, err)

-	assert.Equal(t, 3, len(pods))
+	assert.Equal(t, 4, len(pods))
 	indexPods := make(map[statsapi.PodReference]statsapi.PodStats, len(pods))
 	for _, pod := range pods {
 		indexPods[pod.PodRef] = pod
@@ -261,6 +289,24 @@ func TestCadvisorListPodStats(t *testing.T) {
 	checkCPUStats(t, "Pod2Container0", seedPod2Container, con.CPU)
 	checkMemoryStats(t, "Pod2Container0", seedPod2Container, infos["/pod2-c0"], con.Memory)
 	checkNetworkStats(t, "Pod2", seedPod2Infra, ps.Network)
+
+	// Validate Pod3 Results
+
+	ps, found = indexPods[prf3]
+	assert.True(t, found)
+	assert.Len(t, ps.Containers, 2)
+	indexCon = make(map[string]statsapi.ContainerStats, len(ps.Containers))
+	for _, con := range ps.Containers {
+		indexCon[con.Name] = con
+	}
+	con = indexCon[cName31]
+	assert.Equal(t, cName31, con.Name)
+	checkCPUStats(t, "Pod3Container1", seedPod3Container1, con.CPU)
+	checkMemoryStats(t, "Pod3Container1", seedPod3Container1, infos["/pod3-c1"], con.Memory)
+	con = indexCon[cName30]
+	assert.Equal(t, cName30, con.Name)
+	checkEmptyCPUStats(t, "Pod3Container0", seedPod3Container0, con.CPU)
+	checkEmptyMemoryStats(t, "Pod3Container0", seedPod3Container0, infos["/pod3-c0-init"], con.Memory)
 }

 func TestCadvisorListPodCPUAndMemoryStats(t *testing.T) {
--- a/pkg/kubelet/stats/helper.go
+++ b/pkg/kubelet/stats/helper.go
@@ -40,10 +40,12 @@ func cadvisorInfoToCPUandMemoryStats(info *cadvisorapiv2.ContainerInfo) (*statsa
 	}
 	var cpuStats *statsapi.CPUStats
 	var memoryStats *statsapi.MemoryStats
+	cpuStats = &statsapi.CPUStats{
+		Time:                 metav1.NewTime(cstat.Timestamp),
+		UsageNanoCores:       uint64Ptr(0),
+		UsageCoreNanoSeconds: uint64Ptr(0),
+	}
 	if info.Spec.HasCpu {
-		cpuStats = &statsapi.CPUStats{
-			Time: metav1.NewTime(cstat.Timestamp),
-		}
 		if cstat.CpuInst != nil {
 			cpuStats.UsageNanoCores = &cstat.CpuInst.Usage.Total
 		}
@@ -67,6 +69,11 @@ func cadvisorInfoToCPUandMemoryStats(info *cadvisorapiv2.ContainerInfo) (*statsa
 			availableBytes := info.Spec.Memory.Limit - cstat.Memory.WorkingSet
 			memoryStats.AvailableBytes = &availableBytes
 		}
+	} else {
+		memoryStats = &statsapi.MemoryStats{
+			Time:            metav1.NewTime(cstat.Timestamp),
+			WorkingSetBytes: uint64Ptr(0),
+		}
 	}
 	return cpuStats, memoryStats
 }
--- a/pkg/kubelet/stats/stats_provider_test.go
+++ b/pkg/kubelet/stats/stats_provider_test.go
@@ -621,7 +621,32 @@ func checkNetworkStats(t *testing.T, label string, seed int, stats *statsapi.Net

 }

+// container which had no stats should have zero-valued CPU usage
+func checkEmptyCPUStats(t *testing.T, label string, seed int, stats *statsapi.CPUStats) {
+	require.NotNil(t, stats.Time, label+".CPU.Time")
+	require.NotNil(t, stats.UsageNanoCores, label+".CPU.UsageNanoCores")
+	require.NotNil(t, stats.UsageNanoCores, label+".CPU.UsageCoreSeconds")
+	assert.EqualValues(t, testTime(timestamp, seed).Unix(), stats.Time.Time.Unix(), label+".CPU.Time")
+	assert.EqualValues(t, 0, *stats.UsageNanoCores, label+".CPU.UsageCores")
+	assert.EqualValues(t, 0, *stats.UsageCoreNanoSeconds, label+".CPU.UsageCoreSeconds")
+}
+
+// container which had no stats should have zero-valued Memory usage
+func checkEmptyMemoryStats(t *testing.T, label string, seed int, info cadvisorapiv2.ContainerInfo, stats *statsapi.MemoryStats) {
+	assert.EqualValues(t, testTime(timestamp, seed).Unix(), stats.Time.Time.Unix(), label+".Mem.Time")
+	require.NotNil(t, stats.WorkingSetBytes, label+".Mem.WorkingSetBytes")
+	assert.EqualValues(t, 0, *stats.WorkingSetBytes, label+".Mem.WorkingSetBytes")
+	assert.Nil(t, stats.UsageBytes, label+".Mem.UsageBytes")
+	assert.Nil(t, stats.RSSBytes, label+".Mem.RSSBytes")
+	assert.Nil(t, stats.PageFaults, label+".Mem.PageFaults")
+	assert.Nil(t, stats.MajorPageFaults, label+".Mem.MajorPageFaults")
+	assert.Nil(t, stats.AvailableBytes, label+".Mem.AvailableBytes")
+}
+
 func checkCPUStats(t *testing.T, label string, seed int, stats *statsapi.CPUStats) {
+	require.NotNil(t, stats.Time, label+".CPU.Time")
+	require.NotNil(t, stats.UsageNanoCores, label+".CPU.UsageNanoCores")
+	require.NotNil(t, stats.UsageNanoCores, label+".CPU.UsageCoreSeconds")
 	assert.EqualValues(t, testTime(timestamp, seed).Unix(), stats.Time.Time.Unix(), label+".CPU.Time")
 	assert.EqualValues(t, seed+offsetCPUUsageCores, *stats.UsageNanoCores, label+".CPU.UsageCores")
 	assert.EqualValues(t, seed+offsetCPUUsageCoreSeconds, *stats.UsageCoreNanoSeconds, label+".CPU.UsageCoreSeconds")