From 50e430b3e909544159ed8cc703939dabc4511ae7 Mon Sep 17 00:00:00 2001 From: Sascha Grunert Date: Mon, 29 Jul 2024 10:36:33 +0200 Subject: [PATCH] Fix kubelet cadvisor stats runtime panic Fixing a kubelet runtime panic when the runtime returns incomplete data: ``` E0729 08:17:47.260393 5218 panic.go:115] "Observed a panic" panic="runtime error: index out of range [0] with length 0" panicGoValue="runtime.boundsError{x:0, y:0, signed:true, code:0x0}" stacktrace=< goroutine 174 [running]: k8s.io/apimachinery/pkg/util/runtime.logPanic({0x33631e8, 0x4ddf5c0}, {0x2c9bfe0, 0xc000a563f0}) k8s.io/apimachinery/pkg/util/runtime/runtime.go:107 +0xbc k8s.io/apimachinery/pkg/util/runtime.handleCrash({0x33631e8, 0x4ddf5c0}, {0x2c9bfe0, 0xc000a563f0}, {0x4ddf5c0, 0x0, 0x10000000043c9e5?}) k8s.io/apimachinery/pkg/util/runtime/runtime.go:82 +0x5e k8s.io/apimachinery/pkg/util/runtime.HandleCrash({0x0, 0x0, 0xc000ae08c0?}) k8s.io/apimachinery/pkg/util/runtime/runtime.go:59 +0x108 panic({0x2c9bfe0?, 0xc000a563f0?}) runtime/panic.go:785 +0x132 k8s.io/kubernetes/pkg/kubelet/stats.(*cadvisorStatsProvider).ImageFsStats(0xc000535d10, {0x3363348, 0xc000afa330}) k8s.io/kubernetes/pkg/kubelet/stats/cadvisor_stats_provider.go:277 +0xaba k8s.io/kubernetes/pkg/kubelet/images.(*realImageGCManager).GarbageCollect(0xc000a3c820, {0x33631e8?, 0x4ddf5c0?}, {0x0?, 0x0?, 0x4dbca20?}) k8s.io/kubernetes/pkg/kubelet/images/image_gc_manager.go:354 +0x1d3 k8s.io/kubernetes/pkg/kubelet.(*Kubelet).StartGarbageCollection.func2() k8s.io/kubernetes/pkg/kubelet/kubelet.go:1472 +0x58 k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0x30?) k8s.io/apimachinery/pkg/util/wait/backoff.go:226 +0x33 k8s.io/apimachinery/pkg/util/wait.BackoffUntil(0xc000add110, {0x3330380, 0xc000afa300}, 0x1, 0xc0000ac150) k8s.io/apimachinery/pkg/util/wait/backoff.go:227 +0xaf k8s.io/apimachinery/pkg/util/wait.JitterUntil(0xc000add110, 0x45d964b800, 0x0, 0x1, 0xc0000ac150) k8s.io/apimachinery/pkg/util/wait/backoff.go:204 +0x7f k8s.io/apimachinery/pkg/util/wait.Until(...) k8s.io/apimachinery/pkg/util/wait/backoff.go:161 created by k8s.io/kubernetes/pkg/kubelet.(*Kubelet).StartGarbageCollection in goroutine 1 k8s.io/kubernetes/pkg/kubelet/kubelet.go:1470 +0x247 ``` This commit fixes panics if: - `len(imageStats.ImageFilesystems) == 0` - `len(imageStats.ContainerFilesystems) == 0` - `imageStats.ImageFilesystems[0].FsId == nil` - `imageStats.ContainerFilesystems[0].FsId == nil` - `imageStats.ImageFilesystems[0].UsedBytes == nil` - `imageStats.ContainerFilesystems[0].UsedBytes == nil` It also fixes the wrapped `nil` error for the check: `err != nil || imageStats == nil` in case that `imageStats == nil`. Signed-off-by: Sascha Grunert --- pkg/kubelet/stats/cadvisor_stats_provider.go | 27 +++++-- .../stats/cadvisor_stats_provider_test.go | 81 +++++++++++++++++++ 2 files changed, 100 insertions(+), 8 deletions(-) diff --git a/pkg/kubelet/stats/cadvisor_stats_provider.go b/pkg/kubelet/stats/cadvisor_stats_provider.go index 25f8683d1c1..e4a5d00c55b 100644 --- a/pkg/kubelet/stats/cadvisor_stats_provider.go +++ b/pkg/kubelet/stats/cadvisor_stats_provider.go @@ -270,26 +270,34 @@ func (p *cadvisorStatsProvider) ImageFsStats(ctx context.Context) (imageFsRet *s return imageFs, imageFs, nil } imageStats, err := p.imageService.ImageFsInfo(ctx) - if err != nil || imageStats == nil { - return nil, nil, fmt.Errorf("failed to get image stats: %v", err) + if err != nil { + return nil, nil, fmt.Errorf("failed to get image stats: %w", err) + } + if imageStats == nil || len(imageStats.ImageFilesystems) == 0 || len(imageStats.ContainerFilesystems) == 0 { + return nil, nil, fmt.Errorf("missing image stats: %+v", imageStats) } splitFileSystem := false - if imageStats.ImageFilesystems[0].FsId.Mountpoint != imageStats.ContainerFilesystems[0].FsId.Mountpoint { - klog.InfoS("Detect Split Filesystem", "ImageFilesystems", imageStats.ImageFilesystems[0], "ContainerFilesystems", imageStats.ContainerFilesystems[0]) + imageFs := imageStats.ImageFilesystems[0] + containerFs := imageStats.ContainerFilesystems[0] + if imageFs.FsId != nil && containerFs.FsId != nil && imageFs.FsId.Mountpoint != containerFs.FsId.Mountpoint { + klog.InfoS("Detect Split Filesystem", "ImageFilesystems", imageFs, "ContainerFilesystems", containerFs) splitFileSystem = true } - imageFs := imageStats.ImageFilesystems[0] var imageFsInodesUsed *uint64 if imageFsInfo.Inodes != nil && imageFsInfo.InodesFree != nil { imageFsIU := *imageFsInfo.Inodes - *imageFsInfo.InodesFree imageFsInodesUsed = &imageFsIU } + var usedBytes uint64 + if imageFs.GetUsedBytes() != nil { + usedBytes = imageFs.GetUsedBytes().GetValue() + } fsStats := &statsapi.FsStats{ Time: metav1.NewTime(imageFsInfo.Timestamp), AvailableBytes: &imageFsInfo.Available, CapacityBytes: &imageFsInfo.Capacity, - UsedBytes: &imageFs.UsedBytes.Value, + UsedBytes: &usedBytes, InodesFree: imageFsInfo.InodesFree, Inodes: imageFsInfo.Inodes, InodesUsed: imageFsInodesUsed, @@ -305,18 +313,21 @@ func (p *cadvisorStatsProvider) ImageFsStats(ctx context.Context) (imageFsRet *s return nil, nil, fmt.Errorf("failed to get container fs info: %w", err) } - containerFs := imageStats.ContainerFilesystems[0] var containerFsInodesUsed *uint64 if containerFsInfo.Inodes != nil && containerFsInfo.InodesFree != nil { containerFsIU := *containerFsInfo.Inodes - *containerFsInfo.InodesFree containerFsInodesUsed = &containerFsIU } + var usedContainerBytes uint64 + if containerFs.GetUsedBytes() != nil { + usedContainerBytes = containerFs.GetUsedBytes().GetValue() + } fsContainerStats := &statsapi.FsStats{ Time: metav1.NewTime(containerFsInfo.Timestamp), AvailableBytes: &containerFsInfo.Available, CapacityBytes: &containerFsInfo.Capacity, - UsedBytes: &containerFs.UsedBytes.Value, + UsedBytes: &usedContainerBytes, InodesFree: containerFsInfo.InodesFree, Inodes: containerFsInfo.Inodes, InodesUsed: containerFsInodesUsed, diff --git a/pkg/kubelet/stats/cadvisor_stats_provider_test.go b/pkg/kubelet/stats/cadvisor_stats_provider_test.go index bff66f7acca..4cf988f4fe1 100644 --- a/pkg/kubelet/stats/cadvisor_stats_provider_test.go +++ b/pkg/kubelet/stats/cadvisor_stats_provider_test.go @@ -23,6 +23,7 @@ import ( cadvisorapiv2 "github.com/google/cadvisor/info/v2" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -546,6 +547,86 @@ func TestCadvisorImagesFsStatsKubeletSeparateDiskOff(t *testing.T) { assert.Equal(*imageFsInfo.Inodes-*imageFsInfo.InodesFree, *stats.InodesUsed) } +func TestImageFsStatsCustomResponse(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, true) + for desc, tc := range map[string]struct { + response *runtimeapi.ImageFsInfoResponse + callContainerFsInfo, shouldErr bool + }{ + "image stats are nil": { + shouldErr: true, + }, + "no image filesystems in image stats": { + response: &runtimeapi.ImageFsInfoResponse{ + ImageFilesystems: []*runtimeapi.FilesystemUsage{}, + ContainerFilesystems: []*runtimeapi.FilesystemUsage{{}}, + }, + shouldErr: true, + }, + "no container filesystems in image stats": { + response: &runtimeapi.ImageFsInfoResponse{ + ImageFilesystems: []*runtimeapi.FilesystemUsage{{}}, + ContainerFilesystems: []*runtimeapi.FilesystemUsage{}, + }, + shouldErr: true, + }, + "image and container filesystem identifiers are nil": { + response: &runtimeapi.ImageFsInfoResponse{ + ImageFilesystems: []*runtimeapi.FilesystemUsage{{}}, + ContainerFilesystems: []*runtimeapi.FilesystemUsage{{}}, + }, + shouldErr: false, + }, + "using different mountpoints but no used bytes set": { + response: &runtimeapi.ImageFsInfoResponse{ + ImageFilesystems: []*runtimeapi.FilesystemUsage{{ + FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "mnt-1"}, + }}, + ContainerFilesystems: []*runtimeapi.FilesystemUsage{{ + FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "mnt-2"}, + }}, + }, + callContainerFsInfo: true, + shouldErr: false, + }, + "using different mountpoints and set used bytes": { + response: &runtimeapi.ImageFsInfoResponse{ + ImageFilesystems: []*runtimeapi.FilesystemUsage{{ + FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "mnt-1"}, + UsedBytes: &runtimeapi.UInt64Value{Value: 10}, + }}, + ContainerFilesystems: []*runtimeapi.FilesystemUsage{{ + FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "mnt-2"}, + UsedBytes: &runtimeapi.UInt64Value{Value: 20}, + }}, + }, + callContainerFsInfo: true, + shouldErr: false, + }, + } { + ctx := context.Background() + mockCadvisor := cadvisortest.NewMockInterface(t) + mockRuntime := containertest.NewMockRuntime(t) + + res := getTestFsInfo(1000) + mockCadvisor.EXPECT().ImagesFsInfo().Return(res, nil) + mockRuntime.EXPECT().ImageFsInfo(ctx).Return(tc.response, nil) + if tc.callContainerFsInfo { + mockCadvisor.EXPECT().ContainerFsInfo().Return(res, nil) + } + + provider := newCadvisorStatsProvider(mockCadvisor, &fakeResourceAnalyzer{}, mockRuntime, nil, NewFakeHostStatsProvider()) + stats, containerfs, err := provider.ImageFsStats(ctx) + if tc.shouldErr { + require.Error(t, err, desc) + assert.Nil(t, stats) + assert.Nil(t, containerfs) + } else { + assert.NoError(t, err, desc) + } + } +} + func TestCadvisorImagesFsStats(t *testing.T) { ctx := context.Background() var (