Fix issue in split filesystem where layers are stored on same disk but different paths

This commit is contained in:
Kevin Hannon 2024-08-06 12:27:43 -04:00
parent 60c4c2b252
commit aea0b90652
9 changed files with 123 additions and 146 deletions

View File

@ -45,8 +45,6 @@ type GC interface {
GarbageCollect(ctx context.Context) error
// Deletes all unused containers, including containers belonging to pods that are terminated but not deleted
DeleteAllUnusedContainers(ctx context.Context) error
// IsContainerFsSeparateFromImageFs tells if writeable layer and read-only layer are separate.
IsContainerFsSeparateFromImageFs(ctx context.Context) bool
}
// SourcesReadyProvider knows how to determine if configuration sources are ready
@ -88,22 +86,3 @@ func (cgc *realContainerGC) DeleteAllUnusedContainers(ctx context.Context) error
klog.InfoS("Attempting to delete unused containers")
return cgc.runtime.GarbageCollect(ctx, cgc.policy, cgc.sourcesReadyProvider.AllReady(), true)
}
func (cgc *realContainerGC) IsContainerFsSeparateFromImageFs(ctx context.Context) bool {
resp, err := cgc.runtime.ImageFsInfo(ctx)
if err != nil {
return false
}
// These fields can be empty if CRI implementation didn't populate.
if resp.ContainerFilesystems == nil || resp.ImageFilesystems == nil || len(resp.ContainerFilesystems) == 0 || len(resp.ImageFilesystems) == 0 {
return false
}
// KEP 4191 explains that multiple filesystems for images and containers is not
// supported at the moment.
// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/4191-split-image-filesystem#comment-on-future-extensions
// for work needed to support multiple filesystems.
if resp.ContainerFilesystems[0].FsId != nil && resp.ImageFilesystems[0].FsId != nil {
return resp.ContainerFilesystems[0].FsId.Mountpoint != resp.ImageFilesystems[0].FsId.Mountpoint
}
return false
}

View File

@ -1,96 +0,0 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package container_test
import (
"context"
"reflect"
"testing"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
. "k8s.io/kubernetes/pkg/kubelet/container"
ctest "k8s.io/kubernetes/pkg/kubelet/container/testing"
)
func TestIsContainerFsSeparateFromImageFs(t *testing.T) {
runtime := &ctest.FakeRuntime{}
fakeSources := ctest.NewFakeReadyProvider()
gcContainer, err := NewContainerGC(runtime, GCPolicy{}, fakeSources)
if err != nil {
t.Errorf("unexpected error")
}
cases := []struct {
name string
containerFs []*runtimeapi.FilesystemUsage
imageFs []*runtimeapi.FilesystemUsage
writeableSeparateFromReadOnly bool
}{
{
name: "Only images",
imageFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}},
writeableSeparateFromReadOnly: false,
},
{
name: "images and containers",
imageFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}},
containerFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "container"}}},
writeableSeparateFromReadOnly: true,
},
{
name: "same filesystem",
imageFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}},
containerFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}},
writeableSeparateFromReadOnly: false,
},
{
name: "Only containers",
containerFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}},
writeableSeparateFromReadOnly: false,
},
{
name: "neither are specified",
writeableSeparateFromReadOnly: false,
},
{
name: "both are empty arrays",
writeableSeparateFromReadOnly: false,
containerFs: []*runtimeapi.FilesystemUsage{},
imageFs: []*runtimeapi.FilesystemUsage{},
},
{
name: "FsId does not exist",
writeableSeparateFromReadOnly: false,
containerFs: []*runtimeapi.FilesystemUsage{{UsedBytes: &runtimeapi.UInt64Value{Value: 10}}},
imageFs: []*runtimeapi.FilesystemUsage{{UsedBytes: &runtimeapi.UInt64Value{Value: 10}}},
},
}
for _, tc := range cases {
runtime.SetContainerFsStats(tc.containerFs)
runtime.SetImageFsStats(tc.imageFs)
actualCommand := gcContainer.IsContainerFsSeparateFromImageFs(context.TODO())
if e, a := tc.writeableSeparateFromReadOnly, actualCommand; !reflect.DeepEqual(e, a) {
t.Errorf("%v: unexpected value; expected %v, got %v", tc.name, e, a)
}
runtime.SetContainerFsStats(nil)
runtime.SetImageFsStats(nil)
}
}

View File

@ -252,13 +252,17 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// build the ranking functions (if not yet known)
// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
if m.dedicatedImageFs == nil {
hasImageFs, splitDiskError := diskInfoProvider.HasDedicatedImageFs(ctx)
if splitDiskError != nil {
klog.ErrorS(splitDiskError, "Eviction manager: failed to get HasDedicatedImageFs")
return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedImageFs: %v", splitDiskError)
hasImageFs, imageFsErr := diskInfoProvider.HasDedicatedImageFs(ctx)
if imageFsErr != nil {
klog.ErrorS(imageFsErr, "Eviction manager: failed to get HasDedicatedImageFs")
return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedImageFs: %w", imageFsErr)
}
m.dedicatedImageFs = &hasImageFs
splitContainerImageFs := m.containerGC.IsContainerFsSeparateFromImageFs(ctx)
splitContainerImageFs, splitErr := diskInfoProvider.HasDedicatedContainerFs(ctx)
if splitErr != nil {
klog.ErrorS(splitErr, "Eviction manager: failed to get HasDedicatedContainerFs")
return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedContainerFs: %w", splitErr)
}
// If we are a split filesystem but the feature is turned off
// we should return an error.

View File

@ -66,7 +66,8 @@ func (m *mockPodKiller) killPodNow(pod *v1.Pod, evict bool, gracePeriodOverride
// mockDiskInfoProvider is used to simulate testing.
type mockDiskInfoProvider struct {
dedicatedImageFs *bool
dedicatedImageFs *bool
dedicatedContainerFs *bool
}
// HasDedicatedImageFs returns the mocked value
@ -74,14 +75,18 @@ func (m *mockDiskInfoProvider) HasDedicatedImageFs(_ context.Context) (bool, err
return ptr.Deref(m.dedicatedImageFs, false), nil
}
// HasDedicatedContainerFs returns the mocked value
func (m *mockDiskInfoProvider) HasDedicatedContainerFs(_ context.Context) (bool, error) {
return ptr.Deref(m.dedicatedContainerFs, false), nil
}
// mockDiskGC is used to simulate invoking image and container garbage collection.
type mockDiskGC struct {
err error
imageGCInvoked bool
containerGCInvoked bool
readAndWriteSeparate bool
fakeSummaryProvider *fakeSummaryProvider
summaryAfterGC *statsapi.Summary
err error
imageGCInvoked bool
containerGCInvoked bool
fakeSummaryProvider *fakeSummaryProvider
summaryAfterGC *statsapi.Summary
}
// DeleteUnusedImages returns the mocked values.
@ -102,10 +107,6 @@ func (m *mockDiskGC) DeleteAllUnusedContainers(_ context.Context) error {
return m.err
}
func (m *mockDiskGC) IsContainerFsSeparateFromImageFs(_ context.Context) bool {
return m.readAndWriteSeparate
}
func makePodWithMemoryStats(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, memoryWorkingSet string) (*v1.Pod, statsapi.PodStats) {
pod := newPod(name, priority, []v1.Container{
newContainer(name, requests, limits),
@ -384,7 +385,7 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) {
fakeClock := testingclock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false), dedicatedContainerFs: ptr.To(false)}
diskGC := &mockDiskGC{err: nil}
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
@ -562,8 +563,8 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
fakeClock := testingclock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs}
diskGC := &mockDiskGC{err: nil, readAndWriteSeparate: tc.writeableSeparateFromReadOnly}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly}
diskGC := &mockDiskGC{err: nil}
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
@ -1307,8 +1308,8 @@ func TestDiskPressureNodeFs(t *testing.T) {
fakeClock := testingclock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs}
diskGC := &mockDiskGC{err: nil, readAndWriteSeparate: tc.writeableSeparateFromReadOnly}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly}
diskGC := &mockDiskGC{err: nil}
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
@ -1829,7 +1830,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
fakeClock := testingclock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly}
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
@ -1846,7 +1847,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
// This is a constant that we use to test that disk pressure is over. Don't change!
diskStatConst := diskStatStart
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStatStart)}
diskGC := &mockDiskGC{fakeSummaryProvider: summaryProvider, err: nil, readAndWriteSeparate: tc.writeableSeparateFromReadOnly}
diskGC := &mockDiskGC{fakeSummaryProvider: summaryProvider, err: nil}
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
@ -2292,8 +2293,8 @@ func TestInodePressureFsInodes(t *testing.T) {
fakeClock := testingclock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs}
diskGC := &mockDiskGC{err: nil, readAndWriteSeparate: tc.writeableSeparateFromReadOnly}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly}
diskGC := &mockDiskGC{err: nil}
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{

View File

@ -75,6 +75,8 @@ type Manager interface {
type DiskInfoProvider interface {
// HasDedicatedImageFs returns true if the imagefs is on a separate device from the rootfs.
HasDedicatedImageFs(ctx context.Context) (bool, error)
// HasDedicatedContainerFs returns true if the container fs is on a separate device from the rootfs.
HasDedicatedContainerFs(ctx context.Context) (bool, error)
}
// ImageGC is responsible for performing garbage collection of unused images.
@ -87,8 +89,6 @@ type ImageGC interface {
type ContainerGC interface {
// DeleteAllUnusedContainers deletes all unused containers, even those that belong to pods that are terminated, but not deleted.
DeleteAllUnusedContainers(ctx context.Context) error
// IsContainerFsSeparateFromImageFs checks if container filesystem is split from image filesystem.
IsContainerFsSeparateFromImageFs(ctx context.Context) bool
}
// KillPodFunc kills a pod.

View File

@ -276,11 +276,11 @@ func (p *cadvisorStatsProvider) ImageFsStats(ctx context.Context) (imageFsRet *s
if imageStats == nil || len(imageStats.ImageFilesystems) == 0 || len(imageStats.ContainerFilesystems) == 0 {
return nil, nil, fmt.Errorf("missing image stats: %+v", imageStats)
}
splitFileSystem := false
imageFs := imageStats.ImageFilesystems[0]
containerFs := imageStats.ContainerFilesystems[0]
if imageFs.FsId != nil && containerFs.FsId != nil && imageFs.FsId.Mountpoint != containerFs.FsId.Mountpoint {
klog.InfoS("Detect Split Filesystem", "ImageFilesystems", imageFs, "ContainerFilesystems", containerFs)
splitFileSystem = true
}
var imageFsInodesUsed *uint64
@ -312,6 +312,12 @@ func (p *cadvisorStatsProvider) ImageFsStats(ctx context.Context) (imageFsRet *s
if err != nil {
return nil, nil, fmt.Errorf("failed to get container fs info: %w", err)
}
// ImageFs and ContainerFs could be on different paths on the same device.
if containerFsInfo.Device == imageFsInfo.Device {
return fsStats, fsStats, nil
}
klog.InfoS("Detect Split Filesystem", "ImageFilesystems", imageStats.ImageFilesystems[0], "ContainerFilesystems", imageStats.ContainerFilesystems[0])
var containerFsInodesUsed *uint64
if containerFsInfo.Inodes != nil && containerFsInfo.InodesFree != nil {

View File

@ -680,7 +680,7 @@ func TestCadvisorSplitImagesFsStats(t *testing.T) {
mockRuntime = containertest.NewMockRuntime(t)
seed = 1000
imageFsInfo = getTestFsInfo(seed)
imageFsInfo = getTestFsInfoWithDifferentMount(seed, "image")
containerSeed = 1001
containerFsInfo = getTestFsInfo(containerSeed)
)
@ -723,7 +723,59 @@ func TestCadvisorSplitImagesFsStats(t *testing.T) {
assert.Equal(containerFsInfo.InodesFree, containerfs.InodesFree)
assert.Equal(containerFsInfo.Inodes, containerfs.Inodes)
assert.Equal(*containerFsInfo.Inodes-*containerFsInfo.InodesFree, *containerfs.InodesUsed)
}
func TestCadvisorSameDiskDifferentLocations(t *testing.T) {
ctx := context.Background()
var (
assert = assert.New(t)
mockCadvisor = cadvisortest.NewMockInterface(t)
mockRuntime = containertest.NewMockRuntime(t)
seed = 1000
imageFsInfo = getTestFsInfo(seed)
containerSeed = 1001
containerFsInfo = getTestFsInfo(containerSeed)
)
imageFsInfoCRI := &runtimeapi.FilesystemUsage{
Timestamp: imageFsInfo.Timestamp.Unix(),
FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "images"},
UsedBytes: &runtimeapi.UInt64Value{Value: imageFsInfo.Usage},
InodesUsed: &runtimeapi.UInt64Value{Value: *imageFsInfo.Inodes},
}
containerFsInfoCRI := &runtimeapi.FilesystemUsage{
Timestamp: containerFsInfo.Timestamp.Unix(),
FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "containers"},
UsedBytes: &runtimeapi.UInt64Value{Value: containerFsInfo.Usage},
InodesUsed: &runtimeapi.UInt64Value{Value: *containerFsInfo.Inodes},
}
imageFsInfoResponse := &runtimeapi.ImageFsInfoResponse{
ImageFilesystems: []*runtimeapi.FilesystemUsage{imageFsInfoCRI},
ContainerFilesystems: []*runtimeapi.FilesystemUsage{containerFsInfoCRI},
}
mockCadvisor.EXPECT().ImagesFsInfo().Return(imageFsInfo, nil)
mockCadvisor.EXPECT().ContainerFsInfo().Return(containerFsInfo, nil)
mockRuntime.EXPECT().ImageFsInfo(ctx).Return(imageFsInfoResponse, nil)
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, true)
provider := newCadvisorStatsProvider(mockCadvisor, &fakeResourceAnalyzer{}, mockRuntime, nil, NewFakeHostStatsProvider())
stats, containerfs, err := provider.ImageFsStats(ctx)
require.NoError(t, err, "imageFsStats should have no error")
assert.Equal(imageFsInfo.Timestamp, stats.Time.Time)
assert.Equal(imageFsInfo.Available, *stats.AvailableBytes)
assert.Equal(imageFsInfo.Capacity, *stats.CapacityBytes)
assert.Equal(imageFsInfo.InodesFree, stats.InodesFree)
assert.Equal(imageFsInfo.Inodes, stats.Inodes)
assert.Equal(*imageFsInfo.Inodes-*imageFsInfo.InodesFree, *stats.InodesUsed)
assert.Equal(imageFsInfo.Timestamp, containerfs.Time.Time)
assert.Equal(imageFsInfo.Available, *containerfs.AvailableBytes)
assert.Equal(imageFsInfo.Capacity, *containerfs.CapacityBytes)
assert.Equal(imageFsInfo.InodesFree, containerfs.InodesFree)
assert.Equal(imageFsInfo.Inodes, containerfs.Inodes)
assert.Equal(*imageFsInfo.Inodes-*imageFsInfo.InodesFree, *containerfs.InodesUsed)
}
func TestCadvisorListPodStatsWhenContainerLogFound(t *testing.T) {

View File

@ -197,6 +197,20 @@ func (p *Provider) HasDedicatedImageFs(ctx context.Context) (bool, error) {
return device != rootFsInfo.Device, nil
}
// HasDedicatedImageFs returns true if a dedicated image filesystem exists for storing images.
// KEP Issue Number 4191: Enhanced this to allow for the containers to be separate from images.
func (p *Provider) HasDedicatedContainerFs(ctx context.Context) (bool, error) {
imageFs, err := p.cadvisor.ImagesFsInfo()
if err != nil {
return false, err
}
containerFs, err := p.cadvisor.ContainerFsInfo()
if err != nil {
return false, err
}
return imageFs.Device != containerFs.Device, nil
}
func equalFileSystems(a, b *statsapi.FsStats) bool {
if a == nil || b == nil {
return false

View File

@ -347,6 +347,23 @@ func getTestFsInfo(seed int) cadvisorapiv2.FsInfo {
}
}
func getTestFsInfoWithDifferentMount(seed int, device string) cadvisorapiv2.FsInfo {
var (
inodes = uint64(seed + offsetFsInodes)
inodesFree = uint64(seed + offsetFsInodesFree)
)
return cadvisorapiv2.FsInfo{
Timestamp: time.Now(),
Device: device,
Mountpoint: "test-mount-point",
Capacity: uint64(seed + offsetFsCapacity),
Available: uint64(seed + offsetFsAvailable),
Usage: uint64(seed + offsetFsUsage),
Inodes: &inodes,
InodesFree: &inodesFree,
}
}
func getPodVolumeStats(seed int, volumeName string) statsapi.VolumeStats {
availableBytes := uint64(seed + offsetFsAvailable)
capacityBytes := uint64(seed + offsetFsCapacity)