diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 816c456ca59..14996843620 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -454,6 +454,12 @@ const ( // Enable POD resources API to return allocatable resources KubeletPodResourcesGetAllocatable featuregate.Feature = "KubeletPodResourcesGetAllocatable" + // KubeletSeparateDiskGC enables Kubelet to garbage collection images/containers on different filesystems + // owner: @kannon92 + // kep: https://kep.k8s.io/4191 + // alpha: v1.29 + KubeletSeparateDiskGC featuregate.Feature = "KubeletSeparateDiskGC" + // owner: @sallyom // kep: https://kep.k8s.io/2832 // alpha: v1.25 @@ -1088,6 +1094,8 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS KubeletPodResourcesGetAllocatable: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // GA in 1.28, remove in 1.30 + KubeletSeparateDiskGC: {Default: false, PreRelease: featuregate.Alpha}, + KubeletTracing: {Default: true, PreRelease: featuregate.Beta}, KubeProxyDrainingTerminatingNodes: {Default: false, PreRelease: featuregate.Alpha}, diff --git a/pkg/kubelet/cadvisor/cadvisor_linux.go b/pkg/kubelet/cadvisor/cadvisor_linux.go index 4c9dc55cee4..f4857f7cb08 100644 --- a/pkg/kubelet/cadvisor/cadvisor_linux.go +++ b/pkg/kubelet/cadvisor/cadvisor_linux.go @@ -186,6 +186,14 @@ func (cc *cadvisorClient) getFsInfo(label string) (cadvisorapiv2.FsInfo, error) return res[0], nil } +func (cc *cadvisorClient) ContainerFsInfo() (cadvisorapiv2.FsInfo, error) { + label, err := cc.imageFsInfoProvider.ContainerFsInfoLabel() + if err != nil { + return cadvisorapiv2.FsInfo{}, err + } + return cc.getFsInfo(label) +} + func (cc *cadvisorClient) WatchEvents(request *events.Request) (*events.EventChannel, error) { return cc.WatchForEvents(request) } diff --git a/pkg/kubelet/cadvisor/cadvisor_linux_test.go b/pkg/kubelet/cadvisor/cadvisor_linux_test.go index 144516939ce..bcd961fca8f 100644 --- a/pkg/kubelet/cadvisor/cadvisor_linux_test.go +++ b/pkg/kubelet/cadvisor/cadvisor_linux_test.go @@ -57,3 +57,32 @@ func TestImageFsInfoLabel(t *testing.T) { }) } } + +func TestContainerFsInfoLabel(t *testing.T) { + testcases := []struct { + description string + runtime string + runtimeEndpoint string + expectedLabel string + expectedError error + }{{ + description: "LabelCrioWriteableImages should be returned", + runtimeEndpoint: crio.CrioSocket, + expectedLabel: LabelCrioContainers, + expectedError: nil, + }, { + description: "Cannot find valid imagefs label", + runtimeEndpoint: "", + expectedLabel: "", + expectedError: fmt.Errorf("no containerfs label for configured runtime"), + }} + + for _, tc := range testcases { + t.Run(tc.description, func(t *testing.T) { + infoProvider := NewImageFsInfoProvider(tc.runtimeEndpoint) + label, err := infoProvider.ContainerFsInfoLabel() + assert.Equal(t, tc.expectedLabel, label) + assert.Equal(t, tc.expectedError, err) + }) + } +} diff --git a/pkg/kubelet/cadvisor/cadvisor_unsupported.go b/pkg/kubelet/cadvisor/cadvisor_unsupported.go index 40113a9ce82..eecf63c910e 100644 --- a/pkg/kubelet/cadvisor/cadvisor_unsupported.go +++ b/pkg/kubelet/cadvisor/cadvisor_unsupported.go @@ -79,6 +79,10 @@ func (cu *cadvisorUnsupported) RootFsInfo() (cadvisorapiv2.FsInfo, error) { return cadvisorapiv2.FsInfo{}, errUnsupported } +func (cu *cadvisorUnsupported) ContainerFsInfo() (cadvisorapiv2.FsInfo, error) { + return cadvisorapiv2.FsInfo{}, errUnsupported +} + func (cu *cadvisorUnsupported) WatchEvents(request *events.Request) (*events.EventChannel, error) { return nil, errUnsupported } diff --git a/pkg/kubelet/cadvisor/cadvisor_windows.go b/pkg/kubelet/cadvisor/cadvisor_windows.go index ded51e7caa1..e09c7a4c990 100644 --- a/pkg/kubelet/cadvisor/cadvisor_windows.go +++ b/pkg/kubelet/cadvisor/cadvisor_windows.go @@ -79,6 +79,10 @@ func (cu *cadvisorClient) ImagesFsInfo() (cadvisorapiv2.FsInfo, error) { return cadvisorapiv2.FsInfo{}, nil } +func (cu *cadvisorClient) ContainerFsInfo() (cadvisorapiv2.FsInfo, error) { + return cadvisorapiv2.FsInfo{}, nil +} + func (cu *cadvisorClient) RootFsInfo() (cadvisorapiv2.FsInfo, error) { return cu.GetDirFsInfo(cu.rootPath) } diff --git a/pkg/kubelet/cadvisor/helpers_linux.go b/pkg/kubelet/cadvisor/helpers_linux.go index 7851cf5376b..598425e503d 100644 --- a/pkg/kubelet/cadvisor/helpers_linux.go +++ b/pkg/kubelet/cadvisor/helpers_linux.go @@ -26,6 +26,11 @@ import ( cadvisorfs "github.com/google/cadvisor/fs" ) +// LabelCrioContainers is a label to allow for cadvisor to track writeable layers +// separately from read-only layers. +// Once CAdvisor upstream changes are merged, we should remove this constant +const LabelCrioContainers string = "crio-containers" + // imageFsInfoProvider knows how to translate the configured runtime // to its file system label for images. type imageFsInfoProvider struct { @@ -35,15 +40,28 @@ type imageFsInfoProvider struct { // ImageFsInfoLabel returns the image fs label for the configured runtime. // For remote runtimes, it handles additional runtimes natively understood by cAdvisor. func (i *imageFsInfoProvider) ImageFsInfoLabel() (string, error) { - // This is a temporary workaround to get stats for cri-o from cadvisor - // and should be removed. - // Related to https://github.com/kubernetes/kubernetes/issues/51798 - if strings.HasSuffix(i.runtimeEndpoint, CrioSocketSuffix) { + if detectCrioWorkaround(i) { return cadvisorfs.LabelCrioImages, nil } return "", fmt.Errorf("no imagefs label for configured runtime") } +// ContainerFsInfoLabel returns the container fs label for the configured runtime. +// For remote runtimes, it handles addition runtimes natively understood by cAdvisor. +func (i *imageFsInfoProvider) ContainerFsInfoLabel() (string, error) { + if detectCrioWorkaround(i) { + return LabelCrioContainers, nil + } + return "", fmt.Errorf("no containerfs label for configured runtime") +} + +// This is a temporary workaround to get stats for cri-o from cadvisor +// and should be removed. +// Related to https://github.com/kubernetes/kubernetes/issues/51798 +func detectCrioWorkaround(i *imageFsInfoProvider) bool { + return strings.HasSuffix(i.runtimeEndpoint, CrioSocketSuffix) +} + // NewImageFsInfoProvider returns a provider for the specified runtime configuration. func NewImageFsInfoProvider(runtimeEndpoint string) ImageFsInfoProvider { return &imageFsInfoProvider{runtimeEndpoint: runtimeEndpoint} diff --git a/pkg/kubelet/cadvisor/helpers_unsupported.go b/pkg/kubelet/cadvisor/helpers_unsupported.go index 2379ef1f188..7d4dd4f7c48 100644 --- a/pkg/kubelet/cadvisor/helpers_unsupported.go +++ b/pkg/kubelet/cadvisor/helpers_unsupported.go @@ -29,6 +29,10 @@ func (i *unsupportedImageFsInfoProvider) ImageFsInfoLabel() (string, error) { return "", errors.New("unsupported") } +func (i *unsupportedImageFsInfoProvider) ContainerFsInfoLabel() (string, error) { + return "", errors.New("unsupported") +} + // NewImageFsInfoProvider returns a provider for the specified runtime configuration. func NewImageFsInfoProvider(runtimeEndpoint string) ImageFsInfoProvider { return &unsupportedImageFsInfoProvider{} diff --git a/pkg/kubelet/cadvisor/testing/cadvisor_fake.go b/pkg/kubelet/cadvisor/testing/cadvisor_fake.go index 6d4f4fb2b9c..e2cb92853ff 100644 --- a/pkg/kubelet/cadvisor/testing/cadvisor_fake.go +++ b/pkg/kubelet/cadvisor/testing/cadvisor_fake.go @@ -101,6 +101,11 @@ func (c *Fake) RootFsInfo() (cadvisorapiv2.FsInfo, error) { return cadvisorapiv2.FsInfo{}, nil } +// ContainerFsInfo is a fake implementation of Interface.ContainerFsInfo. +func (c *Fake) ContainerFsInfo() (cadvisorapiv2.FsInfo, error) { + return cadvisorapiv2.FsInfo{}, nil +} + // WatchEvents is a fake implementation of Interface.WatchEvents. func (c *Fake) WatchEvents(request *events.Request) (*events.EventChannel, error) { return new(events.EventChannel), nil diff --git a/pkg/kubelet/cadvisor/testing/cadvisor_mock.go b/pkg/kubelet/cadvisor/testing/cadvisor_mock.go index 7a20c1de997..b0e94d55f71 100644 --- a/pkg/kubelet/cadvisor/testing/cadvisor_mock.go +++ b/pkg/kubelet/cadvisor/testing/cadvisor_mock.go @@ -52,6 +52,21 @@ func (m *MockInterface) EXPECT() *MockInterfaceMockRecorder { return m.recorder } +// ContainerFsInfo mocks base method. +func (m *MockInterface) ContainerFsInfo() (v2.FsInfo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ContainerFsInfo") + ret0, _ := ret[0].(v2.FsInfo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ContainerFsInfo indicates an expected call of ContainerFsInfo. +func (mr *MockInterfaceMockRecorder) ContainerFsInfo() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ContainerFsInfo", reflect.TypeOf((*MockInterface)(nil).ContainerFsInfo)) +} + // ContainerInfo mocks base method. func (m *MockInterface) ContainerInfo(name string, req *v1.ContainerInfoRequest) (*v1.ContainerInfo, error) { m.ctrl.T.Helper() @@ -254,6 +269,21 @@ func (m *MockImageFsInfoProvider) EXPECT() *MockImageFsInfoProviderMockRecorder return m.recorder } +// ContainerFsInfoLabel mocks base method. +func (m *MockImageFsInfoProvider) ContainerFsInfoLabel() (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ContainerFsInfoLabel") + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ContainerFsInfoLabel indicates an expected call of ContainerFsInfoLabel. +func (mr *MockImageFsInfoProviderMockRecorder) ContainerFsInfoLabel() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ContainerFsInfoLabel", reflect.TypeOf((*MockImageFsInfoProvider)(nil).ContainerFsInfoLabel)) +} + // ImageFsInfoLabel mocks base method. func (m *MockImageFsInfoProvider) ImageFsInfoLabel() (string, error) { m.ctrl.T.Helper() diff --git a/pkg/kubelet/cadvisor/types.go b/pkg/kubelet/cadvisor/types.go index 6c1d0d39ac4..9a0c3d49d1a 100644 --- a/pkg/kubelet/cadvisor/types.go +++ b/pkg/kubelet/cadvisor/types.go @@ -41,6 +41,10 @@ type Interface interface { // Returns usage information about the root filesystem. RootFsInfo() (cadvisorapiv2.FsInfo, error) + // Returns usage information about the writeable layer. + // KEP 4191 can separate the image filesystem + ContainerFsInfo() (cadvisorapiv2.FsInfo, error) + // Get events streamed through passedChannel that fit the request. WatchEvents(request *events.Request) (*events.EventChannel, error) @@ -52,4 +56,6 @@ type Interface interface { type ImageFsInfoProvider interface { // ImageFsInfoLabel returns the label cAdvisor should use to find the filesystem holding container images. ImageFsInfoLabel() (string, error) + // In split image filesystem this will be different from ImageFsInfoLabel + ContainerFsInfoLabel() (string, error) } diff --git a/pkg/kubelet/container/container_gc.go b/pkg/kubelet/container/container_gc.go index b0a25d50058..ae1c6eaaeea 100644 --- a/pkg/kubelet/container/container_gc.go +++ b/pkg/kubelet/container/container_gc.go @@ -45,6 +45,8 @@ type GC interface { GarbageCollect(ctx context.Context) error // Deletes all unused containers, including containers belonging to pods that are terminated but not deleted DeleteAllUnusedContainers(ctx context.Context) error + // IsContainerFsSeparateFromImageFs tells if writeable layer and read-only layer are separate. + IsContainerFsSeparateFromImageFs(ctx context.Context) bool } // SourcesReadyProvider knows how to determine if configuration sources are ready @@ -86,3 +88,22 @@ func (cgc *realContainerGC) DeleteAllUnusedContainers(ctx context.Context) error klog.InfoS("Attempting to delete unused containers") return cgc.runtime.GarbageCollect(ctx, cgc.policy, cgc.sourcesReadyProvider.AllReady(), true) } + +func (cgc *realContainerGC) IsContainerFsSeparateFromImageFs(ctx context.Context) bool { + resp, err := cgc.runtime.ImageFsInfo(ctx) + if err != nil { + return false + } + // These fields can be empty if CRI implementation didn't populate. + if resp.ContainerFilesystems == nil || resp.ImageFilesystems == nil || len(resp.ContainerFilesystems) == 0 || len(resp.ImageFilesystems) == 0 { + return false + } + // KEP 4191 explains that multiple filesystems for images and containers is not + // supported at the moment. + // See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/4191-split-image-filesystem#comment-on-future-extensions + // for work needed to support multiple filesystems. + if resp.ContainerFilesystems[0].FsId != nil && resp.ImageFilesystems[0].FsId != nil { + return resp.ContainerFilesystems[0].FsId.Mountpoint != resp.ImageFilesystems[0].FsId.Mountpoint + } + return false +} diff --git a/pkg/kubelet/container/container_gc_test.go b/pkg/kubelet/container/container_gc_test.go new file mode 100644 index 00000000000..7f30883ce0e --- /dev/null +++ b/pkg/kubelet/container/container_gc_test.go @@ -0,0 +1,96 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package container_test + +import ( + "context" + "reflect" + "testing" + + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" + . "k8s.io/kubernetes/pkg/kubelet/container" + ctest "k8s.io/kubernetes/pkg/kubelet/container/testing" +) + +func TestIsContainerFsSeparateFromImageFs(t *testing.T) { + runtime := &ctest.FakeRuntime{} + fakeSources := ctest.NewFakeReadyProvider() + + gcContainer, err := NewContainerGC(runtime, GCPolicy{}, fakeSources) + if err != nil { + t.Errorf("unexpected error") + } + + cases := []struct { + name string + containerFs []*runtimeapi.FilesystemUsage + imageFs []*runtimeapi.FilesystemUsage + writeableSeparateFromReadOnly bool + }{ + { + name: "Only images", + imageFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}}, + writeableSeparateFromReadOnly: false, + }, + { + name: "images and containers", + imageFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}}, + containerFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "container"}}}, + writeableSeparateFromReadOnly: true, + }, + { + name: "same filesystem", + imageFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}}, + containerFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}}, + writeableSeparateFromReadOnly: false, + }, + + { + name: "Only containers", + containerFs: []*runtimeapi.FilesystemUsage{{FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "image"}}}, + writeableSeparateFromReadOnly: false, + }, + { + name: "neither are specified", + writeableSeparateFromReadOnly: false, + }, + { + name: "both are empty arrays", + writeableSeparateFromReadOnly: false, + containerFs: []*runtimeapi.FilesystemUsage{}, + imageFs: []*runtimeapi.FilesystemUsage{}, + }, + { + name: "FsId does not exist", + writeableSeparateFromReadOnly: false, + containerFs: []*runtimeapi.FilesystemUsage{{UsedBytes: &runtimeapi.UInt64Value{Value: 10}}}, + imageFs: []*runtimeapi.FilesystemUsage{{UsedBytes: &runtimeapi.UInt64Value{Value: 10}}}, + }, + } + + for _, tc := range cases { + runtime.SetContainerFsStats(tc.containerFs) + runtime.SetImageFsStats(tc.imageFs) + actualCommand := gcContainer.IsContainerFsSeparateFromImageFs(context.TODO()) + + if e, a := tc.writeableSeparateFromReadOnly, actualCommand; !reflect.DeepEqual(e, a) { + t.Errorf("%v: unexpected value; expected %v, got %v", tc.name, e, a) + } + runtime.SetContainerFsStats(nil) + runtime.SetImageFsStats(nil) + } +} diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index c12ad5fc717..e1395a206be 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -160,6 +160,8 @@ type ImageService interface { RemoveImage(ctx context.Context, image ImageSpec) error // ImageStats returns Image statistics. ImageStats(ctx context.Context) (*ImageStats, error) + // ImageFsInfo returns a list of file systems for containers/images + ImageFsInfo(ctx context.Context) (*runtimeapi.ImageFsInfoResponse, error) } // Attacher interface allows to attach a container. diff --git a/pkg/kubelet/container/testing/fake_ready_provider.go b/pkg/kubelet/container/testing/fake_ready_provider.go new file mode 100644 index 00000000000..51780dec9e7 --- /dev/null +++ b/pkg/kubelet/container/testing/fake_ready_provider.go @@ -0,0 +1,36 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testing + +import ( + kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" +) + +// FakeReadyProvider implements a fake ready provider +type FakeReadyProvider struct { + kubecontainer.SourcesReadyProvider +} + +// AllReady notifies caller that the Fake Provider is ready. +func (frp *FakeReadyProvider) AllReady() bool { + return true +} + +// NewFakeReadyProvider creates a FakeReadyProvider object +func NewFakeReadyProvider() kubecontainer.SourcesReadyProvider { + return &FakeReadyProvider{} +} diff --git a/pkg/kubelet/container/testing/fake_runtime.go b/pkg/kubelet/container/testing/fake_runtime.go index bf82205303c..e9424c2c017 100644 --- a/pkg/kubelet/container/testing/fake_runtime.go +++ b/pkg/kubelet/container/testing/fake_runtime.go @@ -45,6 +45,8 @@ type FakeRuntime struct { PodList []*FakePod AllPodList []*FakePod ImageList []kubecontainer.Image + ImageFsStats []*runtimeapi.FilesystemUsage + ContainerFsStats []*runtimeapi.FilesystemUsage APIPodStatus v1.PodStatus PodStatus kubecontainer.PodStatus StartedPods []string @@ -422,6 +424,16 @@ func (f *FakeRuntime) ListPodSandboxMetrics(_ context.Context) ([]*runtimeapi.Po return nil, f.Err } +// SetContainerFsStats sets the containerFsStats for dependency injection. +func (f *FakeRuntime) SetContainerFsStats(val []*runtimeapi.FilesystemUsage) { + f.ContainerFsStats = val +} + +// SetImageFsStats sets the ImageFsStats for dependency injection. +func (f *FakeRuntime) SetImageFsStats(val []*runtimeapi.FilesystemUsage) { + f.ImageFsStats = val +} + func (f *FakeRuntime) ImageStats(_ context.Context) (*kubecontainer.ImageStats, error) { f.Lock() defer f.Unlock() @@ -430,6 +442,20 @@ func (f *FakeRuntime) ImageStats(_ context.Context) (*kubecontainer.ImageStats, return nil, f.Err } +// ImageFsInfo returns a ImageFsInfoResponse given the DI injected values of ImageFsStats +// and ContainerFsStats. +func (f *FakeRuntime) ImageFsInfo(_ context.Context) (*runtimeapi.ImageFsInfoResponse, error) { + f.Lock() + defer f.Unlock() + + f.CalledFunctions = append(f.CalledFunctions, "ImageFsInfo") + resp := &runtimeapi.ImageFsInfoResponse{ + ImageFilesystems: f.ImageFsStats, + ContainerFilesystems: f.ContainerFsStats, + } + return resp, f.Err +} + func (f *FakeStreamingRuntime) GetExec(_ context.Context, id kubecontainer.ContainerID, cmd []string, stdin, stdout, stderr, tty bool) (*url.URL, error) { f.Lock() defer f.Unlock() diff --git a/pkg/kubelet/container/testing/runtime_mock.go b/pkg/kubelet/container/testing/runtime_mock.go index 1ad1d0431ef..aae37826120 100644 --- a/pkg/kubelet/container/testing/runtime_mock.go +++ b/pkg/kubelet/container/testing/runtime_mock.go @@ -242,6 +242,21 @@ func (mr *MockRuntimeMockRecorder) GetPods(ctx, all interface{}) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPods", reflect.TypeOf((*MockRuntime)(nil).GetPods), ctx, all) } +// ImageFsInfo mocks base method. +func (m *MockRuntime) ImageFsInfo(ctx context.Context) (*v10.ImageFsInfoResponse, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ImageFsInfo", ctx) + ret0, _ := ret[0].(*v10.ImageFsInfoResponse) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ImageFsInfo indicates an expected call of ImageFsInfo. +func (mr *MockRuntimeMockRecorder) ImageFsInfo(ctx interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ImageFsInfo", reflect.TypeOf((*MockRuntime)(nil).ImageFsInfo), ctx) +} + // ImageStats mocks base method. func (m *MockRuntime) ImageStats(ctx context.Context) (*container.ImageStats, error) { m.ctrl.T.Helper() @@ -523,6 +538,21 @@ func (mr *MockImageServiceMockRecorder) GetImageRef(ctx, image interface{}) *gom return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetImageRef", reflect.TypeOf((*MockImageService)(nil).GetImageRef), ctx, image) } +// ImageFsInfo mocks base method. +func (m *MockImageService) ImageFsInfo(ctx context.Context) (*v10.ImageFsInfoResponse, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ImageFsInfo", ctx) + ret0, _ := ret[0].(*v10.ImageFsInfoResponse) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ImageFsInfo indicates an expected call of ImageFsInfo. +func (mr *MockImageServiceMockRecorder) ImageFsInfo(ctx interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ImageFsInfo", reflect.TypeOf((*MockImageService)(nil).ImageFsInfo), ctx) +} + // ImageStats mocks base method. func (m *MockImageService) ImageStats(ctx context.Context) (*container.ImageStats, error) { m.ctrl.T.Helper() diff --git a/pkg/kubelet/eviction/api/types.go b/pkg/kubelet/eviction/api/types.go index b5a6d36a431..ce0e2bd7a0b 100644 --- a/pkg/kubelet/eviction/api/types.go +++ b/pkg/kubelet/eviction/api/types.go @@ -32,10 +32,25 @@ const ( SignalNodeFsAvailable Signal = "nodefs.available" // SignalNodeFsInodesFree is amount of inodes available on filesystem that kubelet uses for volumes, daemon logs, etc. SignalNodeFsInodesFree Signal = "nodefs.inodesFree" - // SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images and container writable layers. + // SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images layers. + // If the container filesystem and image filesystem are not separate, + // than imagefs can store both image layers and writeable layers. SignalImageFsAvailable Signal = "imagefs.available" - // SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writable layers. + // SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images layers. + // If the container filesystem and image filesystem are not separate, + // than imagefs can store both image layers and writeable layers. SignalImageFsInodesFree Signal = "imagefs.inodesFree" + // SignalContainerFsAvailable is amount of storage available on filesystem that container runtime uses for container writable layers. + // In case of a single filesystem, containerfs=nodefs. + // In case of a image filesystem, containerfs=imagefs. + // We will override user settings and set to either imagefs or nodefs depending on configuration. + SignalContainerFsAvailable Signal = "containerfs.available" + // SignalContainerFsInodesFree is amount of inodes available on filesystem that container runtime uses for container writable layers. + // SignalContainerFsAvailable is amount of storage available on filesystem that container runtime uses for container writable layers. + // In case of a single filesystem, containerfs=nodefs. + // In case of a image filesystem, containerfs=imagefs. + // We will override user settings and set to either imagefs or nodefs depending on configuration. + SignalContainerFsInodesFree Signal = "containerfs.inodesFree" // SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes. SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available" // SignalPIDAvailable is amount of PID available for pod allocation @@ -63,6 +78,8 @@ var OpForSignal = map[Signal]ThresholdOperator{ SignalNodeFsInodesFree: OpLessThan, SignalImageFsAvailable: OpLessThan, SignalImageFsInodesFree: OpLessThan, + SignalContainerFsAvailable: OpLessThan, + SignalContainerFsInodesFree: OpLessThan, SignalAllocatableMemoryAvailable: OpLessThan, SignalPIDAvailable: OpLessThan, } diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index e47b37a0d05..fdb6c72e6ec 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -94,6 +94,8 @@ type managerImpl struct { lastObservations signalObservations // dedicatedImageFs indicates if imagefs is on a separate device from the rootfs dedicatedImageFs *bool + // splitContainerImageFs indicates if containerfs is on a separate device from imagefs + splitContainerImageFs *bool // thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold thresholdNotifiers []ThresholdNotifier // thresholdsLastUpdated is the last time the thresholdNotifiers were updated. @@ -129,6 +131,7 @@ func NewManager( nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, thresholdsFirstObservedAt: thresholdsObservedAt{}, dedicatedImageFs: nil, + splitContainerImageFs: nil, thresholdNotifiers: []ThresholdNotifier{}, localStorageCapacityIsolation: localStorageCapacityIsolation, } @@ -197,10 +200,14 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod // start the eviction manager monitoring go func() { for { - if evictedPods := m.synchronize(diskInfoProvider, podFunc); evictedPods != nil { + evictedPods, err := m.synchronize(diskInfoProvider, podFunc) + if evictedPods != nil && err == nil { klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods)) m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) } else { + if err != nil { + klog.ErrorS(err, "Eviction manager: failed to synchronize") + } time.Sleep(monitoringInterval) } } @@ -230,33 +237,50 @@ func (m *managerImpl) IsUnderPIDPressure() bool { // synchronize is the main control loop that enforces eviction thresholds. // Returns the pod that was killed, or nil if no pod was killed. -func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) []*v1.Pod { +func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) { ctx := context.Background() // if we have nothing to do, just return thresholds := m.config.Thresholds if len(thresholds) == 0 && !m.localStorageCapacityIsolation { - return nil + return nil, nil } klog.V(3).InfoS("Eviction manager: synchronize housekeeping") // build the ranking functions (if not yet known) // TODO: have a function in cadvisor that lets us know if global housekeeping has completed if m.dedicatedImageFs == nil { - hasImageFs, ok := diskInfoProvider.HasDedicatedImageFs(ctx) - if ok != nil { - return nil + hasImageFs, splitDiskError := diskInfoProvider.HasDedicatedImageFs(ctx) + if splitDiskError != nil { + klog.ErrorS(splitDiskError, "Eviction manager: failed to get HasDedicatedImageFs") + return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedImageFs: %v", splitDiskError) } m.dedicatedImageFs = &hasImageFs - m.signalToRankFunc = buildSignalToRankFunc(hasImageFs) - m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs) + splitContainerImageFs := m.containerGC.IsContainerFsSeparateFromImageFs(ctx) + + // If we are a split filesystem but the feature is turned off + // we should return an error. + // This is a bad state. + if !utilfeature.DefaultFeatureGate.Enabled(features.KubeletSeparateDiskGC) && splitContainerImageFs { + splitDiskError := fmt.Errorf("KubeletSeparateDiskGC is turned off but we still have a split filesystem") + return nil, splitDiskError + } + thresholds, err := UpdateContainerFsThresholds(m.config.Thresholds, hasImageFs, splitContainerImageFs) + m.config.Thresholds = thresholds + if err != nil { + klog.ErrorS(err, "eviction manager: found conflicting containerfs eviction. Ignoring.") + } + m.splitContainerImageFs = &splitContainerImageFs + m.signalToRankFunc = buildSignalToRankFunc(hasImageFs, splitContainerImageFs) + m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs, splitContainerImageFs) } + klog.V(3).InfoS("FileSystem detection", "DedicatedImageFs", m.dedicatedImageFs, "SplitImageFs", m.splitContainerImageFs) activePods := podFunc() updateStats := true summary, err := m.summaryProvider.Get(ctx, updateStats) if err != nil { klog.ErrorS(err, "Eviction manager: failed to get summary stats") - return nil + return nil, nil } if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval { @@ -324,20 +348,20 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // If eviction happens in localStorageEviction function, skip the rest of eviction action if m.localStorageCapacityIsolation { if evictedPods := m.localStorageEviction(activePods, statsFunc); len(evictedPods) > 0 { - return evictedPods + return evictedPods, nil } } if len(thresholds) == 0 { klog.V(3).InfoS("Eviction manager: no resources are starved") - return nil + return nil, nil } // rank the thresholds by eviction priority sort.Sort(byEvictionPriority(thresholds)) thresholdToReclaim, resourceToReclaim, foundAny := getReclaimableThreshold(thresholds) if !foundAny { - return nil + return nil, nil } klog.InfoS("Eviction manager: attempting to reclaim", "resourceName", resourceToReclaim) @@ -347,7 +371,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) { klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim) - return nil + return nil, nil } klog.InfoS("Eviction manager: must evict pod(s) to reclaim", "resourceName", resourceToReclaim) @@ -356,13 +380,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal] if !ok { klog.ErrorS(nil, "Eviction manager: no ranking function for signal", "threshold", thresholdToReclaim.Signal) - return nil + return nil, nil } // the only candidates viable for eviction are those pods that had anything running. if len(activePods) == 0 { klog.ErrorS(nil, "Eviction manager: eviction thresholds have been met, but no pods are active to evict") - return nil + return nil, nil } // rank the running pods for eviction for the specified resource @@ -397,11 +421,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act } if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) { metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc() - return []*v1.Pod{pod} + return []*v1.Pod{pod}, nil } } klog.InfoS("Eviction manager: unable to evict any pods from the node") - return nil + return nil, nil } func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) { diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 75fe9fb9263..cae32993ed3 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -39,6 +39,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/lifecycle" kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" testingclock "k8s.io/utils/clock/testing" + "k8s.io/utils/ptr" ) const ( @@ -66,21 +67,22 @@ func (m *mockPodKiller) killPodNow(pod *v1.Pod, evict bool, gracePeriodOverride // mockDiskInfoProvider is used to simulate testing. type mockDiskInfoProvider struct { - dedicatedImageFs bool + dedicatedImageFs *bool } // HasDedicatedImageFs returns the mocked value func (m *mockDiskInfoProvider) HasDedicatedImageFs(_ context.Context) (bool, error) { - return m.dedicatedImageFs, nil + return ptr.Deref(m.dedicatedImageFs, false), nil } // mockDiskGC is used to simulate invoking image and container garbage collection. type mockDiskGC struct { - err error - imageGCInvoked bool - containerGCInvoked bool - fakeSummaryProvider *fakeSummaryProvider - summaryAfterGC *statsapi.Summary + err error + imageGCInvoked bool + containerGCInvoked bool + readAndWriteSeparate bool + fakeSummaryProvider *fakeSummaryProvider + summaryAfterGC *statsapi.Summary } // DeleteUnusedImages returns the mocked values. @@ -101,6 +103,10 @@ func (m *mockDiskGC) DeleteAllUnusedContainers(_ context.Context) error { return m.err } +func (m *mockDiskGC) IsContainerFsSeparateFromImageFs(_ context.Context) bool { + return m.readAndWriteSeparate +} + func makePodWithMemoryStats(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, memoryWorkingSet string) (*v1.Pod, statsapi.PodStats) { pod := newPod(name, priority, []v1.Container{ newContainer(name, requests, limits), @@ -171,13 +177,28 @@ func makeMemoryStats(nodeAvailableBytes string, podStats map[*v1.Pod]statsapi.Po return result } -func makeDiskStats(rootFsAvailableBytes, imageFsAvailableBytes string, podStats map[*v1.Pod]statsapi.PodStats) *statsapi.Summary { - rootFsVal := resource.MustParse(rootFsAvailableBytes) +type diskStats struct { + rootFsAvailableBytes string + imageFsAvailableBytes string + // optional fs + // if not specified, than will assume imagefs=containerfs + containerFsAvailableBytes string + podStats map[*v1.Pod]statsapi.PodStats +} + +func makeDiskStats(diskStats diskStats) *statsapi.Summary { + rootFsVal := resource.MustParse(diskStats.rootFsAvailableBytes) rootFsBytes := uint64(rootFsVal.Value()) rootFsCapacityBytes := uint64(rootFsVal.Value() * 2) - imageFsVal := resource.MustParse(imageFsAvailableBytes) + imageFsVal := resource.MustParse(diskStats.imageFsAvailableBytes) imageFsBytes := uint64(imageFsVal.Value()) imageFsCapacityBytes := uint64(imageFsVal.Value() * 2) + if diskStats.containerFsAvailableBytes == "" { + diskStats.containerFsAvailableBytes = diskStats.imageFsAvailableBytes + } + containerFsVal := resource.MustParse(diskStats.containerFsAvailableBytes) + containerFsBytes := uint64(containerFsVal.Value()) + containerFsCapacityBytes := uint64(containerFsVal.Value() * 2) result := &statsapi.Summary{ Node: statsapi.NodeStats{ Fs: &statsapi.FsStats{ @@ -189,11 +210,15 @@ func makeDiskStats(rootFsAvailableBytes, imageFsAvailableBytes string, podStats AvailableBytes: &imageFsBytes, CapacityBytes: &imageFsCapacityBytes, }, + ContainerFs: &statsapi.FsStats{ + AvailableBytes: &containerFsBytes, + CapacityBytes: &containerFsCapacityBytes, + }, }, }, Pods: []statsapi.PodStats{}, } - for _, podStat := range podStats { + for _, podStat := range diskStats.podStats { result.Pods = append(result.Pods, podStat) } return result @@ -217,7 +242,14 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) { testCases := map[string]struct { wantPodStatus v1.PodStatus }{ - "eviction due to memory pressure": { + "eviction due to memory pressure; no image fs": { + wantPodStatus: v1.PodStatus{ + Phase: v1.PodFailed, + Reason: "Evicted", + Message: "The node was low on resource: memory. Threshold quantity: 2Gi, available: 1500Mi. ", + }, + }, + "eviction due to memory pressure; image fs": { wantPodStatus: v1.PodStatus{ Phase: v1.PodFailed, Reason: "Evicted", @@ -249,7 +281,7 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) { fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -280,8 +312,11 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) { } // synchronize to detect the memory pressure - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } // verify memory pressure is detected if !manager.IsUnderMemoryPressure() { t.Fatalf("Manager should have detected memory pressure") @@ -314,26 +349,108 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) { func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) { testCases := map[string]struct { - wantPodStatus v1.PodStatus + nodeFsStats string + imageFsStats string + containerFsStats string + evictionMessage string + kubeletSeparateDiskFeature bool + writeableSeparateFromReadOnly bool + thresholdToMonitor evictionapi.Threshold + podToMakes []podToMake + dedicatedImageFs *bool + expectErr string }{ - "eviction due to disk pressure": { - wantPodStatus: v1.PodStatus{ - Phase: v1.PodFailed, - Reason: "Evicted", - Message: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. ", + "eviction due to disk pressure; no image fs": { + dedicatedImageFs: ptr.To(false), + nodeFsStats: "1.5Gi", + imageFsStats: "10Gi", + containerFsStats: "10Gi", + thresholdToMonitor: evictionapi.Threshold{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. ", + podToMakes: []podToMake{ + {name: "below-requests", requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "900Mi"}, + {name: "above-requests", requests: newResourceList("", "", "100Mi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "700Mi"}, + }, + }, + "eviction due to image disk pressure; image fs": { + dedicatedImageFs: ptr.To(true), + nodeFsStats: "1Gi", + imageFsStats: "10Gi", + containerFsStats: "10Gi", + evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ", + thresholdToMonitor: evictionapi.Threshold{ + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("50Gi"), + }, + }, + podToMakes: []podToMake{ + {name: "below-requests", requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "900Mi"}, + {name: "above-requests", requests: newResourceList("", "", "50Gi"), limits: newResourceList("", "", "50Gi"), rootFsUsed: "80Gi"}, + }, + }, + "eviction due to container disk pressure; feature off; error; container fs": { + dedicatedImageFs: ptr.To(true), + kubeletSeparateDiskFeature: false, + writeableSeparateFromReadOnly: true, + expectErr: "KubeletSeparateDiskGC is turned off but we still have a split filesystem", + nodeFsStats: "1Gi", + imageFsStats: "100Gi", + containerFsStats: "10Gi", + evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ", + thresholdToMonitor: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("50Gi"), + }, + }, + podToMakes: []podToMake{ + {name: "below-requests", requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "900Mi"}, + {name: "above-requests", requests: newResourceList("", "", "50Gi"), limits: newResourceList("", "", "50Gi"), rootFsUsed: "80Gi"}, + }, + }, + "eviction due to container disk pressure; container fs": { + dedicatedImageFs: ptr.To(true), + kubeletSeparateDiskFeature: true, + writeableSeparateFromReadOnly: true, + nodeFsStats: "10Gi", + imageFsStats: "100Gi", + containerFsStats: "10Gi", + evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ", + thresholdToMonitor: evictionapi.Threshold{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("50Gi"), + }, + }, + podToMakes: []podToMake{ + {name: "below-requests", requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "900Mi"}, + {name: "above-requests", requests: newResourceList("", "", "50Gi"), limits: newResourceList("", "", "50Gi"), rootFsUsed: "80Gi"}, }, }, } for name, tc := range testCases { for _, enablePodDisruptionConditions := range []bool{false, true} { t.Run(fmt.Sprintf("%s;PodDisruptionConditions=%v", name, enablePodDisruptionConditions), func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, tc.kubeletSeparateDiskFeature)() defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodDisruptionConditions, enablePodDisruptionConditions)() podMaker := makePodWithDiskStats summaryStatsMaker := makeDiskStats - podsToMake := []podToMake{ - {name: "below-requests", requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "900Mi"}, - {name: "above-requests", requests: newResourceList("", "", "100Mi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "700Mi"}, + podsToMake := tc.podToMakes + wantPodStatus := v1.PodStatus{ + Phase: v1.PodFailed, + Reason: "Evicted", + Message: tc.evictionMessage, } pods := []*v1.Pod{} podStats := map[*v1.Pod]statsapi.PodStats{} @@ -348,23 +465,21 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) { fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - diskGC := &mockDiskGC{err: nil} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs} + diskGC := &mockDiskGC{err: nil, readAndWriteSeparate: tc.writeableSeparateFromReadOnly} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} config := Config{ PressureTransitionPeriod: time.Minute * 5, - Thresholds: []evictionapi.Threshold{ - { - Signal: evictionapi.SignalNodeFsAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("2Gi"), - }, - }, - }, + Thresholds: []evictionapi.Threshold{tc.thresholdToMonitor}, } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("1.5Gi", "200Gi", podStats)} + diskStat := diskStats{ + rootFsAvailableBytes: tc.nodeFsStats, + imageFsAvailableBytes: tc.imageFsStats, + containerFsAvailableBytes: tc.containerFsStats, + podStats: podStats, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStat)} manager := &managerImpl{ clock: fakeClock, killPodFunc: podKiller.killPodNow, @@ -379,32 +494,39 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) { } // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + pods, synchErr := manager.synchronize(diskInfoProvider, activePodsFunc) - // verify menager detected disk pressure - if !manager.IsUnderDiskPressure() { - t.Fatalf("Manager should report disk pressure") - } + if synchErr == nil && tc.expectErr != "" { + t.Fatalf("Manager should report error but did not") + } else if tc.expectErr != "" && synchErr != nil { + if diff := cmp.Diff(tc.expectErr, synchErr.Error()); diff != "" { + t.Errorf("Unexpected error (-want,+got):\n%s", diff) + } + } else { + // verify manager detected disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } - // verify a pod is selected for eviction - if podKiller.pod == nil { - t.Fatalf("Manager should have selected a pod for eviction") - } + // verify a pod is selected for eviction + if podKiller.pod == nil { + t.Fatalf("Manager should have selected a pod for eviction") + } - wantPodStatus := tc.wantPodStatus.DeepCopy() - if enablePodDisruptionConditions { - wantPodStatus.Conditions = append(wantPodStatus.Conditions, v1.PodCondition{ - Type: "DisruptionTarget", - Status: "True", - Reason: "TerminationByKubelet", - Message: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. ", - }) - } + if enablePodDisruptionConditions { + wantPodStatus.Conditions = append(wantPodStatus.Conditions, v1.PodCondition{ + Type: "DisruptionTarget", + Status: "True", + Reason: "TerminationByKubelet", + Message: tc.evictionMessage, + }) + } - // verify the pod status after applying the status update function - podKiller.statusFn(&podKiller.pod.Status) - if diff := cmp.Diff(*wantPodStatus, podKiller.pod.Status, cmpopts.IgnoreFields(v1.PodCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" { - t.Errorf("Unexpected pod status of the evicted pod (-want,+got):\n%s", diff) + // verify the pod status after applying the status update function + podKiller.statusFn(&podKiller.pod.Status) + if diff := cmp.Diff(wantPodStatus, podKiller.pod.Status, cmpopts.IgnoreFields(v1.PodCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" { + t.Errorf("Unexpected pod status of the evicted pod (-want,+got):\n%s", diff) + } } }) } @@ -436,7 +558,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -480,7 +602,11 @@ func TestMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newResourceList("100m", "100Mi", ""), newResourceList("200m", "200Mi", ""), "0Gi") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -498,7 +624,11 @@ func TestMemoryPressure(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -513,7 +643,11 @@ func TestMemoryPressure(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -538,7 +672,11 @@ func TestMemoryPressure(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -548,7 +686,11 @@ func TestMemoryPressure(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -576,7 +718,11 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -600,7 +746,11 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } // we should not have memory pressure (because transition period met) if manager.IsUnderMemoryPressure() { @@ -676,198 +826,351 @@ func parseQuantity(value string) resource.Quantity { } func TestDiskPressureNodeFs(t *testing.T) { - podMaker := makePodWithDiskStats - summaryStatsMaker := makeDiskStats - podsToMake := []podToMake{ - {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, - {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, - {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, - {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, - {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, - } - pods := []*v1.Pod{} - podStats := map[*v1.Pod]statsapi.PodStats{} - for _, podToMake := range podsToMake { - pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) - pods = append(pods, pod) - podStats[pod] = podStat - } - podToEvict := pods[0] - activePodsFunc := func() []*v1.Pod { - return pods - } - fakeClock := testingclock.NewFakeClock(time.Now()) - podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - diskGC := &mockDiskGC{err: nil} - nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - - config := Config{ - MaxPodGracePeriodSeconds: 5, - PressureTransitionPeriod: time.Minute * 5, - Thresholds: []evictionapi.Threshold{ - { - Signal: evictionapi.SignalNodeFsAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("1Gi"), + testCases := map[string]struct { + nodeFsStats string + imageFsStats string + containerFsStats string + kubeletSeparateDiskFeature bool + writeableSeparateFromReadOnly bool + thresholdToMonitor []evictionapi.Threshold + podToMakes []podToMake + dedicatedImageFs *bool + expectErr string + inducePressureOnWhichFs string + softDiskPressure string + hardDiskPressure string + }{ + "eviction due to disk pressure; no image fs": { + dedicatedImageFs: ptr.To(false), + nodeFsStats: "16Gi", + imageFsStats: "16Gi", + containerFsStats: "16Gi", + inducePressureOnWhichFs: "nodefs", + softDiskPressure: "1.5Gi", + hardDiskPressure: "750Mi", + thresholdToMonitor: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: time.Minute * 2, }, }, - { - Signal: evictionapi.SignalNodeFsAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("2Gi"), + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, + }, + }, + "eviction due to image disk pressure; image fs": { + dedicatedImageFs: ptr.To(true), + nodeFsStats: "16Gi", + imageFsStats: "16Gi", + containerFsStats: "16Gi", + softDiskPressure: "1.5Gi", + hardDiskPressure: "750Mi", + inducePressureOnWhichFs: "imagefs", + thresholdToMonitor: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, }, - GracePeriod: time.Minute * 2, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: time.Minute * 2, + }, + }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, + }, + }, + "eviction due to container disk pressure; container fs": { + dedicatedImageFs: ptr.To(true), + kubeletSeparateDiskFeature: true, + writeableSeparateFromReadOnly: true, + nodeFsStats: "16Gi", + imageFsStats: "16Gi", + containerFsStats: "16Gi", + softDiskPressure: "1.5Gi", + hardDiskPressure: "750Mi", + inducePressureOnWhichFs: "containerfs", + thresholdToMonitor: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: time.Minute * 2, + }, + }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, }, }, } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("16Gi", "200Gi", podStats)} - manager := &managerImpl{ - clock: fakeClock, - killPodFunc: podKiller.killPodNow, - imageGC: diskGC, - containerGC: diskGC, - config: config, - recorder: &record.FakeRecorder{}, - summaryProvider: summaryProvider, - nodeRef: nodeRef, - nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, - thresholdsFirstObservedAt: thresholdsObservedAt{}, - } - // create a best effort pod to test admission - podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi") + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, tc.kubeletSeparateDiskFeature)() - // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + podMaker := makePodWithDiskStats + summaryStatsMaker := makeDiskStats + podsToMake := tc.podToMakes + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) + pods = append(pods, pod) + podStats[pod] = podStat + } + podToEvict := pods[0] + activePodsFunc := func() []*v1.Pod { + return pods + } - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } + fakeClock := testingclock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs} + diskGC := &mockDiskGC{err: nil, readAndWriteSeparate: tc.writeableSeparateFromReadOnly} + nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - // try to admit our pod (should succeed) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { - t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) - } + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: tc.thresholdToMonitor, + } - // induce soft threshold - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + diskStatStart := diskStats{ + rootFsAvailableBytes: tc.nodeFsStats, + imageFsAvailableBytes: tc.imageFsStats, + containerFsAvailableBytes: tc.containerFsStats, + podStats: podStats, + } + diskStatConst := diskStatStart + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStatStart)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: diskGC, + containerGC: diskGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure since soft threshold was met") - } + // create a best effort pod to test admission + podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi") - // verify no pod was yet killed because there has not yet been enough time passed. - if podKiller.pod != nil { - t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name) - } + // synchronize + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) - // step forward in time pass the grace period - fakeClock.Step(3 * time.Minute) - summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure since soft threshold was met") - } + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } - // verify the right pod was killed with the right grace period. - if podKiller.pod != podToEvict { - t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) - } - if podKiller.gracePeriodOverride == nil { - t.Errorf("Manager chose to kill pod but should have had a grace period override.") - } - observedGracePeriod := *podKiller.gracePeriodOverride - if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds { - t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod) - } - // reset state - podKiller.pod = nil - podKiller.gracePeriodOverride = nil + // try to admit our pod (should succeed) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + } - // remove disk pressure - fakeClock.Step(20 * time.Minute) - summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + // induce soft threshold + fakeClock.Step(1 * time.Minute) - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } + if tc.inducePressureOnWhichFs == "nodefs" { + diskStatStart.rootFsAvailableBytes = tc.softDiskPressure + } else if tc.inducePressureOnWhichFs == "imagefs" { + diskStatStart.imageFsAvailableBytes = tc.softDiskPressure + } else if tc.inducePressureOnWhichFs == "containerfs" { + diskStatStart.containerFsAvailableBytes = tc.softDiskPressure + } + summaryProvider.result = summaryStatsMaker(diskStatStart) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // induce disk pressure! - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("500Mi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure") - } + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure since soft threshold was met") + } - // check the right pod was killed - if podKiller.pod != podToEvict { - t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) - } - observedGracePeriod = *podKiller.gracePeriodOverride - if observedGracePeriod != int64(0) { - t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) - } + // verify no pod was yet killed because there has not yet been enough time passed. + if podKiller.pod != nil { + t.Fatalf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name) + } - // try to admit our pod (should fail) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { - t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) - } + // step forward in time pass the grace period + fakeClock.Step(3 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatStart) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // reduce disk pressure - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - // we should have disk pressure (because transition period not yet met) - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure") - } + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure since soft threshold was met") + } - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } + // verify the right pod was killed with the right grace period. + if podKiller.pod != podToEvict { + t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } + if podKiller.gracePeriodOverride == nil { + t.Fatalf("Manager chose to kill pod but should have had a grace period override.") + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds { + t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod) + } + // reset state + podKiller.pod = nil + podKiller.gracePeriodOverride = nil - // try to admit our pod (should fail) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { - t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) - } + // remove disk pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatConst) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // move the clock past transition period to ensure that we stop reporting pressure - fakeClock.Step(5 * time.Minute) - summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - // we should not have disk pressure (because transition period met) - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } + // induce disk pressure! + fakeClock.Step(1 * time.Minute) + if tc.inducePressureOnWhichFs == "nodefs" { + diskStatStart.rootFsAvailableBytes = tc.hardDiskPressure + } else if tc.inducePressureOnWhichFs == "imagefs" { + diskStatStart.imageFsAvailableBytes = tc.hardDiskPressure + } else if tc.inducePressureOnWhichFs == "containerfs" { + diskStatStart.containerFsAvailableBytes = tc.hardDiskPressure + } + summaryProvider.result = summaryStatsMaker(diskStatStart) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // try to admit our pod (should succeed) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { - t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } + + // check the right pod was killed + if podKiller.pod != podToEvict { + t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } + observedGracePeriod = *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // try to admit our pod (should fail) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) + } + + // reduce disk pressure + fakeClock.Step(1 * time.Minute) + + summaryProvider.result = summaryStatsMaker(diskStatConst) + podKiller.pod = nil // reset state + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + // we should have disk pressure (because transition period not yet met) + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // try to admit our pod (should fail) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatConst) + podKiller.pod = nil // reset state + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should not have disk pressure (because transition period met) + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // try to admit our pod (should succeed) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + } + }) } } @@ -896,7 +1199,7 @@ func TestMinReclaim(t *testing.T) { fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -931,8 +1234,10 @@ func TestMinReclaim(t *testing.T) { } // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) - + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Errorf("Manager should not report any errors") + } // we should not have memory pressure if manager.IsUnderMemoryPressure() { t.Errorf("Manager should not report memory pressure") @@ -941,7 +1246,11 @@ func TestMinReclaim(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -961,7 +1270,11 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -981,7 +1294,11 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -997,7 +1314,11 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should not have memory pressure (because transition period met) if manager.IsUnderMemoryPressure() { @@ -1011,37 +1332,33 @@ func TestMinReclaim(t *testing.T) { } func TestNodeReclaimFuncs(t *testing.T) { - podMaker := makePodWithDiskStats - summaryStatsMaker := makeDiskStats - podsToMake := []podToMake{ - {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, - {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, - {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, - {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, - {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, - } - pods := []*v1.Pod{} - podStats := map[*v1.Pod]statsapi.PodStats{} - for _, podToMake := range podsToMake { - pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) - pods = append(pods, pod) - podStats[pod] = podStat - } - podToEvict := pods[0] - activePodsFunc := func() []*v1.Pod { - return pods - } - - fakeClock := testingclock.NewFakeClock(time.Now()) - podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - - config := Config{ - MaxPodGracePeriodSeconds: 5, - PressureTransitionPeriod: time.Minute * 5, - Thresholds: []evictionapi.Threshold{ - { + testCases := map[string]struct { + nodeFsStats string + imageFsStats string + containerFsStats string + kubeletSeparateDiskFeature bool + writeableSeparateFromReadOnly bool + expectContainerGcCall bool + expectImageGcCall bool + thresholdToMonitor evictionapi.Threshold + podToMakes []podToMake + dedicatedImageFs *bool + expectErr string + inducePressureOnWhichFs string + softDiskPressure string + hardDiskPressure string + }{ + "eviction due to disk pressure; no image fs": { + dedicatedImageFs: ptr.To(false), + nodeFsStats: "16Gi", + imageFsStats: "16Gi", + containerFsStats: "16Gi", + inducePressureOnWhichFs: "nodefs", + softDiskPressure: "1.5Gi", + hardDiskPressure: "750Mi", + expectContainerGcCall: true, + expectImageGcCall: true, + thresholdToMonitor: evictionapi.Threshold{ Signal: evictionapi.SignalNodeFsAvailable, Operator: evictionapi.OpLessThan, Value: evictionapi.ThresholdValue{ @@ -1051,186 +1368,376 @@ func TestNodeReclaimFuncs(t *testing.T) { Quantity: quantityMustParse("500Mi"), }, }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, + }, + }, + "eviction due to image disk pressure; image fs": { + dedicatedImageFs: ptr.To(true), + nodeFsStats: "16Gi", + imageFsStats: "16Gi", + containerFsStats: "16Gi", + softDiskPressure: "1.5Gi", + hardDiskPressure: "750Mi", + inducePressureOnWhichFs: "imagefs", + expectContainerGcCall: true, + expectImageGcCall: true, + thresholdToMonitor: evictionapi.Threshold{ + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, + }, + }, + "eviction due to container disk pressure; container fs": { + dedicatedImageFs: ptr.To(true), + kubeletSeparateDiskFeature: true, + writeableSeparateFromReadOnly: true, + nodeFsStats: "16Gi", + imageFsStats: "16Gi", + containerFsStats: "16Gi", + softDiskPressure: "1.5Gi", + hardDiskPressure: "750Mi", + inducePressureOnWhichFs: "nodefs", + expectContainerGcCall: true, + expectImageGcCall: false, + thresholdToMonitor: evictionapi.Threshold{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, + }, + }, + "eviction due to image disk pressure; container fs": { + dedicatedImageFs: ptr.To(true), + kubeletSeparateDiskFeature: true, + writeableSeparateFromReadOnly: true, + nodeFsStats: "16Gi", + imageFsStats: "16Gi", + containerFsStats: "16Gi", + softDiskPressure: "1.5Gi", + hardDiskPressure: "750Mi", + inducePressureOnWhichFs: "imagefs", + expectContainerGcCall: false, + expectImageGcCall: true, + thresholdToMonitor: evictionapi.Threshold{ + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, + }, }, } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("16Gi", "200Gi", podStats)} - diskGC := &mockDiskGC{fakeSummaryProvider: summaryProvider, err: nil} - manager := &managerImpl{ - clock: fakeClock, - killPodFunc: podKiller.killPodNow, - imageGC: diskGC, - containerGC: diskGC, - config: config, - recorder: &record.FakeRecorder{}, - summaryProvider: summaryProvider, - nodeRef: nodeRef, - nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, - thresholdsFirstObservedAt: thresholdsObservedAt{}, - } - // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, tc.kubeletSeparateDiskFeature)() - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } + podMaker := makePodWithDiskStats + summaryStatsMaker := makeDiskStats + podsToMake := tc.podToMakes + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) + pods = append(pods, pod) + podStats[pod] = podStat + } + podToEvict := pods[0] + activePodsFunc := func() []*v1.Pod { + return pods + } - // induce hard threshold - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats) - // make GC successfully return disk usage to previous levels - diskGC.summaryAfterGC = summaryStatsMaker("16Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + fakeClock := testingclock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs} + nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure since soft threshold was met") - } + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []evictionapi.Threshold{tc.thresholdToMonitor}, + } + diskStatStart := diskStats{ + rootFsAvailableBytes: tc.nodeFsStats, + imageFsAvailableBytes: tc.imageFsStats, + containerFsAvailableBytes: tc.containerFsStats, + podStats: podStats, + } + // This is a constant that we use to test that disk pressure is over. Don't change! + diskStatConst := diskStatStart + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStatStart)} + diskGC := &mockDiskGC{fakeSummaryProvider: summaryProvider, err: nil, readAndWriteSeparate: tc.writeableSeparateFromReadOnly} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: diskGC, + containerGC: diskGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } - // verify image gc was invoked - if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked { - t.Errorf("Manager should have invoked image gc") - } + // synchronize + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) - // verify no pod was killed because image gc was sufficient - if podKiller.pod != nil { - t.Errorf("Manager should not have killed a pod, but killed: %v", podKiller.pod.Name) - } + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // reset state - diskGC.imageGCInvoked = false - diskGC.containerGCInvoked = false + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } - // remove disk pressure - fakeClock.Step(20 * time.Minute) - summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + // induce hard threshold + fakeClock.Step(1 * time.Minute) - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } + setDiskStatsBasedOnFs := func(whichFs string, diskPressure string, diskStat diskStats) diskStats { + if tc.inducePressureOnWhichFs == "nodefs" { + diskStat.rootFsAvailableBytes = diskPressure + } else if tc.inducePressureOnWhichFs == "imagefs" { + diskStat.imageFsAvailableBytes = diskPressure + } else if tc.inducePressureOnWhichFs == "containerfs" { + diskStat.containerFsAvailableBytes = diskPressure + } + return diskStat + } + newDiskAfterHardEviction := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) + summaryProvider.result = summaryStatsMaker(newDiskAfterHardEviction) + // make GC successfully return disk usage to previous levels + diskGC.summaryAfterGC = summaryStatsMaker(diskStatConst) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure since soft threshold was met") + } - // induce hard threshold - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats) - // make GC return disk usage bellow the threshold, but not satisfying minReclaim - diskGC.summaryAfterGC = summaryStatsMaker("1.1Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + // verify image, container or both gc were called. + // split filesystem can have container gc called without image. + // same filesystem should have both. + if diskGC.imageGCInvoked != tc.expectImageGcCall && diskGC.containerGCInvoked != tc.expectContainerGcCall { + t.Fatalf("Manager should have invoked image gc") + } - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure since soft threshold was met") - } + // verify no pod was killed because image gc was sufficient + if podKiller.pod != nil { + t.Fatalf("Manager should not have killed a pod, but killed: %v", podKiller.pod.Name) + } - // verify image gc was invoked - if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked { - t.Errorf("Manager should have invoked image gc") - } + // reset state + diskGC.imageGCInvoked = false + diskGC.containerGCInvoked = false - // verify a pod was killed because image gc was not enough to satisfy minReclaim - if podKiller.pod == nil { - t.Errorf("Manager should have killed a pod, but didn't") - } + // remove disk pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatConst) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // reset state - diskGC.imageGCInvoked = false - diskGC.containerGCInvoked = false - podKiller.pod = nil + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // remove disk pressure - fakeClock.Step(20 * time.Minute) - summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } + // synchronize + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // induce disk pressure! - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats) - // Don't reclaim any disk - diskGC.summaryAfterGC = summaryStatsMaker("400Mi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure") - } + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } - // ensure disk gc was invoked - if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked { - t.Errorf("Manager should have invoked image gc") - } + // induce hard threshold + fakeClock.Step(1 * time.Minute) + newDiskAfterHardEviction = setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) + summaryProvider.result = summaryStatsMaker(newDiskAfterHardEviction) + // make GC return disk usage bellow the threshold, but not satisfying minReclaim + gcBelowThreshold := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, "1.1G", newDiskAfterHardEviction) + diskGC.summaryAfterGC = summaryStatsMaker(gcBelowThreshold) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // check the right pod was killed - if podKiller.pod != podToEvict { - t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) - } - observedGracePeriod := *podKiller.gracePeriodOverride - if observedGracePeriod != int64(0) { - t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) - } + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // reduce disk pressure - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - diskGC.imageGCInvoked = false // reset state - diskGC.containerGCInvoked = false // reset state - podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure since soft threshold was met") + } - // we should have disk pressure (because transition period not yet met) - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure") - } + // verify image, container or both gc were called. + // split filesystem can have container gc called without image. + // same filesystem should have both. + if diskGC.imageGCInvoked != tc.expectImageGcCall && diskGC.containerGCInvoked != tc.expectContainerGcCall { + t.Fatalf("Manager should have invoked image gc") + } - // no image gc should have occurred - if diskGC.imageGCInvoked || diskGC.containerGCInvoked { - t.Errorf("Manager chose to perform image gc when it was not needed") - } + // verify a pod was killed because image gc was not enough to satisfy minReclaim + if podKiller.pod == nil { + t.Fatalf("Manager should have killed a pod, but didn't") + } - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } + // reset state + diskGC.imageGCInvoked = false + diskGC.containerGCInvoked = false + podKiller.pod = nil - // move the clock past transition period to ensure that we stop reporting pressure - fakeClock.Step(5 * time.Minute) - summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - diskGC.imageGCInvoked = false // reset state - diskGC.containerGCInvoked = false // reset state - podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + // remove disk pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatConst) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // we should not have disk pressure (because transition period met) - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // no image gc should have occurred - if diskGC.imageGCInvoked || diskGC.containerGCInvoked { - t.Errorf("Manager chose to perform image gc when it was not needed") - } + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + // induce disk pressure! + fakeClock.Step(1 * time.Minute) + softDiskPressure := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) + summaryProvider.result = summaryStatsMaker(softDiskPressure) + // Don't reclaim any disk + diskGC.summaryAfterGC = summaryStatsMaker(softDiskPressure) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } + + // verify image, container or both gc were called. + // split filesystem can have container gc called without image. + // same filesystem should have both. + if diskGC.imageGCInvoked != tc.expectImageGcCall && diskGC.containerGCInvoked != tc.expectContainerGcCall { + t.Fatalf("Manager should have invoked image gc") + } + + // check the right pod was killed + if podKiller.pod != podToEvict { + t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // reduce disk pressure + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatConst) + diskGC.imageGCInvoked = false // reset state + diskGC.containerGCInvoked = false // reset state + podKiller.pod = nil // reset state + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should have disk pressure (because transition period not yet met) + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } + + if diskGC.imageGCInvoked || diskGC.containerGCInvoked { + t.Errorf("Manager chose to perform image gc when it was not needed") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatConst) + diskGC.imageGCInvoked = false // reset state + diskGC.containerGCInvoked = false // reset state + podKiller.pod = nil // reset state + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should not have disk pressure (because transition period met) + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } + + if diskGC.imageGCInvoked || diskGC.containerGCInvoked { + t.Errorf("Manager chose to perform image gc when it was not needed") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + }) } } -func TestInodePressureNodeFsInodes(t *testing.T) { +func TestInodePressureFsInodes(t *testing.T) { podMaker := func(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, rootInodes, logInodes, volumeInodes string) (*v1.Pod, statsapi.PodStats) { pod := newPod(name, priority, []v1.Container{ newContainer(name, requests, limits), @@ -1238,17 +1745,38 @@ func TestInodePressureNodeFsInodes(t *testing.T) { podStats := newPodInodeStats(pod, parseQuantity(rootInodes), parseQuantity(logInodes), parseQuantity(volumeInodes)) return pod, podStats } - summaryStatsMaker := func(rootFsInodesFree, rootFsInodes string, podStats map[*v1.Pod]statsapi.PodStats) *statsapi.Summary { + summaryStatsMaker := func(rootFsInodesFree, rootFsInodes, imageFsInodesFree, imageFsInodes, containerFsInodesFree, containerFsInodes string, podStats map[*v1.Pod]statsapi.PodStats) *statsapi.Summary { rootFsInodesFreeVal := resource.MustParse(rootFsInodesFree) internalRootFsInodesFree := uint64(rootFsInodesFreeVal.Value()) rootFsInodesVal := resource.MustParse(rootFsInodes) internalRootFsInodes := uint64(rootFsInodesVal.Value()) + + imageFsInodesFreeVal := resource.MustParse(imageFsInodesFree) + internalImageFsInodesFree := uint64(imageFsInodesFreeVal.Value()) + imageFsInodesVal := resource.MustParse(imageFsInodes) + internalImageFsInodes := uint64(imageFsInodesVal.Value()) + + containerFsInodesFreeVal := resource.MustParse(containerFsInodesFree) + internalContainerFsInodesFree := uint64(containerFsInodesFreeVal.Value()) + containerFsInodesVal := resource.MustParse(containerFsInodes) + internalContainerFsInodes := uint64(containerFsInodesVal.Value()) + result := &statsapi.Summary{ Node: statsapi.NodeStats{ Fs: &statsapi.FsStats{ InodesFree: &internalRootFsInodesFree, Inodes: &internalRootFsInodes, }, + Runtime: &statsapi.RuntimeStats{ + ImageFs: &statsapi.FsStats{ + InodesFree: &internalImageFsInodesFree, + Inodes: &internalImageFsInodes, + }, + ContainerFs: &statsapi.FsStats{ + InodesFree: &internalContainerFsInodesFree, + Inodes: &internalContainerFsInodes, + }, + }, }, Pods: []statsapi.PodStats{}, } @@ -1257,196 +1785,356 @@ func TestInodePressureNodeFsInodes(t *testing.T) { } return result } - podsToMake := []podToMake{ - {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsInodesUsed: "900Mi"}, - {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsInodesUsed: "50Mi"}, - {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsInodesUsed: "400Mi"}, - {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsInodesUsed: "400Mi"}, - {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsInodesUsed: "100Mi"}, - } - pods := []*v1.Pod{} - podStats := map[*v1.Pod]statsapi.PodStats{} - for _, podToMake := range podsToMake { - pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsInodesUsed, podToMake.logsFsInodesUsed, podToMake.perLocalVolumeInodesUsed) - pods = append(pods, pod) - podStats[pod] = podStat - } - podToEvict := pods[0] - activePodsFunc := func() []*v1.Pod { - return pods + + setINodesFreeBasedOnFs := func(whichFs string, inodesFree string, diskStat *statsapi.Summary) *statsapi.Summary { + inodesFreeVal := resource.MustParse(inodesFree) + internalFsInodesFree := uint64(inodesFreeVal.Value()) + + if whichFs == "nodefs" { + diskStat.Node.Fs.InodesFree = &internalFsInodesFree + } else if whichFs == "imagefs" { + diskStat.Node.Runtime.ImageFs.InodesFree = &internalFsInodesFree + } else if whichFs == "containerfs" { + diskStat.Node.Runtime.ContainerFs.InodesFree = &internalFsInodesFree + } + return diskStat } - fakeClock := testingclock.NewFakeClock(time.Now()) - podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - diskGC := &mockDiskGC{err: nil} - nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - - config := Config{ - MaxPodGracePeriodSeconds: 5, - PressureTransitionPeriod: time.Minute * 5, - Thresholds: []evictionapi.Threshold{ - { - Signal: evictionapi.SignalNodeFsInodesFree, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("1Mi"), + testCases := map[string]struct { + nodeFsInodesFree string + nodeFsInodes string + imageFsInodesFree string + imageFsInodes string + containerFsInodesFree string + containerFsInodes string + kubeletSeparateDiskFeature bool + writeableSeparateFromReadOnly bool + thresholdToMonitor []evictionapi.Threshold + podToMakes []podToMake + dedicatedImageFs *bool + expectErr string + inducePressureOnWhichFs string + softINodePressure string + hardINodePressure string + }{ + "eviction due to disk pressure; no image fs": { + dedicatedImageFs: ptr.To(false), + nodeFsInodesFree: "3Mi", + nodeFsInodes: "4Mi", + imageFsInodesFree: "3Mi", + imageFsInodes: "4Mi", + containerFsInodesFree: "3Mi", + containerFsInodes: "4Mi", + inducePressureOnWhichFs: "nodefs", + softINodePressure: "1.5Mi", + hardINodePressure: "0.5Mi", + thresholdToMonitor: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Mi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Mi"), + }, + GracePeriod: time.Minute * 2, }, }, - { - Signal: evictionapi.SignalNodeFsInodesFree, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("2Mi"), + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsInodesUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsInodesUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsInodesUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsInodesUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsInodesUsed: "100Mi"}, + }, + }, + "eviction due to image disk pressure; image fs": { + dedicatedImageFs: ptr.To(true), + nodeFsInodesFree: "3Mi", + nodeFsInodes: "4Mi", + imageFsInodesFree: "3Mi", + imageFsInodes: "4Mi", + containerFsInodesFree: "3Mi", + containerFsInodes: "4Mi", + softINodePressure: "1.5Mi", + hardINodePressure: "0.5Mi", + inducePressureOnWhichFs: "imagefs", + thresholdToMonitor: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Mi"), + }, }, - GracePeriod: time.Minute * 2, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Mi"), + }, + GracePeriod: time.Minute * 2, + }, + }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsInodesUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsInodesUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsInodesUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsInodesUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsInodesUsed: "100Mi"}, + }, + }, + "eviction due to container disk pressure; container fs": { + dedicatedImageFs: ptr.To(true), + kubeletSeparateDiskFeature: true, + writeableSeparateFromReadOnly: true, + nodeFsInodesFree: "3Mi", + nodeFsInodes: "4Mi", + imageFsInodesFree: "3Mi", + imageFsInodes: "4Mi", + containerFsInodesFree: "3Mi", + containerFsInodes: "4Mi", + softINodePressure: "1.5Mi", + hardINodePressure: "0.5Mi", + inducePressureOnWhichFs: "nodefs", + thresholdToMonitor: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Mi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Mi"), + }, + GracePeriod: time.Minute * 2, + }, + }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsInodesUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsInodesUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsInodesUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsInodesUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsInodesUsed: "100Mi"}, }, }, } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("3Mi", "4Mi", podStats)} - manager := &managerImpl{ - clock: fakeClock, - killPodFunc: podKiller.killPodNow, - imageGC: diskGC, - containerGC: diskGC, - config: config, - recorder: &record.FakeRecorder{}, - summaryProvider: summaryProvider, - nodeRef: nodeRef, - nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, - thresholdsFirstObservedAt: thresholdsObservedAt{}, - } - // create a best effort pod to test admission - podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0", "0", "0") + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, tc.kubeletSeparateDiskFeature)() - // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + podMaker := podMaker + summaryStatsMaker := summaryStatsMaker + podsToMake := tc.podToMakes + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsInodesUsed, podToMake.logsFsInodesUsed, podToMake.perLocalVolumeInodesUsed) + pods = append(pods, pod) + podStats[pod] = podStat + } + podToEvict := pods[0] + activePodsFunc := func() []*v1.Pod { + return pods + } - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report inode pressure") - } + fakeClock := testingclock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs} + diskGC := &mockDiskGC{err: nil, readAndWriteSeparate: tc.writeableSeparateFromReadOnly} + nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - // try to admit our pod (should succeed) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { - t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) - } + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: tc.thresholdToMonitor, + } + startingStatsConst := summaryStatsMaker(tc.nodeFsInodesFree, tc.nodeFsInodes, tc.imageFsInodesFree, tc.imageFsInodes, tc.containerFsInodesFree, tc.containerFsInodes, podStats) + startingStatsModified := summaryStatsMaker(tc.nodeFsInodesFree, tc.nodeFsInodes, tc.imageFsInodesFree, tc.imageFsInodes, tc.containerFsInodesFree, tc.containerFsInodes, podStats) + summaryProvider := &fakeSummaryProvider{result: startingStatsModified} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: diskGC, + containerGC: diskGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } - // induce soft threshold - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + // create a best effort pod to test admission + podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0", "0", "0") - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report inode pressure since soft threshold was met") - } + // synchronize + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) - // verify no pod was yet killed because there has not yet been enough time passed. - if podKiller.pod != nil { - t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name) - } + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // step forward in time pass the grace period - fakeClock.Step(3 * time.Minute) - summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report inode pressure") + } - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report inode pressure since soft threshold was met") - } + // try to admit our pod (should succeed) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + } - // verify the right pod was killed with the right grace period. - if podKiller.pod != podToEvict { - t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) - } - if podKiller.gracePeriodOverride == nil { - t.Errorf("Manager chose to kill pod but should have had a grace period override.") - } - observedGracePeriod := *podKiller.gracePeriodOverride - if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds { - t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod) - } - // reset state - podKiller.pod = nil - podKiller.gracePeriodOverride = nil + // induce soft threshold + fakeClock.Step(1 * time.Minute) + summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.softINodePressure, startingStatsModified) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // remove inode pressure - fakeClock.Step(20 * time.Minute) - summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report inode pressure") - } + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report inode pressure since soft threshold was met") + } - // induce inode pressure! - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + // verify no pod was yet killed because there has not yet been enough time passed. + if podKiller.pod != nil { + t.Fatalf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name) + } - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report inode pressure") - } + // step forward in time pass the grace period + fakeClock.Step(3 * time.Minute) + summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.softINodePressure, startingStatsModified) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // check the right pod was killed - if podKiller.pod != podToEvict { - t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) - } - observedGracePeriod = *podKiller.gracePeriodOverride - if observedGracePeriod != int64(0) { - t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) - } + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // try to admit our pod (should fail) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { - t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) - } + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report inode pressure since soft threshold was met") + } - // reduce inode pressure - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) - podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + // verify the right pod was killed with the right grace period. + if podKiller.pod != podToEvict { + t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } + if podKiller.gracePeriodOverride == nil { + t.Fatalf("Manager chose to kill pod but should have had a grace period override.") + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds { + t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod) + } + // reset state + podKiller.pod = nil + podKiller.gracePeriodOverride = nil - // we should have disk pressure (because transition period not yet met) - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report inode pressure") - } + // remove inode pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = startingStatsConst + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // try to admit our pod (should fail) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { - t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) - } + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report inode pressure") + } - // move the clock past transition period to ensure that we stop reporting pressure - fakeClock.Step(5 * time.Minute) - summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) - podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + // induce inode pressure! + fakeClock.Step(1 * time.Minute) + summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.hardINodePressure, startingStatsModified) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // we should not have disk pressure (because transition period met) - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report inode pressure") - } + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report inode pressure") + } - // try to admit our pod (should succeed) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { - t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + // check the right pod was killed + if podKiller.pod != podToEvict { + t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } + observedGracePeriod = *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // try to admit our pod (should fail) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) + } + + // reduce inode pressure + fakeClock.Step(1 * time.Minute) + summaryProvider.result = startingStatsConst + podKiller.pod = nil // reset state + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should have disk pressure (because transition period not yet met) + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report inode pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // try to admit our pod (should fail) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + summaryProvider.result = startingStatsConst + podKiller.pod = nil // reset state + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should not have disk pressure (because transition period met) + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report inode pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Fatalf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // try to admit our pod (should succeed) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + } + }) } } @@ -1480,7 +2168,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{ Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "", @@ -1523,7 +2211,11 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1538,7 +2230,11 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1556,7 +2252,11 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -1572,7 +2272,11 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1605,7 +2309,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -1641,7 +2345,11 @@ func TestAllocatableMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newResourceList("100m", "100Mi", ""), newResourceList("200m", "200Mi", ""), "0Gi") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -1661,7 +2369,11 @@ func TestAllocatableMemoryPressure(t *testing.T) { pod, podStat := podMaker("guaranteed-high-2", defaultPriority, newResourceList("100m", "1Gi", ""), newResourceList("100m", "1Gi", ""), "1Gi") podStats[pod] = podStat summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1697,7 +2409,11 @@ func TestAllocatableMemoryPressure(t *testing.T) { } summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -1721,7 +2437,11 @@ func TestAllocatableMemoryPressure(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // we should not have memory pressure (because transition period met) if manager.IsUnderMemoryPressure() { @@ -1749,7 +2469,7 @@ func TestUpdateMemcgThreshold(t *testing.T) { fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -1790,14 +2510,26 @@ func TestUpdateMemcgThreshold(t *testing.T) { } // The UpdateThreshold method should have been called once, since this is the first run. - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // The UpdateThreshold method should not have been called again, since not enough time has passed - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // The UpdateThreshold method should be called again since enough time has passed fakeClock.Step(2 * notifierRefreshInterval) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } // new memory threshold notifier that returns an error thresholdNotifier = NewMockThresholdNotifier(mockCtrl) @@ -1808,7 +2540,11 @@ func TestUpdateMemcgThreshold(t *testing.T) { // The UpdateThreshold method should be called because at least notifierRefreshInterval time has passed. // The Description method should be called because UpdateThreshold returned an error fakeClock.Step(2 * notifierRefreshInterval) - manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } } func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) { @@ -1828,7 +2564,12 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) { podStats[pod] = podStat } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("1Gi", "200Mi", podStats)} + diskStat := diskStats{ + rootFsAvailableBytes: "1Gi", + imageFsAvailableBytes: "200Mi", + podStats: podStats, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStat)} config := Config{ MaxPodGracePeriodSeconds: 5, @@ -1848,7 +2589,7 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) { diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} fakeClock := testingclock.NewFakeClock(time.Now()) - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} mgr := &managerImpl{ clock: fakeClock, @@ -1860,15 +2601,18 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) { summaryProvider: summaryProvider, nodeRef: nodeRef, localStorageCapacityIsolation: true, - dedicatedImageFs: &diskInfoProvider.dedicatedImageFs, + dedicatedImageFs: diskInfoProvider.dedicatedImageFs, } activePodsFunc := func() []*v1.Pod { return pods } - evictedPods := mgr.synchronize(diskInfoProvider, activePodsFunc) + evictedPods, err := mgr.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager should not have error but got %v", err) + } if podKiller.pod == nil { t.Fatalf("Manager should have selected a pod for eviction") } diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index cb593352356..19d09ec9da5 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -17,6 +17,7 @@ limitations under the License. package eviction import ( + "errors" "fmt" "sort" "strconv" @@ -80,9 +81,11 @@ func init() { signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure + signalToNodeCondition[evictionapi.SignalContainerFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure + signalToNodeCondition[evictionapi.SignalContainerFsInodesFree] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalPIDAvailable] = v1.NodePIDPressure // map signals to resources (and vice-versa) @@ -91,6 +94,8 @@ func init() { signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory signalToResource[evictionapi.SignalImageFsAvailable] = v1.ResourceEphemeralStorage signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes + signalToResource[evictionapi.SignalContainerFsAvailable] = v1.ResourceEphemeralStorage + signalToResource[evictionapi.SignalContainerFsInodesFree] = resourceInodes signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes signalToResource[evictionapi.SignalPIDAvailable] = resourcePids @@ -172,6 +177,181 @@ func addAllocatableThresholds(thresholds []evictionapi.Threshold) []evictionapi. return append(append([]evictionapi.Threshold{}, thresholds...), additionalThresholds...) } +// UpdateContainerFsThresholds will add containerfs eviction hard/soft +// settings based on container runtime settings. +// Thresholds are parsed from evictionHard and evictionSoft limits so we will override. +// If there is a single filesystem, then containerfs settings are same as nodefs. +// If there is a separate image filesystem for both containers and images then containerfs settings are same as imagefs. +func UpdateContainerFsThresholds(thresholds []evictionapi.Threshold, imageFs, separateContainerImageFs bool) ([]evictionapi.Threshold, error) { + hardNodeFsDisk := evictionapi.Threshold{} + softNodeFsDisk := evictionapi.Threshold{} + hardNodeINodeDisk := evictionapi.Threshold{} + softNodeINodeDisk := evictionapi.Threshold{} + hardImageFsDisk := evictionapi.Threshold{} + softImageFsDisk := evictionapi.Threshold{} + hardImageINodeDisk := evictionapi.Threshold{} + softImageINodeDisk := evictionapi.Threshold{} + + hardContainerFsDisk := -1 + softContainerFsDisk := -1 + hardContainerFsINodes := -1 + softContainerFsINodes := -1 + // Find the imagefs and nodefs thresholds + var err error = nil + for idx, threshold := range thresholds { + if threshold.Signal == evictionapi.SignalImageFsAvailable && isHardEvictionThreshold(threshold) { + hardImageFsDisk = threshold + } + if threshold.Signal == evictionapi.SignalImageFsAvailable && !isHardEvictionThreshold(threshold) { + softImageFsDisk = threshold + } + if threshold.Signal == evictionapi.SignalImageFsInodesFree && isHardEvictionThreshold(threshold) { + hardImageINodeDisk = threshold + } + if threshold.Signal == evictionapi.SignalImageFsInodesFree && !isHardEvictionThreshold(threshold) { + softImageINodeDisk = threshold + } + if threshold.Signal == evictionapi.SignalNodeFsAvailable && isHardEvictionThreshold(threshold) { + hardNodeFsDisk = threshold + } + if threshold.Signal == evictionapi.SignalNodeFsAvailable && !isHardEvictionThreshold(threshold) { + softNodeFsDisk = threshold + } + if threshold.Signal == evictionapi.SignalNodeFsInodesFree && isHardEvictionThreshold(threshold) { + hardNodeINodeDisk = threshold + } + if threshold.Signal == evictionapi.SignalNodeFsInodesFree && !isHardEvictionThreshold(threshold) { + softNodeINodeDisk = threshold + } + // We are logging a warning and we will override the settings. + // In this case this is safe because we do not support a separate container filesystem. + // So we want either limits to be same as nodefs or imagefs. + if threshold.Signal == evictionapi.SignalContainerFsAvailable && isHardEvictionThreshold(threshold) { + err = errors.Join(fmt.Errorf("found containerfs.available for hard eviction. ignoring")) + hardContainerFsDisk = idx + } + if threshold.Signal == evictionapi.SignalContainerFsAvailable && !isHardEvictionThreshold(threshold) { + err = errors.Join(fmt.Errorf("found containerfs.available for soft eviction. ignoring")) + softContainerFsDisk = idx + } + if threshold.Signal == evictionapi.SignalContainerFsInodesFree && isHardEvictionThreshold(threshold) { + err = errors.Join(fmt.Errorf("found containerfs.inodesFree for hard eviction. ignoring")) + hardContainerFsINodes = idx + } + if threshold.Signal == evictionapi.SignalContainerFsInodesFree && !isHardEvictionThreshold(threshold) { + err = errors.Join(fmt.Errorf("found containerfs.inodesFree for soft eviction. ignoring")) + softContainerFsINodes = idx + } + } + // Either split disk case (containerfs=nodefs) or single filesystem + if (imageFs && separateContainerImageFs) || (!imageFs && !separateContainerImageFs) { + if hardContainerFsDisk != -1 { + thresholds[hardContainerFsDisk] = evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, Operator: hardNodeFsDisk.Operator, Value: hardNodeFsDisk.Value, MinReclaim: hardNodeFsDisk.MinReclaim, + } + } else { + thresholds = append(thresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: hardNodeFsDisk.Operator, + Value: hardNodeFsDisk.Value, + MinReclaim: hardNodeFsDisk.MinReclaim, + }) + } + if softContainerFsDisk != -1 { + thresholds[softContainerFsDisk] = evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, GracePeriod: softNodeFsDisk.GracePeriod, Operator: softNodeFsDisk.Operator, Value: softNodeFsDisk.Value, MinReclaim: softNodeFsDisk.MinReclaim, + } + } else { + thresholds = append(thresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: softNodeFsDisk.Operator, + Value: softNodeFsDisk.Value, + MinReclaim: softNodeFsDisk.MinReclaim, + GracePeriod: softNodeFsDisk.GracePeriod, + }) + } + if hardContainerFsINodes != -1 { + thresholds[hardContainerFsINodes] = evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, Operator: hardNodeINodeDisk.Operator, Value: hardNodeINodeDisk.Value, MinReclaim: hardNodeINodeDisk.MinReclaim, + } + } else { + thresholds = append(thresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: hardNodeINodeDisk.Operator, + Value: hardNodeINodeDisk.Value, + MinReclaim: hardNodeINodeDisk.MinReclaim, + }) + } + if softContainerFsINodes != -1 { + thresholds[softContainerFsINodes] = evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: softNodeINodeDisk.GracePeriod, Operator: softNodeINodeDisk.Operator, Value: softNodeINodeDisk.Value, MinReclaim: softNodeINodeDisk.MinReclaim, + } + } else { + thresholds = append(thresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: softNodeINodeDisk.Operator, + Value: softNodeINodeDisk.Value, + MinReclaim: softNodeINodeDisk.MinReclaim, + GracePeriod: softNodeINodeDisk.GracePeriod, + }) + } + } + // Separate image filesystem case + if imageFs && !separateContainerImageFs { + if hardContainerFsDisk != -1 { + thresholds[hardContainerFsDisk] = evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, Operator: hardImageFsDisk.Operator, Value: hardImageFsDisk.Value, MinReclaim: hardImageFsDisk.MinReclaim, + } + } else { + thresholds = append(thresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: hardImageFsDisk.Operator, + Value: hardImageFsDisk.Value, + MinReclaim: hardImageFsDisk.MinReclaim, + }) + } + if softContainerFsDisk != -1 { + thresholds[softContainerFsDisk] = evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, GracePeriod: softImageFsDisk.GracePeriod, Operator: softImageFsDisk.Operator, Value: softImageFsDisk.Value, MinReclaim: softImageFsDisk.MinReclaim, + } + } else { + thresholds = append(thresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: softImageFsDisk.Operator, + Value: softImageFsDisk.Value, + MinReclaim: softImageFsDisk.MinReclaim, + GracePeriod: softImageFsDisk.GracePeriod, + }) + } + if hardContainerFsINodes != -1 { + thresholds[hardContainerFsINodes] = evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: hardImageINodeDisk.GracePeriod, Operator: hardImageINodeDisk.Operator, Value: hardImageINodeDisk.Value, MinReclaim: hardImageINodeDisk.MinReclaim, + } + } else { + thresholds = append(thresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: hardImageINodeDisk.Operator, + Value: hardImageINodeDisk.Value, + MinReclaim: hardImageINodeDisk.MinReclaim, + }) + } + if softContainerFsINodes != -1 { + thresholds[softContainerFsINodes] = evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: softImageINodeDisk.GracePeriod, Operator: softImageINodeDisk.Operator, Value: softImageINodeDisk.Value, MinReclaim: softImageINodeDisk.MinReclaim, + } + } else { + thresholds = append(thresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: softImageINodeDisk.Operator, + Value: softImageINodeDisk.Value, + MinReclaim: softImageINodeDisk.MinReclaim, + GracePeriod: softImageINodeDisk.GracePeriod, + }) + } + } + return thresholds, err +} + // parseThresholdStatements parses the input statements into a list of Threshold objects. func parseThresholdStatements(statements map[string]string) ([]evictionapi.Threshold, error) { if len(statements) == 0 { @@ -708,12 +888,28 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI), time: imageFs.Time, } - if imageFs.InodesFree != nil && imageFs.Inodes != nil { - result[evictionapi.SignalImageFsInodesFree] = signalObservation{ - available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI), - capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI), - time: imageFs.Time, - } + } + if imageFs.InodesFree != nil && imageFs.Inodes != nil { + result[evictionapi.SignalImageFsInodesFree] = signalObservation{ + available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI), + capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI), + time: imageFs.Time, + } + } + } + if containerFs := summary.Node.Runtime.ContainerFs; containerFs != nil { + if containerFs.AvailableBytes != nil && containerFs.CapacityBytes != nil { + result[evictionapi.SignalContainerFsAvailable] = signalObservation{ + available: resource.NewQuantity(int64(*containerFs.AvailableBytes), resource.BinarySI), + capacity: resource.NewQuantity(int64(*containerFs.CapacityBytes), resource.BinarySI), + time: containerFs.Time, + } + } + if containerFs.InodesFree != nil && containerFs.Inodes != nil { + result[evictionapi.SignalContainerFsInodesFree] = signalObservation{ + available: resource.NewQuantity(int64(*containerFs.InodesFree), resource.DecimalSI), + capacity: resource.NewQuantity(int64(*containerFs.Inodes), resource.DecimalSI), + time: containerFs.Time, } } } @@ -951,27 +1147,47 @@ func isAllocatableEvictionThreshold(threshold evictionapi.Threshold) bool { } // buildSignalToRankFunc returns ranking functions associated with resources -func buildSignalToRankFunc(withImageFs bool) map[evictionapi.Signal]rankFunc { +func buildSignalToRankFunc(withImageFs bool, imageContainerSplitFs bool) map[evictionapi.Signal]rankFunc { signalToRankFunc := map[evictionapi.Signal]rankFunc{ evictionapi.SignalMemoryAvailable: rankMemoryPressure, evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure, evictionapi.SignalPIDAvailable: rankPIDPressure, } // usage of an imagefs is optional - if withImageFs { + // We have a dedicated Image filesystem (images and containers are on same disk) + // then we assume it is just a separate imagefs + if withImageFs && !imageContainerSplitFs { // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) // with an imagefs, imagefs pod rank func for eviction only includes rootfs - signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, v1.ResourceEphemeralStorage) - signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes) + signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsImages}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsImages}, resourceInodes) + signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalImageFsAvailable] + signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalImageFsInodesFree] + + // If both imagefs and container fs are on separate disks + // we want to track the writeable layer in containerfs signals. + } else if withImageFs && imageContainerSplitFs { + // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes + signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot}, resourceInodes) + signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalNodeFsAvailable] + signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalNodeFsInodesFree] + // with an imagefs, containerfs pod rank func for eviction only includes rootfs + + signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages}, resourceInodes) + // If image fs is not on separate disk as root but container fs is } else { // without an imagefs, nodefs pod rank func for eviction looks at all fs stats. // since imagefs and nodefs share a common device, they share common ranking functions. - signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) - signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) - signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) - signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) + signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) + signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) + signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalNodeFsAvailable] + signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalNodeFsInodesFree] } return signalToRankFunc } @@ -982,19 +1198,29 @@ func PodIsEvicted(podStatus v1.PodStatus) bool { } // buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources. -func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs { +func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool, splitContainerImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs { signalToReclaimFunc := map[evictionapi.Signal]nodeReclaimFuncs{} // usage of an imagefs is optional - if withImageFs { + if withImageFs && !splitContainerImageFs { // with an imagefs, nodefs pressure should just delete logs signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{} signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{} // with an imagefs, imagefs pressure should delete unused images signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} + // usage of imagefs and container fs on separate disks + // containers gc on containerfs pressure + // image gc on imagefs pressure + } else if withImageFs && splitContainerImageFs { + // with an imagefs, imagefs pressure should delete unused images + signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{imageGC.DeleteUnusedImages} + signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{imageGC.DeleteUnusedImages} + // with an split fs and imagefs, containerfs pressure should delete unused containers + signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers} + signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers} } else { // without an imagefs, nodefs pressure should delete logs, and unused images - // since imagefs and nodefs share a common device, they share common reclaim functions + // since imagefs, containerfs and nodefs share a common device, they share common reclaim functions signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 0a7b3c51f8a..e3a8f8a4c42 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -90,6 +90,16 @@ func TestGetReclaimableThreshold(t *testing.T) { Quantity: quantityMustParse("1Gi"), }, }, + { + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, }, }, } @@ -1232,6 +1242,12 @@ func TestMakeSignalObservations(t *testing.T) { imageFsInodes := uint64(1024 * 1024) nodeFsInodesFree := uint64(1024) nodeFsInodes := uint64(1024 * 1024) + containerFsAvailableBytes := uint64(1024 * 1024 * 2) + containerFsCapacityBytes := uint64(1024 * 1024 * 8) + containerFsInodesFree := uint64(1024 * 2) + containerFsInodes := uint64(1024 * 2) + maxPID := int64(255816) + numberOfRunningProcesses := int64(1000) fakeStats := &statsapi.Summary{ Node: statsapi.NodeStats{ Memory: &statsapi.MemoryStats{ @@ -1245,6 +1261,16 @@ func TestMakeSignalObservations(t *testing.T) { InodesFree: &imageFsInodesFree, Inodes: &imageFsInodes, }, + ContainerFs: &statsapi.FsStats{ + AvailableBytes: &containerFsAvailableBytes, + CapacityBytes: &containerFsCapacityBytes, + InodesFree: &containerFsInodesFree, + Inodes: &containerFsInodes, + }, + }, + Rlimit: &statsapi.RlimitStats{ + MaxPID: &maxPID, + NumOfRunningProcesses: &numberOfRunningProcesses, }, Fs: &statsapi.FsStats{ AvailableBytes: &nodeFsAvailableBytes, @@ -1329,6 +1355,16 @@ func TestMakeSignalObservations(t *testing.T) { if expectedBytes := int64(imageFsCapacityBytes); imageFsQuantity.capacity.Value() != expectedBytes { t.Errorf("Expected %v, actual: %v", expectedBytes, imageFsQuantity.capacity.Value()) } + containerFsQuantity, found := actualObservations[evictionapi.SignalContainerFsAvailable] + if !found { + t.Error("Expected available containerfs observation") + } + if expectedBytes := int64(containerFsAvailableBytes); containerFsQuantity.available.Value() != expectedBytes { + t.Errorf("Expected %v, actual: %v", expectedBytes, containerFsQuantity.available.Value()) + } + if expectedBytes := int64(containerFsCapacityBytes); containerFsQuantity.capacity.Value() != expectedBytes { + t.Errorf("Expected %v, actual: %v", expectedBytes, containerFsQuantity.capacity.Value()) + } imageFsInodesQuantity, found := actualObservations[evictionapi.SignalImageFsInodesFree] if !found { t.Error("Expected inodes free imagefs observation") @@ -1339,6 +1375,27 @@ func TestMakeSignalObservations(t *testing.T) { if expected := int64(imageFsInodes); imageFsInodesQuantity.capacity.Value() != expected { t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.capacity.Value()) } + containerFsInodesQuantity, found := actualObservations[evictionapi.SignalContainerFsInodesFree] + if !found { + t.Error("Expected indoes free containerfs observation") + } + if expected := int64(containerFsInodesFree); containerFsInodesQuantity.available.Value() != expected { + t.Errorf("Expected %v, actual: %v", expected, containerFsInodesQuantity.available.Value()) + } + if expected := int64(containerFsInodes); containerFsInodesQuantity.capacity.Value() != expected { + t.Errorf("Expected %v, actual: %v", expected, containerFsInodesQuantity.capacity.Value()) + } + + pidQuantity, found := actualObservations[evictionapi.SignalPIDAvailable] + if !found { + t.Error("Expected available memory observation") + } + if expectedBytes := int64(maxPID); pidQuantity.capacity.Value() != expectedBytes { + t.Errorf("Expected %v, actual: %v", expectedBytes, pidQuantity.capacity.Value()) + } + if expectedBytes := int64(maxPID - numberOfRunningProcesses); pidQuantity.available.Value() != expectedBytes { + t.Errorf("Expected %v, actual: %v", expectedBytes, pidQuantity.available.Value()) + } for _, pod := range pods { podStats, found := statsFunc(pod) if !found { @@ -1953,6 +2010,1006 @@ func TestCompareThresholdValue(t *testing.T) { } } } +func TestAddContainerFsThresholds(t *testing.T) { + gracePeriod := time.Duration(1) + testCases := []struct { + description string + imageFs bool + containerFs bool + expectedContainerFsHard evictionapi.Threshold + expectedContainerFsSoft evictionapi.Threshold + expectedContainerFsINodesHard evictionapi.Threshold + expectedContainerFsINodesSoft evictionapi.Threshold + expectErr bool + thresholdList []evictionapi.Threshold + }{ + { + description: "single filesystem", + imageFs: false, + containerFs: false, + expectedContainerFsHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsINodesHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsINodesSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + + thresholdList: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + }, + }, + { + description: "image filesystem", + imageFs: true, + containerFs: false, + expectedContainerFsHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1.5Gi"), + }, + }, + expectedContainerFsSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("3Gi"), + }, + }, + expectedContainerFsINodesHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + expectedContainerFsINodesSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + thresholdList: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1.5Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1.5Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("3Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + }, + }, + { + description: "container and image are separate", + imageFs: true, + containerFs: true, + expectedContainerFsHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsINodesHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsINodesSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + thresholdList: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1.5Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("3Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + }, + }, + { + description: "single filesystem; existing containerfsstats", + imageFs: false, + containerFs: false, + expectErr: true, + expectedContainerFsHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsINodesHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsINodesSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + + thresholdList: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + { + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + + { + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + { + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + }, + }, + { + description: "image filesystem; expect error", + imageFs: true, + containerFs: false, + expectErr: true, + expectedContainerFsHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1.5Gi"), + }, + }, + expectedContainerFsSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("3Gi"), + }, + }, + expectedContainerFsINodesHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + expectedContainerFsINodesSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + thresholdList: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1.5Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1.5Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("3Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + { + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + + { + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + { + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + }, + }, + { + description: "container and image are separate; expect error", + imageFs: true, + containerFs: true, + expectErr: true, + expectedContainerFsHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsINodesHard: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + expectedContainerFsINodesSoft: evictionapi.Threshold{ + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + thresholdList: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1.5Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("3Gi"), + }, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("2Gi"), + }, + }, + { + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + + { + Signal: evictionapi.SignalContainerFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + { + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + GracePeriod: gracePeriod, + }, + { + Signal: evictionapi.SignalContainerFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("500Mi"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("5Gi"), + }, + }, + }, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { + expected, err := UpdateContainerFsThresholds(testCase.thresholdList, testCase.imageFs, testCase.containerFs) + if err != nil && !testCase.expectErr { + t.Fatalf("got error but did not expect any") + } + hardContainerFsMatch := -1 + softContainerFsMatch := -1 + hardContainerFsINodesMatch := -1 + softContainerFsINodesMatch := -1 + for idx, val := range expected { + if val.Signal == evictionapi.SignalContainerFsAvailable && isHardEvictionThreshold(val) { + if !reflect.DeepEqual(val, testCase.expectedContainerFsHard) { + t.Fatalf("want %v got %v", testCase.expectedContainerFsHard, val) + } + hardContainerFsMatch = idx + } + if val.Signal == evictionapi.SignalContainerFsAvailable && !isHardEvictionThreshold(val) { + if !reflect.DeepEqual(val, testCase.expectedContainerFsSoft) { + t.Fatalf("want %v got %v", testCase.expectedContainerFsSoft, val) + } + softContainerFsMatch = idx + } + if val.Signal == evictionapi.SignalContainerFsInodesFree && isHardEvictionThreshold(val) { + if !reflect.DeepEqual(val, testCase.expectedContainerFsINodesHard) { + t.Fatalf("want %v got %v", testCase.expectedContainerFsINodesHard, val) + } + hardContainerFsINodesMatch = idx + } + if val.Signal == evictionapi.SignalContainerFsInodesFree && !isHardEvictionThreshold(val) { + if !reflect.DeepEqual(val, testCase.expectedContainerFsINodesSoft) { + t.Fatalf("want %v got %v", testCase.expectedContainerFsINodesSoft, val) + } + softContainerFsINodesMatch = idx + } + } + if hardContainerFsMatch == -1 { + t.Fatalf("did not find hard containerfs.available") + } + if softContainerFsMatch == -1 { + t.Fatalf("did not find soft containerfs.available") + } + if hardContainerFsINodesMatch == -1 { + t.Fatalf("did not find hard containerfs.inodesfree") + } + if softContainerFsINodesMatch == -1 { + t.Fatalf("did not find soft containerfs.inodesfree") + } + }) + } +} // newPodInodeStats returns stats with specified usage amounts. func newPodInodeStats(pod *v1.Pod, rootFsInodesUsed, logsInodesUsed, perLocalVolumeInodesUsed resource.Quantity) statsapi.PodStats { diff --git a/pkg/kubelet/eviction/mock_threshold_notifier_test.go b/pkg/kubelet/eviction/mock_threshold_notifier_test.go index 341b7c8b1f4..02258b4f3a3 100644 --- a/pkg/kubelet/eviction/mock_threshold_notifier_test.go +++ b/pkg/kubelet/eviction/mock_threshold_notifier_test.go @@ -218,6 +218,20 @@ func (mr *MockContainerGCMockRecorder) DeleteAllUnusedContainers(ctx interface{} return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteAllUnusedContainers", reflect.TypeOf((*MockContainerGC)(nil).DeleteAllUnusedContainers), ctx) } +// IsContainerFsSeparateFromImageFs mocks base method. +func (m *MockContainerGC) IsContainerFsSeparateFromImageFs(ctx context.Context) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsContainerFsSeparateFromImageFs", ctx) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsContainerFsSeparateFromImageFs indicates an expected call of IsContainerFsSeparateFromImageFs. +func (mr *MockContainerGCMockRecorder) IsContainerFsSeparateFromImageFs(ctx interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsContainerFsSeparateFromImageFs", reflect.TypeOf((*MockContainerGC)(nil).IsContainerFsSeparateFromImageFs), ctx) +} + // MockCgroupNotifier is a mock of CgroupNotifier interface. type MockCgroupNotifier struct { ctrl *gomock.Controller diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index d0f7a7403d5..3a83a3e31ef 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -38,6 +38,8 @@ const ( fsStatsLogs fsStatsType = "logs" // fsStatsRoot identifies stats for pod container writable layers. fsStatsRoot fsStatsType = "root" + // fsStatsContainer identifies stats for pod container read-only layers + fsStatsImages fsStatsType = "images" ) // Config holds information about how eviction is configured. @@ -85,6 +87,8 @@ type ImageGC interface { type ContainerGC interface { // DeleteAllUnusedContainers deletes all unused containers, even those that belong to pods that are terminated, but not deleted. DeleteAllUnusedContainers(ctx context.Context) error + // IsContainerFsSeparateFromImageFs checks if container filesystem is split from image filesystem. + IsContainerFsSeparateFromImageFs(ctx context.Context) bool } // KillPodFunc kills a pod. diff --git a/pkg/kubelet/images/image_gc_manager.go b/pkg/kubelet/images/image_gc_manager.go index b8129ef2ada..30d49fe450b 100644 --- a/pkg/kubelet/images/image_gc_manager.go +++ b/pkg/kubelet/images/image_gc_manager.go @@ -54,7 +54,7 @@ const imageIndexTupleFormat = "%s,%s" // collection. type StatsProvider interface { // ImageFsStats returns the stats of the image filesystem. - ImageFsStats(ctx context.Context) (*statsapi.FsStats, error) + ImageFsStats(ctx context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) } // ImageGCManager is an interface for managing lifecycle of all images. @@ -328,7 +328,7 @@ func (im *realImageGCManager) GarbageCollect(ctx context.Context) error { } // Get disk usage on disk holding images. - fsStats, err := im.statsProvider.ImageFsStats(ctx) + fsStats, _, err := im.statsProvider.ImageFsStats(ctx) if err != nil { return err } diff --git a/pkg/kubelet/images/image_gc_manager_test.go b/pkg/kubelet/images/image_gc_manager_test.go index 769031f9730..3465ca8456c 100644 --- a/pkg/kubelet/images/image_gc_manager_test.go +++ b/pkg/kubelet/images/image_gc_manager_test.go @@ -618,10 +618,11 @@ func TestGarbageCollectBelowLowThreshold(t *testing.T) { manager, _ := newRealImageGCManager(policy, mockStatsProvider) // Expect 40% usage. - mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(&statsapi.FsStats{ + imageStats := &statsapi.FsStats{ AvailableBytes: uint64Ptr(600), CapacityBytes: uint64Ptr(1000), - }, nil) + } + mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(imageStats, imageStats, nil) assert.NoError(t, manager.GarbageCollect(ctx)) } @@ -637,7 +638,7 @@ func TestGarbageCollectCadvisorFailure(t *testing.T) { mockStatsProvider := statstest.NewMockProvider(mockCtrl) manager, _ := newRealImageGCManager(policy, mockStatsProvider) - mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(&statsapi.FsStats{}, fmt.Errorf("error")) + mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(&statsapi.FsStats{}, &statsapi.FsStats{}, fmt.Errorf("error")) assert.NotNil(t, manager.GarbageCollect(ctx)) } @@ -654,10 +655,11 @@ func TestGarbageCollectBelowSuccess(t *testing.T) { manager, fakeRuntime := newRealImageGCManager(policy, mockStatsProvider) // Expect 95% usage and most of it gets freed. - mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(&statsapi.FsStats{ + imageFs := &statsapi.FsStats{ AvailableBytes: uint64Ptr(50), CapacityBytes: uint64Ptr(1000), - }, nil) + } + mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(imageFs, imageFs, nil) fakeRuntime.ImageList = []container.Image{ makeImage(0, 450), } @@ -677,10 +679,11 @@ func TestGarbageCollectNotEnoughFreed(t *testing.T) { manager, fakeRuntime := newRealImageGCManager(policy, mockStatsProvider) // Expect 95% usage and little of it gets freed. - mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(&statsapi.FsStats{ + imageFs := &statsapi.FsStats{ AvailableBytes: uint64Ptr(50), CapacityBytes: uint64Ptr(1000), - }, nil) + } + mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(imageFs, imageFs, nil) fakeRuntime.ImageList = []container.Image{ makeImage(0, 50), } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index f5735b83502..3b494893005 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1326,7 +1326,7 @@ func (kl *Kubelet) ListPodStatsAndUpdateCPUNanoCoreUsage(ctx context.Context) ([ } // ImageFsStats is delegated to StatsProvider, which implements stats.Provider interface -func (kl *Kubelet) ImageFsStats(ctx context.Context) (*statsapi.FsStats, error) { +func (kl *Kubelet) ImageFsStats(ctx context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) { return kl.StatsProvider.ImageFsStats(ctx) } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_image.go b/pkg/kubelet/kuberuntime/kuberuntime_image.go index f4d9d364f9c..9c4520062cc 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_image.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_image.go @@ -158,3 +158,12 @@ func (m *kubeGenericRuntimeManager) ImageStats(ctx context.Context) (*kubecontai } return stats, nil } + +func (m *kubeGenericRuntimeManager) ImageFsInfo(ctx context.Context) (*runtimeapi.ImageFsInfoResponse, error) { + allImages, err := m.imageService.ImageFsInfo(ctx) + if err != nil { + klog.ErrorS(err, "Failed to get image filesystem") + return nil, err + } + return allImages, nil +} diff --git a/pkg/kubelet/server/server_test.go b/pkg/kubelet/server/server_test.go index 8faeb231408..8adabf6fe79 100644 --- a/pkg/kubelet/server/server_test.go +++ b/pkg/kubelet/server/server_test.go @@ -289,8 +289,10 @@ func (*fakeKubelet) ListPodStatsAndUpdateCPUNanoCoreUsage(_ context.Context) ([] func (*fakeKubelet) ListPodCPUAndMemoryStats(_ context.Context) ([]statsapi.PodStats, error) { return nil, nil } -func (*fakeKubelet) ImageFsStats(_ context.Context) (*statsapi.FsStats, error) { return nil, nil } -func (*fakeKubelet) RlimitStats() (*statsapi.RlimitStats, error) { return nil, nil } +func (*fakeKubelet) ImageFsStats(_ context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) { + return nil, nil, nil +} +func (*fakeKubelet) RlimitStats() (*statsapi.RlimitStats, error) { return nil, nil } func (*fakeKubelet) GetCgroupStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, *statsapi.NetworkStats, error) { return nil, nil, nil } diff --git a/pkg/kubelet/server/stats/handler.go b/pkg/kubelet/server/stats/handler.go index 24acdbcb093..ba3a946217c 100644 --- a/pkg/kubelet/server/stats/handler.go +++ b/pkg/kubelet/server/stats/handler.go @@ -51,8 +51,13 @@ type Provider interface { // for more details. ListPodStatsAndUpdateCPUNanoCoreUsage(ctx context.Context) ([]statsapi.PodStats, error) // ImageFsStats returns the stats of the image filesystem. - ImageFsStats(ctx context.Context) (*statsapi.FsStats, error) - + // Kubelet allows three options for container filesystems + // Everything is on node fs (so no image filesystem) + // Container storage is on a dedicated disk (imageFs is separate from root) + // Container Filesystem is on root and Images are stored on ImageFs + // First return parameter is the image filesystem and + // second parameter is the container filesystem + ImageFsStats(ctx context.Context) (imageFs *statsapi.FsStats, containerFs *statsapi.FsStats, callErr error) // The following stats are provided by cAdvisor. // // GetCgroupStats returns the stats and the networking usage of the cgroup diff --git a/pkg/kubelet/server/stats/summary.go b/pkg/kubelet/server/stats/summary.go index cb130e6007c..dacf3bf7e81 100644 --- a/pkg/kubelet/server/stats/summary.go +++ b/pkg/kubelet/server/stats/summary.go @@ -82,7 +82,7 @@ func (sp *summaryProviderImpl) Get(ctx context.Context, updateStats bool) (*stat if err != nil { return nil, fmt.Errorf("failed to get rootFs stats: %v", err) } - imageFsStats, err := sp.provider.ImageFsStats(ctx) + imageFsStats, containerFsStats, err := sp.provider.ImageFsStats(ctx) if err != nil { return nil, fmt.Errorf("failed to get imageFs stats: %v", err) } @@ -109,7 +109,7 @@ func (sp *summaryProviderImpl) Get(ctx context.Context, updateStats bool) (*stat Network: networkStats, StartTime: sp.systemBootTime, Fs: rootFsStats, - Runtime: &statsapi.RuntimeStats{ImageFs: imageFsStats}, + Runtime: &statsapi.RuntimeStats{ContainerFs: containerFsStats, ImageFs: imageFsStats}, Rlimit: rlimit, SystemContainers: sp.GetSystemContainersStats(nodeConfig, podStats, updateStats), } diff --git a/pkg/kubelet/server/stats/summary_test.go b/pkg/kubelet/server/stats/summary_test.go index e8d0e98beff..9e2241bc35c 100644 --- a/pkg/kubelet/server/stats/summary_test.go +++ b/pkg/kubelet/server/stats/summary_test.go @@ -48,7 +48,7 @@ var ( rlimitStats = getRlimitStats() ) -func TestSummaryProviderGetStats(t *testing.T) { +func TestSummaryProviderGetStatsNoSplitFileSystem(t *testing.T) { ctx := context.Background() assert := assert.New(t) @@ -81,7 +81,7 @@ func TestSummaryProviderGetStats(t *testing.T) { mockStatsProvider.EXPECT().GetPodCgroupRoot().Return(cgroupRoot) mockStatsProvider.EXPECT().ListPodStats(ctx).Return(podStats, nil).AnyTimes() mockStatsProvider.EXPECT().ListPodStatsAndUpdateCPUNanoCoreUsage(ctx).Return(podStats, nil) - mockStatsProvider.EXPECT().ImageFsStats(ctx).Return(imageFsStats, nil) + mockStatsProvider.EXPECT().ImageFsStats(ctx).Return(imageFsStats, imageFsStats, nil) mockStatsProvider.EXPECT().RootFsStats().Return(rootFsStats, nil) mockStatsProvider.EXPECT().RlimitStats().Return(rlimitStats, nil) mockStatsProvider.EXPECT().GetCgroupStats("/", true).Return(cgroupStatsMap["/"].cs, cgroupStatsMap["/"].ns, nil) @@ -103,7 +103,106 @@ func TestSummaryProviderGetStats(t *testing.T) { assert.Equal(summary.Node.Swap, cgroupStatsMap["/"].cs.Swap) assert.Equal(summary.Node.Network, cgroupStatsMap["/"].ns) assert.Equal(summary.Node.Fs, rootFsStats) - assert.Equal(summary.Node.Runtime, &statsapi.RuntimeStats{ImageFs: imageFsStats}) + assert.Equal(summary.Node.Runtime, &statsapi.RuntimeStats{ContainerFs: imageFsStats, ImageFs: imageFsStats}) + + assert.Equal(len(summary.Node.SystemContainers), 4) + assert.Contains(summary.Node.SystemContainers, statsapi.ContainerStats{ + Name: "kubelet", + StartTime: kubeletCreationTime, + CPU: cgroupStatsMap["/kubelet"].cs.CPU, + Memory: cgroupStatsMap["/kubelet"].cs.Memory, + Accelerators: cgroupStatsMap["/kubelet"].cs.Accelerators, + UserDefinedMetrics: cgroupStatsMap["/kubelet"].cs.UserDefinedMetrics, + Swap: cgroupStatsMap["/kubelet"].cs.Swap, + }) + assert.Contains(summary.Node.SystemContainers, statsapi.ContainerStats{ + Name: "misc", + StartTime: cgroupStatsMap["/misc"].cs.StartTime, + CPU: cgroupStatsMap["/misc"].cs.CPU, + Memory: cgroupStatsMap["/misc"].cs.Memory, + Accelerators: cgroupStatsMap["/misc"].cs.Accelerators, + UserDefinedMetrics: cgroupStatsMap["/misc"].cs.UserDefinedMetrics, + Swap: cgroupStatsMap["/misc"].cs.Swap, + }) + assert.Contains(summary.Node.SystemContainers, statsapi.ContainerStats{ + Name: "runtime", + StartTime: cgroupStatsMap["/runtime"].cs.StartTime, + CPU: cgroupStatsMap["/runtime"].cs.CPU, + Memory: cgroupStatsMap["/runtime"].cs.Memory, + Accelerators: cgroupStatsMap["/runtime"].cs.Accelerators, + UserDefinedMetrics: cgroupStatsMap["/runtime"].cs.UserDefinedMetrics, + Swap: cgroupStatsMap["/runtime"].cs.Swap, + }) + assert.Contains(summary.Node.SystemContainers, statsapi.ContainerStats{ + Name: "pods", + StartTime: cgroupStatsMap["/pods"].cs.StartTime, + CPU: cgroupStatsMap["/pods"].cs.CPU, + Memory: cgroupStatsMap["/pods"].cs.Memory, + Accelerators: cgroupStatsMap["/pods"].cs.Accelerators, + UserDefinedMetrics: cgroupStatsMap["/pods"].cs.UserDefinedMetrics, + Swap: cgroupStatsMap["/pods"].cs.Swap, + }) + assert.Equal(summary.Pods, podStats) +} + +func TestSummaryProviderGetStatsSplitImageFs(t *testing.T) { + ctx := context.Background() + assert := assert.New(t) + + podStats := []statsapi.PodStats{ + { + PodRef: statsapi.PodReference{Name: "test-pod", Namespace: "test-namespace", UID: "UID_test-pod"}, + StartTime: metav1.NewTime(time.Now()), + Containers: []statsapi.ContainerStats{*getContainerStats()}, + Network: getNetworkStats(), + VolumeStats: []statsapi.VolumeStats{*getVolumeStats()}, + }, + } + cgroupStatsMap := map[string]struct { + cs *statsapi.ContainerStats + ns *statsapi.NetworkStats + }{ + "/": {cs: getContainerStats(), ns: getNetworkStats()}, + "/runtime": {cs: getContainerStats(), ns: getNetworkStats()}, + "/misc": {cs: getContainerStats(), ns: getNetworkStats()}, + "/kubelet": {cs: getContainerStats(), ns: getNetworkStats()}, + "/pods": {cs: getContainerStats(), ns: getNetworkStats()}, + } + + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mockStatsProvider := statstest.NewMockProvider(mockCtrl) + + mockStatsProvider.EXPECT().GetNode().Return(node, nil) + mockStatsProvider.EXPECT().GetNodeConfig().Return(nodeConfig) + mockStatsProvider.EXPECT().GetPodCgroupRoot().Return(cgroupRoot) + mockStatsProvider.EXPECT().ListPodStats(ctx).Return(podStats, nil).AnyTimes() + mockStatsProvider.EXPECT().ListPodStatsAndUpdateCPUNanoCoreUsage(ctx).Return(podStats, nil) + mockStatsProvider.EXPECT().RootFsStats().Return(rootFsStats, nil) + mockStatsProvider.EXPECT().RlimitStats().Return(rlimitStats, nil) + mockStatsProvider.EXPECT().GetCgroupStats("/", true).Return(cgroupStatsMap["/"].cs, cgroupStatsMap["/"].ns, nil) + mockStatsProvider.EXPECT().GetCgroupStats("/runtime", false).Return(cgroupStatsMap["/runtime"].cs, cgroupStatsMap["/runtime"].ns, nil) + mockStatsProvider.EXPECT().GetCgroupStats("/misc", false).Return(cgroupStatsMap["/misc"].cs, cgroupStatsMap["/misc"].ns, nil) + mockStatsProvider.EXPECT().GetCgroupStats("/kubelet", false).Return(cgroupStatsMap["/kubelet"].cs, cgroupStatsMap["/kubelet"].ns, nil) + mockStatsProvider.EXPECT().GetCgroupStats("/kubepods", true).Return(cgroupStatsMap["/pods"].cs, cgroupStatsMap["/pods"].ns, nil) + + mockStatsProvider.EXPECT().ImageFsStats(ctx).Return(imageFsStats, rootFsStats, nil) + + kubeletCreationTime := metav1.Now() + systemBootTime := metav1.Now() + provider := summaryProviderImpl{kubeletCreationTime: kubeletCreationTime, systemBootTime: systemBootTime, provider: mockStatsProvider} + summary, err := provider.Get(ctx, true) + assert.NoError(err) + + assert.Equal(summary.Node.NodeName, "test-node") + assert.Equal(summary.Node.StartTime, systemBootTime) + assert.Equal(summary.Node.CPU, cgroupStatsMap["/"].cs.CPU) + assert.Equal(summary.Node.Memory, cgroupStatsMap["/"].cs.Memory) + assert.Equal(summary.Node.Swap, cgroupStatsMap["/"].cs.Swap) + assert.Equal(summary.Node.Network, cgroupStatsMap["/"].ns) + assert.Equal(summary.Node.Fs, rootFsStats) + // Since we are a split filesystem we want root filesystem to be container fs and image to be image filesystem + assert.Equal(summary.Node.Runtime, &statsapi.RuntimeStats{ContainerFs: rootFsStats, ImageFs: imageFsStats}) assert.Equal(len(summary.Node.SystemContainers), 4) assert.Contains(summary.Node.SystemContainers, statsapi.ContainerStats{ diff --git a/pkg/kubelet/server/stats/testing/mock_stats_provider.go b/pkg/kubelet/server/stats/testing/mock_stats_provider.go index 2dc4ab58c4f..4c22d5ec76e 100644 --- a/pkg/kubelet/server/stats/testing/mock_stats_provider.go +++ b/pkg/kubelet/server/stats/testing/mock_stats_provider.go @@ -221,12 +221,13 @@ func (mr *MockProviderMockRecorder) GetRequestedContainersInfo(containerName, op } // ImageFsStats mocks base method. -func (m *MockProvider) ImageFsStats(ctx context.Context) (*v1alpha1.FsStats, error) { +func (m *MockProvider) ImageFsStats(ctx context.Context) (*v1alpha1.FsStats, *v1alpha1.FsStats, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "ImageFsStats", ctx) ret0, _ := ret[0].(*v1alpha1.FsStats) - ret1, _ := ret[1].(error) - return ret0, ret1 + ret1, _ := ret[1].(*v1alpha1.FsStats) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 } // ImageFsStats indicates an expected call of ImageFsStats. diff --git a/pkg/kubelet/stats/cadvisor_stats_provider.go b/pkg/kubelet/stats/cadvisor_stats_provider.go index f6251e1a7f4..4f4c7c6225b 100644 --- a/pkg/kubelet/stats/cadvisor_stats_provider.go +++ b/pkg/kubelet/stats/cadvisor_stats_provider.go @@ -28,8 +28,10 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" kubetypes "k8s.io/kubelet/pkg/types" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/cadvisor" "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" @@ -237,31 +239,86 @@ func (p *cadvisorStatsProvider) ListPodCPUAndMemoryStats(_ context.Context) ([]s } // ImageFsStats returns the stats of the filesystem for storing images. -func (p *cadvisorStatsProvider) ImageFsStats(ctx context.Context) (*statsapi.FsStats, error) { +func (p *cadvisorStatsProvider) ImageFsStats(ctx context.Context) (imageFsRet *statsapi.FsStats, containerFsRet *statsapi.FsStats, errCall error) { imageFsInfo, err := p.cadvisor.ImagesFsInfo() if err != nil { - return nil, fmt.Errorf("failed to get imageFs info: %v", err) - } - imageStats, err := p.imageService.ImageStats(ctx) - if err != nil || imageStats == nil { - return nil, fmt.Errorf("failed to get image stats: %v", err) + return nil, nil, fmt.Errorf("failed to get imageFs info: %v", err) } + if !utilfeature.DefaultFeatureGate.Enabled(features.KubeletSeparateDiskGC) { + imageStats, err := p.imageService.ImageStats(ctx) + if err != nil || imageStats == nil { + return nil, nil, fmt.Errorf("failed to get image stats: %v", err) + } + + var imageFsInodesUsed *uint64 + if imageFsInfo.Inodes != nil && imageFsInfo.InodesFree != nil { + imageFsIU := *imageFsInfo.Inodes - *imageFsInfo.InodesFree + imageFsInodesUsed = &imageFsIU + } + + imageFs := &statsapi.FsStats{ + Time: metav1.NewTime(imageFsInfo.Timestamp), + AvailableBytes: &imageFsInfo.Available, + CapacityBytes: &imageFsInfo.Capacity, + UsedBytes: &imageStats.TotalStorageBytes, + InodesFree: imageFsInfo.InodesFree, + Inodes: imageFsInfo.Inodes, + InodesUsed: imageFsInodesUsed, + } + return imageFs, imageFs, nil + } + containerFsInfo, err := p.cadvisor.ContainerFsInfo() + if err != nil { + return nil, nil, fmt.Errorf("failed to get container fs info: %v", err) + } + imageStats, err := p.imageService.ImageFsInfo(ctx) + if err != nil || imageStats == nil { + return nil, nil, fmt.Errorf("failed to get image stats: %v", err) + } + splitFileSystem := false + if imageStats.ImageFilesystems[0].FsId.Mountpoint != imageStats.ContainerFilesystems[0].FsId.Mountpoint { + klog.InfoS("Detect Split Filesystem", "ImageFilesystems", imageStats.ImageFilesystems[0], "ContainerFilesystems", imageStats.ContainerFilesystems[0]) + splitFileSystem = true + } + imageFs := imageStats.ImageFilesystems[0] var imageFsInodesUsed *uint64 if imageFsInfo.Inodes != nil && imageFsInfo.InodesFree != nil { imageFsIU := *imageFsInfo.Inodes - *imageFsInfo.InodesFree imageFsInodesUsed = &imageFsIU } - return &statsapi.FsStats{ + fsStats := &statsapi.FsStats{ Time: metav1.NewTime(imageFsInfo.Timestamp), AvailableBytes: &imageFsInfo.Available, CapacityBytes: &imageFsInfo.Capacity, - UsedBytes: &imageStats.TotalStorageBytes, + UsedBytes: &imageFs.UsedBytes.Value, InodesFree: imageFsInfo.InodesFree, Inodes: imageFsInfo.Inodes, InodesUsed: imageFsInodesUsed, - }, nil + } + if !splitFileSystem { + return fsStats, fsStats, nil + } + + containerFs := imageStats.ContainerFilesystems[0] + var containerFsInodesUsed *uint64 + if containerFsInfo.Inodes != nil && containerFsInfo.InodesFree != nil { + containerFsIU := *containerFsInfo.Inodes - *containerFsInfo.InodesFree + containerFsInodesUsed = &containerFsIU + } + + fsContainerStats := &statsapi.FsStats{ + Time: metav1.NewTime(containerFsInfo.Timestamp), + AvailableBytes: &containerFsInfo.Available, + CapacityBytes: &containerFsInfo.Capacity, + UsedBytes: &containerFs.UsedBytes.Value, + InodesFree: containerFsInfo.InodesFree, + Inodes: containerFsInfo.Inodes, + InodesUsed: containerFsInodesUsed, + } + + return fsStats, fsContainerStats, nil } // ImageFsDevice returns name of the device where the image filesystem locates, diff --git a/pkg/kubelet/stats/cadvisor_stats_provider_test.go b/pkg/kubelet/stats/cadvisor_stats_provider_test.go index b6dddd2b8b0..17f7beebcf7 100644 --- a/pkg/kubelet/stats/cadvisor_stats_provider_test.go +++ b/pkg/kubelet/stats/cadvisor_stats_provider_test.go @@ -28,7 +28,11 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" + featuregatetesting "k8s.io/component-base/featuregate/testing" + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" + "k8s.io/kubernetes/pkg/features" cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" @@ -250,7 +254,6 @@ func TestCadvisorListPodStats(t *testing.T) { mockCadvisor.EXPECT().ImagesFsInfo().Return(imagefs, nil) mockRuntime := containertest.NewMockRuntime(mockCtrl) - mockRuntime.EXPECT().ImageStats(ctx).Return(&kubecontainer.ImageStats{TotalStorageBytes: 123}, nil).AnyTimes() ephemeralVolumes := []statsapi.VolumeStats{getPodVolumeStats(seedEphemeralVolume1, "ephemeralVolume1"), getPodVolumeStats(seedEphemeralVolume2, "ephemeralVolume2")} @@ -520,7 +523,7 @@ func TestCadvisorListPodCPUAndMemoryStats(t *testing.T) { assert.Nil(t, ps.Network) } -func TestCadvisorImagesFsStats(t *testing.T) { +func TestCadvisorImagesFsStatsKubeletSeparateDiskOff(t *testing.T) { ctx := context.Background() mockCtrl := gomock.NewController(t) defer mockCtrl.Finish() @@ -534,11 +537,13 @@ func TestCadvisorImagesFsStats(t *testing.T) { imageStats = &kubecontainer.ImageStats{TotalStorageBytes: 100} ) + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, false)() + mockCadvisor.EXPECT().ImagesFsInfo().Return(imageFsInfo, nil) mockRuntime.EXPECT().ImageStats(ctx).Return(imageStats, nil) provider := newCadvisorStatsProvider(mockCadvisor, &fakeResourceAnalyzer{}, mockRuntime, nil, NewFakeHostStatsProvider()) - stats, err := provider.ImageFsStats(ctx) + stats, _, err := provider.ImageFsStats(ctx) assert.NoError(err) assert.Equal(imageFsInfo.Timestamp, stats.Time.Time) @@ -550,6 +555,110 @@ func TestCadvisorImagesFsStats(t *testing.T) { assert.Equal(*imageFsInfo.Inodes-*imageFsInfo.InodesFree, *stats.InodesUsed) } +func TestCadvisorImagesFsStats(t *testing.T) { + ctx := context.Background() + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + var ( + assert = assert.New(t) + mockCadvisor = cadvisortest.NewMockInterface(mockCtrl) + mockRuntime = containertest.NewMockRuntime(mockCtrl) + + seed = 1000 + imageFsInfo = getTestFsInfo(seed) + ) + imageFsInfoCRI := &runtimeapi.FilesystemUsage{ + Timestamp: imageFsInfo.Timestamp.Unix(), + FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "images"}, + UsedBytes: &runtimeapi.UInt64Value{Value: imageFsInfo.Usage}, + InodesUsed: &runtimeapi.UInt64Value{Value: *imageFsInfo.Inodes}, + } + imageFsInfoResponse := &runtimeapi.ImageFsInfoResponse{ + ImageFilesystems: []*runtimeapi.FilesystemUsage{imageFsInfoCRI}, + ContainerFilesystems: []*runtimeapi.FilesystemUsage{imageFsInfoCRI}, + } + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, true)() + + mockCadvisor.EXPECT().ImagesFsInfo().Return(imageFsInfo, nil) + mockCadvisor.EXPECT().ContainerFsInfo().Return(imageFsInfo, nil) + mockRuntime.EXPECT().ImageFsInfo(ctx).Return(imageFsInfoResponse, nil) + + provider := newCadvisorStatsProvider(mockCadvisor, &fakeResourceAnalyzer{}, mockRuntime, nil, NewFakeHostStatsProvider()) + stats, containerfs, err := provider.ImageFsStats(ctx) + assert.NoError(err) + + assert.Equal(imageFsInfo.Timestamp, stats.Time.Time) + assert.Equal(imageFsInfo.Available, *stats.AvailableBytes) + assert.Equal(imageFsInfo.Capacity, *stats.CapacityBytes) + assert.Equal(imageFsInfo.InodesFree, stats.InodesFree) + assert.Equal(imageFsInfo.Inodes, stats.Inodes) + assert.Equal(*imageFsInfo.Inodes-*imageFsInfo.InodesFree, *stats.InodesUsed) + + assert.Equal(imageFsInfo.Timestamp, containerfs.Time.Time) + assert.Equal(imageFsInfo.Available, *containerfs.AvailableBytes) + assert.Equal(imageFsInfo.Capacity, *containerfs.CapacityBytes) + assert.Equal(imageFsInfo.InodesFree, containerfs.InodesFree) + assert.Equal(imageFsInfo.Inodes, containerfs.Inodes) + assert.Equal(*imageFsInfo.Inodes-*imageFsInfo.InodesFree, *containerfs.InodesUsed) + +} + +func TestCadvisorSplitImagesFsStats(t *testing.T) { + ctx := context.Background() + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + var ( + assert = assert.New(t) + mockCadvisor = cadvisortest.NewMockInterface(mockCtrl) + mockRuntime = containertest.NewMockRuntime(mockCtrl) + + seed = 1000 + imageFsInfo = getTestFsInfo(seed) + containerSeed = 1001 + containerFsInfo = getTestFsInfo(containerSeed) + ) + imageFsInfoCRI := &runtimeapi.FilesystemUsage{ + Timestamp: imageFsInfo.Timestamp.Unix(), + FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "images"}, + UsedBytes: &runtimeapi.UInt64Value{Value: imageFsInfo.Usage}, + InodesUsed: &runtimeapi.UInt64Value{Value: *imageFsInfo.Inodes}, + } + containerFsInfoCRI := &runtimeapi.FilesystemUsage{ + Timestamp: containerFsInfo.Timestamp.Unix(), + FsId: &runtimeapi.FilesystemIdentifier{Mountpoint: "containers"}, + UsedBytes: &runtimeapi.UInt64Value{Value: containerFsInfo.Usage}, + InodesUsed: &runtimeapi.UInt64Value{Value: *containerFsInfo.Inodes}, + } + imageFsInfoResponse := &runtimeapi.ImageFsInfoResponse{ + ImageFilesystems: []*runtimeapi.FilesystemUsage{imageFsInfoCRI}, + ContainerFilesystems: []*runtimeapi.FilesystemUsage{containerFsInfoCRI}, + } + + mockCadvisor.EXPECT().ImagesFsInfo().Return(imageFsInfo, nil) + mockCadvisor.EXPECT().ContainerFsInfo().Return(containerFsInfo, nil) + mockRuntime.EXPECT().ImageFsInfo(ctx).Return(imageFsInfoResponse, nil) + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, true)() + + provider := newCadvisorStatsProvider(mockCadvisor, &fakeResourceAnalyzer{}, mockRuntime, nil, NewFakeHostStatsProvider()) + stats, containerfs, err := provider.ImageFsStats(ctx) + assert.NoError(err) + + assert.Equal(imageFsInfo.Timestamp, stats.Time.Time) + assert.Equal(imageFsInfo.Available, *stats.AvailableBytes) + assert.Equal(imageFsInfo.Capacity, *stats.CapacityBytes) + assert.Equal(imageFsInfo.InodesFree, stats.InodesFree) + assert.Equal(imageFsInfo.Inodes, stats.Inodes) + assert.Equal(*imageFsInfo.Inodes-*imageFsInfo.InodesFree, *stats.InodesUsed) + + assert.Equal(containerFsInfo.Timestamp, containerfs.Time.Time) + assert.Equal(containerFsInfo.Available, *containerfs.AvailableBytes) + assert.Equal(containerFsInfo.Capacity, *containerfs.CapacityBytes) + assert.Equal(containerFsInfo.InodesFree, containerfs.InodesFree) + assert.Equal(containerFsInfo.Inodes, containerfs.Inodes) + assert.Equal(*containerFsInfo.Inodes-*containerFsInfo.InodesFree, *containerfs.InodesUsed) + +} + func TestCadvisorListPodStatsWhenContainerLogFound(t *testing.T) { ctx := context.Background() const ( diff --git a/pkg/kubelet/stats/cri_stats_provider.go b/pkg/kubelet/stats/cri_stats_provider.go index 3d442fdb02b..243e8eeb590 100644 --- a/pkg/kubelet/stats/cri_stats_provider.go +++ b/pkg/kubelet/stats/cri_stats_provider.go @@ -385,10 +385,10 @@ func (p *criStatsProvider) getPodAndContainerMaps(ctx context.Context) (map[stri } // ImageFsStats returns the stats of the image filesystem. -func (p *criStatsProvider) ImageFsStats(ctx context.Context) (*statsapi.FsStats, error) { +func (p *criStatsProvider) ImageFsStats(ctx context.Context) (imageFsRet *statsapi.FsStats, containerFsRet *statsapi.FsStats, errRet error) { resp, err := p.imageService.ImageFsInfo(ctx) if err != nil { - return nil, err + return nil, nil, err } // CRI may return the stats of multiple image filesystems but we only @@ -396,31 +396,32 @@ func (p *criStatsProvider) ImageFsStats(ctx context.Context) (*statsapi.FsStats, // // TODO(yguo0905): Support returning stats of multiple image filesystems. if len(resp.GetImageFilesystems()) == 0 { - return nil, fmt.Errorf("imageFs information is unavailable") + return nil, nil, fmt.Errorf("imageFs information is unavailable") } fs := resp.GetImageFilesystems()[0] - s := &statsapi.FsStats{ + imageFsRet = &statsapi.FsStats{ Time: metav1.NewTime(time.Unix(0, fs.Timestamp)), UsedBytes: &fs.UsedBytes.Value, } if fs.InodesUsed != nil { - s.InodesUsed = &fs.InodesUsed.Value + imageFsRet.InodesUsed = &fs.InodesUsed.Value } imageFsInfo, err := p.getFsInfo(fs.GetFsId()) if err != nil { - return nil, fmt.Errorf("get filesystem info: %w", err) + return nil, nil, fmt.Errorf("get filesystem info: %w", err) } if imageFsInfo != nil { // The image filesystem id is unknown to the local node or there's // an error on retrieving the stats. In these cases, we omit those // stats and return the best-effort partial result. See // https://github.com/kubernetes/heapster/issues/1793. - s.AvailableBytes = &imageFsInfo.Available - s.CapacityBytes = &imageFsInfo.Capacity - s.InodesFree = imageFsInfo.InodesFree - s.Inodes = imageFsInfo.Inodes + imageFsRet.AvailableBytes = &imageFsInfo.Available + imageFsRet.CapacityBytes = &imageFsInfo.Capacity + imageFsRet.InodesFree = imageFsInfo.InodesFree + imageFsRet.Inodes = imageFsInfo.Inodes } - return s, nil + // TODO: For CRI Stats Provider we don't support separate disks yet. + return imageFsRet, imageFsRet, nil } // ImageFsDevice returns name of the device where the image filesystem locates, diff --git a/pkg/kubelet/stats/cri_stats_provider_test.go b/pkg/kubelet/stats/cri_stats_provider_test.go index f24b1a025ae..943c528a314 100644 --- a/pkg/kubelet/stats/cri_stats_provider_test.go +++ b/pkg/kubelet/stats/cri_stats_provider_test.go @@ -773,7 +773,7 @@ func TestCRIImagesFsStats(t *testing.T) { false, ) - stats, err := provider.ImageFsStats(ctx) + stats, containerStats, err := provider.ImageFsStats(ctx) assert := assert.New(t) assert.NoError(err) @@ -784,6 +784,15 @@ func TestCRIImagesFsStats(t *testing.T) { assert.Equal(imageFsInfo.Inodes, stats.Inodes) assert.Equal(imageFsUsage.UsedBytes.Value, *stats.UsedBytes) assert.Equal(imageFsUsage.InodesUsed.Value, *stats.InodesUsed) + + assert.Equal(imageFsUsage.Timestamp, containerStats.Time.UnixNano()) + assert.Equal(imageFsInfo.Available, *containerStats.AvailableBytes) + assert.Equal(imageFsInfo.Capacity, *containerStats.CapacityBytes) + assert.Equal(imageFsInfo.InodesFree, containerStats.InodesFree) + assert.Equal(imageFsInfo.Inodes, containerStats.Inodes) + assert.Equal(imageFsUsage.UsedBytes.Value, *containerStats.UsedBytes) + assert.Equal(imageFsUsage.InodesUsed.Value, *containerStats.InodesUsed) + } func makeFakePodSandbox(name, uid, namespace string, terminated bool) *critest.FakePodSandbox { diff --git a/pkg/kubelet/stats/provider.go b/pkg/kubelet/stats/provider.go index 09f82c609f5..b4f61feebf7 100644 --- a/pkg/kubelet/stats/provider.go +++ b/pkg/kubelet/stats/provider.go @@ -31,6 +31,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" "k8s.io/kubernetes/pkg/kubelet/status" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" + "k8s.io/utils/ptr" ) // PodManager is the subset of methods the manager needs to observe the actual state of the kubelet. @@ -99,7 +100,7 @@ type containerStatsProvider interface { ListPodStats(ctx context.Context) ([]statsapi.PodStats, error) ListPodStatsAndUpdateCPUNanoCoreUsage(ctx context.Context) ([]statsapi.PodStats, error) ListPodCPUAndMemoryStats(ctx context.Context) ([]statsapi.PodStats, error) - ImageFsStats(ctx context.Context) (*statsapi.FsStats, error) + ImageFsStats(ctx context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) ImageFsDevice(ctx context.Context) (string, error) } @@ -203,6 +204,7 @@ func (p *Provider) GetRawContainerInfo(containerName string, req *cadvisorapiv1. } // HasDedicatedImageFs returns true if a dedicated image filesystem exists for storing images. +// KEP Issue Number 4191: Enhanced this to allow for the containers to be separate from images. func (p *Provider) HasDedicatedImageFs(ctx context.Context) (bool, error) { device, err := p.containerStatsProvider.ImageFsDevice(ctx) if err != nil { @@ -212,5 +214,38 @@ func (p *Provider) HasDedicatedImageFs(ctx context.Context) (bool, error) { if err != nil { return false, err } + // KEP Enhancement: DedicatedImageFs can mean either container or image fs are separate from root + // CAdvisor reports this a bit differently than Container runtimes + if device == rootFsInfo.Device { + imageFs, containerFs, err := p.ImageFsStats(ctx) + if err != nil { + return false, err + } + if !equalFileSystems(imageFs, containerFs) { + return true, nil + } + } return device != rootFsInfo.Device, nil } + +func equalFileSystems(a, b *statsapi.FsStats) bool { + if a == nil || b == nil { + return false + } + if !ptr.Equal(a.AvailableBytes, b.AvailableBytes) { + return false + } + if !ptr.Equal(a.CapacityBytes, b.CapacityBytes) { + return false + } + if !ptr.Equal(a.InodesUsed, b.InodesUsed) { + return false + } + if !ptr.Equal(a.InodesFree, b.InodesFree) { + return false + } + if !ptr.Equal(a.Inodes, b.Inodes) { + return false + } + return true +} diff --git a/pkg/kubelet/stats/provider_test.go b/pkg/kubelet/stats/provider_test.go index 4923f53de9a..897414ead19 100644 --- a/pkg/kubelet/stats/provider_test.go +++ b/pkg/kubelet/stats/provider_test.go @@ -417,21 +417,34 @@ func TestHasDedicatedImageFs(t *testing.T) { ctx := context.Background() mockCtrl := gomock.NewController(t) defer mockCtrl.Finish() + imageStatsExpected := &statsapi.FsStats{AvailableBytes: uint64Ptr(1)} for desc, test := range map[string]struct { - rootfsDevice string - imagefsDevice string - dedicated bool + rootfsDevice string + imagefsDevice string + dedicated bool + imageFsStats *statsapi.FsStats + containerFsStats *statsapi.FsStats }{ "dedicated device for image filesystem": { rootfsDevice: "root/device", imagefsDevice: "image/device", dedicated: true, + imageFsStats: imageStatsExpected, }, "shared device for image filesystem": { - rootfsDevice: "share/device", - imagefsDevice: "share/device", - dedicated: false, + rootfsDevice: "share/device", + imagefsDevice: "share/device", + dedicated: false, + imageFsStats: imageStatsExpected, + containerFsStats: imageStatsExpected, + }, + "split filesystem for images": { + rootfsDevice: "root/device", + imagefsDevice: "root/device", + dedicated: true, + imageFsStats: &statsapi.FsStats{AvailableBytes: uint64Ptr(1)}, + containerFsStats: &statsapi.FsStats{AvailableBytes: uint64Ptr(2)}, }, } { t.Logf("TestCase %q", desc) @@ -441,10 +454,12 @@ func TestHasDedicatedImageFs(t *testing.T) { mockRuntimeCache = new(kubecontainertest.MockRuntimeCache) ) mockCadvisor.EXPECT().RootFsInfo().Return(cadvisorapiv2.FsInfo{Device: test.rootfsDevice}, nil) - provider := newStatsProvider(mockCadvisor, mockPodManager, mockRuntimeCache, fakeContainerStatsProvider{ - device: test.imagefsDevice, + device: test.imagefsDevice, + imageFs: test.imageFsStats, + containerFs: test.containerFsStats, }) + dedicated, err := provider.HasDedicatedImageFs(ctx) assert.NoError(t, err) assert.Equal(t, test.dedicated, dedicated) @@ -760,7 +775,9 @@ func (o *fakeResourceAnalyzer) GetPodVolumeStats(uid types.UID) (serverstats.Pod } type fakeContainerStatsProvider struct { - device string + device string + imageFs *statsapi.FsStats + containerFs *statsapi.FsStats } func (p fakeContainerStatsProvider) ListPodStats(context.Context) ([]statsapi.PodStats, error) { @@ -775,8 +792,8 @@ func (p fakeContainerStatsProvider) ListPodCPUAndMemoryStats(context.Context) ([ return nil, fmt.Errorf("not implemented") } -func (p fakeContainerStatsProvider) ImageFsStats(context.Context) (*statsapi.FsStats, error) { - return nil, fmt.Errorf("not implemented") +func (p fakeContainerStatsProvider) ImageFsStats(context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) { + return p.imageFs, p.containerFs, nil } func (p fakeContainerStatsProvider) ImageFsDevice(context.Context) (string, error) { diff --git a/test/e2e_node/summary_test.go b/test/e2e_node/summary_test.go index 0a7306292a2..9820af9425a 100644 --- a/test/e2e_node/summary_test.go +++ b/test/e2e_node/summary_test.go @@ -317,7 +317,16 @@ var _ = SIGDescribe("Summary API [NodeConformance]", func() { "Inodes": bounded(1e4, 1e8), "InodesUsed": bounded(0, 1e8), }), - "ContainerFs": gomega.BeNil(), + "ContainerFs": ptrMatchAllFields(gstruct.Fields{ + "Time": recent(maxStatsAge), + "AvailableBytes": fsCapacityBounds, + "CapacityBytes": fsCapacityBounds, + // we assume we are not running tests on machines more than 10tb of disk + "UsedBytes": bounded(e2evolume.Kb, 10*e2evolume.Tb), + "InodesFree": bounded(1e4, 1e8), + "Inodes": bounded(1e4, 1e8), + "InodesUsed": bounded(0, 1e8), + }), }), "Rlimit": ptrMatchAllFields(gstruct.Fields{ "Time": recent(maxStatsAge),