From 0411a3d5651ed17f9a7dda4bb5dc0f97d154dc81 Mon Sep 17 00:00:00 2001 From: Mark Rossetti Date: Wed, 10 Jan 2024 19:28:56 -0800 Subject: [PATCH 1/2] Add support for memory pressure evictiong on Windows Signed-off-by: Mark Rossetti --- pkg/kubelet/eviction/defaults_others.go | 4 +- pkg/kubelet/eviction/defaults_windows.go | 30 +++++ pkg/kubelet/eviction/eviction_manager.go | 4 +- pkg/kubelet/eviction/helpers.go | 10 +- pkg/kubelet/eviction/helpers_others.go | 37 ++++++ pkg/kubelet/eviction/helpers_test.go | 11 +- pkg/kubelet/eviction/helpers_windows.go | 48 +++++++ .../eviction/memory_threshold_notifier.go | 106 +-------------- .../memory_threshold_notifier_others.go | 123 ++++++++++++++++++ .../memory_threshold_notifier_test.go | 9 +- .../memory_threshold_notifier_windows.go | 101 ++++++++++++++ .../stats/summary_sys_containers_windows.go | 28 ++++ .../server/stats/summary_windows_test.go | 7 +- ...ts.go => perfcounter_nodestats_windows.go} | 33 +++++ .../kubelet/pkg/apis/stats/v1alpha1/types.go | 3 + 15 files changed, 437 insertions(+), 117 deletions(-) create mode 100644 pkg/kubelet/eviction/defaults_windows.go create mode 100644 pkg/kubelet/eviction/helpers_others.go create mode 100644 pkg/kubelet/eviction/helpers_windows.go create mode 100644 pkg/kubelet/eviction/memory_threshold_notifier_others.go create mode 100644 pkg/kubelet/eviction/memory_threshold_notifier_windows.go rename pkg/kubelet/winstats/{perfcounter_nodestats.go => perfcounter_nodestats_windows.go} (89%) diff --git a/pkg/kubelet/eviction/defaults_others.go b/pkg/kubelet/eviction/defaults_others.go index b226a210f45..912acf178db 100644 --- a/pkg/kubelet/eviction/defaults_others.go +++ b/pkg/kubelet/eviction/defaults_others.go @@ -1,5 +1,5 @@ -//go:build !linux -// +build !linux +//go:build !linux && !windows +// +build !linux,!windows /* Copyright 2018 The Kubernetes Authors. diff --git a/pkg/kubelet/eviction/defaults_windows.go b/pkg/kubelet/eviction/defaults_windows.go new file mode 100644 index 00000000000..de6517f6540 --- /dev/null +++ b/pkg/kubelet/eviction/defaults_windows.go @@ -0,0 +1,30 @@ +//go:build windows +// +build windows + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package eviction + +// DefaultEvictionHard includes default options for hard eviction for Windows. +// Note: On Linux, the default eviction hard threshold is 100Mi for memory.available +// but Windows generally requires more memory reserved for system processes so we'll +// set the default threshold to 500Mi. +var DefaultEvictionHard = map[string]string{ + "memory.available": "500Mi", + "nodefs.available": "10%", + "imagefs.available": "15%", +} diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index ed946a0e056..00df1a773c6 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -19,6 +19,7 @@ package eviction import ( "context" "fmt" + "runtime" "sort" "sync" "time" @@ -185,7 +186,8 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod klog.InfoS(message) m.synchronize(diskInfoProvider, podFunc) } - if m.config.KernelMemcgNotification { + klog.InfoS("Eviction manager: starting control loop") + if m.config.KernelMemcgNotification || runtime.GOOS == "windows" { for _, threshold := range m.config.Thresholds { if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable { notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler) diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 19d09ec9da5..8050613e672 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -846,13 +846,11 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat // build an evaluation context for current eviction signals result := signalObservations{} - if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil { - result[evictionapi.SignalMemoryAvailable] = signalObservation{ - available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), - capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), - time: memory.Time, - } + memoryAvailableSignal := makeMemoryAvailableSignalObservation(summary) + if memoryAvailableSignal != nil { + result[evictionapi.SignalMemoryAvailable] = *memoryAvailableSignal } + if allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods); err != nil { klog.ErrorS(err, "Eviction manager: failed to construct signal", "signal", evictionapi.SignalAllocatableMemoryAvailable) } else { diff --git a/pkg/kubelet/eviction/helpers_others.go b/pkg/kubelet/eviction/helpers_others.go new file mode 100644 index 00000000000..6b424bac474 --- /dev/null +++ b/pkg/kubelet/eviction/helpers_others.go @@ -0,0 +1,37 @@ +//go:build !windows +// +build !windows + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package eviction + +import ( + "k8s.io/apimachinery/pkg/api/resource" + statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" +) + +func makeMemoryAvailableSignalObservation(summary *statsapi.Summary) *signalObservation { + if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil { + return &signalObservation{ + available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), + capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), + time: memory.Time, + } + } + + return nil +} diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 5df209b2801..93c039c4ee3 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -1279,6 +1279,7 @@ func TestMakeSignalObservations(t *testing.T) { Inodes: &nodeFsInodes, }, SystemContainers: []statsapi.ContainerStats{ + // Used for memory signal observations on linux { Name: statsapi.SystemContainerPods, Memory: &statsapi.MemoryStats{ @@ -1286,6 +1287,14 @@ func TestMakeSignalObservations(t *testing.T) { WorkingSetBytes: &nodeWorkingSetBytes, }, }, + // Used for memory signal observations on windows + { + Name: statsapi.SystemContainerWindowsGlobalCommitMemory, + Memory: &statsapi.MemoryStats{ + AvailableBytes: &nodeAvailableBytes, + UsageBytes: &nodeWorkingSetBytes, + }, + }, }, }, Pods: []statsapi.PodStats{}, @@ -1307,7 +1316,7 @@ func TestMakeSignalObservations(t *testing.T) { actualObservations, statsFunc := makeSignalObservations(fakeStats) allocatableMemQuantity, found := actualObservations[evictionapi.SignalAllocatableMemoryAvailable] if !found { - t.Errorf("Expected allocatable memory observation, but didnt find one") + t.Errorf("Expected allocatable memory observation, but didn't find one") } if expectedBytes := int64(nodeAvailableBytes); allocatableMemQuantity.available.Value() != expectedBytes { t.Errorf("Expected %v, actual: %v", expectedBytes, allocatableMemQuantity.available.Value()) diff --git a/pkg/kubelet/eviction/helpers_windows.go b/pkg/kubelet/eviction/helpers_windows.go new file mode 100644 index 00000000000..ac3ea7f5302 --- /dev/null +++ b/pkg/kubelet/eviction/helpers_windows.go @@ -0,0 +1,48 @@ +//go:build windows +// +build windows + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package eviction + +import ( + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" + statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" +) + +func makeMemoryAvailableSignalObservation(summary *statsapi.Summary) *signalObservation { + klog.V(4).InfoS("Eviction manager: building memory signal observations for windows") + sysContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerWindowsGlobalCommitMemory) + if err != nil { + klog.ErrorS(err, "Eviction manager: failed to construct signal", "signal", evictionapi.SignalMemoryAvailable) + } + if memory := sysContainer.Memory; memory != nil && memory.AvailableBytes != nil && memory.UsageBytes != nil { + klog.V(4).InfoS( + "Eviction manager: memory signal observations for windows", + "Available", *memory.AvailableBytes, + "Usage", *memory.UsageBytes) + return &signalObservation{ + available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), + capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.UsageBytes), resource.BinarySI), + time: memory.Time, + } + } else { + return nil + } +} diff --git a/pkg/kubelet/eviction/memory_threshold_notifier.go b/pkg/kubelet/eviction/memory_threshold_notifier.go index 3ce5fb811e1..6bc1bc6ab1d 100644 --- a/pkg/kubelet/eviction/memory_threshold_notifier.go +++ b/pkg/kubelet/eviction/memory_threshold_notifier.go @@ -16,119 +16,19 @@ limitations under the License. package eviction -import ( - "fmt" - "time" - - "k8s.io/klog/v2" - - "k8s.io/apimachinery/pkg/api/resource" - statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" - "k8s.io/kubernetes/pkg/kubelet/cm" - evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" -) +import "time" const ( - memoryUsageAttribute = "memory.usage_in_bytes" // this prevents constantly updating the memcg notifier if synchronize // is run frequently. notifierRefreshInterval = 10 * time.Second ) -type memoryThresholdNotifier struct { - threshold evictionapi.Threshold - cgroupPath string - events chan struct{} - factory NotifierFactory - handler func(string) - notifier CgroupNotifier -} - -var _ ThresholdNotifier = &memoryThresholdNotifier{} - -// NewMemoryThresholdNotifier creates a ThresholdNotifier which is designed to respond to the given threshold. -// UpdateThreshold must be called once before the threshold will be active. -func NewMemoryThresholdNotifier(threshold evictionapi.Threshold, cgroupRoot string, factory NotifierFactory, handler func(string)) (ThresholdNotifier, error) { - cgroups, err := cm.GetCgroupSubsystems() - if err != nil { - return nil, err - } - cgpath, found := cgroups.MountPoints["memory"] - if !found || len(cgpath) == 0 { - return nil, fmt.Errorf("memory cgroup mount point not found") - } - if isAllocatableEvictionThreshold(threshold) { - // for allocatable thresholds, point the cgroup notifier at the allocatable cgroup - cgpath += cgroupRoot - } - return &memoryThresholdNotifier{ - threshold: threshold, - cgroupPath: cgpath, - events: make(chan struct{}), - handler: handler, - factory: factory, - }, nil -} - -func (m *memoryThresholdNotifier) Start() { - klog.InfoS("Eviction manager: created memoryThresholdNotifier", "notifier", m.Description()) - for range m.events { - m.handler(fmt.Sprintf("eviction manager: %s crossed", m.Description())) - } -} - -func (m *memoryThresholdNotifier) UpdateThreshold(summary *statsapi.Summary) error { - memoryStats := summary.Node.Memory - if isAllocatableEvictionThreshold(m.threshold) { - allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods) - if err != nil { - return err - } - memoryStats = allocatableContainer.Memory - } - if memoryStats == nil || memoryStats.UsageBytes == nil || memoryStats.WorkingSetBytes == nil || memoryStats.AvailableBytes == nil { - return fmt.Errorf("summary was incomplete. Expected MemoryStats and all subfields to be non-nil, but got %+v", memoryStats) - } - // Set threshold on usage to capacity - eviction_hard + inactive_file, - // since we want to be notified when working_set = capacity - eviction_hard - inactiveFile := resource.NewQuantity(int64(*memoryStats.UsageBytes-*memoryStats.WorkingSetBytes), resource.BinarySI) - capacity := resource.NewQuantity(int64(*memoryStats.AvailableBytes+*memoryStats.WorkingSetBytes), resource.BinarySI) - evictionThresholdQuantity := evictionapi.GetThresholdQuantity(m.threshold.Value, capacity) - memcgThreshold := capacity.DeepCopy() - memcgThreshold.Sub(*evictionThresholdQuantity) - memcgThreshold.Add(*inactiveFile) - - klog.V(3).InfoS("Eviction manager: setting notifier to capacity", "notifier", m.Description(), "capacity", memcgThreshold.String()) - if m.notifier != nil { - m.notifier.Stop() - } - newNotifier, err := m.factory.NewCgroupNotifier(m.cgroupPath, memoryUsageAttribute, memcgThreshold.Value()) - if err != nil { - return err - } - m.notifier = newNotifier - go m.notifier.Start(m.events) - return nil -} - -func (m *memoryThresholdNotifier) Description() string { - var hard, allocatable string - if isHardEvictionThreshold(m.threshold) { - hard = "hard " - } else { - hard = "soft " - } - if isAllocatableEvictionThreshold(m.threshold) { - allocatable = "allocatable " - } - return fmt.Sprintf("%s%smemory eviction threshold", hard, allocatable) -} - -var _ NotifierFactory = &CgroupNotifierFactory{} - // CgroupNotifierFactory knows how to make CgroupNotifiers which integrate with the kernel type CgroupNotifierFactory struct{} +var _ NotifierFactory = &CgroupNotifierFactory{} + // NewCgroupNotifier implements the NotifierFactory interface func (n *CgroupNotifierFactory) NewCgroupNotifier(path, attribute string, threshold int64) (CgroupNotifier, error) { return NewCgroupNotifier(path, attribute, threshold) diff --git a/pkg/kubelet/eviction/memory_threshold_notifier_others.go b/pkg/kubelet/eviction/memory_threshold_notifier_others.go new file mode 100644 index 00000000000..1b7bc21e05d --- /dev/null +++ b/pkg/kubelet/eviction/memory_threshold_notifier_others.go @@ -0,0 +1,123 @@ +//go:build !windows +// +build !windows + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package eviction + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" + statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" + "k8s.io/kubernetes/pkg/kubelet/cm" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" +) + +const ( + memoryUsageAttribute = "memory.usage_in_bytes" +) + +type linuxMemoryThresholdNotifier struct { + threshold evictionapi.Threshold + cgroupPath string + events chan struct{} + factory NotifierFactory + handler func(string) + notifier CgroupNotifier +} + +var _ ThresholdNotifier = &linuxMemoryThresholdNotifier{} + +// NewMemoryThresholdNotifier creates a ThresholdNotifier which is designed to respond to the given threshold. +// UpdateThreshold must be called once before the threshold will be active. +func NewMemoryThresholdNotifier(threshold evictionapi.Threshold, cgroupRoot string, factory NotifierFactory, handler func(string)) (ThresholdNotifier, error) { + cgroups, err := cm.GetCgroupSubsystems() + if err != nil { + return nil, err + } + cgpath, found := cgroups.MountPoints["memory"] + if !found || len(cgpath) == 0 { + return nil, fmt.Errorf("memory cgroup mount point not found") + } + if isAllocatableEvictionThreshold(threshold) { + // for allocatable thresholds, point the cgroup notifier at the allocatable cgroup + cgpath += cgroupRoot + } + return &linuxMemoryThresholdNotifier{ + threshold: threshold, + cgroupPath: cgpath, + events: make(chan struct{}), + handler: handler, + factory: factory, + }, nil +} + +func (m *linuxMemoryThresholdNotifier) UpdateThreshold(summary *statsapi.Summary) error { + memoryStats := summary.Node.Memory + if isAllocatableEvictionThreshold(m.threshold) { + allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods) + if err != nil { + return err + } + memoryStats = allocatableContainer.Memory + } + if memoryStats == nil || memoryStats.UsageBytes == nil || memoryStats.WorkingSetBytes == nil || memoryStats.AvailableBytes == nil { + return fmt.Errorf("summary was incomplete. Expected MemoryStats and all subfields to be non-nil, but got %+v", memoryStats) + } + // Set threshold on usage to capacity - eviction_hard + inactive_file, + // since we want to be notified when working_set = capacity - eviction_hard + inactiveFile := resource.NewQuantity(int64(*memoryStats.UsageBytes-*memoryStats.WorkingSetBytes), resource.BinarySI) + capacity := resource.NewQuantity(int64(*memoryStats.AvailableBytes+*memoryStats.WorkingSetBytes), resource.BinarySI) + evictionThresholdQuantity := evictionapi.GetThresholdQuantity(m.threshold.Value, capacity) + memcgThreshold := capacity.DeepCopy() + memcgThreshold.Sub(*evictionThresholdQuantity) + memcgThreshold.Add(*inactiveFile) + + klog.V(3).InfoS("Eviction manager: setting notifier to capacity", "notifier", m.Description(), "capacity", memcgThreshold.String()) + if m.notifier != nil { + m.notifier.Stop() + } + newNotifier, err := m.factory.NewCgroupNotifier(m.cgroupPath, memoryUsageAttribute, memcgThreshold.Value()) + if err != nil { + return err + } + m.notifier = newNotifier + go m.notifier.Start(m.events) + return nil +} + +func (m *linuxMemoryThresholdNotifier) Start() { + klog.InfoS("Eviction manager: created memoryThresholdNotifier", "notifier", m.Description()) + for range m.events { + m.handler(fmt.Sprintf("eviction manager: %s crossed", m.Description())) + } +} + +func (m *linuxMemoryThresholdNotifier) Description() string { + var hard, allocatable string + if isHardEvictionThreshold(m.threshold) { + hard = "hard " + } else { + hard = "soft " + } + if isAllocatableEvictionThreshold(m.threshold) { + allocatable = "allocatable " + } + return fmt.Sprintf("%s%smemory eviction threshold", hard, allocatable) +} diff --git a/pkg/kubelet/eviction/memory_threshold_notifier_test.go b/pkg/kubelet/eviction/memory_threshold_notifier_test.go index 8e64988bf0c..4969db78385 100644 --- a/pkg/kubelet/eviction/memory_threshold_notifier_test.go +++ b/pkg/kubelet/eviction/memory_threshold_notifier_test.go @@ -1,3 +1,6 @@ +//go:build linux +// +build linux + /* Copyright 2018 The Kubernetes Authors. @@ -58,8 +61,8 @@ func nodeSummary(available, workingSet, usage resource.Quantity, allocatable boo } } -func newTestMemoryThresholdNotifier(threshold evictionapi.Threshold, factory NotifierFactory, handler func(string)) *memoryThresholdNotifier { - return &memoryThresholdNotifier{ +func newTestMemoryThresholdNotifier(threshold evictionapi.Threshold, factory NotifierFactory, handler func(string)) *linuxMemoryThresholdNotifier { + return &linuxMemoryThresholdNotifier{ threshold: threshold, cgroupPath: testCgroupPath, events: make(chan struct{}), @@ -245,7 +248,7 @@ func TestThresholdDescription(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - m := &memoryThresholdNotifier{ + m := &linuxMemoryThresholdNotifier{ notifier: &MockCgroupNotifier{}, threshold: tc.evictionThreshold, cgroupPath: testCgroupPath, diff --git a/pkg/kubelet/eviction/memory_threshold_notifier_windows.go b/pkg/kubelet/eviction/memory_threshold_notifier_windows.go new file mode 100644 index 00000000000..8c014fa972a --- /dev/null +++ b/pkg/kubelet/eviction/memory_threshold_notifier_windows.go @@ -0,0 +1,101 @@ +//go:build windows +// +build windows + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package eviction + +import ( + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" + statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" + "k8s.io/kubernetes/pkg/kubelet/winstats" +) + +type windowsMemoryThresholdNotifier struct { + threshold evictionapi.Threshold + events chan struct{} + handler func(string) +} + +var _ ThresholdNotifier = &windowsMemoryThresholdNotifier{} + +func NewMemoryThresholdNotifier(threshold evictionapi.Threshold, cgroupRoot string, factory NotifierFactory, handler func(string)) (ThresholdNotifier, error) { + klog.InfoS("Eviction manager: creating new WindowsMemoryThresholdNotifier") + return &windowsMemoryThresholdNotifier{ + threshold: threshold, + events: make(chan struct{}), + handler: handler, + }, nil +} + +func (m *windowsMemoryThresholdNotifier) Start() { + klog.InfoS("Eviction manager: starting windowsMemoryThresholdNotifier", "notifier", m.Description()) + go func() { + for true { + time.Sleep(notifierRefreshInterval) + m.checkMemoryUsage() + } + }() + + for range m.events { + m.handler(fmt.Sprintf("eviction manager: %s crossed", m.Description())) + } +} + +func (m *windowsMemoryThresholdNotifier) checkMemoryUsage() { + // Get global commit limit + perfInfo, err := winstats.GetPerformanceInfo() + if err != nil { + klog.ErrorS(err, "Eviction manager: error getting global memory status for node") + } + + commmiLimitBytes := perfInfo.CommitLimitPages * perfInfo.PageSize + capacity := resource.NewQuantity(int64(commmiLimitBytes), resource.BinarySI) + evictionThresholdQuantity := evictionapi.GetThresholdQuantity(m.threshold.Value, capacity) + + commitTotalBytes := perfInfo.CommitTotalPages * perfInfo.PageSize + commitAvailableBytes := commmiLimitBytes - commitTotalBytes + + if commitAvailableBytes <= uint64(evictionThresholdQuantity.Value()) { + m.events <- struct{}{} + } +} + +func (m *windowsMemoryThresholdNotifier) UpdateThreshold(summary *statsapi.Summary) error { + // Windows doesn't use cgroup notifiers to trigger eviction, so this function is a no-op. + // Instead the go-routine set up in Start() will poll the system for memory usage and + // trigger eviction when the threshold is crossed. + return nil +} + +func (m *windowsMemoryThresholdNotifier) Description() string { + var hard, allocatable string + if isHardEvictionThreshold(m.threshold) { + hard = "hard" + } else { + hard = "soft" + } + if isAllocatableEvictionThreshold(m.threshold) { + allocatable = "allocatable" + } + return fmt.Sprintf("%s %s memory eviction threshold", hard, allocatable) +} diff --git a/pkg/kubelet/server/stats/summary_sys_containers_windows.go b/pkg/kubelet/server/stats/summary_sys_containers_windows.go index 520e0018b59..b651c943a1c 100644 --- a/pkg/kubelet/server/stats/summary_sys_containers_windows.go +++ b/pkg/kubelet/server/stats/summary_sys_containers_windows.go @@ -23,17 +23,21 @@ import ( "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" "k8s.io/kubernetes/pkg/kubelet/cm" + "k8s.io/kubernetes/pkg/kubelet/winstats" ) func (sp *summaryProviderImpl) GetSystemContainersStats(nodeConfig cm.NodeConfig, podStats []statsapi.PodStats, updateStats bool) (stats []statsapi.ContainerStats) { stats = append(stats, sp.getSystemPodsCPUAndMemoryStats(nodeConfig, podStats, updateStats)) + stats = append(stats, sp.getSystemWindowsGlobalmemoryStats()) return stats } func (sp *summaryProviderImpl) GetSystemContainersCPUAndMemoryStats(nodeConfig cm.NodeConfig, podStats []statsapi.PodStats, updateStats bool) (stats []statsapi.ContainerStats) { stats = append(stats, sp.getSystemPodsCPUAndMemoryStats(nodeConfig, podStats, updateStats)) + stats = append(stats, sp.getSystemWindowsGlobalmemoryStats()) return stats } @@ -96,3 +100,27 @@ func (sp *summaryProviderImpl) getSystemPodsCPUAndMemoryStats(nodeConfig cm.Node return podsSummary } + +func (sp *summaryProviderImpl) getSystemWindowsGlobalmemoryStats() statsapi.ContainerStats { + now := metav1.NewTime(time.Now()) + globalMemorySummary := statsapi.ContainerStats{ + StartTime: now, + Memory: &statsapi.MemoryStats{}, + Name: statsapi.SystemContainerWindowsGlobalCommitMemory, + } + + perfInfo, err := winstats.GetPerformanceInfo() + if err != nil { + klog.Errorf("Failed to get Windows performance info: %v", err) + return globalMemorySummary + } + + commitLimitBytes := perfInfo.CommitLimitPages * perfInfo.PageSize + commitTotalBytes := perfInfo.CommitTotalPages * perfInfo.PageSize + commitAvailableBytes := commitLimitBytes - commitTotalBytes + globalMemorySummary.Memory.Time = now + globalMemorySummary.Memory.AvailableBytes = &commitAvailableBytes + globalMemorySummary.Memory.UsageBytes = &commitTotalBytes + + return globalMemorySummary +} diff --git a/pkg/kubelet/server/stats/summary_windows_test.go b/pkg/kubelet/server/stats/summary_windows_test.go index 27e74f285ec..21f46d2654f 100644 --- a/pkg/kubelet/server/stats/summary_windows_test.go +++ b/pkg/kubelet/server/stats/summary_windows_test.go @@ -79,13 +79,18 @@ func TestSummaryProvider(t *testing.T) { assert.Equal(summary.Node.Fs, rootFsStats) assert.Equal(summary.Node.Runtime, &statsapi.RuntimeStats{ContainerFs: imageFsStats, ImageFs: imageFsStats}) - assert.Equal(len(summary.Node.SystemContainers), 1) + assert.NoError(err) + assert.Equal(len(summary.Node.SystemContainers), 2) assert.Equal(summary.Node.SystemContainers[0].Name, "pods") assert.Equal(summary.Node.SystemContainers[0].CPU.UsageCoreNanoSeconds, podStats[0].CPU.UsageCoreNanoSeconds) assert.Equal(summary.Node.SystemContainers[0].CPU.UsageNanoCores, podStats[0].CPU.UsageNanoCores) assert.Equal(summary.Node.SystemContainers[0].Memory.WorkingSetBytes, podStats[0].Memory.WorkingSetBytes) assert.Equal(summary.Node.SystemContainers[0].Memory.UsageBytes, podStats[0].Memory.UsageBytes) assert.Equal(summary.Node.SystemContainers[0].Memory.AvailableBytes, podStats[0].Memory.AvailableBytes) + assert.Equal(summary.Node.SystemContainers[1].Name, statsapi.SystemContainerWindowsGlobalCommitMemory) + assert.NotEqual(nil, summary.Node.SystemContainers[1].Memory) + assert.NotEqual(nil, summary.Node.SystemContainers[1].Memory.AvailableBytes) + assert.NotEqual(nil, summary.Node.SystemContainers[1].Memory.UsageBytes) assert.Equal(summary.Pods, podStats) } diff --git a/pkg/kubelet/winstats/perfcounter_nodestats.go b/pkg/kubelet/winstats/perfcounter_nodestats_windows.go similarity index 89% rename from pkg/kubelet/winstats/perfcounter_nodestats.go rename to pkg/kubelet/winstats/perfcounter_nodestats_windows.go index c8412f1af2f..c673c973fc0 100644 --- a/pkg/kubelet/winstats/perfcounter_nodestats.go +++ b/pkg/kubelet/winstats/perfcounter_nodestats_windows.go @@ -56,10 +56,33 @@ type MemoryStatusEx struct { AvailExtendedVirtual uint64 } +// PerformanceInfo is the same as Windows structure PERFORMANCE_INFORMATION +// https://learn.microsoft.com/en-us/windows/win32/api/psapi/ns-psapi-performance_information +type PerformanceInformation struct { + cb uint32 + CommitTotalPages uint64 + CommitLimitPages uint64 + CommitPeakPages uint64 + PhysicalTotalPages uint64 + PhysicalAvailablePages uint64 + SystemCachePages uint64 + KernelTotalPages uint64 + KernelPagesPages uint64 + KernelNonpagedPages uint64 + PageSize uint64 + HandleCount uint32 + ProcessCount uint32 + ThreadCount uint32 +} + var ( + // kernel32.dll system calls modkernel32 = windows.NewLazySystemDLL("kernel32.dll") procGlobalMemoryStatusEx = modkernel32.NewProc("GlobalMemoryStatusEx") procGetActiveProcessorCount = modkernel32.NewProc("GetActiveProcessorCount") + // psapi.dll system calls + modpsapi = windows.NewLazySystemDLL("psapi.dll") + procGetPerformanceInfo = modpsapi.NewProc("GetPerformanceInfo") ) const allProcessorGroups = 0xFFFF @@ -294,6 +317,16 @@ func getPhysicallyInstalledSystemMemoryBytes() (uint64, error) { return statex.TotalPhys, nil } +func GetPerformanceInfo() (*PerformanceInformation, error) { + var pi PerformanceInformation + pi.cb = uint32(unsafe.Sizeof(pi)) + ret, _, _ := procGetPerformanceInfo.Call(uintptr(unsafe.Pointer(&pi)), uintptr(pi.cb)) + if ret == 0 { + return nil, errors.New("unable to read Windows performance information") + } + return &pi, nil +} + func getBootID() (string, error) { regKey, err := registry.OpenKey(registry.LOCAL_MACHINE, bootIdRegistry, registry.READ) if err != nil { diff --git a/staging/src/k8s.io/kubelet/pkg/apis/stats/v1alpha1/types.go b/staging/src/k8s.io/kubelet/pkg/apis/stats/v1alpha1/types.go index 54355503f6a..b63a17ace5e 100644 --- a/staging/src/k8s.io/kubelet/pkg/apis/stats/v1alpha1/types.go +++ b/staging/src/k8s.io/kubelet/pkg/apis/stats/v1alpha1/types.go @@ -99,6 +99,9 @@ const ( SystemContainerMisc = "misc" // SystemContainerPods is the container name for the system container tracking user pods. SystemContainerPods = "pods" + // SystemContainerWindowsGlobalCommitMemory (only used on Windows) is the container name for the system container + // tracking global commit memory usage and is used for memory-pressure eviction. + SystemContainerWindowsGlobalCommitMemory = "windows-global-commit-memory" ) // ProcessStats are stats pertaining to processes. From 3683010a7c6a3dc5372e1688d7b2995b745f5bea Mon Sep 17 00:00:00 2001 From: Mark Rossetti Date: Mon, 22 Jan 2024 13:29:33 -0800 Subject: [PATCH 2/2] Adding e2e test to validate memory-pressure eviction on Windows Signed-off-by: Mark Rossetti --- test/e2e/windows/eviction.go | 199 ++++++++++++++++++++++++++++++ test/e2e/windows/memory_limits.go | 49 ++++---- 2 files changed, 224 insertions(+), 24 deletions(-) create mode 100644 test/e2e/windows/eviction.go diff --git a/test/e2e/windows/eviction.go b/test/e2e/windows/eviction.go new file mode 100644 index 00000000000..53bb4deff4a --- /dev/null +++ b/test/e2e/windows/eviction.go @@ -0,0 +1,199 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package windows + +import ( + "context" + "strconv" + "strings" + "time" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/kubernetes/test/e2e/feature" + "k8s.io/kubernetes/test/e2e/framework" + e2enode "k8s.io/kubernetes/test/e2e/framework/node" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + imageutils "k8s.io/kubernetes/test/utils/image" + admissionapi "k8s.io/pod-security-admission/api" +) + +const ( + // It can take 10-15 seconds for node memory-pressure taint to show up on the node + // so we'll wait 45 seconds for the taint to show up so the e2e test case can catch + // it and the wait for the taint to be removed so other serial/slow tests can run + // against the same node. + waitForNodeMemoryPressureTaintDelayDuration = 45 * time.Second +) + +var _ = sigDescribe(feature.Windows, "Eviction", framework.WithSerial(), framework.WithSlow(), framework.WithDisruptive(), (func() { + ginkgo.BeforeEach(func() { + e2eskipper.SkipUnlessNodeOSDistroIs("windows") + }) + + f := framework.NewDefaultFramework("eviction-test-windows") + f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged + + // This test will first find a Windows node memory-pressure hard-eviction enabled. + // The test will then schedule a pod that requests and consumes 500Mi of memory and then + // another pod that will consume the rest of the node's memory. + // The test will then verify that the second pod gets evicted and then the node again becomes + // ready for schedule after the second pod gets evicted. + ginkgo.It("should evict a pod when a node experiences memory pressure", func(ctx context.Context) { + framework.Logf("Looking for a Windows node with memory-pressure eviction enabled") + selector := labels.Set{"kubernetes.io/os": "windows"}.AsSelector() + nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + LabelSelector: selector.String(), + }) + framework.ExpectNoError(err) + + var node *v1.Node + var nodeMem nodeMemory + for _, n := range nodeList.Items { + nm := getNodeMemory(ctx, f, n) + if nm.hardEviction.Value() != 0 { + framework.Logf("Using node %s", n.Name) + node = &n + nodeMem = nm + break + } + } + + if node == nil { + e2eskipper.Skipf("No Windows nodes with hard memory-pressure eviction found") + } + + // Delete img-puller pods if they exist because eviction manager keeps selecting them for eviction first + // Note we cannot just delete the namespace because a deferred cleanup task tries to delete the ns if + // image pre-pulling was enabled. + nsList, err := f.ClientSet.CoreV1().Namespaces().List(ctx, metav1.ListOptions{}) + framework.ExpectNoError(err) + for _, ns := range nsList.Items { + if strings.Contains(ns.Name, "img-puller") { + framework.Logf("Deleting pods in namespace %s", ns.Name) + podList, err := f.ClientSet.CoreV1().Pods(ns.Name).List(ctx, metav1.ListOptions{}) + framework.ExpectNoError(err) + for _, pod := range podList.Items { + framework.Logf(" Deleteing pod %s", pod.Name) + err = f.ClientSet.CoreV1().Pods(ns.Name).Delete(ctx, pod.Name, metav1.DeleteOptions{}) + framework.ExpectNoError(err) + } + break + } + } + + ginkgo.By("Scheduling a pod that requests and consumes 500Mi of Memory") + + pod1 := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "pod1", + Image: imageutils.GetE2EImage(imageutils.ResourceConsumer), + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceMemory: *resource.NewQuantity(500*1024*1024, resource.BinarySI), + }, + }, + Command: []string{ + "/bin/testlimit.exe", + "-accepteula", + "-d", + "100Mb", + "-e", + "5", + "20000s", + "-c", + "5"}, + }, + }, + NodeSelector: map[string]string{ + "kubernetes.io/os": "windows", + }, + NodeName: node.Name, + }, + } + _, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, pod1, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + ginkgo.By("Scheduling another pod will consume the rest of the node's memory") + chunks := int((nodeMem.capacity.Value()-nodeMem.hardEviction.Value())/(300*1024*1024) + 3) + framework.Logf("Pod2 will consume %d chunks of 300Mi", chunks) + pod2 := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "pod2", + Image: imageutils.GetE2EImage(imageutils.ResourceConsumer), + Command: []string{ + "/bin/testlimit.exe", + "-accepteula", + "-d", + "300Mb", + "-e", + "1", + "20000s", + "-c", + strconv.Itoa(chunks)}, + }, + }, + NodeSelector: map[string]string{ + "kubernetes.io/os": "windows", + }, + NodeName: node.Name, + }, + } + _, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, pod2, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + ginkgo.By("Waiting for pods to start running") + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 2, 3*time.Minute) + framework.ExpectNoError(err) + + framework.Logf("Waiting for pod2 to get evicted") + gomega.Eventually(ctx, func() bool { + eventList, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) + framework.ExpectNoError(err) + for _, e := range eventList.Items { + // Look for an event that shows FailedScheduling + if e.Type == "Warning" && e.Reason == "Evicted" && strings.Contains(e.Message, "pod2") { + framework.Logf("Found %+v event with message %+v", e.Reason, e.Message) + return true + } + } + return false + }, 10*time.Minute, 10*time.Second).Should(gomega.BeTrueBecause("Eviction Event was not found")) + + ginkgo.By("Waiting for node.kubernetes.io/memory-pressure taint to be removed") + // ensure e2e test framework catches the memory-pressure taint + time.Sleep(waitForNodeMemoryPressureTaintDelayDuration) + // wait for node.kubernetes.io/memory-pressure=NoSchedule to be removed so other tests can run + err = e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 10*time.Minute) + framework.ExpectNoError(err) + }) +})) diff --git a/test/e2e/windows/memory_limits.go b/test/e2e/windows/memory_limits.go index de978ea26ad..b19bac8343a 100644 --- a/test/e2e/windows/memory_limits.go +++ b/test/e2e/windows/memory_limits.go @@ -82,7 +82,7 @@ type nodeMemory struct { // checks that a calculated value for NodeAllocatable is equal to the reported value func checkNodeAllocatableTest(ctx context.Context, f *framework.Framework) { - nodeMem := getNodeMemory(ctx, f) + nodeMem := getFirstNodeMemory(ctx, f) framework.Logf("nodeMem says: %+v", nodeMem) // calculate the allocatable mem based on capacity - reserved amounts @@ -176,24 +176,9 @@ func overrideAllocatableMemoryTest(ctx context.Context, f *framework.Framework, }, 3*time.Minute, 10*time.Second).Should(gomega.BeTrue()) } -// getNodeMemory populates a nodeMemory struct with information from the first -func getNodeMemory(ctx context.Context, f *framework.Framework) nodeMemory { - selector := labels.Set{"kubernetes.io/os": "windows"}.AsSelector() - nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{ - LabelSelector: selector.String(), - }) - framework.ExpectNoError(err) - - // Assuming that agent nodes have the same config - // Make sure there is >0 agent nodes, then use the first one for info - gomega.Expect(nodeList.Items).ToNot(gomega.BeEmpty()) - - ginkgo.By("Getting memory details from node status and kubelet config") - status := nodeList.Items[0].Status - nodeName := nodeList.Items[0].ObjectMeta.Name - - framework.Logf("Getting configuration details for node %s", nodeName) - request := f.ClientSet.CoreV1().RESTClient().Get().Resource("nodes").Name(nodeName).SubResource("proxy").Suffix("configz") +func getNodeMemory(ctx context.Context, f *framework.Framework, node v1.Node) nodeMemory { + framework.Logf("Getting memory details for node %s", node.ObjectMeta.Name) + request := f.ClientSet.CoreV1().RESTClient().Get().Resource("nodes").Name(node.ObjectMeta.Name).SubResource("proxy").Suffix("configz") rawbytes, err := request.DoRaw(ctx) framework.ExpectNoError(err) kubeletConfig, err := decodeConfigz(rawbytes) @@ -217,18 +202,34 @@ func getNodeMemory(ctx context.Context, f *framework.Framework) nodeMemory { } nodeMem := nodeMemory{ - capacity: status.Capacity[v1.ResourceMemory], - allocatable: status.Allocatable[v1.ResourceMemory], + capacity: node.Status.Capacity[v1.ResourceMemory], + allocatable: node.Status.Allocatable[v1.ResourceMemory], systemReserve: systemReserve, hardEviction: hardEviction, - // these are not implemented and are here for future use - will always be 0 at the moment - kubeReserve: kubeReserve, - softEviction: softEviction, + kubeReserve: kubeReserve, + softEviction: softEviction, } return nodeMem } +// getNodeMemory populates a nodeMemory struct with information from the first Windows node +// that is found in the cluster. +func getFirstNodeMemory(ctx context.Context, f *framework.Framework) nodeMemory { + selector := labels.Set{"kubernetes.io/os": "windows"}.AsSelector() + nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + LabelSelector: selector.String(), + }) + framework.ExpectNoError(err) + + // Assuming that agent nodes have the same config + // Make sure there is >0 agent nodes, then use the first one for info + gomega.Expect(nodeList.Items).ToNot(gomega.BeEmpty()) + + ginkgo.By("Getting memory details from first Windows") + return getNodeMemory(ctx, f, nodeList.Items[0]) +} + // modified from https://github.com/kubernetes/kubernetes/blob/master/test/e2e/framework/kubelet/config.go#L110 // the proxy version was causing and non proxy used a value that isn't set by e2e func decodeConfigz(contentsBytes []byte) (*kubeletconfig.KubeletConfiguration, error) {