Merge pull request #122922 from marosset/windows-memory-eviction

Add support for Windows memory-pressure eviction
This commit is contained in:
Kubernetes Prow Robot 2024-07-18 10:39:06 -07:00 committed by GitHub
commit 601eb7e9cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 661 additions and 141 deletions

View File

@ -1,5 +1,5 @@
//go:build !linux
// +build !linux
//go:build !linux && !windows
// +build !linux,!windows
/*
Copyright 2018 The Kubernetes Authors.

View File

@ -0,0 +1,30 @@
//go:build windows
// +build windows
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eviction
// DefaultEvictionHard includes default options for hard eviction for Windows.
// Note: On Linux, the default eviction hard threshold is 100Mi for memory.available
// but Windows generally requires more memory reserved for system processes so we'll
// set the default threshold to 500Mi.
var DefaultEvictionHard = map[string]string{
"memory.available": "500Mi",
"nodefs.available": "10%",
"imagefs.available": "15%",
}

View File

@ -19,6 +19,7 @@ package eviction
import (
"context"
"fmt"
"runtime"
"sort"
"sync"
"time"
@ -185,7 +186,8 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod
klog.InfoS(message)
m.synchronize(diskInfoProvider, podFunc)
}
if m.config.KernelMemcgNotification {
klog.InfoS("Eviction manager: starting control loop")
if m.config.KernelMemcgNotification || runtime.GOOS == "windows" {
for _, threshold := range m.config.Thresholds {
if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable {
notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler)

View File

@ -846,13 +846,11 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat
// build an evaluation context for current eviction signals
result := signalObservations{}
if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
result[evictionapi.SignalMemoryAvailable] = signalObservation{
available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
time: memory.Time,
}
memoryAvailableSignal := makeMemoryAvailableSignalObservation(summary)
if memoryAvailableSignal != nil {
result[evictionapi.SignalMemoryAvailable] = *memoryAvailableSignal
}
if allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods); err != nil {
klog.ErrorS(err, "Eviction manager: failed to construct signal", "signal", evictionapi.SignalAllocatableMemoryAvailable)
} else {

View File

@ -0,0 +1,37 @@
//go:build !windows
// +build !windows
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eviction
import (
"k8s.io/apimachinery/pkg/api/resource"
statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
)
func makeMemoryAvailableSignalObservation(summary *statsapi.Summary) *signalObservation {
if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
return &signalObservation{
available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
time: memory.Time,
}
}
return nil
}

View File

@ -1279,6 +1279,7 @@ func TestMakeSignalObservations(t *testing.T) {
Inodes: &nodeFsInodes,
},
SystemContainers: []statsapi.ContainerStats{
// Used for memory signal observations on linux
{
Name: statsapi.SystemContainerPods,
Memory: &statsapi.MemoryStats{
@ -1286,6 +1287,14 @@ func TestMakeSignalObservations(t *testing.T) {
WorkingSetBytes: &nodeWorkingSetBytes,
},
},
// Used for memory signal observations on windows
{
Name: statsapi.SystemContainerWindowsGlobalCommitMemory,
Memory: &statsapi.MemoryStats{
AvailableBytes: &nodeAvailableBytes,
UsageBytes: &nodeWorkingSetBytes,
},
},
},
},
Pods: []statsapi.PodStats{},
@ -1307,7 +1316,7 @@ func TestMakeSignalObservations(t *testing.T) {
actualObservations, statsFunc := makeSignalObservations(fakeStats)
allocatableMemQuantity, found := actualObservations[evictionapi.SignalAllocatableMemoryAvailable]
if !found {
t.Errorf("Expected allocatable memory observation, but didnt find one")
t.Errorf("Expected allocatable memory observation, but didn't find one")
}
if expectedBytes := int64(nodeAvailableBytes); allocatableMemQuantity.available.Value() != expectedBytes {
t.Errorf("Expected %v, actual: %v", expectedBytes, allocatableMemQuantity.available.Value())

View File

@ -0,0 +1,48 @@
//go:build windows
// +build windows
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eviction
import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog/v2"
statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
func makeMemoryAvailableSignalObservation(summary *statsapi.Summary) *signalObservation {
klog.V(4).InfoS("Eviction manager: building memory signal observations for windows")
sysContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerWindowsGlobalCommitMemory)
if err != nil {
klog.ErrorS(err, "Eviction manager: failed to construct signal", "signal", evictionapi.SignalMemoryAvailable)
}
if memory := sysContainer.Memory; memory != nil && memory.AvailableBytes != nil && memory.UsageBytes != nil {
klog.V(4).InfoS(
"Eviction manager: memory signal observations for windows",
"Available", *memory.AvailableBytes,
"Usage", *memory.UsageBytes)
return &signalObservation{
available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.UsageBytes), resource.BinarySI),
time: memory.Time,
}
} else {
return nil
}
}

View File

@ -16,119 +16,19 @@ limitations under the License.
package eviction
import (
"fmt"
"time"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/api/resource"
statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
"k8s.io/kubernetes/pkg/kubelet/cm"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
import "time"
const (
memoryUsageAttribute = "memory.usage_in_bytes"
// this prevents constantly updating the memcg notifier if synchronize
// is run frequently.
notifierRefreshInterval = 10 * time.Second
)
type memoryThresholdNotifier struct {
threshold evictionapi.Threshold
cgroupPath string
events chan struct{}
factory NotifierFactory
handler func(string)
notifier CgroupNotifier
}
var _ ThresholdNotifier = &memoryThresholdNotifier{}
// NewMemoryThresholdNotifier creates a ThresholdNotifier which is designed to respond to the given threshold.
// UpdateThreshold must be called once before the threshold will be active.
func NewMemoryThresholdNotifier(threshold evictionapi.Threshold, cgroupRoot string, factory NotifierFactory, handler func(string)) (ThresholdNotifier, error) {
cgroups, err := cm.GetCgroupSubsystems()
if err != nil {
return nil, err
}
cgpath, found := cgroups.MountPoints["memory"]
if !found || len(cgpath) == 0 {
return nil, fmt.Errorf("memory cgroup mount point not found")
}
if isAllocatableEvictionThreshold(threshold) {
// for allocatable thresholds, point the cgroup notifier at the allocatable cgroup
cgpath += cgroupRoot
}
return &memoryThresholdNotifier{
threshold: threshold,
cgroupPath: cgpath,
events: make(chan struct{}),
handler: handler,
factory: factory,
}, nil
}
func (m *memoryThresholdNotifier) Start() {
klog.InfoS("Eviction manager: created memoryThresholdNotifier", "notifier", m.Description())
for range m.events {
m.handler(fmt.Sprintf("eviction manager: %s crossed", m.Description()))
}
}
func (m *memoryThresholdNotifier) UpdateThreshold(summary *statsapi.Summary) error {
memoryStats := summary.Node.Memory
if isAllocatableEvictionThreshold(m.threshold) {
allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods)
if err != nil {
return err
}
memoryStats = allocatableContainer.Memory
}
if memoryStats == nil || memoryStats.UsageBytes == nil || memoryStats.WorkingSetBytes == nil || memoryStats.AvailableBytes == nil {
return fmt.Errorf("summary was incomplete. Expected MemoryStats and all subfields to be non-nil, but got %+v", memoryStats)
}
// Set threshold on usage to capacity - eviction_hard + inactive_file,
// since we want to be notified when working_set = capacity - eviction_hard
inactiveFile := resource.NewQuantity(int64(*memoryStats.UsageBytes-*memoryStats.WorkingSetBytes), resource.BinarySI)
capacity := resource.NewQuantity(int64(*memoryStats.AvailableBytes+*memoryStats.WorkingSetBytes), resource.BinarySI)
evictionThresholdQuantity := evictionapi.GetThresholdQuantity(m.threshold.Value, capacity)
memcgThreshold := capacity.DeepCopy()
memcgThreshold.Sub(*evictionThresholdQuantity)
memcgThreshold.Add(*inactiveFile)
klog.V(3).InfoS("Eviction manager: setting notifier to capacity", "notifier", m.Description(), "capacity", memcgThreshold.String())
if m.notifier != nil {
m.notifier.Stop()
}
newNotifier, err := m.factory.NewCgroupNotifier(m.cgroupPath, memoryUsageAttribute, memcgThreshold.Value())
if err != nil {
return err
}
m.notifier = newNotifier
go m.notifier.Start(m.events)
return nil
}
func (m *memoryThresholdNotifier) Description() string {
var hard, allocatable string
if isHardEvictionThreshold(m.threshold) {
hard = "hard "
} else {
hard = "soft "
}
if isAllocatableEvictionThreshold(m.threshold) {
allocatable = "allocatable "
}
return fmt.Sprintf("%s%smemory eviction threshold", hard, allocatable)
}
var _ NotifierFactory = &CgroupNotifierFactory{}
// CgroupNotifierFactory knows how to make CgroupNotifiers which integrate with the kernel
type CgroupNotifierFactory struct{}
var _ NotifierFactory = &CgroupNotifierFactory{}
// NewCgroupNotifier implements the NotifierFactory interface
func (n *CgroupNotifierFactory) NewCgroupNotifier(path, attribute string, threshold int64) (CgroupNotifier, error) {
return NewCgroupNotifier(path, attribute, threshold)

View File

@ -0,0 +1,123 @@
//go:build !windows
// +build !windows
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eviction
import (
"fmt"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog/v2"
statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
"k8s.io/kubernetes/pkg/kubelet/cm"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
const (
memoryUsageAttribute = "memory.usage_in_bytes"
)
type linuxMemoryThresholdNotifier struct {
threshold evictionapi.Threshold
cgroupPath string
events chan struct{}
factory NotifierFactory
handler func(string)
notifier CgroupNotifier
}
var _ ThresholdNotifier = &linuxMemoryThresholdNotifier{}
// NewMemoryThresholdNotifier creates a ThresholdNotifier which is designed to respond to the given threshold.
// UpdateThreshold must be called once before the threshold will be active.
func NewMemoryThresholdNotifier(threshold evictionapi.Threshold, cgroupRoot string, factory NotifierFactory, handler func(string)) (ThresholdNotifier, error) {
cgroups, err := cm.GetCgroupSubsystems()
if err != nil {
return nil, err
}
cgpath, found := cgroups.MountPoints["memory"]
if !found || len(cgpath) == 0 {
return nil, fmt.Errorf("memory cgroup mount point not found")
}
if isAllocatableEvictionThreshold(threshold) {
// for allocatable thresholds, point the cgroup notifier at the allocatable cgroup
cgpath += cgroupRoot
}
return &linuxMemoryThresholdNotifier{
threshold: threshold,
cgroupPath: cgpath,
events: make(chan struct{}),
handler: handler,
factory: factory,
}, nil
}
func (m *linuxMemoryThresholdNotifier) UpdateThreshold(summary *statsapi.Summary) error {
memoryStats := summary.Node.Memory
if isAllocatableEvictionThreshold(m.threshold) {
allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods)
if err != nil {
return err
}
memoryStats = allocatableContainer.Memory
}
if memoryStats == nil || memoryStats.UsageBytes == nil || memoryStats.WorkingSetBytes == nil || memoryStats.AvailableBytes == nil {
return fmt.Errorf("summary was incomplete. Expected MemoryStats and all subfields to be non-nil, but got %+v", memoryStats)
}
// Set threshold on usage to capacity - eviction_hard + inactive_file,
// since we want to be notified when working_set = capacity - eviction_hard
inactiveFile := resource.NewQuantity(int64(*memoryStats.UsageBytes-*memoryStats.WorkingSetBytes), resource.BinarySI)
capacity := resource.NewQuantity(int64(*memoryStats.AvailableBytes+*memoryStats.WorkingSetBytes), resource.BinarySI)
evictionThresholdQuantity := evictionapi.GetThresholdQuantity(m.threshold.Value, capacity)
memcgThreshold := capacity.DeepCopy()
memcgThreshold.Sub(*evictionThresholdQuantity)
memcgThreshold.Add(*inactiveFile)
klog.V(3).InfoS("Eviction manager: setting notifier to capacity", "notifier", m.Description(), "capacity", memcgThreshold.String())
if m.notifier != nil {
m.notifier.Stop()
}
newNotifier, err := m.factory.NewCgroupNotifier(m.cgroupPath, memoryUsageAttribute, memcgThreshold.Value())
if err != nil {
return err
}
m.notifier = newNotifier
go m.notifier.Start(m.events)
return nil
}
func (m *linuxMemoryThresholdNotifier) Start() {
klog.InfoS("Eviction manager: created memoryThresholdNotifier", "notifier", m.Description())
for range m.events {
m.handler(fmt.Sprintf("eviction manager: %s crossed", m.Description()))
}
}
func (m *linuxMemoryThresholdNotifier) Description() string {
var hard, allocatable string
if isHardEvictionThreshold(m.threshold) {
hard = "hard "
} else {
hard = "soft "
}
if isAllocatableEvictionThreshold(m.threshold) {
allocatable = "allocatable "
}
return fmt.Sprintf("%s%smemory eviction threshold", hard, allocatable)
}

View File

@ -1,3 +1,6 @@
//go:build linux
// +build linux
/*
Copyright 2018 The Kubernetes Authors.
@ -58,8 +61,8 @@ func nodeSummary(available, workingSet, usage resource.Quantity, allocatable boo
}
}
func newTestMemoryThresholdNotifier(threshold evictionapi.Threshold, factory NotifierFactory, handler func(string)) *memoryThresholdNotifier {
return &memoryThresholdNotifier{
func newTestMemoryThresholdNotifier(threshold evictionapi.Threshold, factory NotifierFactory, handler func(string)) *linuxMemoryThresholdNotifier {
return &linuxMemoryThresholdNotifier{
threshold: threshold,
cgroupPath: testCgroupPath,
events: make(chan struct{}),
@ -245,7 +248,7 @@ func TestThresholdDescription(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
m := &memoryThresholdNotifier{
m := &linuxMemoryThresholdNotifier{
notifier: &MockCgroupNotifier{},
threshold: tc.evictionThreshold,
cgroupPath: testCgroupPath,

View File

@ -0,0 +1,101 @@
//go:build windows
// +build windows
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eviction
import (
"fmt"
"time"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog/v2"
statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/winstats"
)
type windowsMemoryThresholdNotifier struct {
threshold evictionapi.Threshold
events chan struct{}
handler func(string)
}
var _ ThresholdNotifier = &windowsMemoryThresholdNotifier{}
func NewMemoryThresholdNotifier(threshold evictionapi.Threshold, cgroupRoot string, factory NotifierFactory, handler func(string)) (ThresholdNotifier, error) {
klog.InfoS("Eviction manager: creating new WindowsMemoryThresholdNotifier")
return &windowsMemoryThresholdNotifier{
threshold: threshold,
events: make(chan struct{}),
handler: handler,
}, nil
}
func (m *windowsMemoryThresholdNotifier) Start() {
klog.InfoS("Eviction manager: starting windowsMemoryThresholdNotifier", "notifier", m.Description())
go func() {
for true {
time.Sleep(notifierRefreshInterval)
m.checkMemoryUsage()
}
}()
for range m.events {
m.handler(fmt.Sprintf("eviction manager: %s crossed", m.Description()))
}
}
func (m *windowsMemoryThresholdNotifier) checkMemoryUsage() {
// Get global commit limit
perfInfo, err := winstats.GetPerformanceInfo()
if err != nil {
klog.ErrorS(err, "Eviction manager: error getting global memory status for node")
}
commmiLimitBytes := perfInfo.CommitLimitPages * perfInfo.PageSize
capacity := resource.NewQuantity(int64(commmiLimitBytes), resource.BinarySI)
evictionThresholdQuantity := evictionapi.GetThresholdQuantity(m.threshold.Value, capacity)
commitTotalBytes := perfInfo.CommitTotalPages * perfInfo.PageSize
commitAvailableBytes := commmiLimitBytes - commitTotalBytes
if commitAvailableBytes <= uint64(evictionThresholdQuantity.Value()) {
m.events <- struct{}{}
}
}
func (m *windowsMemoryThresholdNotifier) UpdateThreshold(summary *statsapi.Summary) error {
// Windows doesn't use cgroup notifiers to trigger eviction, so this function is a no-op.
// Instead the go-routine set up in Start() will poll the system for memory usage and
// trigger eviction when the threshold is crossed.
return nil
}
func (m *windowsMemoryThresholdNotifier) Description() string {
var hard, allocatable string
if isHardEvictionThreshold(m.threshold) {
hard = "hard"
} else {
hard = "soft"
}
if isAllocatableEvictionThreshold(m.threshold) {
allocatable = "allocatable"
}
return fmt.Sprintf("%s %s memory eviction threshold", hard, allocatable)
}

View File

@ -23,17 +23,21 @@ import (
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/winstats"
)
func (sp *summaryProviderImpl) GetSystemContainersStats(nodeConfig cm.NodeConfig, podStats []statsapi.PodStats, updateStats bool) (stats []statsapi.ContainerStats) {
stats = append(stats, sp.getSystemPodsCPUAndMemoryStats(nodeConfig, podStats, updateStats))
stats = append(stats, sp.getSystemWindowsGlobalmemoryStats())
return stats
}
func (sp *summaryProviderImpl) GetSystemContainersCPUAndMemoryStats(nodeConfig cm.NodeConfig, podStats []statsapi.PodStats, updateStats bool) (stats []statsapi.ContainerStats) {
stats = append(stats, sp.getSystemPodsCPUAndMemoryStats(nodeConfig, podStats, updateStats))
stats = append(stats, sp.getSystemWindowsGlobalmemoryStats())
return stats
}
@ -96,3 +100,27 @@ func (sp *summaryProviderImpl) getSystemPodsCPUAndMemoryStats(nodeConfig cm.Node
return podsSummary
}
func (sp *summaryProviderImpl) getSystemWindowsGlobalmemoryStats() statsapi.ContainerStats {
now := metav1.NewTime(time.Now())
globalMemorySummary := statsapi.ContainerStats{
StartTime: now,
Memory: &statsapi.MemoryStats{},
Name: statsapi.SystemContainerWindowsGlobalCommitMemory,
}
perfInfo, err := winstats.GetPerformanceInfo()
if err != nil {
klog.Errorf("Failed to get Windows performance info: %v", err)
return globalMemorySummary
}
commitLimitBytes := perfInfo.CommitLimitPages * perfInfo.PageSize
commitTotalBytes := perfInfo.CommitTotalPages * perfInfo.PageSize
commitAvailableBytes := commitLimitBytes - commitTotalBytes
globalMemorySummary.Memory.Time = now
globalMemorySummary.Memory.AvailableBytes = &commitAvailableBytes
globalMemorySummary.Memory.UsageBytes = &commitTotalBytes
return globalMemorySummary
}

View File

@ -79,13 +79,18 @@ func TestSummaryProvider(t *testing.T) {
assert.Equal(summary.Node.Fs, rootFsStats)
assert.Equal(summary.Node.Runtime, &statsapi.RuntimeStats{ContainerFs: imageFsStats, ImageFs: imageFsStats})
assert.Equal(len(summary.Node.SystemContainers), 1)
assert.NoError(err)
assert.Equal(len(summary.Node.SystemContainers), 2)
assert.Equal(summary.Node.SystemContainers[0].Name, "pods")
assert.Equal(summary.Node.SystemContainers[0].CPU.UsageCoreNanoSeconds, podStats[0].CPU.UsageCoreNanoSeconds)
assert.Equal(summary.Node.SystemContainers[0].CPU.UsageNanoCores, podStats[0].CPU.UsageNanoCores)
assert.Equal(summary.Node.SystemContainers[0].Memory.WorkingSetBytes, podStats[0].Memory.WorkingSetBytes)
assert.Equal(summary.Node.SystemContainers[0].Memory.UsageBytes, podStats[0].Memory.UsageBytes)
assert.Equal(summary.Node.SystemContainers[0].Memory.AvailableBytes, podStats[0].Memory.AvailableBytes)
assert.Equal(summary.Node.SystemContainers[1].Name, statsapi.SystemContainerWindowsGlobalCommitMemory)
assert.NotEqual(nil, summary.Node.SystemContainers[1].Memory)
assert.NotEqual(nil, summary.Node.SystemContainers[1].Memory.AvailableBytes)
assert.NotEqual(nil, summary.Node.SystemContainers[1].Memory.UsageBytes)
assert.Equal(summary.Pods, podStats)
}

View File

@ -56,10 +56,33 @@ type MemoryStatusEx struct {
AvailExtendedVirtual uint64
}
// PerformanceInfo is the same as Windows structure PERFORMANCE_INFORMATION
// https://learn.microsoft.com/en-us/windows/win32/api/psapi/ns-psapi-performance_information
type PerformanceInformation struct {
cb uint32
CommitTotalPages uint64
CommitLimitPages uint64
CommitPeakPages uint64
PhysicalTotalPages uint64
PhysicalAvailablePages uint64
SystemCachePages uint64
KernelTotalPages uint64
KernelPagesPages uint64
KernelNonpagedPages uint64
PageSize uint64
HandleCount uint32
ProcessCount uint32
ThreadCount uint32
}
var (
// kernel32.dll system calls
modkernel32 = windows.NewLazySystemDLL("kernel32.dll")
procGlobalMemoryStatusEx = modkernel32.NewProc("GlobalMemoryStatusEx")
procGetActiveProcessorCount = modkernel32.NewProc("GetActiveProcessorCount")
// psapi.dll system calls
modpsapi = windows.NewLazySystemDLL("psapi.dll")
procGetPerformanceInfo = modpsapi.NewProc("GetPerformanceInfo")
)
const allProcessorGroups = 0xFFFF
@ -294,6 +317,16 @@ func getPhysicallyInstalledSystemMemoryBytes() (uint64, error) {
return statex.TotalPhys, nil
}
func GetPerformanceInfo() (*PerformanceInformation, error) {
var pi PerformanceInformation
pi.cb = uint32(unsafe.Sizeof(pi))
ret, _, _ := procGetPerformanceInfo.Call(uintptr(unsafe.Pointer(&pi)), uintptr(pi.cb))
if ret == 0 {
return nil, errors.New("unable to read Windows performance information")
}
return &pi, nil
}
func getBootID() (string, error) {
regKey, err := registry.OpenKey(registry.LOCAL_MACHINE, bootIdRegistry, registry.READ)
if err != nil {

View File

@ -99,6 +99,9 @@ const (
SystemContainerMisc = "misc"
// SystemContainerPods is the container name for the system container tracking user pods.
SystemContainerPods = "pods"
// SystemContainerWindowsGlobalCommitMemory (only used on Windows) is the container name for the system container
// tracking global commit memory usage and is used for memory-pressure eviction.
SystemContainerWindowsGlobalCommitMemory = "windows-global-commit-memory"
)
// ProcessStats are stats pertaining to processes.

View File

@ -0,0 +1,199 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package windows
import (
"context"
"strconv"
"strings"
"time"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
imageutils "k8s.io/kubernetes/test/utils/image"
admissionapi "k8s.io/pod-security-admission/api"
)
const (
// It can take 10-15 seconds for node memory-pressure taint to show up on the node
// so we'll wait 45 seconds for the taint to show up so the e2e test case can catch
// it and the wait for the taint to be removed so other serial/slow tests can run
// against the same node.
waitForNodeMemoryPressureTaintDelayDuration = 45 * time.Second
)
var _ = sigDescribe(feature.Windows, "Eviction", framework.WithSerial(), framework.WithSlow(), framework.WithDisruptive(), (func() {
ginkgo.BeforeEach(func() {
e2eskipper.SkipUnlessNodeOSDistroIs("windows")
})
f := framework.NewDefaultFramework("eviction-test-windows")
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
// This test will first find a Windows node memory-pressure hard-eviction enabled.
// The test will then schedule a pod that requests and consumes 500Mi of memory and then
// another pod that will consume the rest of the node's memory.
// The test will then verify that the second pod gets evicted and then the node again becomes
// ready for schedule after the second pod gets evicted.
ginkgo.It("should evict a pod when a node experiences memory pressure", func(ctx context.Context) {
framework.Logf("Looking for a Windows node with memory-pressure eviction enabled")
selector := labels.Set{"kubernetes.io/os": "windows"}.AsSelector()
nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{
LabelSelector: selector.String(),
})
framework.ExpectNoError(err)
var node *v1.Node
var nodeMem nodeMemory
for _, n := range nodeList.Items {
nm := getNodeMemory(ctx, f, n)
if nm.hardEviction.Value() != 0 {
framework.Logf("Using node %s", n.Name)
node = &n
nodeMem = nm
break
}
}
if node == nil {
e2eskipper.Skipf("No Windows nodes with hard memory-pressure eviction found")
}
// Delete img-puller pods if they exist because eviction manager keeps selecting them for eviction first
// Note we cannot just delete the namespace because a deferred cleanup task tries to delete the ns if
// image pre-pulling was enabled.
nsList, err := f.ClientSet.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
framework.ExpectNoError(err)
for _, ns := range nsList.Items {
if strings.Contains(ns.Name, "img-puller") {
framework.Logf("Deleting pods in namespace %s", ns.Name)
podList, err := f.ClientSet.CoreV1().Pods(ns.Name).List(ctx, metav1.ListOptions{})
framework.ExpectNoError(err)
for _, pod := range podList.Items {
framework.Logf(" Deleteing pod %s", pod.Name)
err = f.ClientSet.CoreV1().Pods(ns.Name).Delete(ctx, pod.Name, metav1.DeleteOptions{})
framework.ExpectNoError(err)
}
break
}
}
ginkgo.By("Scheduling a pod that requests and consumes 500Mi of Memory")
pod1 := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod1",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "pod1",
Image: imageutils.GetE2EImage(imageutils.ResourceConsumer),
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: *resource.NewQuantity(500*1024*1024, resource.BinarySI),
},
},
Command: []string{
"/bin/testlimit.exe",
"-accepteula",
"-d",
"100Mb",
"-e",
"5",
"20000s",
"-c",
"5"},
},
},
NodeSelector: map[string]string{
"kubernetes.io/os": "windows",
},
NodeName: node.Name,
},
}
_, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, pod1, metav1.CreateOptions{})
framework.ExpectNoError(err)
ginkgo.By("Scheduling another pod will consume the rest of the node's memory")
chunks := int((nodeMem.capacity.Value()-nodeMem.hardEviction.Value())/(300*1024*1024) + 3)
framework.Logf("Pod2 will consume %d chunks of 300Mi", chunks)
pod2 := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod2",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "pod2",
Image: imageutils.GetE2EImage(imageutils.ResourceConsumer),
Command: []string{
"/bin/testlimit.exe",
"-accepteula",
"-d",
"300Mb",
"-e",
"1",
"20000s",
"-c",
strconv.Itoa(chunks)},
},
},
NodeSelector: map[string]string{
"kubernetes.io/os": "windows",
},
NodeName: node.Name,
},
}
_, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, pod2, metav1.CreateOptions{})
framework.ExpectNoError(err)
ginkgo.By("Waiting for pods to start running")
err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 2, 3*time.Minute)
framework.ExpectNoError(err)
framework.Logf("Waiting for pod2 to get evicted")
gomega.Eventually(ctx, func() bool {
eventList, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{})
framework.ExpectNoError(err)
for _, e := range eventList.Items {
// Look for an event that shows FailedScheduling
if e.Type == "Warning" && e.Reason == "Evicted" && strings.Contains(e.Message, "pod2") {
framework.Logf("Found %+v event with message %+v", e.Reason, e.Message)
return true
}
}
return false
}, 10*time.Minute, 10*time.Second).Should(gomega.BeTrueBecause("Eviction Event was not found"))
ginkgo.By("Waiting for node.kubernetes.io/memory-pressure taint to be removed")
// ensure e2e test framework catches the memory-pressure taint
time.Sleep(waitForNodeMemoryPressureTaintDelayDuration)
// wait for node.kubernetes.io/memory-pressure=NoSchedule to be removed so other tests can run
err = e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 10*time.Minute)
framework.ExpectNoError(err)
})
}))

View File

@ -82,7 +82,7 @@ type nodeMemory struct {
// checks that a calculated value for NodeAllocatable is equal to the reported value
func checkNodeAllocatableTest(ctx context.Context, f *framework.Framework) {
nodeMem := getNodeMemory(ctx, f)
nodeMem := getFirstNodeMemory(ctx, f)
framework.Logf("nodeMem says: %+v", nodeMem)
// calculate the allocatable mem based on capacity - reserved amounts
@ -176,24 +176,9 @@ func overrideAllocatableMemoryTest(ctx context.Context, f *framework.Framework,
}, 3*time.Minute, 10*time.Second).Should(gomega.BeTrue())
}
// getNodeMemory populates a nodeMemory struct with information from the first
func getNodeMemory(ctx context.Context, f *framework.Framework) nodeMemory {
selector := labels.Set{"kubernetes.io/os": "windows"}.AsSelector()
nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{
LabelSelector: selector.String(),
})
framework.ExpectNoError(err)
// Assuming that agent nodes have the same config
// Make sure there is >0 agent nodes, then use the first one for info
gomega.Expect(nodeList.Items).ToNot(gomega.BeEmpty())
ginkgo.By("Getting memory details from node status and kubelet config")
status := nodeList.Items[0].Status
nodeName := nodeList.Items[0].ObjectMeta.Name
framework.Logf("Getting configuration details for node %s", nodeName)
request := f.ClientSet.CoreV1().RESTClient().Get().Resource("nodes").Name(nodeName).SubResource("proxy").Suffix("configz")
func getNodeMemory(ctx context.Context, f *framework.Framework, node v1.Node) nodeMemory {
framework.Logf("Getting memory details for node %s", node.ObjectMeta.Name)
request := f.ClientSet.CoreV1().RESTClient().Get().Resource("nodes").Name(node.ObjectMeta.Name).SubResource("proxy").Suffix("configz")
rawbytes, err := request.DoRaw(ctx)
framework.ExpectNoError(err)
kubeletConfig, err := decodeConfigz(rawbytes)
@ -217,18 +202,34 @@ func getNodeMemory(ctx context.Context, f *framework.Framework) nodeMemory {
}
nodeMem := nodeMemory{
capacity: status.Capacity[v1.ResourceMemory],
allocatable: status.Allocatable[v1.ResourceMemory],
capacity: node.Status.Capacity[v1.ResourceMemory],
allocatable: node.Status.Allocatable[v1.ResourceMemory],
systemReserve: systemReserve,
hardEviction: hardEviction,
// these are not implemented and are here for future use - will always be 0 at the moment
kubeReserve: kubeReserve,
softEviction: softEviction,
kubeReserve: kubeReserve,
softEviction: softEviction,
}
return nodeMem
}
// getNodeMemory populates a nodeMemory struct with information from the first Windows node
// that is found in the cluster.
func getFirstNodeMemory(ctx context.Context, f *framework.Framework) nodeMemory {
selector := labels.Set{"kubernetes.io/os": "windows"}.AsSelector()
nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{
LabelSelector: selector.String(),
})
framework.ExpectNoError(err)
// Assuming that agent nodes have the same config
// Make sure there is >0 agent nodes, then use the first one for info
gomega.Expect(nodeList.Items).ToNot(gomega.BeEmpty())
ginkgo.By("Getting memory details from first Windows")
return getNodeMemory(ctx, f, nodeList.Items[0])
}
// modified from https://github.com/kubernetes/kubernetes/blob/master/test/e2e/framework/kubelet/config.go#L110
// the proxy version was causing and non proxy used a value that isn't set by e2e
func decodeConfigz(contentsBytes []byte) (*kubeletconfig.KubeletConfiguration, error) {