mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-29 22:46:12 +00:00
Support metrics for node shutdown
This commit is contained in:
parent
94e494d9d7
commit
5eb3e88f6b
@ -832,6 +832,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
||||
ShutdownGracePeriodRequested: kubeCfg.ShutdownGracePeriod.Duration,
|
||||
ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration,
|
||||
ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority,
|
||||
StateDirectory: rootDirectory,
|
||||
})
|
||||
klet.shutdownManager = shutdownManager
|
||||
klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
|
||||
|
@ -462,6 +462,26 @@ var (
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
|
||||
// GracefulShutdownStartTime is a gauge that records the time at which the kubelet started graceful shutdown.
|
||||
GracefulShutdownStartTime = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: "graceful_shutdown_start_time_seconds",
|
||||
Help: "Last graceful shutdown start time since unix epoch in seconds",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
|
||||
// GracefulShutdownEndTime is a gauge that records the time at which the kubelet completed graceful shutdown.
|
||||
GracefulShutdownEndTime = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: "graceful_shutdown_end_time_seconds",
|
||||
Help: "Last graceful shutdown start time since unix epoch in seconds",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
var registerMetrics sync.Once
|
||||
@ -504,6 +524,13 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
for _, collector := range collectors {
|
||||
legacyregistry.CustomMustRegister(collector)
|
||||
}
|
||||
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) &&
|
||||
utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) {
|
||||
legacyregistry.MustRegister(GracefulShutdownStartTime)
|
||||
legacyregistry.MustRegister(GracefulShutdownEndTime)
|
||||
}
|
||||
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -46,6 +46,7 @@ type Config struct {
|
||||
ShutdownGracePeriodRequested time.Duration
|
||||
ShutdownGracePeriodCriticalPods time.Duration
|
||||
ShutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||
StateDirectory string
|
||||
Clock clock.Clock
|
||||
}
|
||||
|
||||
|
@ -22,6 +22,7 @@ package nodeshutdown
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
@ -36,6 +37,7 @@ import (
|
||||
kubeletevents "k8s.io/kubernetes/pkg/kubelet/events"
|
||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
||||
"k8s.io/kubernetes/pkg/kubelet/prober"
|
||||
"k8s.io/utils/clock"
|
||||
@ -47,6 +49,7 @@ const (
|
||||
nodeShutdownNotAdmittedReason = "NodeShutdown"
|
||||
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
||||
dbusReconnectPeriod = 1 * time.Second
|
||||
localStorageStateFile = "graceful_node_shutdown_state"
|
||||
)
|
||||
|
||||
var systemDbus = func() (dbusInhibiter, error) {
|
||||
@ -81,6 +84,9 @@ type managerImpl struct {
|
||||
nodeShuttingDownNow bool
|
||||
|
||||
clock clock.Clock
|
||||
|
||||
enableMetrics bool
|
||||
storage storage
|
||||
}
|
||||
|
||||
// NewManager returns a new node shutdown manager.
|
||||
@ -120,6 +126,10 @@ func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) {
|
||||
syncNodeStatus: conf.SyncNodeStatusFunc,
|
||||
shutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority,
|
||||
clock: conf.Clock,
|
||||
enableMetrics: utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority),
|
||||
storage: localStorage{
|
||||
Path: filepath.Join(conf.StateDirectory, localStorageStateFile),
|
||||
},
|
||||
}
|
||||
klog.InfoS("Creating node shutdown manager",
|
||||
"shutdownGracePeriodRequested", conf.ShutdownGracePeriodRequested,
|
||||
@ -143,6 +153,24 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
||||
return lifecycle.PodAdmitResult{Admit: true}
|
||||
}
|
||||
|
||||
// setMetrics sets the metrics for the node shutdown manager.
|
||||
func (m *managerImpl) setMetrics() {
|
||||
if m.enableMetrics && m.storage != nil {
|
||||
sta := state{}
|
||||
err := m.storage.Load(&sta)
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "Failed to load graceful shutdown state")
|
||||
} else {
|
||||
if !sta.StartTime.IsZero() {
|
||||
metrics.GracefulShutdownStartTime.Set(timestamp(sta.StartTime))
|
||||
}
|
||||
if !sta.EndTime.IsZero() {
|
||||
metrics.GracefulShutdownEndTime.Set(timestamp(sta.EndTime))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start starts the node shutdown manager and will start watching the node for shutdown events.
|
||||
func (m *managerImpl) Start() error {
|
||||
stop, err := m.start()
|
||||
@ -163,6 +191,8 @@ func (m *managerImpl) Start() error {
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
m.setMetrics()
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -289,6 +319,32 @@ func (m *managerImpl) processShutdownEvent() error {
|
||||
klog.V(1).InfoS("Shutdown manager processing shutdown event")
|
||||
activePods := m.getPods()
|
||||
|
||||
defer func() {
|
||||
m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
|
||||
klog.V(1).InfoS("Shutdown manager completed processing shutdown event, node will shutdown shortly")
|
||||
}()
|
||||
|
||||
if m.enableMetrics && m.storage != nil {
|
||||
startTime := time.Now()
|
||||
err := m.storage.Store(state{
|
||||
StartTime: startTime,
|
||||
})
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "Failed to store graceful shutdown state")
|
||||
}
|
||||
|
||||
defer func() {
|
||||
endTime := time.Now()
|
||||
err := m.storage.Store(state{
|
||||
StartTime: startTime,
|
||||
EndTime: endTime,
|
||||
})
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "Failed to store graceful shutdown state")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
groups := groupByPriority(m.shutdownGracePeriodByPodPriority, activePods)
|
||||
for _, group := range groups {
|
||||
// If there are no pods in a particular range,
|
||||
@ -347,9 +403,6 @@ func (m *managerImpl) processShutdownEvent() error {
|
||||
}
|
||||
}
|
||||
|
||||
m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
|
||||
klog.V(1).InfoS("Shutdown manager completed processing shutdown event, node will shutdown shortly")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
63
pkg/kubelet/nodeshutdown/storage.go
Normal file
63
pkg/kubelet/nodeshutdown/storage.go
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
Copyright 2022 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodeshutdown
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
type storage interface {
|
||||
Store(data interface{}) (err error)
|
||||
Load(data interface{}) (err error)
|
||||
}
|
||||
|
||||
type localStorage struct {
|
||||
Path string
|
||||
}
|
||||
|
||||
func (l localStorage) Store(data interface{}) (err error) {
|
||||
b, err := json.Marshal(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(l.Path, b, 0644)
|
||||
}
|
||||
|
||||
func (l localStorage) Load(data interface{}) (err error) {
|
||||
b, err := os.ReadFile(l.Path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
return json.Unmarshal(b, data)
|
||||
}
|
||||
|
||||
func timestamp(t time.Time) float64 {
|
||||
if t.IsZero() {
|
||||
return 0
|
||||
}
|
||||
return float64(t.Unix())
|
||||
}
|
||||
|
||||
type state struct {
|
||||
StartTime time.Time `json:"startTime"`
|
||||
EndTime time.Time `json:"endTime"`
|
||||
}
|
Loading…
Reference in New Issue
Block a user