Support metrics for node shutdown

This commit is contained in:
Shiming Zhang 2022-03-11 17:30:37 +08:00
parent 94e494d9d7
commit 5eb3e88f6b
5 changed files with 148 additions and 3 deletions

View File

@ -832,6 +832,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
ShutdownGracePeriodRequested: kubeCfg.ShutdownGracePeriod.Duration,
ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration,
ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority,
StateDirectory: rootDirectory,
})
klet.shutdownManager = shutdownManager
klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)

View File

@ -462,6 +462,26 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// GracefulShutdownStartTime is a gauge that records the time at which the kubelet started graceful shutdown.
GracefulShutdownStartTime = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: "graceful_shutdown_start_time_seconds",
Help: "Last graceful shutdown start time since unix epoch in seconds",
StabilityLevel: metrics.ALPHA,
},
)
// GracefulShutdownEndTime is a gauge that records the time at which the kubelet completed graceful shutdown.
GracefulShutdownEndTime = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: "graceful_shutdown_end_time_seconds",
Help: "Last graceful shutdown start time since unix epoch in seconds",
StabilityLevel: metrics.ALPHA,
},
)
)
var registerMetrics sync.Once
@ -504,6 +524,13 @@ func Register(collectors ...metrics.StableCollector) {
for _, collector := range collectors {
legacyregistry.CustomMustRegister(collector)
}
if utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) &&
utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) {
legacyregistry.MustRegister(GracefulShutdownStartTime)
legacyregistry.MustRegister(GracefulShutdownEndTime)
}
})
}

View File

@ -46,6 +46,7 @@ type Config struct {
ShutdownGracePeriodRequested time.Duration
ShutdownGracePeriodCriticalPods time.Duration
ShutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
StateDirectory string
Clock clock.Clock
}

View File

@ -22,6 +22,7 @@ package nodeshutdown
import (
"fmt"
"path/filepath"
"sort"
"sync"
"time"
@ -36,6 +37,7 @@ import (
kubeletevents "k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
"k8s.io/kubernetes/pkg/kubelet/prober"
"k8s.io/utils/clock"
@ -47,6 +49,7 @@ const (
nodeShutdownNotAdmittedReason = "NodeShutdown"
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
dbusReconnectPeriod = 1 * time.Second
localStorageStateFile = "graceful_node_shutdown_state"
)
var systemDbus = func() (dbusInhibiter, error) {
@ -81,6 +84,9 @@ type managerImpl struct {
nodeShuttingDownNow bool
clock clock.Clock
enableMetrics bool
storage storage
}
// NewManager returns a new node shutdown manager.
@ -120,6 +126,10 @@ func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) {
syncNodeStatus: conf.SyncNodeStatusFunc,
shutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority,
clock: conf.Clock,
enableMetrics: utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority),
storage: localStorage{
Path: filepath.Join(conf.StateDirectory, localStorageStateFile),
},
}
klog.InfoS("Creating node shutdown manager",
"shutdownGracePeriodRequested", conf.ShutdownGracePeriodRequested,
@ -143,6 +153,24 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
return lifecycle.PodAdmitResult{Admit: true}
}
// setMetrics sets the metrics for the node shutdown manager.
func (m *managerImpl) setMetrics() {
if m.enableMetrics && m.storage != nil {
sta := state{}
err := m.storage.Load(&sta)
if err != nil {
klog.ErrorS(err, "Failed to load graceful shutdown state")
} else {
if !sta.StartTime.IsZero() {
metrics.GracefulShutdownStartTime.Set(timestamp(sta.StartTime))
}
if !sta.EndTime.IsZero() {
metrics.GracefulShutdownEndTime.Set(timestamp(sta.EndTime))
}
}
}
}
// Start starts the node shutdown manager and will start watching the node for shutdown events.
func (m *managerImpl) Start() error {
stop, err := m.start()
@ -163,6 +191,8 @@ func (m *managerImpl) Start() error {
}
}
}()
m.setMetrics()
return nil
}
@ -289,6 +319,32 @@ func (m *managerImpl) processShutdownEvent() error {
klog.V(1).InfoS("Shutdown manager processing shutdown event")
activePods := m.getPods()
defer func() {
m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
klog.V(1).InfoS("Shutdown manager completed processing shutdown event, node will shutdown shortly")
}()
if m.enableMetrics && m.storage != nil {
startTime := time.Now()
err := m.storage.Store(state{
StartTime: startTime,
})
if err != nil {
klog.ErrorS(err, "Failed to store graceful shutdown state")
}
defer func() {
endTime := time.Now()
err := m.storage.Store(state{
StartTime: startTime,
EndTime: endTime,
})
if err != nil {
klog.ErrorS(err, "Failed to store graceful shutdown state")
}
}()
}
groups := groupByPriority(m.shutdownGracePeriodByPodPriority, activePods)
for _, group := range groups {
// If there are no pods in a particular range,
@ -347,9 +403,6 @@ func (m *managerImpl) processShutdownEvent() error {
}
}
m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
klog.V(1).InfoS("Shutdown manager completed processing shutdown event, node will shutdown shortly")
return nil
}

View File

@ -0,0 +1,63 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeshutdown
import (
"encoding/json"
"os"
"time"
)
type storage interface {
Store(data interface{}) (err error)
Load(data interface{}) (err error)
}
type localStorage struct {
Path string
}
func (l localStorage) Store(data interface{}) (err error) {
b, err := json.Marshal(data)
if err != nil {
return err
}
return os.WriteFile(l.Path, b, 0644)
}
func (l localStorage) Load(data interface{}) (err error) {
b, err := os.ReadFile(l.Path)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
return json.Unmarshal(b, data)
}
func timestamp(t time.Time) float64 {
if t.IsZero() {
return 0
}
return float64(t.Unix())
}
type state struct {
StartTime time.Time `json:"startTime"`
EndTime time.Time `json:"endTime"`
}