mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-06 02:34:03 +00:00
Merge pull request #30686 from gmarek/metrics
Automatic merge from submit-queue Add cluster health metrics to NodeController Follow up of #28832 This adds metrics to monitor cluster/zone status. cc @alex-mohr @fabioy @wojtek-t @Q-Lee
This commit is contained in:
commit
6b20896fea
201
pkg/controller/node/metrics.go
Normal file
201
pkg/controller/node/metrics.go
Normal file
@ -0,0 +1,201 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2016 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package node
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||||
|
|
||||||
|
"github.com/golang/glog"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
NodeControllerSubsystem = "node_collector"
|
||||||
|
ZoneHealthStatisticKey = "zone_health"
|
||||||
|
ZoneSizeKey = "zone_size"
|
||||||
|
ZoneNoUnhealthyNodesKey = "unhealty_nodes_in_zone"
|
||||||
|
EvictionsIn10MinutesKey = "10_minute_evictions"
|
||||||
|
EvictionsIn1HourKey = "1_hour_evictions"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
ZoneHealth = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Subsystem: NodeControllerSubsystem,
|
||||||
|
Name: ZoneHealthStatisticKey,
|
||||||
|
Help: "Gauge measuring percentage of healty nodes per zone.",
|
||||||
|
},
|
||||||
|
[]string{"zone"},
|
||||||
|
)
|
||||||
|
ZoneSize = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Subsystem: NodeControllerSubsystem,
|
||||||
|
Name: ZoneSizeKey,
|
||||||
|
Help: "Gauge measuring number of registered Nodes per zones.",
|
||||||
|
},
|
||||||
|
[]string{"zone"},
|
||||||
|
)
|
||||||
|
UnhealthyNodes = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Subsystem: NodeControllerSubsystem,
|
||||||
|
Name: ZoneNoUnhealthyNodesKey,
|
||||||
|
Help: "Gauge measuring number of not Ready Nodes per zones.",
|
||||||
|
},
|
||||||
|
[]string{"zone"},
|
||||||
|
)
|
||||||
|
Evictions10Minutes = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Subsystem: NodeControllerSubsystem,
|
||||||
|
Name: EvictionsIn10MinutesKey,
|
||||||
|
Help: "Gauge measuring number of Node evictions that happened in previous 10 minutes per zone.",
|
||||||
|
},
|
||||||
|
[]string{"zone"},
|
||||||
|
)
|
||||||
|
Evictions1Hour = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Subsystem: NodeControllerSubsystem,
|
||||||
|
Name: EvictionsIn1HourKey,
|
||||||
|
Help: "Gauge measuring number of Node evictions that happened in previous hour per zone.",
|
||||||
|
},
|
||||||
|
[]string{"zone"},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
var registerMetrics sync.Once
|
||||||
|
|
||||||
|
func Register() {
|
||||||
|
registerMetrics.Do(func() {
|
||||||
|
prometheus.MustRegister(ZoneHealth)
|
||||||
|
prometheus.MustRegister(ZoneSize)
|
||||||
|
prometheus.MustRegister(UnhealthyNodes)
|
||||||
|
prometheus.MustRegister(Evictions10Minutes)
|
||||||
|
prometheus.MustRegister(Evictions1Hour)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
type eviction struct {
|
||||||
|
node string
|
||||||
|
time unversioned.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
type evictionData struct {
|
||||||
|
sync.Mutex
|
||||||
|
nodeEvictionCount map[string]map[string]int
|
||||||
|
nodeEvictionList []eviction
|
||||||
|
now func() unversioned.Time
|
||||||
|
windowSize time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func newEvictionData(windowSize time.Duration) *evictionData {
|
||||||
|
return &evictionData{
|
||||||
|
nodeEvictionCount: make(map[string]map[string]int),
|
||||||
|
nodeEvictionList: make([]eviction, 0),
|
||||||
|
now: unversioned.Now,
|
||||||
|
windowSize: windowSize,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *evictionData) slideWindow() {
|
||||||
|
e.Lock()
|
||||||
|
defer e.Unlock()
|
||||||
|
now := e.now()
|
||||||
|
firstInside := 0
|
||||||
|
for _, v := range e.nodeEvictionList {
|
||||||
|
if v.time.Add(e.windowSize).Before(now.Time) {
|
||||||
|
firstInside++
|
||||||
|
zone := ""
|
||||||
|
for z := range e.nodeEvictionCount {
|
||||||
|
if _, ok := e.nodeEvictionCount[z][v.node]; ok {
|
||||||
|
zone = z
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if zone == "" {
|
||||||
|
glog.Warningf("EvictionData corruption - unknown zone for node %v", v.node)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if e.nodeEvictionCount[zone][v.node] > 1 {
|
||||||
|
e.nodeEvictionCount[zone][v.node] = e.nodeEvictionCount[zone][v.node] - 1
|
||||||
|
} else {
|
||||||
|
delete(e.nodeEvictionCount[zone], v.node)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
e.nodeEvictionList = e.nodeEvictionList[firstInside:]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *evictionData) registerEviction(node, zone string) {
|
||||||
|
e.Lock()
|
||||||
|
defer e.Unlock()
|
||||||
|
|
||||||
|
e.nodeEvictionList = append(e.nodeEvictionList, eviction{node: node, time: e.now()})
|
||||||
|
if _, ok := e.nodeEvictionCount[zone]; !ok {
|
||||||
|
e.nodeEvictionCount[zone] = make(map[string]int)
|
||||||
|
}
|
||||||
|
if _, ok := e.nodeEvictionCount[zone][node]; !ok {
|
||||||
|
e.nodeEvictionCount[zone][node] = 1
|
||||||
|
} else {
|
||||||
|
e.nodeEvictionCount[zone][node] = e.nodeEvictionCount[zone][node] + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *evictionData) removeEviction(node, zone string) {
|
||||||
|
e.Lock()
|
||||||
|
defer e.Unlock()
|
||||||
|
|
||||||
|
// TODO: This may be inefficient, but hopefully will be rarely called. Verify that this is true.
|
||||||
|
for i := len(e.nodeEvictionList) - 1; i >= 0; i-- {
|
||||||
|
if e.nodeEvictionList[i].node == node {
|
||||||
|
e.nodeEvictionList = append(e.nodeEvictionList[:i], e.nodeEvictionList[i+1:]...)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if e.nodeEvictionCount[zone][node] > 1 {
|
||||||
|
e.nodeEvictionCount[zone][node] = e.nodeEvictionCount[zone][node] - 1
|
||||||
|
} else {
|
||||||
|
delete(e.nodeEvictionCount[zone], node)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *evictionData) countEvictions(zone string) int {
|
||||||
|
e.Lock()
|
||||||
|
defer e.Unlock()
|
||||||
|
return len(e.nodeEvictionCount[zone])
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *evictionData) getZones() []string {
|
||||||
|
e.Lock()
|
||||||
|
defer e.Unlock()
|
||||||
|
|
||||||
|
zones := make([]string, 0, len(e.nodeEvictionCount))
|
||||||
|
for k := range e.nodeEvictionCount {
|
||||||
|
zones = append(zones, k)
|
||||||
|
}
|
||||||
|
return zones
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *evictionData) initZone(zone string) {
|
||||||
|
e.Lock()
|
||||||
|
defer e.Unlock()
|
||||||
|
|
||||||
|
e.nodeEvictionCount[zone] = make(map[string]int)
|
||||||
|
}
|
129
pkg/controller/node/metrics_test.go
Normal file
129
pkg/controller/node/metrics_test.go
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2016 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package node
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEvictionData(t *testing.T) {
|
||||||
|
evictionData := newEvictionData(time.Hour)
|
||||||
|
now := unversioned.Now()
|
||||||
|
evictionData.now = func() unversioned.Time {
|
||||||
|
return *(&now)
|
||||||
|
}
|
||||||
|
if evictionData.countEvictions("zone1") != 0 {
|
||||||
|
t.Fatalf("Invalid eviction count before doing anything")
|
||||||
|
}
|
||||||
|
evictionData.initZone("zone1")
|
||||||
|
if evictionData.countEvictions("zone1") != 0 {
|
||||||
|
t.Fatalf("Invalid eviction after zone initialization")
|
||||||
|
}
|
||||||
|
|
||||||
|
evictionData.registerEviction("first", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 1 {
|
||||||
|
t.Fatalf("Invalid eviction count after adding first Node")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.registerEviction("second", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 2 {
|
||||||
|
t.Fatalf("Invalid eviction count after adding second Node")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.registerEviction("second", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 2 {
|
||||||
|
t.Fatalf("Invalid eviction count after adding second Node second time")
|
||||||
|
}
|
||||||
|
if evictionData.countEvictions("zone2") != 0 {
|
||||||
|
t.Fatalf("Invalid eviction in nonexistent zone")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.registerEviction("third", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 3 {
|
||||||
|
t.Fatalf("Invalid eviction count after adding third Node first time")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.removeEviction("third", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 2 {
|
||||||
|
t.Fatalf("Invalid eviction count after remove third Node")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.removeEviction("third", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 2 {
|
||||||
|
t.Fatalf("Invalid eviction count after remove third Node second time")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.registerEviction("fourth", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 3 {
|
||||||
|
t.Fatalf("Invalid eviction count after adding fourth Node first time")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.registerEviction("fourth", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 3 {
|
||||||
|
t.Fatalf("Invalid eviction count after adding fourth Node second time")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.removeEviction("fourth", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 3 {
|
||||||
|
t.Fatalf("Invalid eviction count after remove fourth Node first time")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.removeEviction("fourth", "zone1")
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 2 {
|
||||||
|
t.Fatalf("Invalid eviction count after remove fourth Node second time")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(52 * time.Minute))
|
||||||
|
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 1 {
|
||||||
|
t.Fatalf("Invalid eviction count after first Node went out of scope")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Minute))
|
||||||
|
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 1 {
|
||||||
|
t.Fatalf("Invalid eviction count after first occurence of the second Node went out of scope")
|
||||||
|
}
|
||||||
|
now = unversioned.NewTime(now.Add(time.Second))
|
||||||
|
|
||||||
|
evictionData.slideWindow()
|
||||||
|
if evictionData.countEvictions("zone1") != 0 {
|
||||||
|
t.Fatalf("Invalid eviction count after second occurence of the second Node went out of scope")
|
||||||
|
}
|
||||||
|
}
|
@ -46,8 +46,15 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/util/wait"
|
"k8s.io/kubernetes/pkg/util/wait"
|
||||||
"k8s.io/kubernetes/pkg/version"
|
"k8s.io/kubernetes/pkg/version"
|
||||||
"k8s.io/kubernetes/pkg/watch"
|
"k8s.io/kubernetes/pkg/watch"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// Register prometheus metrics
|
||||||
|
Register()
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ErrCloudInstance = errors.New("cloud provider doesn't support instances.")
|
ErrCloudInstance = errors.New("cloud provider doesn't support instances.")
|
||||||
gracefulDeletionVersion = version.MustParse("v1.1.0")
|
gracefulDeletionVersion = version.MustParse("v1.1.0")
|
||||||
@ -142,7 +149,7 @@ type NodeController struct {
|
|||||||
|
|
||||||
forcefullyDeletePod func(*api.Pod) error
|
forcefullyDeletePod func(*api.Pod) error
|
||||||
nodeExistsInCloudProvider func(string) (bool, error)
|
nodeExistsInCloudProvider func(string) (bool, error)
|
||||||
computeZoneStateFunc func(nodeConditions []*api.NodeCondition) zoneState
|
computeZoneStateFunc func(nodeConditions []*api.NodeCondition) (int, zoneState)
|
||||||
enterPartialDisruptionFunc func(nodeNum int) float32
|
enterPartialDisruptionFunc func(nodeNum int) float32
|
||||||
enterFullDisruptionFunc func(nodeNum int) float32
|
enterFullDisruptionFunc func(nodeNum int) float32
|
||||||
|
|
||||||
@ -158,6 +165,9 @@ type NodeController struct {
|
|||||||
// the controller using NewDaemonSetsController(passing SharedInformer), this
|
// the controller using NewDaemonSetsController(passing SharedInformer), this
|
||||||
// will be null
|
// will be null
|
||||||
internalPodInformer framework.SharedIndexInformer
|
internalPodInformer framework.SharedIndexInformer
|
||||||
|
|
||||||
|
evictions10Minutes *evictionData
|
||||||
|
evictions1Hour *evictionData
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewNodeController returns a new node controller to sync instances from cloudprovider.
|
// NewNodeController returns a new node controller to sync instances from cloudprovider.
|
||||||
@ -229,6 +239,8 @@ func NewNodeController(
|
|||||||
largeClusterThreshold: largeClusterThreshold,
|
largeClusterThreshold: largeClusterThreshold,
|
||||||
unhealthyZoneThreshold: unhealthyZoneThreshold,
|
unhealthyZoneThreshold: unhealthyZoneThreshold,
|
||||||
zoneStates: make(map[string]zoneState),
|
zoneStates: make(map[string]zoneState),
|
||||||
|
evictions10Minutes: newEvictionData(10 * time.Minute),
|
||||||
|
evictions1Hour: newEvictionData(time.Hour),
|
||||||
}
|
}
|
||||||
nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc
|
nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc
|
||||||
nc.enterFullDisruptionFunc = nc.HealthyQPSFunc
|
nc.enterFullDisruptionFunc = nc.HealthyQPSFunc
|
||||||
@ -403,6 +415,18 @@ func (nc *NodeController) Run(period time.Duration) {
|
|||||||
defer nc.evictorLock.Unlock()
|
defer nc.evictorLock.Unlock()
|
||||||
for k := range nc.zonePodEvictor {
|
for k := range nc.zonePodEvictor {
|
||||||
nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
|
nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
|
||||||
|
obj, exists, err := nc.nodeStore.Get(value.Value)
|
||||||
|
if err != nil {
|
||||||
|
glog.Warningf("Failed to get Node %v from the nodeStore: %v", value.Value, err)
|
||||||
|
} else if !exists {
|
||||||
|
glog.Warningf("Node %v no longer present in nodeStore!", value.Value)
|
||||||
|
} else {
|
||||||
|
node, _ := obj.(*api.Node)
|
||||||
|
zone := utilnode.GetZoneKey(node)
|
||||||
|
nc.evictions10Minutes.registerEviction(zone, value.Value)
|
||||||
|
nc.evictions1Hour.registerEviction(zone, value.Value)
|
||||||
|
}
|
||||||
|
|
||||||
nodeUid, _ := value.UID.(string)
|
nodeUid, _ := value.UID.(string)
|
||||||
remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
|
remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -477,6 +501,8 @@ func (nc *NodeController) monitorNodeStatus() error {
|
|||||||
nc.zonePodEvictor[zone] =
|
nc.zonePodEvictor[zone] =
|
||||||
NewRateLimitedTimedQueue(
|
NewRateLimitedTimedQueue(
|
||||||
flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
|
flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
|
||||||
|
nc.evictions10Minutes.initZone(zone)
|
||||||
|
nc.evictions1Hour.initZone(zone)
|
||||||
}
|
}
|
||||||
if _, found := nc.zoneTerminationEvictor[zone]; !found {
|
if _, found := nc.zoneTerminationEvictor[zone]; !found {
|
||||||
nc.zoneTerminationEvictor[zone] = NewRateLimitedTimedQueue(
|
nc.zoneTerminationEvictor[zone] = NewRateLimitedTimedQueue(
|
||||||
@ -575,15 +601,28 @@ func (nc *NodeController) monitorNodeStatus() error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
nc.handleDisruption(zoneToNodeConditions, nodes)
|
nc.handleDisruption(zoneToNodeConditions, nodes)
|
||||||
|
nc.updateEvictionMetric(Evictions10Minutes, nc.evictions10Minutes)
|
||||||
|
nc.updateEvictionMetric(Evictions1Hour, nc.evictions1Hour)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (nc *NodeController) updateEvictionMetric(metric *prometheus.GaugeVec, data *evictionData) {
|
||||||
|
data.slideWindow()
|
||||||
|
zones := data.getZones()
|
||||||
|
for _, z := range zones {
|
||||||
|
metric.WithLabelValues(z).Set(float64(data.countEvictions(z)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*api.NodeCondition, nodes *api.NodeList) {
|
func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*api.NodeCondition, nodes *api.NodeList) {
|
||||||
newZoneStates := map[string]zoneState{}
|
newZoneStates := map[string]zoneState{}
|
||||||
allAreFullyDisrupted := true
|
allAreFullyDisrupted := true
|
||||||
for k, v := range zoneToNodeConditions {
|
for k, v := range zoneToNodeConditions {
|
||||||
newState := nc.computeZoneStateFunc(v)
|
ZoneSize.WithLabelValues(k).Set(float64(len(v)))
|
||||||
|
unhealthy, newState := nc.computeZoneStateFunc(v)
|
||||||
|
ZoneHealth.WithLabelValues(k).Set(float64(100*(len(v)-unhealthy)) / float64(len(v)))
|
||||||
|
UnhealthyNodes.WithLabelValues(k).Set(float64(unhealthy))
|
||||||
if newState != stateFullDisruption {
|
if newState != stateFullDisruption {
|
||||||
allAreFullyDisrupted = false
|
allAreFullyDisrupted = false
|
||||||
}
|
}
|
||||||
@ -596,6 +635,9 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*ap
|
|||||||
allWasFullyDisrupted := true
|
allWasFullyDisrupted := true
|
||||||
for k, v := range nc.zoneStates {
|
for k, v := range nc.zoneStates {
|
||||||
if _, have := zoneToNodeConditions[k]; !have {
|
if _, have := zoneToNodeConditions[k]; !have {
|
||||||
|
ZoneSize.WithLabelValues(k).Set(0)
|
||||||
|
ZoneHealth.WithLabelValues(k).Set(100)
|
||||||
|
UnhealthyNodes.WithLabelValues(k).Set(0)
|
||||||
delete(nc.zoneStates, k)
|
delete(nc.zoneStates, k)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -876,6 +918,8 @@ func (nc *NodeController) cancelPodEviction(node *api.Node) bool {
|
|||||||
wasTerminating := nc.zoneTerminationEvictor[zone].Remove(node.Name)
|
wasTerminating := nc.zoneTerminationEvictor[zone].Remove(node.Name)
|
||||||
if wasDeleting || wasTerminating {
|
if wasDeleting || wasTerminating {
|
||||||
glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
|
glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
|
||||||
|
nc.evictions10Minutes.removeEviction(zone, node.Name)
|
||||||
|
nc.evictions1Hour.removeEviction(zone, node.Name)
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
@ -907,7 +951,7 @@ func (nc *NodeController) ReducedQPSFunc(nodeNum int) float32 {
|
|||||||
// - fullyDisrupted if there're no Ready Nodes,
|
// - fullyDisrupted if there're no Ready Nodes,
|
||||||
// - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready,
|
// - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready,
|
||||||
// - normal otherwise
|
// - normal otherwise
|
||||||
func (nc *NodeController) ComputeZoneState(nodeReadyConditions []*api.NodeCondition) zoneState {
|
func (nc *NodeController) ComputeZoneState(nodeReadyConditions []*api.NodeCondition) (int, zoneState) {
|
||||||
readyNodes := 0
|
readyNodes := 0
|
||||||
notReadyNodes := 0
|
notReadyNodes := 0
|
||||||
for i := range nodeReadyConditions {
|
for i := range nodeReadyConditions {
|
||||||
@ -919,10 +963,10 @@ func (nc *NodeController) ComputeZoneState(nodeReadyConditions []*api.NodeCondit
|
|||||||
}
|
}
|
||||||
switch {
|
switch {
|
||||||
case readyNodes == 0 && notReadyNodes > 0:
|
case readyNodes == 0 && notReadyNodes > 0:
|
||||||
return stateFullDisruption
|
return notReadyNodes, stateFullDisruption
|
||||||
case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold:
|
case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold:
|
||||||
return statePartialDisruption
|
return notReadyNodes, statePartialDisruption
|
||||||
default:
|
default:
|
||||||
return stateNormal
|
return notReadyNodes, stateNormal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user