mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-03 17:30:00 +00:00
add metrics to the nodeport allocator repair loop
The repair loop are great for saving us of leaks, but the side effect is that bugs can go unnoticed for a long time, so we need some signal to be able to identify those errors proactivily. Add two new metrics to identify: - errors on the reconcile loop - errors per nodeport
This commit is contained in:
parent
46dea3015f
commit
4eff70dcf9
@ -0,0 +1,65 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2023 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package controller
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"k8s.io/component-base/metrics"
|
||||||
|
"k8s.io/component-base/metrics/legacyregistry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
namespace = "apiserver"
|
||||||
|
subsystem = "nodeport_repair"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// nodePortRepairPortErrors indicates the number of errors found by the repair loop
|
||||||
|
// divided by the type of error:
|
||||||
|
// leak, repair, full, outOfRange, duplicate, unknown
|
||||||
|
nodePortRepairPortErrors = metrics.NewCounterVec(
|
||||||
|
&metrics.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Name: "port_errors_total",
|
||||||
|
Help: "Number of errors detected on ports by the repair loop broken down by type of error: leak, repair, full, outOfRange, duplicate, unknown",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
[]string{"type"},
|
||||||
|
)
|
||||||
|
// nodePortRepairReconcileErrors indicates the number of times the repair loop has failed to repair
|
||||||
|
// the errors it detected.
|
||||||
|
nodePortRepairReconcileErrors = metrics.NewCounter(
|
||||||
|
&metrics.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Name: "reconcile_errors_total",
|
||||||
|
Help: "Number of reconciliation failures on the nodeport repair reconcile loop",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
var registerMetricsOnce sync.Once
|
||||||
|
|
||||||
|
func registerMetrics() {
|
||||||
|
registerMetricsOnce.Do(func() {
|
||||||
|
legacyregistry.MustRegister(nodePortRepairPortErrors)
|
||||||
|
legacyregistry.MustRegister(nodePortRepairReconcileErrors)
|
||||||
|
})
|
||||||
|
}
|
@ -61,6 +61,8 @@ func NewRepair(interval time.Duration, serviceClient corev1client.ServicesGetter
|
|||||||
eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: eventClient})
|
eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: eventClient})
|
||||||
recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, "portallocator-repair-controller")
|
recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, "portallocator-repair-controller")
|
||||||
|
|
||||||
|
registerMetrics()
|
||||||
|
|
||||||
return &Repair{
|
return &Repair{
|
||||||
interval: interval,
|
interval: interval,
|
||||||
serviceClient: serviceClient,
|
serviceClient: serviceClient,
|
||||||
@ -89,7 +91,13 @@ func (c *Repair) RunUntil(onFirstSuccess func(), stopCh chan struct{}) {
|
|||||||
|
|
||||||
// runOnce verifies the state of the port allocations and returns an error if an unrecoverable problem occurs.
|
// runOnce verifies the state of the port allocations and returns an error if an unrecoverable problem occurs.
|
||||||
func (c *Repair) runOnce() error {
|
func (c *Repair) runOnce() error {
|
||||||
return retry.RetryOnConflict(retry.DefaultBackoff, c.doRunOnce)
|
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
|
||||||
|
err := c.doRunOnce()
|
||||||
|
if err != nil {
|
||||||
|
nodePortRepairReconcileErrors.Inc()
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// doRunOnce verifies the state of the port allocations and returns an error if an unrecoverable problem occurs.
|
// doRunOnce verifies the state of the port allocations and returns an error if an unrecoverable problem occurs.
|
||||||
@ -153,23 +161,28 @@ func (c *Repair) doRunOnce() error {
|
|||||||
stored.Release(port)
|
stored.Release(port)
|
||||||
} else {
|
} else {
|
||||||
// doesn't seem to be allocated
|
// doesn't seem to be allocated
|
||||||
|
nodePortRepairPortErrors.WithLabelValues("repair").Inc()
|
||||||
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "PortNotAllocated", "PortAllocation", "Port %d is not allocated; repairing", port)
|
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "PortNotAllocated", "PortAllocation", "Port %d is not allocated; repairing", port)
|
||||||
runtime.HandleError(fmt.Errorf("the node port %d for service %s/%s is not allocated; repairing", port, svc.Name, svc.Namespace))
|
runtime.HandleError(fmt.Errorf("the node port %d for service %s/%s is not allocated; repairing", port, svc.Name, svc.Namespace))
|
||||||
}
|
}
|
||||||
delete(c.leaks, port) // it is used, so it can't be leaked
|
delete(c.leaks, port) // it is used, so it can't be leaked
|
||||||
case portallocator.ErrAllocated:
|
case portallocator.ErrAllocated:
|
||||||
// port is duplicate, reallocate
|
// port is duplicate, reallocate
|
||||||
|
nodePortRepairPortErrors.WithLabelValues("duplicate").Inc()
|
||||||
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "PortAlreadyAllocated", "PortAllocation", "Port %d was assigned to multiple services; please recreate service", port)
|
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "PortAlreadyAllocated", "PortAllocation", "Port %d was assigned to multiple services; please recreate service", port)
|
||||||
runtime.HandleError(fmt.Errorf("the node port %d for service %s/%s was assigned to multiple services; please recreate", port, svc.Name, svc.Namespace))
|
runtime.HandleError(fmt.Errorf("the node port %d for service %s/%s was assigned to multiple services; please recreate", port, svc.Name, svc.Namespace))
|
||||||
case err.(*portallocator.ErrNotInRange):
|
case err.(*portallocator.ErrNotInRange):
|
||||||
// port is out of range, reallocate
|
// port is out of range, reallocate
|
||||||
|
nodePortRepairPortErrors.WithLabelValues("outOfRange").Inc()
|
||||||
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "PortOutOfRange", "PortAllocation", "Port %d is not within the port range %s; please recreate service", port, c.portRange)
|
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "PortOutOfRange", "PortAllocation", "Port %d is not within the port range %s; please recreate service", port, c.portRange)
|
||||||
runtime.HandleError(fmt.Errorf("the port %d for service %s/%s is not within the port range %s; please recreate", port, svc.Name, svc.Namespace, c.portRange))
|
runtime.HandleError(fmt.Errorf("the port %d for service %s/%s is not within the port range %s; please recreate", port, svc.Name, svc.Namespace, c.portRange))
|
||||||
case portallocator.ErrFull:
|
case portallocator.ErrFull:
|
||||||
// somehow we are out of ports
|
// somehow we are out of ports
|
||||||
|
nodePortRepairPortErrors.WithLabelValues("full").Inc()
|
||||||
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "PortRangeFull", "PortAllocation", "Port range %s is full; you must widen the port range in order to create new services", c.portRange)
|
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "PortRangeFull", "PortAllocation", "Port range %s is full; you must widen the port range in order to create new services", c.portRange)
|
||||||
return fmt.Errorf("the port range %s is full; you must widen the port range in order to create new services", c.portRange)
|
return fmt.Errorf("the port range %s is full; you must widen the port range in order to create new services", c.portRange)
|
||||||
default:
|
default:
|
||||||
|
nodePortRepairPortErrors.WithLabelValues("unknown").Inc()
|
||||||
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "UnknownError", "PortAllocation", "Unable to allocate port %d due to an unknown error", port)
|
c.recorder.Eventf(svc, nil, corev1.EventTypeWarning, "UnknownError", "PortAllocation", "Unable to allocate port %d due to an unknown error", port)
|
||||||
return fmt.Errorf("unable to allocate port %d for service %s/%s due to an unknown error, exiting: %v", port, svc.Name, svc.Namespace, err)
|
return fmt.Errorf("unable to allocate port %d for service %s/%s due to an unknown error, exiting: %v", port, svc.Name, svc.Namespace, err)
|
||||||
}
|
}
|
||||||
@ -189,9 +202,11 @@ func (c *Repair) doRunOnce() error {
|
|||||||
// pretend it is still in use until count expires
|
// pretend it is still in use until count expires
|
||||||
c.leaks[port] = count - 1
|
c.leaks[port] = count - 1
|
||||||
if err := rebuilt.Allocate(port); err != nil {
|
if err := rebuilt.Allocate(port); err != nil {
|
||||||
|
// do not increment the metric here, if it is a leak it will be detected once the counter gets to 0
|
||||||
runtime.HandleError(fmt.Errorf("the node port %d may have leaked, but can not be allocated: %v", port, err))
|
runtime.HandleError(fmt.Errorf("the node port %d may have leaked, but can not be allocated: %v", port, err))
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
nodePortRepairPortErrors.WithLabelValues("leak").Inc()
|
||||||
// do not add it to the rebuilt set, which means it will be available for reuse
|
// do not add it to the rebuilt set, which means it will be available for reuse
|
||||||
runtime.HandleError(fmt.Errorf("the node port %d appears to have leaked: cleaning up", port))
|
runtime.HandleError(fmt.Errorf("the node port %d appears to have leaked: cleaning up", port))
|
||||||
}
|
}
|
||||||
|
@ -27,6 +27,7 @@ import (
|
|||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
"k8s.io/apimachinery/pkg/util/net"
|
"k8s.io/apimachinery/pkg/util/net"
|
||||||
"k8s.io/client-go/kubernetes/fake"
|
"k8s.io/client-go/kubernetes/fake"
|
||||||
|
"k8s.io/component-base/metrics/testutil"
|
||||||
api "k8s.io/kubernetes/pkg/apis/core"
|
api "k8s.io/kubernetes/pkg/apis/core"
|
||||||
"k8s.io/kubernetes/pkg/registry/core/service/portallocator"
|
"k8s.io/kubernetes/pkg/registry/core/service/portallocator"
|
||||||
)
|
)
|
||||||
@ -53,6 +54,7 @@ func (r *mockRangeRegistry) CreateOrUpdate(alloc *api.RangeAllocation) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRepair(t *testing.T) {
|
func TestRepair(t *testing.T) {
|
||||||
|
clearMetrics()
|
||||||
fakeClient := fake.NewSimpleClientset()
|
fakeClient := fake.NewSimpleClientset()
|
||||||
registry := &mockRangeRegistry{
|
registry := &mockRangeRegistry{
|
||||||
item: &api.RangeAllocation{Range: "100-200"},
|
item: &api.RangeAllocation{Range: "100-200"},
|
||||||
@ -66,6 +68,13 @@ func TestRepair(t *testing.T) {
|
|||||||
if !registry.updateCalled || registry.updated == nil || registry.updated.Range != pr.String() || registry.updated != registry.item {
|
if !registry.updateCalled || registry.updated == nil || registry.updated.Range != pr.String() || registry.updated != registry.item {
|
||||||
t.Errorf("unexpected registry: %#v", registry)
|
t.Errorf("unexpected registry: %#v", registry)
|
||||||
}
|
}
|
||||||
|
repairErrors, err := testutil.GetCounterMetricValue(nodePortRepairReconcileErrors)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to get %s value, err: %v", nodePortRepairReconcileErrors.Name, err)
|
||||||
|
}
|
||||||
|
if repairErrors != 0 {
|
||||||
|
t.Fatalf("0 error expected, got %v", repairErrors)
|
||||||
|
}
|
||||||
|
|
||||||
registry = &mockRangeRegistry{
|
registry = &mockRangeRegistry{
|
||||||
item: &api.RangeAllocation{Range: "100-200"},
|
item: &api.RangeAllocation{Range: "100-200"},
|
||||||
@ -75,9 +84,18 @@ func TestRepair(t *testing.T) {
|
|||||||
if err := r.runOnce(); !strings.Contains(err.Error(), ": test error") {
|
if err := r.runOnce(); !strings.Contains(err.Error(), ": test error") {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
repairErrors, err = testutil.GetCounterMetricValue(nodePortRepairReconcileErrors)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to get %s value, err: %v", nodePortRepairReconcileErrors.Name, err)
|
||||||
|
}
|
||||||
|
if repairErrors != 1 {
|
||||||
|
t.Fatalf("1 error expected, got %v", repairErrors)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRepairLeak(t *testing.T) {
|
func TestRepairLeak(t *testing.T) {
|
||||||
|
clearMetrics()
|
||||||
|
|
||||||
pr, _ := net.ParsePortRange("100-200")
|
pr, _ := net.ParsePortRange("100-200")
|
||||||
previous, err := portallocator.NewInMemory(*pr)
|
previous, err := portallocator.NewInMemory(*pr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -127,9 +145,18 @@ func TestRepairLeak(t *testing.T) {
|
|||||||
if after.Has(111) {
|
if after.Has(111) {
|
||||||
t.Errorf("expected portallocator to not have leaked port")
|
t.Errorf("expected portallocator to not have leaked port")
|
||||||
}
|
}
|
||||||
|
em := testMetrics{
|
||||||
|
leak: 1,
|
||||||
|
repair: 0,
|
||||||
|
outOfRange: 0,
|
||||||
|
duplicate: 0,
|
||||||
|
unknown: 0,
|
||||||
|
}
|
||||||
|
expectMetrics(t, em)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRepairWithExisting(t *testing.T) {
|
func TestRepairWithExisting(t *testing.T) {
|
||||||
|
clearMetrics()
|
||||||
pr, _ := net.ParsePortRange("100-200")
|
pr, _ := net.ParsePortRange("100-200")
|
||||||
previous, err := portallocator.NewInMemory(*pr)
|
previous, err := portallocator.NewInMemory(*pr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -204,6 +231,14 @@ func TestRepairWithExisting(t *testing.T) {
|
|||||||
if free := after.Free(); free != 97 {
|
if free := after.Free(); free != 97 {
|
||||||
t.Errorf("unexpected portallocator state: %d free", free)
|
t.Errorf("unexpected portallocator state: %d free", free)
|
||||||
}
|
}
|
||||||
|
em := testMetrics{
|
||||||
|
leak: 0,
|
||||||
|
repair: 4,
|
||||||
|
outOfRange: 1,
|
||||||
|
duplicate: 1,
|
||||||
|
unknown: 0,
|
||||||
|
}
|
||||||
|
expectMetrics(t, em)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCollectServiceNodePorts(t *testing.T) {
|
func TestCollectServiceNodePorts(t *testing.T) {
|
||||||
@ -303,3 +338,51 @@ func TestCollectServiceNodePorts(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Metrics helpers
|
||||||
|
func clearMetrics() {
|
||||||
|
nodePortRepairPortErrors.Reset()
|
||||||
|
nodePortRepairReconcileErrors.Reset()
|
||||||
|
}
|
||||||
|
|
||||||
|
type testMetrics struct {
|
||||||
|
leak float64
|
||||||
|
repair float64
|
||||||
|
outOfRange float64
|
||||||
|
duplicate float64
|
||||||
|
unknown float64
|
||||||
|
full float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func expectMetrics(t *testing.T, em testMetrics) {
|
||||||
|
var m testMetrics
|
||||||
|
var err error
|
||||||
|
|
||||||
|
m.leak, err = testutil.GetCounterMetricValue(nodePortRepairPortErrors.WithLabelValues("leak"))
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to get %s value, err: %v", nodePortRepairPortErrors.Name, err)
|
||||||
|
}
|
||||||
|
m.repair, err = testutil.GetCounterMetricValue(nodePortRepairPortErrors.WithLabelValues("repair"))
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to get %s value, err: %v", nodePortRepairPortErrors.Name, err)
|
||||||
|
}
|
||||||
|
m.outOfRange, err = testutil.GetCounterMetricValue(nodePortRepairPortErrors.WithLabelValues("outOfRange"))
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to get %s value, err: %v", nodePortRepairPortErrors.Name, err)
|
||||||
|
}
|
||||||
|
m.duplicate, err = testutil.GetCounterMetricValue(nodePortRepairPortErrors.WithLabelValues("duplicate"))
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to get %s value, err: %v", nodePortRepairPortErrors.Name, err)
|
||||||
|
}
|
||||||
|
m.unknown, err = testutil.GetCounterMetricValue(nodePortRepairPortErrors.WithLabelValues("unknown"))
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to get %s value, err: %v", nodePortRepairPortErrors.Name, err)
|
||||||
|
}
|
||||||
|
m.full, err = testutil.GetCounterMetricValue(nodePortRepairPortErrors.WithLabelValues("full"))
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("failed to get %s value, err: %v", nodePortRepairPortErrors.Name, err)
|
||||||
|
}
|
||||||
|
if m != em {
|
||||||
|
t.Fatalf("metrics error: expected %v, received %v", em, m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user