kube-proxy network programming latency on restarts

kube-proxy expose the metric network_programming_duration_seconds, that is defined as the time it takes to program the network since a a service or pod has changed. It uses an annotation on the endpoints /endpointslices to calculate when the endpoint was created, however, on restarts, kube-proxy process all the endpoints again, no matter when those were generated, polluting the metrics. To be safe, kube-proxy will estimate the latency only for those endpoints that were generated after it started.
2026-01-04 23:17:50 +00:00 · 2021-04-07 19:09:59 +02:00
parent b0abe89ae2
commit ef76be37de
3 changed files with 42 additions and 16 deletions
--- a/pkg/proxy/endpoints.go
+++ b/pkg/proxy/endpoints.go
@@ -164,6 +164,11 @@ type EndpointChangeTracker struct {
 	// Map from the Endpoints namespaced-name to the times of the triggers that caused the endpoints
 	// object to change. Used to calculate the network-programming-latency.
 	lastChangeTriggerTimes map[types.NamespacedName][]time.Time
+	// record the time when the endpointChangeTracker was created so we can ignore the endpoints
+	// that were generated before, because we can't estimate the network-programming-latency on those.
+	// This is specially problematic on restarts, because we process all the endpoints that may have been
+	// created hours or days before.
+	trackerStartTime time.Time
 }

 // NewEndpointChangeTracker initializes an EndpointsChangeMap
@@ -175,6 +180,7 @@ func NewEndpointChangeTracker(hostname string, makeEndpointInfo makeEndpointFunc
 		ipFamily:                  ipFamily,
 		recorder:                  recorder,
 		lastChangeTriggerTimes:    make(map[types.NamespacedName][]time.Time),
+		trackerStartTime:          time.Now(),
 		processEndpointsMapChange: processEndpointsMapChange,
 	}
 	if endpointSlicesEnabled {
@@ -216,7 +222,7 @@ func (ect *EndpointChangeTracker) Update(previous, current *v1.Endpoints) bool {
 	// In case of Endpoints deletion, the LastChangeTriggerTime annotation is
 	// by-definition coming from the time of last update, which is not what
 	// we want to measure. So we simply ignore it in this cases.
-	if t := getLastChangeTriggerTime(endpoints.Annotations); !t.IsZero() && current != nil {
+	if t := getLastChangeTriggerTime(endpoints.Annotations); !t.IsZero() && current != nil && t.After(ect.trackerStartTime) {
 		ect.lastChangeTriggerTimes[namespacedName] = append(ect.lastChangeTriggerTimes[namespacedName], t)
 	}

@@ -276,7 +282,7 @@ func (ect *EndpointChangeTracker) EndpointSliceUpdate(endpointSlice *discovery.E
 		// we want to measure. So we simply ignore it in this cases.
 		// TODO(wojtek-t, robscott): Address the problem for EndpointSlice deletion
 		// when other EndpointSlice for that service still exist.
-		if t := getLastChangeTriggerTime(endpointSlice.Annotations); !t.IsZero() && !removeSlice {
+		if t := getLastChangeTriggerTime(endpointSlice.Annotations); !t.IsZero() && !removeSlice && t.After(ect.trackerStartTime) {
 			ect.lastChangeTriggerTimes[namespacedName] =
 				append(ect.lastChangeTriggerTimes[namespacedName], t)
 		}