Reworking kube-proxy to only compute endpointChanges on apply.

Computing EndpointChanges is a relatively expensive operation for kube-proxy when Endpoint Slices are used. This had been computed on every EndpointSlice update which became quite inefficient at high levels of scale when multiple EndpointSlice update events would be triggered before a syncProxyRules call. Profiling results showed that computing this on each update could consume ~80% of total kube-proxy CPU utilization at high levels of scale. This change reduced that to as little as 3% of total kube-proxy utilization at high levels of scale. It's worth noting that the difference is minimal when there is a 1:1 relationship between EndpointSlice updates and proxier syncs. This is primarily beneficial when there are many EndpointSlice updates between proxier sync loops.
2026-01-06 07:57:35 +00:00 · 2019-09-19 12:58:18 -07:00
parent f1bb6089ce
commit 8e7de45034
4 changed files with 524 additions and 119 deletions
--- a/pkg/proxy/endpoints.go
+++ b/pkg/proxy/endpoints.go
@@ -92,7 +92,7 @@ type EndpointChangeTracker struct {
 	items map[types.NamespacedName]*endpointsChange
 	// makeEndpointInfo allows proxier to inject customized information when processing endpoint.
 	makeEndpointInfo makeEndpointFunc
-	// endpointSliceCache holds a simplified version of endpoint slices
+	// endpointSliceCache holds a simplified version of endpoint slices.
 	endpointSliceCache *EndpointSliceCache
 	// isIPv6Mode indicates if change tracker is under IPv6/IPv4 mode. Nil means not applicable.
 	isIPv6Mode *bool
@@ -190,39 +190,54 @@ func (ect *EndpointChangeTracker) EndpointSliceUpdate(endpointSlice *discovery.E
 	ect.lock.Lock()
 	defer ect.lock.Unlock()

-	change, ok := ect.items[namespacedName]
-	if !ok {
-		change = &endpointsChange{}
-		change.previous = ect.endpointSliceCache.EndpointsMap(namespacedName)
-		ect.items[namespacedName] = change
+	changeNeeded := ect.endpointSliceCache.updatePending(endpointSlice, removeSlice)
+
+	if changeNeeded {
+		metrics.EndpointChangesPending.Inc()
+		if t := getLastChangeTriggerTime(endpointSlice.Annotations); !t.IsZero() {
+			ect.lastChangeTriggerTimes[namespacedName] =
+				append(ect.lastChangeTriggerTimes[namespacedName], t)
+		}
 	}

-	if removeSlice {
-		ect.endpointSliceCache.Delete(endpointSlice)
-	} else {
-		ect.endpointSliceCache.Update(endpointSlice)
+	return changeNeeded
+}
+
+// checkoutChanges returns a list of pending endpointsChanges and marks them as
+// applied.
+func (ect *EndpointChangeTracker) checkoutChanges() []*endpointsChange {
+	ect.lock.Lock()
+	defer ect.lock.Unlock()
+
+	metrics.EndpointChangesPending.Set(0)
+
+	if ect.endpointSliceCache != nil {
+		return ect.endpointSliceCache.checkoutChanges()
 	}

-	if t := getLastChangeTriggerTime(endpointSlice.Annotations); !t.IsZero() {
-		ect.lastChangeTriggerTimes[namespacedName] =
-			append(ect.lastChangeTriggerTimes[namespacedName], t)
+	changes := []*endpointsChange{}
+	for _, change := range ect.items {
+		changes = append(changes, change)
 	}
+	ect.items = make(map[types.NamespacedName]*endpointsChange)
+	return changes
+}

-	change.current = ect.endpointSliceCache.EndpointsMap(namespacedName)
-	// if change.previous equal to change.current, it means no change
-	if reflect.DeepEqual(change.previous, change.current) {
-		delete(ect.items, namespacedName)
-		// Reset the lastChangeTriggerTimes for this service. Given that the network programming
-		// SLI is defined as the duration between a time of an event and a time when the network was
-		// programmed to incorporate that event, if there are events that happened between two
-		// consecutive syncs and that canceled each other out, e.g. pod A added -> pod A deleted,
-		// there will be no network programming for them and thus no network programming latency metric
-		// should be exported.
-		delete(ect.lastChangeTriggerTimes, namespacedName)
+// checkoutTriggerTimes applies the locally cached trigger times to a map of
+// trigger times that have been passed in and empties the local cache.
+func (ect *EndpointChangeTracker) checkoutTriggerTimes(lastChangeTriggerTimes *map[types.NamespacedName][]time.Time) {
+	ect.lock.Lock()
+	defer ect.lock.Unlock()
+
+	for k, v := range ect.lastChangeTriggerTimes {
+		prev, ok := (*lastChangeTriggerTimes)[k]
+		if !ok {
+			(*lastChangeTriggerTimes)[k] = v
+		} else {
+			(*lastChangeTriggerTimes)[k] = append(prev, v...)
+		}
 	}
-
-	metrics.EndpointChangesPending.Set(float64(len(ect.items)))
-	return len(ect.items) > 0
+	ect.lastChangeTriggerTimes = make(map[types.NamespacedName][]time.Time)
 }

 // getLastChangeTriggerTime returns the time.Time value of the
@@ -351,29 +366,19 @@ func (ect *EndpointChangeTracker) endpointsToEndpointsMap(endpoints *v1.Endpoint
 // The changes map is cleared after applying them.
 // In addition it returns (via argument) and resets the lastChangeTriggerTimes for all endpoints
 // that were changed and will result in syncing the proxy rules.
-func (em EndpointsMap) apply(changes *EndpointChangeTracker, staleEndpoints *[]ServiceEndpoint,
+func (em EndpointsMap) apply(ect *EndpointChangeTracker, staleEndpoints *[]ServiceEndpoint,
 	staleServiceNames *[]ServicePortName, lastChangeTriggerTimes *map[types.NamespacedName][]time.Time) {
-	if changes == nil {
+	if ect == nil {
 		return
 	}
-	changes.lock.Lock()
-	defer changes.lock.Unlock()
-	for _, change := range changes.items {
+
+	changes := ect.checkoutChanges()
+	for _, change := range changes {
 		em.unmerge(change.previous)
 		em.merge(change.current)
 		detectStaleConnections(change.previous, change.current, staleEndpoints, staleServiceNames)
 	}
-	changes.items = make(map[types.NamespacedName]*endpointsChange)
-	metrics.EndpointChangesPending.Set(0)
-	for k, v := range changes.lastChangeTriggerTimes {
-		prev, ok := (*lastChangeTriggerTimes)[k]
-		if !ok {
-			(*lastChangeTriggerTimes)[k] = v
-		} else {
-			(*lastChangeTriggerTimes)[k] = append(prev, v...)
-		}
-	}
-	changes.lastChangeTriggerTimes = make(map[types.NamespacedName][]time.Time)
+	ect.checkoutTriggerTimes(lastChangeTriggerTimes)
 }

 // Merge ensures that the current EndpointsMap contains all <service, endpoints> pairs from the EndpointsMap passed in.