Cleanup service sync path

It dawned on me that `needsFullSync` can never be false. `needsFullSync` was used
to compare the set of nodes that were existing last time the node event handler was
triggered, with the current set of node for this run. However, if `triggerNodeSync`
gets called it's always because the set of nodes have changed due to a condition
changing on one node, or a new node being added/removed. If `needsFullSync` can
never be false then a lot of things in the service sync path was just spurious, for
ex: `servicesToRetry`, `knownHosts`. Essentially: if we ever need to `triggerNodeSync`
then the set of nodes have somehow changed and we always need to re-sync all services.

Before this patch series there was a possibility for `needsFullSync` to be set to false.
`shouldSyncNode` and the predicates used to list nodes were not aligned, specifically
for Unschedulable nodes. This means that we could have been triggered by a change to
the schedulable state but not actually computed any diffs between the old vs. new nodes.
Meaning, whenever there was a change in schedulable state we would just try to re-sync
all service updates that might have failed when we synced last time. But I believe this
to be an overlooked coincidence, rather than something actually intended.
This commit is contained in:
Alexander Constantinescu 2022-07-21 10:35:45 +02:00
parent bd9444c1cf
commit 72c1c6559e
2 changed files with 7 additions and 117 deletions

View File

@ -74,12 +74,10 @@ type serviceCache struct {
// Controller keeps cloud provider service resources
// (like load balancers) in sync with the registry.
type Controller struct {
cloud cloudprovider.Interface
knownHosts []*v1.Node
servicesToUpdate sets.String
kubeClient clientset.Interface
clusterName string
balancer cloudprovider.LoadBalancer
cloud cloudprovider.Interface
kubeClient clientset.Interface
clusterName string
balancer cloudprovider.LoadBalancer
// TODO(#85155): Stop relying on this and remove the cache completely.
cache *serviceCache
serviceLister corelisters.ServiceLister
@ -96,8 +94,6 @@ type Controller struct {
nodeSyncLock sync.Mutex
// nodeSyncCh triggers nodeSyncLoop to run
nodeSyncCh chan interface{}
// needFullSync indicates if the nodeSyncInternal will do a full node sync on all LB services.
needFullSync bool
// lastSyncedNodes is used when reconciling node state and keeps track of the last synced set of
// nodes. Access to this attribute by multiple go-routines is protected by nodeSyncLock
lastSyncedNodes []*v1.Node
@ -125,7 +121,6 @@ func New(
registerMetrics()
s := &Controller{
cloud: cloud,
knownHosts: []*v1.Node{},
kubeClient: kubeClient,
clusterName: clusterName,
cache: &serviceCache{serviceMap: make(map[string]*cachedService)},
@ -200,15 +195,6 @@ func New(
return s, nil
}
// needFullSyncAndUnmark returns the value and needFullSync and marks the field to false.
func (c *Controller) needFullSyncAndUnmark() bool {
c.nodeSyncLock.Lock()
defer c.nodeSyncLock.Unlock()
ret := c.needFullSync
c.needFullSync = false
return ret
}
// obj could be an *v1.Service, or a DeletionFinalStateUnknown marker item.
func (c *Controller) enqueueService(obj interface{}) {
key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
@ -261,22 +247,6 @@ func (c *Controller) Run(ctx context.Context, workers int, controllerManagerMetr
func (c *Controller) triggerNodeSync() {
c.nodeSyncLock.Lock()
defer c.nodeSyncLock.Unlock()
newHosts, err := listWithPredicates(c.nodeLister, allNodePredicates...)
if err != nil {
runtime.HandleError(fmt.Errorf("Failed to retrieve current set of nodes from node lister: %v", err))
// if node list cannot be retrieve, trigger full node sync to be safe.
c.needFullSync = true
} else if !nodeSlicesEqualForLB(newHosts, c.knownHosts) {
// Here the last known state is recorded as knownHosts. For each
// LB update, the latest node list is retrieved. This is to prevent
// a stale set of nodes were used to be update loadbalancers when
// there are many loadbalancers in the clusters. nodeSyncInternal
// would be triggered until all loadbalancers are updated to the new state.
klog.V(2).Infof("Node changes detected, triggering a full node sync on all loadbalancer services")
c.needFullSync = true
c.knownHosts = newHosts
}
select {
case c.nodeSyncCh <- struct{}{}:
klog.V(4).Info("Triggering nodeSync")
@ -688,13 +658,6 @@ func nodeNames(nodes []*v1.Node) sets.String {
return ret
}
func nodeSlicesEqualForLB(x, y []*v1.Node) bool {
if len(x) != len(y) {
return false
}
return nodeNames(x).Equal(nodeNames(y))
}
func shouldSyncUpdatedNode(oldNode, newNode *v1.Node) bool {
// Evaluate the individual node exclusion predicate before evaluating the
// compounded result of all predicates. We don't sync ETP=local services
@ -722,33 +685,12 @@ func (c *Controller) nodeSyncInternal(ctx context.Context, workers int) {
nodeSyncLatency.Observe(latency)
}()
if !c.needFullSyncAndUnmark() {
// The set of nodes in the cluster hasn't changed, but we can retry
// updating any services that we failed to update last time around.
// It is required to call `c.cache.get()` on each Service in case there was
// an update event that occurred between retries.
var servicesToUpdate []*v1.Service
for key := range c.servicesToUpdate {
cachedService, exist := c.cache.get(key)
if !exist {
klog.Errorf("Service %q should be in the cache but not", key)
continue
}
servicesToUpdate = append(servicesToUpdate, cachedService.state)
}
c.servicesToUpdate = c.updateLoadBalancerHosts(ctx, servicesToUpdate, workers)
return
}
klog.V(2).Infof("Syncing backends for all LB services.")
// Try updating all services, and save the failed ones to try again next
// round.
servicesToUpdate := c.cache.allServices()
numServices := len(servicesToUpdate)
c.servicesToUpdate = c.updateLoadBalancerHosts(ctx, servicesToUpdate, workers)
servicesToRetry := c.updateLoadBalancerHosts(ctx, servicesToUpdate, workers)
klog.V(2).Infof("Successfully updated %d out of %d load balancers to direct traffic to the updated set of nodes",
numServices-len(c.servicesToUpdate), numServices)
numServices-len(servicesToRetry), numServices)
}
// nodeSyncService syncs the nodes for one load balancer type service. The return value
@ -803,7 +745,6 @@ func (c *Controller) updateLoadBalancerHosts(ctx context.Context, services []*v1
key := fmt.Sprintf("%s/%s", services[piece].Namespace, services[piece].Name)
servicesToRetry.Insert(key)
}
workqueue.ParallelizeUntil(ctx, workers, len(services), doWork)
c.lastSyncedNodes = nodes
klog.V(4).Infof("Finished updateLoadBalancerHosts")

View File

@ -26,6 +26,7 @@ import (
"testing"
"time"
"github.com/stretchr/testify/assert"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -46,8 +47,6 @@ import (
servicehelper "k8s.io/cloud-provider/service/helpers"
utilpointer "k8s.io/utils/pointer"
"github.com/stretchr/testify/assert"
)
const region = "us-central"
@ -101,7 +100,6 @@ func newController() (*Controller, *fakecloud.Cloud, *fake.Clientset) {
controller := &Controller{
cloud: cloud,
knownHosts: []*v1.Node{},
kubeClient: kubeClient,
clusterName: "test-cluster",
cache: &serviceCache{serviceMap: make(map[string]*cachedService)},
@ -566,7 +564,6 @@ func TestUpdateNodesInExternalLoadBalancer(t *testing.T) {
defer cancel()
controller, cloud, _ := newController()
controller.nodeLister = newFakeNodeLister(nil, nodes...)
if servicesToRetry := controller.updateLoadBalancerHosts(ctx, item.services, item.workers); len(servicesToRetry) != 0 {
t.Errorf("for case %q, unexpected servicesToRetry: %v", item.desc, servicesToRetry)
}
@ -1516,28 +1513,6 @@ func TestServiceCache(t *testing.T) {
}
}
// Test a utility functions as it's not easy to unit test nodeSyncInternal directly
func TestNodeSlicesEqualForLB(t *testing.T) {
numNodes := 10
nArray := make([]*v1.Node, numNodes)
mArray := make([]*v1.Node, numNodes)
for i := 0; i < numNodes; i++ {
nArray[i] = &v1.Node{}
nArray[i].Name = fmt.Sprintf("node%d", i)
}
for i := 0; i < numNodes; i++ {
mArray[i] = &v1.Node{}
mArray[i].Name = fmt.Sprintf("node%d", i+1)
}
if !nodeSlicesEqualForLB(nArray, nArray) {
t.Errorf("nodeSlicesEqualForLB() Expected=true Obtained=false")
}
if nodeSlicesEqualForLB(nArray, mArray) {
t.Errorf("nodeSlicesEqualForLB() Expected=false Obtained=true")
}
}
// TODO(@MrHohn): Verify the end state when below issue is resolved:
// https://github.com/kubernetes/client-go/issues/607
func TestAddFinalizer(t *testing.T) {
@ -3227,32 +3202,6 @@ func TestTriggerNodeSync(t *testing.T) {
tryReadFromChannel(t, controller.nodeSyncCh, false)
}
func TestMarkAndUnmarkFullSync(t *testing.T) {
controller, _, _ := newController()
if controller.needFullSync != false {
t.Errorf("expect controller.needFullSync to be false, but got true")
}
ret := controller.needFullSyncAndUnmark()
if ret != false {
t.Errorf("expect ret == false, but got true")
}
ret = controller.needFullSyncAndUnmark()
if ret != false {
t.Errorf("expect ret == false, but got true")
}
controller.needFullSync = true
ret = controller.needFullSyncAndUnmark()
if ret != true {
t.Errorf("expect ret == true, but got false")
}
ret = controller.needFullSyncAndUnmark()
if ret != false {
t.Errorf("expect ret == false, but got true")
}
}
func tryReadFromChannel(t *testing.T, ch chan interface{}, expectValue bool) {
select {
case _, ok := <-ch: