Service controller: use a workqueue for node updates

Services which fail to be successfully synced as a cause by a triggered node event
are actually never retried. The commit before this one gave an example of when such
services used to be retried before, but that was not really efficient nor fully
correct. Moving to a workqueue for node events is a more modern approach to syncing
nodes, and placing all service keys that have failed on the service workqueue, in
case they do, fixes the re-sync problem

Also, now that we are using a node workqueue and use one go-routine to service items
from that queue, we don't need the `nodeSyncLock` anymore. So further clean that up
from the controller.
This commit is contained in:
Alexander Constantinescu 2022-07-21 11:14:26 +02:00
parent 72c1c6559e
commit c44ab14e20
2 changed files with 64 additions and 95 deletions

View File

@ -86,16 +86,13 @@ type Controller struct {
eventRecorder record.EventRecorder eventRecorder record.EventRecorder
nodeLister corelisters.NodeLister nodeLister corelisters.NodeLister
nodeListerSynced cache.InformerSynced nodeListerSynced cache.InformerSynced
// services that need to be synced // services and nodes that need to be synced
queue workqueue.RateLimitingInterface serviceQueue workqueue.RateLimitingInterface
nodeQueue workqueue.RateLimitingInterface
// nodeSyncLock ensures there is only one instance of triggerNodeSync getting executed at one time // lastSyncedNodes is used when reconciling node state and keeps track of
// and protects internal states (needFullSync) of nodeSync // the last synced set of nodes. This field is concurrently safe because the
nodeSyncLock sync.Mutex // nodeQueue is serviced by only one go-routine, so node events are not
// nodeSyncCh triggers nodeSyncLoop to run // processed concurrently.
nodeSyncCh chan interface{}
// lastSyncedNodes is used when reconciling node state and keeps track of the last synced set of
// nodes. Access to this attribute by multiple go-routines is protected by nodeSyncLock
lastSyncedNodes []*v1.Node lastSyncedNodes []*v1.Node
} }
@ -128,10 +125,9 @@ func New(
eventRecorder: recorder, eventRecorder: recorder,
nodeLister: nodeInformer.Lister(), nodeLister: nodeInformer.Lister(),
nodeListerSynced: nodeInformer.Informer().HasSynced, nodeListerSynced: nodeInformer.Informer().HasSynced,
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(minRetryDelay, maxRetryDelay), "service"), serviceQueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(minRetryDelay, maxRetryDelay), "service"),
// nodeSyncCh has a size 1 buffer. Only one pending sync signal would be cached. nodeQueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(minRetryDelay, maxRetryDelay), "node"),
nodeSyncCh: make(chan interface{}, 1), lastSyncedNodes: []*v1.Node{},
lastSyncedNodes: []*v1.Node{},
} }
serviceInformer.Informer().AddEventHandlerWithResyncPeriod( serviceInformer.Informer().AddEventHandlerWithResyncPeriod(
@ -162,7 +158,7 @@ func New(
nodeInformer.Informer().AddEventHandlerWithResyncPeriod( nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
cache.ResourceEventHandlerFuncs{ cache.ResourceEventHandlerFuncs{
AddFunc: func(cur interface{}) { AddFunc: func(cur interface{}) {
s.triggerNodeSync() s.enqueueNode(cur)
}, },
UpdateFunc: func(old, cur interface{}) { UpdateFunc: func(old, cur interface{}) {
oldNode, ok := old.(*v1.Node) oldNode, ok := old.(*v1.Node)
@ -179,13 +175,13 @@ func New(
return return
} }
s.triggerNodeSync() s.enqueueNode(curNode)
}, },
DeleteFunc: func(old interface{}) { DeleteFunc: func(old interface{}) {
s.triggerNodeSync() s.enqueueNode(old)
}, },
}, },
time.Duration(0), nodeSyncPeriod,
) )
if err := s.init(); err != nil { if err := s.init(); err != nil {
@ -202,7 +198,17 @@ func (c *Controller) enqueueService(obj interface{}) {
runtime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", obj, err)) runtime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", obj, err))
return return
} }
c.queue.Add(key) c.serviceQueue.Add(key)
}
// obj could be an *v1.Service, or a DeletionFinalStateUnknown marker item.
func (c *Controller) enqueueNode(obj interface{}) {
key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
if err != nil {
runtime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", obj, err))
return
}
c.nodeQueue.Add(key)
} }
// Run starts a background goroutine that watches for changes to services that // Run starts a background goroutine that watches for changes to services that
@ -217,7 +223,8 @@ func (c *Controller) enqueueService(obj interface{}) {
// object. // object.
func (c *Controller) Run(ctx context.Context, workers int, controllerManagerMetrics *controllersmetrics.ControllerManagerMetrics) { func (c *Controller) Run(ctx context.Context, workers int, controllerManagerMetrics *controllersmetrics.ControllerManagerMetrics) {
defer runtime.HandleCrash() defer runtime.HandleCrash()
defer c.queue.ShutDown() defer c.serviceQueue.ShutDown()
defer c.nodeQueue.ShutDown()
// Start event processing pipeline. // Start event processing pipeline.
c.eventBroadcaster.StartStructuredLogging(0) c.eventBroadcaster.StartStructuredLogging(0)
@ -234,65 +241,60 @@ func (c *Controller) Run(ctx context.Context, workers int, controllerManagerMetr
} }
for i := 0; i < workers; i++ { for i := 0; i < workers; i++ {
go wait.UntilWithContext(ctx, c.worker, time.Second) go wait.UntilWithContext(ctx, c.serviceWorker, time.Second)
} }
go c.nodeSyncLoop(ctx, workers) // Initialize one go-routine servicing node events. This ensure we only
go wait.Until(c.triggerNodeSync, nodeSyncPeriod, ctx.Done()) // process one node at any given moment in time
go wait.UntilWithContext(ctx, func(ctx context.Context) { c.nodeWorker(ctx, workers) }, time.Second)
<-ctx.Done() <-ctx.Done()
} }
// triggerNodeSync triggers a nodeSync asynchronously // worker runs a worker thread that just dequeues items, processes them, and marks them done.
func (c *Controller) triggerNodeSync() { // It enforces that the syncHandler is never invoked concurrently with the same key.
c.nodeSyncLock.Lock() func (c *Controller) serviceWorker(ctx context.Context) {
defer c.nodeSyncLock.Unlock() for c.processNextServiceItem(ctx) {
select {
case c.nodeSyncCh <- struct{}{}:
klog.V(4).Info("Triggering nodeSync")
return
default:
klog.V(4).Info("A pending nodeSync is already in queue")
return
} }
} }
// worker runs a worker thread that just dequeues items, processes them, and marks them done. // worker runs a worker thread that just dequeues items, processes them, and marks them done.
// It enforces that the syncHandler is never invoked concurrently with the same key. // It enforces that the syncHandler is never invoked concurrently with the same key.
func (c *Controller) worker(ctx context.Context) { func (c *Controller) nodeWorker(ctx context.Context, workers int) {
for c.processNextWorkItem(ctx) { for c.processNextNodeItem(ctx, workers) {
} }
} }
// nodeSyncLoop takes nodeSync signal and triggers nodeSync func (c *Controller) processNextNodeItem(ctx context.Context, workers int) bool {
func (c *Controller) nodeSyncLoop(ctx context.Context, workers int) { key, quit := c.nodeQueue.Get()
klog.V(4).Info("nodeSyncLoop Started")
for {
select {
case <-c.nodeSyncCh:
klog.V(4).Info("nodeSync has been triggered")
c.nodeSyncInternal(ctx, workers)
case <-ctx.Done():
return
}
}
}
func (c *Controller) processNextWorkItem(ctx context.Context) bool {
key, quit := c.queue.Get()
if quit { if quit {
return false return false
} }
defer c.queue.Done(key) defer c.nodeQueue.Done(key)
for serviceToRetry := range c.syncNodes(ctx, workers) {
c.serviceQueue.Add(serviceToRetry)
}
c.nodeQueue.Forget(key)
return true
}
func (c *Controller) processNextServiceItem(ctx context.Context) bool {
key, quit := c.serviceQueue.Get()
if quit {
return false
}
defer c.serviceQueue.Done(key)
err := c.syncService(ctx, key.(string)) err := c.syncService(ctx, key.(string))
if err == nil { if err == nil {
c.queue.Forget(key) c.serviceQueue.Forget(key)
return true return true
} }
runtime.HandleError(fmt.Errorf("error processing service %v (will retry): %v", key, err)) runtime.HandleError(fmt.Errorf("error processing service %v (will retry): %v", key, err))
c.queue.AddRateLimited(key) c.serviceQueue.AddRateLimited(key)
return true return true
} }
@ -675,13 +677,13 @@ func shouldSyncUpdatedNode(oldNode, newNode *v1.Node) bool {
return respectsPredicates(oldNode, allNodePredicates...) != respectsPredicates(newNode, allNodePredicates...) return respectsPredicates(oldNode, allNodePredicates...) != respectsPredicates(newNode, allNodePredicates...)
} }
// nodeSyncInternal handles updating the hosts pointed to by all load // syncNodes handles updating the hosts pointed to by all load
// balancers whenever the set of nodes in the cluster changes. // balancers whenever the set of nodes in the cluster changes.
func (c *Controller) nodeSyncInternal(ctx context.Context, workers int) { func (c *Controller) syncNodes(ctx context.Context, workers int) sets.String {
startTime := time.Now() startTime := time.Now()
defer func() { defer func() {
latency := time.Since(startTime).Seconds() latency := time.Since(startTime).Seconds()
klog.V(4).Infof("It took %v seconds to finish nodeSyncInternal", latency) klog.V(4).Infof("It took %v seconds to finish syncNodes", latency)
nodeSyncLatency.Observe(latency) nodeSyncLatency.Observe(latency)
}() }()
@ -691,6 +693,7 @@ func (c *Controller) nodeSyncInternal(ctx context.Context, workers int) {
servicesToRetry := c.updateLoadBalancerHosts(ctx, servicesToUpdate, workers) servicesToRetry := c.updateLoadBalancerHosts(ctx, servicesToUpdate, workers)
klog.V(2).Infof("Successfully updated %d out of %d load balancers to direct traffic to the updated set of nodes", klog.V(2).Infof("Successfully updated %d out of %d load balancers to direct traffic to the updated set of nodes",
numServices-len(servicesToRetry), numServices) numServices-len(servicesToRetry), numServices)
return servicesToRetry
} }
// nodeSyncService syncs the nodes for one load balancer type service. The return value // nodeSyncService syncs the nodes for one load balancer type service. The return value

View File

@ -107,8 +107,8 @@ func newController() (*Controller, *fakecloud.Cloud, *fake.Clientset) {
eventRecorder: recorder, eventRecorder: recorder,
nodeLister: newFakeNodeLister(nil), nodeLister: newFakeNodeLister(nil),
nodeListerSynced: nodeInformer.Informer().HasSynced, nodeListerSynced: nodeInformer.Informer().HasSynced,
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(minRetryDelay, maxRetryDelay), "service"), serviceQueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(minRetryDelay, maxRetryDelay), "service"),
nodeSyncCh: make(chan interface{}, 1), nodeQueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(minRetryDelay, maxRetryDelay), "node"),
lastSyncedNodes: []*v1.Node{}, lastSyncedNodes: []*v1.Node{},
} }
@ -905,7 +905,7 @@ func TestProcessServiceCreateOrUpdate(t *testing.T) {
cachedServiceTest.state = svc cachedServiceTest.state = svc
controller.cache.set(keyExpected, cachedServiceTest) controller.cache.set(keyExpected, cachedServiceTest)
keyGot, quit := controller.queue.Get() keyGot, quit := controller.serviceQueue.Get()
if quit { if quit {
t.Fatalf("get no queue element") t.Fatalf("get no queue element")
} }
@ -3184,40 +3184,6 @@ func Test_shouldSyncUpdatedNode_compoundedPredicates(t *testing.T) {
} }
} }
func TestTriggerNodeSync(t *testing.T) {
controller, _, _ := newController()
tryReadFromChannel(t, controller.nodeSyncCh, false)
controller.triggerNodeSync()
tryReadFromChannel(t, controller.nodeSyncCh, true)
tryReadFromChannel(t, controller.nodeSyncCh, false)
tryReadFromChannel(t, controller.nodeSyncCh, false)
tryReadFromChannel(t, controller.nodeSyncCh, false)
controller.triggerNodeSync()
controller.triggerNodeSync()
controller.triggerNodeSync()
tryReadFromChannel(t, controller.nodeSyncCh, true)
tryReadFromChannel(t, controller.nodeSyncCh, false)
tryReadFromChannel(t, controller.nodeSyncCh, false)
tryReadFromChannel(t, controller.nodeSyncCh, false)
}
func tryReadFromChannel(t *testing.T, ch chan interface{}, expectValue bool) {
select {
case _, ok := <-ch:
if !ok {
t.Errorf("The channel is closed")
}
if !expectValue {
t.Errorf("Does not expect value from the channel, but got a value")
}
default:
if expectValue {
t.Errorf("Expect value from the channel, but got none")
}
}
}
type fakeNodeLister struct { type fakeNodeLister struct {
cache []*v1.Node cache []*v1.Node
err error err error