Merge pull request #123238 from MirrorShih/master

Fix infinite loop and replace channel with queue
This commit is contained in:
Kubernetes Prow Robot 2024-04-18 14:30:07 -07:00 committed by GitHub
commit fd40d68a39
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 135 additions and 106 deletions

View File

@ -106,13 +106,6 @@ type CIDRAllocatorParams struct {
NodeCIDRMaskSizes []int NodeCIDRMaskSizes []int
} }
// CIDRs are reserved, then node resource is patched with them.
// nodeReservedCIDRs holds the reservation info for a node.
type nodeReservedCIDRs struct {
allocatedCIDRs []*net.IPNet
nodeName string
}
// New creates a new CIDR range allocator. // New creates a new CIDR range allocator.
func New(ctx context.Context, kubeClient clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer, allocatorType CIDRAllocatorType, allocatorParams CIDRAllocatorParams) (CIDRAllocator, error) { func New(ctx context.Context, kubeClient clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer, allocatorType CIDRAllocatorType, allocatorParams CIDRAllocatorParams) (CIDRAllocator, error) {
nodeList, err := listNodes(ctx, kubeClient) nodeList, err := listNodes(ctx, kubeClient)

View File

@ -20,16 +20,16 @@ import (
"context" "context"
"fmt" "fmt"
"net" "net"
"sync" "time"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2" "k8s.io/klog/v2"
netutils "k8s.io/utils/net" netutils "k8s.io/utils/net"
apierrors "k8s.io/apimachinery/pkg/api/errors" apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
informers "k8s.io/client-go/informers/core/v1" informers "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes" clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/kubernetes/scheme"
@ -37,6 +37,7 @@ import (
corelisters "k8s.io/client-go/listers/core/v1" corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record" "k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
nodeutil "k8s.io/component-helpers/node/util" nodeutil "k8s.io/component-helpers/node/util"
"k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset" "k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset"
controllerutil "k8s.io/kubernetes/pkg/controller/util/node" controllerutil "k8s.io/kubernetes/pkg/controller/util/node"
@ -52,14 +53,12 @@ type rangeAllocator struct {
nodeLister corelisters.NodeLister nodeLister corelisters.NodeLister
// nodesSynced returns true if the node shared informer has been synced at least once. // nodesSynced returns true if the node shared informer has been synced at least once.
nodesSynced cache.InformerSynced nodesSynced cache.InformerSynced
// Channel that is used to pass updating Nodes and their reserved CIDRs to the background broadcaster record.EventBroadcaster
// This increases a throughput of CIDR assignment by not blocking on long operations. recorder record.EventRecorder
nodeCIDRUpdateChannel chan nodeReservedCIDRs
broadcaster record.EventBroadcaster // queues are where incoming work is placed to de-dup and to allow "easy"
recorder record.EventRecorder // rate limited requeues on errors
// Keep a set of nodes that are currently being processed to avoid races in CIDR allocation queue workqueue.RateLimitingInterface
lock sync.Mutex
nodesInProcessing sets.String
} }
// NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDRs for node (one from each of clusterCIDRs) // NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDRs for node (one from each of clusterCIDRs)
@ -89,15 +88,14 @@ func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, node
} }
ra := &rangeAllocator{ ra := &rangeAllocator{
client: client, client: client,
clusterCIDRs: allocatorParams.ClusterCIDRs, clusterCIDRs: allocatorParams.ClusterCIDRs,
cidrSets: cidrSets, cidrSets: cidrSets,
nodeLister: nodeInformer.Lister(), nodeLister: nodeInformer.Lister(),
nodesSynced: nodeInformer.Informer().HasSynced, nodesSynced: nodeInformer.Informer().HasSynced,
nodeCIDRUpdateChannel: make(chan nodeReservedCIDRs, cidrUpdateQueueSize), broadcaster: eventBroadcaster,
broadcaster: eventBroadcaster, recorder: recorder,
recorder: recorder, queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cidrallocator_node"),
nodesInProcessing: sets.NewString(),
} }
if allocatorParams.ServiceCIDR != nil { if allocatorParams.ServiceCIDR != nil {
@ -130,37 +128,33 @@ func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, node
} }
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { AddFunc: func(obj interface{}) {
return ra.AllocateOrOccupyCIDR(logger, node) key, err := cache.MetaNamespaceKeyFunc(obj)
}), if err == nil {
UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { ra.queue.Add(key)
// If the PodCIDRs list is not empty we either:
// - already processed a Node that already had CIDRs after NC restarted
// (cidr is marked as used),
// - already processed a Node successfully and allocated CIDRs for it
// (cidr is marked as used),
// - already processed a Node but we did saw a "timeout" response and
// request eventually got through in this case we haven't released
// the allocated CIDRs (cidr is still marked as used).
// There's a possible error here:
// - NC sees a new Node and assigns CIDRs X,Y.. to it,
// - Update Node call fails with a timeout,
// - Node is updated by some other component, NC sees an update and
// assigns CIDRs A,B.. to the Node,
// - Both CIDR X,Y.. and CIDR A,B.. are marked as used in the local cache,
// even though Node sees only CIDR A,B..
// The problem here is that in in-memory cache we see CIDR X,Y.. as marked,
// which prevents it from being assigned to any new node. The cluster
// state is correct.
// Restart of NC fixes the issue.
if len(newNode.Spec.PodCIDRs) == 0 {
return ra.AllocateOrOccupyCIDR(logger, newNode)
} }
return nil },
}), UpdateFunc: func(old, new interface{}) {
DeleteFunc: controllerutil.CreateDeleteNodeHandler(logger, func(node *v1.Node) error { key, err := cache.MetaNamespaceKeyFunc(new)
return ra.ReleaseCIDR(logger, node) if err == nil {
}), ra.queue.Add(key)
}
},
DeleteFunc: func(obj interface{}) {
// The informer cache no longer has the object, and since Node doesn't have a finalizer,
// we don't see the Update with DeletionTimestamp != 0.
// TODO: instead of executing the operation directly in the handler, build a small cache with key node.Name
// and value PodCIDRs use ReleaseCIDR on the reconcile loop so we can retry on `ReleaseCIDR` failures.
if err := ra.ReleaseCIDR(logger, obj.(*v1.Node)); err != nil {
utilruntime.HandleError(fmt.Errorf("error while processing CIDR Release: %w", err))
}
// IndexerInformer uses a delta nodeQueue, therefore for deletes we have to use this
// key function.
key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
if err == nil {
ra.queue.Add(key)
}
},
}) })
return ra, nil return ra, nil
@ -176,6 +170,8 @@ func (r *rangeAllocator) Run(ctx context.Context) {
r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")}) r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")})
defer r.broadcaster.Shutdown() defer r.broadcaster.Shutdown()
defer r.queue.ShutDown()
logger.Info("Starting range CIDR allocator") logger.Info("Starting range CIDR allocator")
defer logger.Info("Shutting down range CIDR allocator") defer logger.Info("Shutting down range CIDR allocator")
@ -184,50 +180,100 @@ func (r *rangeAllocator) Run(ctx context.Context) {
} }
for i := 0; i < cidrUpdateWorkers; i++ { for i := 0; i < cidrUpdateWorkers; i++ {
go r.worker(ctx) go wait.UntilWithContext(ctx, r.runWorker, time.Second)
} }
<-ctx.Done() <-ctx.Done()
} }
func (r *rangeAllocator) worker(ctx context.Context) { // runWorker is a long-running function that will continually call the
logger := klog.FromContext(ctx) // processNextWorkItem function in order to read and process a message on the
for { // queue.
select { func (r *rangeAllocator) runWorker(ctx context.Context) {
case workItem, ok := <-r.nodeCIDRUpdateChannel: for r.processNextNodeWorkItem(ctx) {
if !ok {
logger.Info("Channel nodeCIDRUpdateChannel was unexpectedly closed")
return
}
if err := r.updateCIDRsAllocation(logger, workItem); err != nil {
// Requeue the failed node for update again.
r.nodeCIDRUpdateChannel <- workItem
}
case <-ctx.Done():
return
}
} }
} }
func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool { // processNextWorkItem will read a single work item off the queue and
r.lock.Lock() // attempt to process it, by calling the syncHandler.
defer r.lock.Unlock() func (r *rangeAllocator) processNextNodeWorkItem(ctx context.Context) bool {
if r.nodesInProcessing.Has(nodeName) { obj, shutdown := r.queue.Get()
if shutdown {
return false return false
} }
r.nodesInProcessing.Insert(nodeName)
// We wrap this block in a func so we can defer r.queue.Done.
err := func(logger klog.Logger, obj interface{}) error {
// We call Done here so the workNodeQueue knows we have finished
// processing this item. We also must remember to call Forget if we
// do not want this work item being re-queued. For example, we do
// not call Forget if a transient error occurs, instead the item is
// put back on the queue and attempted again after a back-off
// period.
defer r.queue.Done(obj)
var key string
var ok bool
// We expect strings to come off the workNodeQueue. These are of the
// form namespace/name. We do this as the delayed nature of the
// workNodeQueue means the items in the informer cache may actually be
// more up to date that when the item was initially put onto the
// workNodeQueue.
if key, ok = obj.(string); !ok {
// As the item in the workNodeQueue is actually invalid, we call
// Forget here else we'd go into a loop of attempting to
// process a work item that is invalid.
r.queue.Forget(obj)
utilruntime.HandleError(fmt.Errorf("expected string in workNodeQueue but got %#v", obj))
return nil
}
// Run the syncHandler, passing it the namespace/name string of the
// Foo resource to be synced.
if err := r.syncNode(logger, key); err != nil {
// Put the item back on the queue to handle any transient errors.
r.queue.AddRateLimited(key)
return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error())
}
// Finally, if no error occurs we Forget this item so it does not
// get queue again until another change happens.
r.queue.Forget(obj)
logger.Info("Successfully synced", "key", key)
return nil
}(klog.FromContext(ctx), obj)
if err != nil {
utilruntime.HandleError(err)
return true
}
return true return true
} }
func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) { func (r *rangeAllocator) syncNode(logger klog.Logger, key string) error {
r.lock.Lock() startTime := time.Now()
defer r.lock.Unlock() defer func() {
r.nodesInProcessing.Delete(nodeName) logger.V(4).Info("Finished syncing Node request", "node", key, "elapsed", time.Since(startTime))
}()
node, err := r.nodeLister.Get(key)
if apierrors.IsNotFound(err) {
logger.V(3).Info("node has been deleted", "node", key)
// TODO: obtain the node object information to call ReleaseCIDR from here
// and retry if there is an error.
return nil
}
if err != nil {
return err
}
// Check the DeletionTimestamp to determine if object is under deletion.
if !node.DeletionTimestamp.IsZero() {
logger.V(3).Info("node is being deleted", "node", key)
return r.ReleaseCIDR(logger, node)
}
return r.AllocateOrOccupyCIDR(logger, node)
} }
// marks node.PodCIDRs[...] as used in allocator's tracked cidrSet // marks node.PodCIDRs[...] as used in allocator's tracked cidrSet
func (r *rangeAllocator) occupyCIDRs(node *v1.Node) error { func (r *rangeAllocator) occupyCIDRs(node *v1.Node) error {
defer r.removeNodeFromProcessing(node.Name)
if len(node.Spec.PodCIDRs) == 0 { if len(node.Spec.PodCIDRs) == 0 {
return nil return nil
} }
@ -257,34 +303,25 @@ func (r *rangeAllocator) AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node)
if node == nil { if node == nil {
return nil return nil
} }
if !r.insertNodeToProcessing(node.Name) {
logger.V(2).Info("Node is already in a process of CIDR assignment", "node", klog.KObj(node))
return nil
}
if len(node.Spec.PodCIDRs) > 0 { if len(node.Spec.PodCIDRs) > 0 {
return r.occupyCIDRs(node) return r.occupyCIDRs(node)
} }
// allocate and queue the assignment
allocated := nodeReservedCIDRs{ allocatedCIDRs := make([]*net.IPNet, len(r.cidrSets))
nodeName: node.Name,
allocatedCIDRs: make([]*net.IPNet, len(r.cidrSets)),
}
for idx := range r.cidrSets { for idx := range r.cidrSets {
podCIDR, err := r.cidrSets[idx].AllocateNext() podCIDR, err := r.cidrSets[idx].AllocateNext()
if err != nil { if err != nil {
r.removeNodeFromProcessing(node.Name)
controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRNotAvailable") controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRNotAvailable")
return fmt.Errorf("failed to allocate cidr from cluster cidr at idx:%v: %v", idx, err) return fmt.Errorf("failed to allocate cidr from cluster cidr at idx:%v: %v", idx, err)
} }
allocated.allocatedCIDRs[idx] = podCIDR allocatedCIDRs[idx] = podCIDR
} }
//queue the assignment //queue the assignment
logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocated.allocatedCIDRs) logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocatedCIDRs)
r.nodeCIDRUpdateChannel <- allocated return r.updateCIDRsAllocation(logger, node.Name, allocatedCIDRs)
return nil
} }
// ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets // ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets
@ -336,21 +373,20 @@ func (r *rangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR *
} }
// updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server. // updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server.
func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeReservedCIDRs) error { func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, nodeName string, allocatedCIDRs []*net.IPNet) error {
var err error var err error
var node *v1.Node var node *v1.Node
defer r.removeNodeFromProcessing(data.nodeName) cidrsString := ipnetToStringList(allocatedCIDRs)
cidrsString := ipnetToStringList(data.allocatedCIDRs) node, err = r.nodeLister.Get(nodeName)
node, err = r.nodeLister.Get(data.nodeName)
if err != nil { if err != nil {
logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", data.nodeName)) logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", nodeName))
return err return err
} }
// if cidr list matches the proposed. // if cidr list matches the proposed.
// then we possibly updated this node // then we possibly updated this node
// and just failed to ack the success. // and just failed to ack the success.
if len(node.Spec.PodCIDRs) == len(data.allocatedCIDRs) { if len(node.Spec.PodCIDRs) == len(allocatedCIDRs) {
match := true match := true
for idx, cidr := range cidrsString { for idx, cidr := range cidrsString {
if node.Spec.PodCIDRs[idx] != cidr { if node.Spec.PodCIDRs[idx] != cidr {
@ -359,7 +395,7 @@ func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeRese
} }
} }
if match { if match {
logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", data.allocatedCIDRs) logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", allocatedCIDRs)
return nil return nil
} }
} }
@ -367,7 +403,7 @@ func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeRese
// node has cidrs, release the reserved // node has cidrs, release the reserved
if len(node.Spec.PodCIDRs) != 0 { if len(node.Spec.PodCIDRs) != 0 {
logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs) logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs)
for idx, cidr := range data.allocatedCIDRs { for idx, cidr := range allocatedCIDRs {
if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil {
logger.Error(releaseErr, "Error when releasing CIDR", "index", idx, "CIDR", cidr) logger.Error(releaseErr, "Error when releasing CIDR", "index", idx, "CIDR", cidr)
} }
@ -390,7 +426,7 @@ func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeRese
// NodeController restart will return all falsely allocated CIDRs to the pool. // NodeController restart will return all falsely allocated CIDRs to the pool.
if !apierrors.IsServerTimeout(err) { if !apierrors.IsServerTimeout(err) {
logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node)) logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node))
for idx, cidr := range data.allocatedCIDRs { for idx, cidr := range allocatedCIDRs {
if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil {
logger.Error(releaseErr, "Error releasing allocated CIDR for node", "node", klog.KObj(node)) logger.Error(releaseErr, "Error releasing allocated CIDR for node", "node", klog.KObj(node))
} }