From ea299acf97d3bfae80a807d3dea77365928b3c9b Mon Sep 17 00:00:00 2001 From: Adrian Moisey Date: Wed, 23 Oct 2024 21:51:29 +0200 Subject: [PATCH] Ensure that a node's CIDR isn't released until the node is deleted Fixes https://github.com/kubernetes/kubernetes/issues/127792 Fixes bug where a node's PodCIDR was released when the node was given a delete time stamp, but was hanging around due to a finalizer. --- .../nodeipam/ipam/range_allocator.go | 26 +++-- .../nodeipam/ipam/range_allocator_test.go | 106 ++++++++++++++++++ 2 files changed, 122 insertions(+), 10 deletions(-) diff --git a/pkg/controller/nodeipam/ipam/range_allocator.go b/pkg/controller/nodeipam/ipam/range_allocator.go index a79982a6db2..036030db516 100644 --- a/pkg/controller/nodeipam/ipam/range_allocator.go +++ b/pkg/controller/nodeipam/ipam/range_allocator.go @@ -145,16 +145,22 @@ func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, node DeleteFunc: func(obj interface{}) { // The informer cache no longer has the object, and since Node doesn't have a finalizer, // we don't see the Update with DeletionTimestamp != 0. - // TODO: instead of executing the operation directly in the handler, build a small cache with key node.Name - // and value PodCIDRs use ReleaseCIDR on the reconcile loop so we can retry on `ReleaseCIDR` failures. - if err := ra.ReleaseCIDR(logger, obj.(*v1.Node)); err != nil { - utilruntime.HandleError(fmt.Errorf("error while processing CIDR Release: %w", err)) + node, ok := obj.(*v1.Node) + if !ok { + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + utilruntime.HandleError(fmt.Errorf("unexpected object type: %v", obj)) + return + } + node, ok = tombstone.Obj.(*v1.Node) + if !ok { + utilruntime.HandleError(fmt.Errorf("unexpected object types: %v", obj)) + return + } } - // IndexerInformer uses a delta nodeQueue, therefore for deletes we have to use this - // key function. - key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) - if err == nil { - ra.queue.Add(key) + if err := ra.ReleaseCIDR(logger, node); err != nil { + utilruntime.HandleError(fmt.Errorf("error while processing CIDR Release: %w", err)) + } }, }) @@ -270,7 +276,7 @@ func (r *rangeAllocator) syncNode(ctx context.Context, key string) error { // Check the DeletionTimestamp to determine if object is under deletion. if !node.DeletionTimestamp.IsZero() { logger.V(3).Info("node is being deleted", "node", key) - return r.ReleaseCIDR(logger, node) + return nil } return r.AllocateOrOccupyCIDR(ctx, node) } diff --git a/pkg/controller/nodeipam/ipam/range_allocator_test.go b/pkg/controller/nodeipam/ipam/range_allocator_test.go index 109da26ea2c..bc8ebcdaf2a 100644 --- a/pkg/controller/nodeipam/ipam/range_allocator_test.go +++ b/pkg/controller/nodeipam/ipam/range_allocator_test.go @@ -841,3 +841,109 @@ func TestReleaseCIDRSuccess(t *testing.T) { testFunc(tc) } } +func TestNodeDeletionReleaseCIDR(t *testing.T) { + _, clusterCIDRv4, err := netutils.ParseCIDRSloppy("10.10.0.0/16") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + _, allocatedCIDR, err := netutils.ParseCIDRSloppy("10.10.0.0/24") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + testCases := []struct { + description string + nodeKey string + existingNodes []*v1.Node + shouldReleaseCIDR bool + }{ + { + description: "Regular node not under deletion", + nodeKey: "node0", + existingNodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + PodCIDR: allocatedCIDR.String(), + PodCIDRs: []string{allocatedCIDR.String()}}, + }, + }, + shouldReleaseCIDR: false, + }, + { + description: "Node under deletion", + nodeKey: "node0", + existingNodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + }, + Spec: v1.NodeSpec{ + PodCIDR: allocatedCIDR.String(), + PodCIDRs: []string{allocatedCIDR.String()}}, + }, + }, + shouldReleaseCIDR: false, + }, + { + description: "Node deleted", + nodeKey: "node0", + existingNodes: []*v1.Node{}, + shouldReleaseCIDR: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + allocatorParams := CIDRAllocatorParams{ + ClusterCIDRs: []*net.IPNet{clusterCIDRv4}, ServiceCIDR: nil, + SecondaryServiceCIDR: nil, + NodeCIDRMaskSizes: []int{24}, + } + + fakeNodeHandler := &testutil.FakeNodeHandler{ + Existing: tc.existingNodes, + Clientset: fake.NewSimpleClientset(), + } + _, tCtx := ktesting.NewTestContext(t) + + fakeNodeInformer := test.FakeNodeInformer(fakeNodeHandler) + nodeList, err := fakeNodeHandler.List(tCtx, metav1.ListOptions{}) + if err != nil { + t.Fatalf("Failed to get list of nodes %v", err) + } + allocator, err := NewCIDRRangeAllocator(tCtx, fakeNodeHandler, fakeNodeInformer, allocatorParams, nodeList) + if err != nil { + t.Fatalf("failed to create CIDRRangeAllocator: %v", err) + } + rangeAllocator, ok := allocator.(*rangeAllocator) + if !ok { + t.Fatalf("found non-default implementation of CIDRAllocator") + } + rangeAllocator.nodesSynced = test.AlwaysReady + rangeAllocator.recorder = testutil.NewFakeRecorder() + + rangeAllocator.syncNode(tCtx, tc.nodeKey) + + if len(rangeAllocator.cidrSets) != 1 { + t.Fatalf("Expected a single cidrSet, but got %d", len(rangeAllocator.cidrSets)) + } + + // if the allocated CIDR was released we expect the nextAllocated CIDR to be the same + nextCIDR, err := rangeAllocator.cidrSets[0].AllocateNext() + if err != nil { + t.Fatalf("unexpected error trying to allocate next CIDR: %v", err) + } + expectedCIDR := "10.10.1.0/24" // existing allocated is 10.0.0.0/24 + if tc.shouldReleaseCIDR { + expectedCIDR = allocatedCIDR.String() // if cidr was released we expect to reuse it. ie: 10.0.0.0/24 + } + if nextCIDR.String() != expectedCIDR { + t.Fatalf("Expected CIDR %s to be allocated next, but got: %v", expectedCIDR, nextCIDR.String()) + } + }) + } +}