mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 12:43:23 +00:00
Fix #61363, Bounded retries for cloud allocator.
This commit is contained in:
parent
d0f8f41890
commit
adc71ff034
@ -9,6 +9,7 @@ load(
|
|||||||
go_test(
|
go_test(
|
||||||
name = "go_default_test",
|
name = "go_default_test",
|
||||||
srcs = [
|
srcs = [
|
||||||
|
"cloud_cidr_allocator_test.go",
|
||||||
"controller_test.go",
|
"controller_test.go",
|
||||||
"range_allocator_test.go",
|
"range_allocator_test.go",
|
||||||
"timeout_test.go",
|
"timeout_test.go",
|
||||||
|
@ -69,6 +69,12 @@ const (
|
|||||||
|
|
||||||
// cidrUpdateRetries is the no. of times a NodeSpec update will be retried before dropping it.
|
// cidrUpdateRetries is the no. of times a NodeSpec update will be retried before dropping it.
|
||||||
cidrUpdateRetries = 3
|
cidrUpdateRetries = 3
|
||||||
|
|
||||||
|
// updateRetryTimeout is the time to wait before requeing a failed node for retry
|
||||||
|
updateRetryTimeout = 100 * time.Millisecond
|
||||||
|
|
||||||
|
// updateMaxRetries is the max retries for a failed node
|
||||||
|
updateMaxRetries = 10
|
||||||
)
|
)
|
||||||
|
|
||||||
// CIDRAllocator is an interface implemented by things that know how
|
// CIDRAllocator is an interface implemented by things that know how
|
||||||
|
@ -20,13 +20,14 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/golang/glog"
|
"github.com/golang/glog"
|
||||||
|
|
||||||
|
"k8s.io/apimachinery/pkg/api/errors"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
"k8s.io/apimachinery/pkg/types"
|
"k8s.io/apimachinery/pkg/types"
|
||||||
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
||||||
"k8s.io/apimachinery/pkg/util/sets"
|
|
||||||
informers "k8s.io/client-go/informers/core/v1"
|
informers "k8s.io/client-go/informers/core/v1"
|
||||||
corelisters "k8s.io/client-go/listers/core/v1"
|
corelisters "k8s.io/client-go/listers/core/v1"
|
||||||
"k8s.io/client-go/tools/cache"
|
"k8s.io/client-go/tools/cache"
|
||||||
@ -44,6 +45,11 @@ import (
|
|||||||
utilnode "k8s.io/kubernetes/pkg/util/node"
|
utilnode "k8s.io/kubernetes/pkg/util/node"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// nodeProcessingInfo tracks information related to current nodes in processing
|
||||||
|
type nodeProcessingInfo struct {
|
||||||
|
retries int
|
||||||
|
}
|
||||||
|
|
||||||
// cloudCIDRAllocator allocates node CIDRs according to IP address aliases
|
// cloudCIDRAllocator allocates node CIDRs according to IP address aliases
|
||||||
// assigned by the cloud provider. In this case, the allocation and
|
// assigned by the cloud provider. In this case, the allocation and
|
||||||
// deallocation is delegated to the external provider, and the controller
|
// deallocation is delegated to the external provider, and the controller
|
||||||
@ -67,7 +73,7 @@ type cloudCIDRAllocator struct {
|
|||||||
|
|
||||||
// Keep a set of nodes that are currectly being processed to avoid races in CIDR allocation
|
// Keep a set of nodes that are currectly being processed to avoid races in CIDR allocation
|
||||||
lock sync.Mutex
|
lock sync.Mutex
|
||||||
nodesInProcessing sets.String
|
nodesInProcessing map[string]*nodeProcessingInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ CIDRAllocator = (*cloudCIDRAllocator)(nil)
|
var _ CIDRAllocator = (*cloudCIDRAllocator)(nil)
|
||||||
@ -97,7 +103,7 @@ func NewCloudCIDRAllocator(client clientset.Interface, cloud cloudprovider.Inter
|
|||||||
nodesSynced: nodeInformer.Informer().HasSynced,
|
nodesSynced: nodeInformer.Informer().HasSynced,
|
||||||
nodeUpdateChannel: make(chan string, cidrUpdateQueueSize),
|
nodeUpdateChannel: make(chan string, cidrUpdateQueueSize),
|
||||||
recorder: recorder,
|
recorder: recorder,
|
||||||
nodesInProcessing: sets.NewString(),
|
nodesInProcessing: map[string]*nodeProcessingInfo{},
|
||||||
}
|
}
|
||||||
|
|
||||||
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
|
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
|
||||||
@ -147,9 +153,15 @@ func (ca *cloudCIDRAllocator) worker(stopChan <-chan struct{}) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
if err := ca.updateCIDRAllocation(workItem); err != nil {
|
if err := ca.updateCIDRAllocation(workItem); err != nil {
|
||||||
// Requeue the failed node for update again.
|
if ca.canRetry(workItem) {
|
||||||
ca.nodeUpdateChannel <- workItem
|
time.AfterFunc(updateRetryTimeout, func() {
|
||||||
|
// Requeue the failed node for update again.
|
||||||
|
ca.nodeUpdateChannel <- workItem
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
ca.removeNodeFromProcessing(workItem)
|
||||||
case <-stopChan:
|
case <-stopChan:
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -159,17 +171,28 @@ func (ca *cloudCIDRAllocator) worker(stopChan <-chan struct{}) {
|
|||||||
func (ca *cloudCIDRAllocator) insertNodeToProcessing(nodeName string) bool {
|
func (ca *cloudCIDRAllocator) insertNodeToProcessing(nodeName string) bool {
|
||||||
ca.lock.Lock()
|
ca.lock.Lock()
|
||||||
defer ca.lock.Unlock()
|
defer ca.lock.Unlock()
|
||||||
if ca.nodesInProcessing.Has(nodeName) {
|
if _, found := ca.nodesInProcessing[nodeName]; found {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
ca.nodesInProcessing.Insert(nodeName)
|
ca.nodesInProcessing[nodeName] = &nodeProcessingInfo{}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ca *cloudCIDRAllocator) canRetry(nodeName string) bool {
|
||||||
|
ca.lock.Lock()
|
||||||
|
defer ca.lock.Unlock()
|
||||||
|
count := ca.nodesInProcessing[nodeName].retries + 1
|
||||||
|
if count > updateMaxRetries {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
ca.nodesInProcessing[nodeName].retries = count
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ca *cloudCIDRAllocator) removeNodeFromProcessing(nodeName string) {
|
func (ca *cloudCIDRAllocator) removeNodeFromProcessing(nodeName string) {
|
||||||
ca.lock.Lock()
|
ca.lock.Lock()
|
||||||
defer ca.lock.Unlock()
|
defer ca.lock.Unlock()
|
||||||
ca.nodesInProcessing.Delete(nodeName)
|
delete(ca.nodesInProcessing, nodeName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// WARNING: If you're adding any return calls or defer any more work from this
|
// WARNING: If you're adding any return calls or defer any more work from this
|
||||||
@ -191,10 +214,11 @@ func (ca *cloudCIDRAllocator) AllocateOrOccupyCIDR(node *v1.Node) error {
|
|||||||
|
|
||||||
// updateCIDRAllocation assigns CIDR to Node and sends an update to the API server.
|
// updateCIDRAllocation assigns CIDR to Node and sends an update to the API server.
|
||||||
func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error {
|
func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error {
|
||||||
defer ca.removeNodeFromProcessing(nodeName)
|
|
||||||
|
|
||||||
node, err := ca.nodeLister.Get(nodeName)
|
node, err := ca.nodeLister.Get(nodeName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if errors.IsNotFound(err) {
|
||||||
|
return nil // node no longer available, skip processing
|
||||||
|
}
|
||||||
glog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDR: %v", nodeName, err)
|
glog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDR: %v", nodeName, err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
59
pkg/controller/nodeipam/ipam/cloud_cidr_allocator_test.go
Normal file
59
pkg/controller/nodeipam/ipam/cloud_cidr_allocator_test.go
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2018 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package ipam
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"k8s.io/api/core/v1"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/client-go/informers"
|
||||||
|
"k8s.io/client-go/kubernetes/fake"
|
||||||
|
)
|
||||||
|
|
||||||
|
func hasNodeInProcessing(ca *cloudCIDRAllocator, name string) bool {
|
||||||
|
ca.lock.Lock()
|
||||||
|
defer ca.lock.Unlock()
|
||||||
|
|
||||||
|
_, found := ca.nodesInProcessing[name]
|
||||||
|
return found
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBoundedRetries(t *testing.T) {
|
||||||
|
clientSet := fake.NewSimpleClientset()
|
||||||
|
updateChan := make(chan string, 1) // need to buffer as we are using only on go routine
|
||||||
|
stopChan := make(chan struct{})
|
||||||
|
sharedInfomer := informers.NewSharedInformerFactory(clientSet, 1*time.Hour)
|
||||||
|
ca := &cloudCIDRAllocator{
|
||||||
|
client: clientSet,
|
||||||
|
nodeUpdateChannel: updateChan,
|
||||||
|
nodeLister: sharedInfomer.Core().V1().Nodes().Lister(),
|
||||||
|
nodesSynced: sharedInfomer.Core().V1().Nodes().Informer().HasSynced,
|
||||||
|
nodesInProcessing: map[string]*nodeProcessingInfo{},
|
||||||
|
}
|
||||||
|
go ca.worker(stopChan)
|
||||||
|
nodeName := "testNode"
|
||||||
|
ca.AllocateOrOccupyCIDR(&v1.Node{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: nodeName,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
for hasNodeInProcessing(ca, nodeName) {
|
||||||
|
// wait for node to finish processing (should terminate and not time out)
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user