From f61590c2211e576efd972c85c6f3106d66badddb Mon Sep 17 00:00:00 2001 From: Bowei Du Date: Mon, 27 Feb 2017 00:33:55 -0800 Subject: [PATCH] Adds support for PodCIDR allocation from the GCE cloud provider If CIDRAllocatorType is set to `CloudCIDRAllocator`, then allocation of CIDR allocation instead is done by the external cloud provider and the node controller is only responsible for reflecting the allocation into the node spec. - Splits off the rangeAllocator from the cidr_allocator.go file. - Adds cloudCIDRAllocator, which is used when the cloud provider allocates the CIDR ranges externally. (GCE support only) - Updates RBAC permission for node controller to include PATCH --- .../app/controllermanager.go | 1 + .../app/options/options.go | 5 +- pkg/apis/componentconfig/types.go | 4 +- .../providers/gce/gce_instances.go | 4 +- pkg/controller/node/BUILD | 103 +++---- pkg/controller/node/cidr_allocator.go | 259 ++--------------- pkg/controller/node/cloud_cidr_allocator.go | 143 ++++++++++ pkg/controller/node/nodecontroller.go | 249 ++++++++--------- pkg/controller/node/nodecontroller_test.go | 20 +- pkg/controller/node/range_allocator.go | 262 ++++++++++++++++++ .../rbac/bootstrappolicy/controller_policy.go | 2 +- .../testdata/controller-roles.yaml | 1 + 12 files changed, 614 insertions(+), 439 deletions(-) create mode 100644 pkg/controller/node/cloud_cidr_allocator.go create mode 100644 pkg/controller/node/range_allocator.go diff --git a/cmd/kube-controller-manager/app/controllermanager.go b/cmd/kube-controller-manager/app/controllermanager.go index 4390fa0574d..aeeaf29b822 100644 --- a/cmd/kube-controller-manager/app/controllermanager.go +++ b/cmd/kube-controller-manager/app/controllermanager.go @@ -477,6 +477,7 @@ func StartControllers(controllers map[string]InitFunc, s *options.CMServer, root serviceCIDR, int(s.NodeCIDRMaskSize), s.AllocateNodeCIDRs, + nodecontroller.CIDRAllocatorType(s.CIDRAllocatorType), s.EnableTaintManager, utilfeature.DefaultFeatureGate.Enabled(features.TaintBasedEvictions), ) diff --git a/cmd/kube-controller-manager/app/options/options.go b/cmd/kube-controller-manager/app/options/options.go index fe8affcbda0..27e2ffd492d 100644 --- a/cmd/kube-controller-manager/app/options/options.go +++ b/cmd/kube-controller-manager/app/options/options.go @@ -188,7 +188,10 @@ func (s *CMServer) AddFlags(fs *pflag.FlagSet, allControllers []string, disabled fs.StringVar(&s.ClusterCIDR, "cluster-cidr", s.ClusterCIDR, "CIDR Range for Pods in cluster.") fs.StringVar(&s.ServiceCIDR, "service-cluster-ip-range", s.ServiceCIDR, "CIDR Range for Services in cluster.") fs.Int32Var(&s.NodeCIDRMaskSize, "node-cidr-mask-size", s.NodeCIDRMaskSize, "Mask size for node cidr in cluster.") - fs.BoolVar(&s.AllocateNodeCIDRs, "allocate-node-cidrs", false, "Should CIDRs for Pods be allocated and set on the cloud provider.") + fs.BoolVar(&s.AllocateNodeCIDRs, "allocate-node-cidrs", false, + "Should CIDRs for Pods be allocated and set on the cloud provider.") + fs.StringVar(&s.CIDRAllocatorType, "cidr-allocator-type", "RangeAllocator", + "Type of CIDR allocator to use") fs.BoolVar(&s.ConfigureCloudRoutes, "configure-cloud-routes", true, "Should CIDRs allocated by allocate-node-cidrs be configured on the cloud provider.") fs.StringVar(&s.Master, "master", s.Master, "The address of the Kubernetes API server (overrides any value in kubeconfig)") fs.StringVar(&s.Kubeconfig, "kubeconfig", s.Kubeconfig, "Path to kubeconfig file with authorization and master location information.") diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index 0ce875d66ec..aef599d0a8e 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -794,9 +794,11 @@ type KubeControllerManagerConfiguration struct { ServiceCIDR string // NodeCIDRMaskSize is the mask size for node cidr in cluster. NodeCIDRMaskSize int32 - // allocateNodeCIDRs enables CIDRs for Pods to be allocated and, if + // AllocateNodeCIDRs enables CIDRs for Pods to be allocated and, if // ConfigureCloudRoutes is true, to be set on the cloud provider. AllocateNodeCIDRs bool + // CIDRAllocatorType determines what kind of pod CIDR allocator will be used. + CIDRAllocatorType string // configureCloudRoutes enables CIDRs allocated with allocateNodeCIDRs // to be configured on the cloud provider. ConfigureCloudRoutes bool diff --git a/pkg/cloudprovider/providers/gce/gce_instances.go b/pkg/cloudprovider/providers/gce/gce_instances.go index c6c87199bcc..edb2db37c7f 100644 --- a/pkg/cloudprovider/providers/gce/gce_instances.go +++ b/pkg/cloudprovider/providers/gce/gce_instances.go @@ -217,10 +217,10 @@ func (gce *GCECloud) CurrentNodeName(hostname string) (types.NodeName, error) { return types.NodeName(hostname), nil } -// PodCIDRs returns a list of CIDR ranges that are assigned to the +// AliasRanges returns a list of CIDR ranges that are assigned to the // `node` for allocation to pods. Returns a list of the form // "/". -func (gce *GCECloud) PodCIDRs(nodeName types.NodeName) (cidrs []string, err error) { +func (gce *GCECloud) AliasRanges(nodeName types.NodeName) (cidrs []string, err error) { var instance *gceInstance instance, err = gce.getInstanceByName(mapNodeNameToInstanceName(nodeName)) if err != nil { diff --git a/pkg/controller/node/BUILD b/pkg/controller/node/BUILD index ba15fb95395..a35e4e1c528 100644 --- a/pkg/controller/node/BUILD +++ b/pkg/controller/node/BUILD @@ -8,56 +8,6 @@ load( "go_test", ) -go_library( - name = "go_default_library", - srcs = [ - "cidr_allocator.go", - "cidr_set.go", - "controller_utils.go", - "doc.go", - "metrics.go", - "nodecontroller.go", - "rate_limited_queue.go", - "taint_controller.go", - "timed_workers.go", - ], - tags = ["automanaged"], - deps = [ - "//pkg/api:go_default_library", - "//pkg/api/v1:go_default_library", - "//pkg/client/clientset_generated/clientset:go_default_library", - "//pkg/client/informers/informers_generated/externalversions/core/v1:go_default_library", - "//pkg/client/informers/informers_generated/externalversions/extensions/v1beta1:go_default_library", - "//pkg/client/listers/core/v1:go_default_library", - "//pkg/client/listers/extensions/v1beta1:go_default_library", - "//pkg/cloudprovider:go_default_library", - "//pkg/controller:go_default_library", - "//pkg/kubelet/util/format:go_default_library", - "//pkg/util/metrics:go_default_library", - "//pkg/util/node:go_default_library", - "//pkg/util/system:go_default_library", - "//pkg/util/version:go_default_library", - "//vendor:github.com/golang/glog", - "//vendor:github.com/prometheus/client_golang/prometheus", - "//vendor:k8s.io/apimachinery/pkg/api/equality", - "//vendor:k8s.io/apimachinery/pkg/api/errors", - "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", - "//vendor:k8s.io/apimachinery/pkg/fields", - "//vendor:k8s.io/apimachinery/pkg/labels", - "//vendor:k8s.io/apimachinery/pkg/types", - "//vendor:k8s.io/apimachinery/pkg/util/errors", - "//vendor:k8s.io/apimachinery/pkg/util/runtime", - "//vendor:k8s.io/apimachinery/pkg/util/sets", - "//vendor:k8s.io/apimachinery/pkg/util/wait", - "//vendor:k8s.io/client-go/kubernetes/typed/core/v1", - "//vendor:k8s.io/client-go/pkg/api/v1", - "//vendor:k8s.io/client-go/tools/cache", - "//vendor:k8s.io/client-go/tools/record", - "//vendor:k8s.io/client-go/util/flowcontrol", - "//vendor:k8s.io/client-go/util/workqueue", - ], -) - go_test( name = "go_default_test", srcs = [ @@ -96,6 +46,59 @@ go_test( ], ) +go_library( + name = "go_default_library", + srcs = [ + "cidr_allocator.go", + "cidr_set.go", + "cloud_cidr_allocator.go", + "controller_utils.go", + "doc.go", + "metrics.go", + "nodecontroller.go", + "range_allocator.go", + "rate_limited_queue.go", + "taint_controller.go", + "timed_workers.go", + ], + tags = ["automanaged"], + deps = [ + "//pkg/api:go_default_library", + "//pkg/api/v1:go_default_library", + "//pkg/client/clientset_generated/clientset:go_default_library", + "//pkg/client/informers/informers_generated/externalversions/core/v1:go_default_library", + "//pkg/client/informers/informers_generated/externalversions/extensions/v1beta1:go_default_library", + "//pkg/client/listers/core/v1:go_default_library", + "//pkg/client/listers/extensions/v1beta1:go_default_library", + "//pkg/cloudprovider:go_default_library", + "//pkg/cloudprovider/providers/gce:go_default_library", + "//pkg/controller:go_default_library", + "//pkg/kubelet/util/format:go_default_library", + "//pkg/util/metrics:go_default_library", + "//pkg/util/node:go_default_library", + "//pkg/util/system:go_default_library", + "//pkg/util/version:go_default_library", + "//vendor:github.com/golang/glog", + "//vendor:github.com/prometheus/client_golang/prometheus", + "//vendor:k8s.io/apimachinery/pkg/api/equality", + "//vendor:k8s.io/apimachinery/pkg/api/errors", + "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", + "//vendor:k8s.io/apimachinery/pkg/fields", + "//vendor:k8s.io/apimachinery/pkg/labels", + "//vendor:k8s.io/apimachinery/pkg/types", + "//vendor:k8s.io/apimachinery/pkg/util/errors", + "//vendor:k8s.io/apimachinery/pkg/util/runtime", + "//vendor:k8s.io/apimachinery/pkg/util/sets", + "//vendor:k8s.io/apimachinery/pkg/util/wait", + "//vendor:k8s.io/client-go/kubernetes/typed/core/v1", + "//vendor:k8s.io/client-go/pkg/api/v1", + "//vendor:k8s.io/client-go/tools/cache", + "//vendor:k8s.io/client-go/tools/record", + "//vendor:k8s.io/client-go/util/flowcontrol", + "//vendor:k8s.io/client-go/util/workqueue", + ], +) + filegroup( name = "package-srcs", srcs = glob(["**"]), diff --git a/pkg/controller/node/cidr_allocator.go b/pkg/controller/node/cidr_allocator.go index d2bc7f99860..9e67d0a734e 100644 --- a/pkg/controller/node/cidr_allocator.go +++ b/pkg/controller/node/cidr_allocator.go @@ -18,259 +18,34 @@ package node import ( "errors" - "fmt" "net" - "sync" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/apimachinery/pkg/util/wait" - v1core "k8s.io/client-go/kubernetes/typed/core/v1" - clientv1 "k8s.io/client-go/pkg/api/v1" - "k8s.io/client-go/tools/record" - "k8s.io/kubernetes/pkg/api" - "k8s.io/kubernetes/pkg/api/v1" - "k8s.io/kubernetes/pkg/client/clientset_generated/clientset" - - "github.com/golang/glog" + v1 "k8s.io/kubernetes/pkg/api/v1" ) -// TODO: figure out the good setting for those constants. -const ( - // controls how many NodeSpec updates NC can process concurrently. - cidrUpdateWorkers = 10 - cidrUpdateQueueSize = 5000 - // podCIDRUpdateRetry controls the number of retries of writing Node.Spec.PodCIDR update. - podCIDRUpdateRetry = 5 -) - -var errCIDRRangeNoCIDRsRemaining = errors.New("CIDR allocation failed; there are no remaining CIDRs left to allocate in the accepted range") +var errCIDRRangeNoCIDRsRemaining = errors.New( + "CIDR allocation failed; there are no remaining CIDRs left to allocate in the accepted range") type nodeAndCIDR struct { cidr *net.IPNet nodeName string } -// CIDRAllocator is an interface implemented by things that know how to allocate/occupy/recycle CIDR for nodes. +// CIDRAllocatorType is the type of the allocator to use. +type CIDRAllocatorType string + +const ( + RangeAllocatorType CIDRAllocatorType = "RangeAllocator" + CloudAllocatorType CIDRAllocatorType = "CloudAllocator" +) + +// CIDRAllocator is an interface implemented by things that know how to +// allocate/occupy/recycle CIDR for nodes. type CIDRAllocator interface { + // AllocateOrOccupyCIDR looks at the given node, assigns it a valid + // CIDR if it doesn't currently have one or mark the CIDR as used if + // the node already have one. AllocateOrOccupyCIDR(node *v1.Node) error + // ReleaseCIDR releases the CIDR of the removed node ReleaseCIDR(node *v1.Node) error } - -type rangeAllocator struct { - client clientset.Interface - cidrs *cidrSet - clusterCIDR *net.IPNet - maxCIDRs int - // Channel that is used to pass updating Nodes with assigned CIDRs to the background - // This increases a throughput of CIDR assignment by not blocking on long operations. - nodeCIDRUpdateChannel chan nodeAndCIDR - recorder record.EventRecorder - // Keep a set of nodes that are currectly being processed to avoid races in CIDR allocation - sync.Mutex - nodesInProcessing sets.String -} - -// NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDR for node -// Caller must ensure subNetMaskSize is not less than cluster CIDR mask size. -// Caller must always pass in a list of existing nodes so the new allocator -// can initialize its CIDR map. NodeList is only nil in testing. -func NewCIDRRangeAllocator(client clientset.Interface, clusterCIDR *net.IPNet, serviceCIDR *net.IPNet, subNetMaskSize int, nodeList *v1.NodeList) (CIDRAllocator, error) { - eventBroadcaster := record.NewBroadcaster() - recorder := eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "cidrAllocator"}) - eventBroadcaster.StartLogging(glog.Infof) - if client != nil { - glog.V(0).Infof("Sending events to api server.") - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(client.Core().RESTClient()).Events("")}) - } else { - glog.Fatalf("kubeClient is nil when starting NodeController") - } - - ra := &rangeAllocator{ - client: client, - cidrs: newCIDRSet(clusterCIDR, subNetMaskSize), - clusterCIDR: clusterCIDR, - nodeCIDRUpdateChannel: make(chan nodeAndCIDR, cidrUpdateQueueSize), - recorder: recorder, - nodesInProcessing: sets.NewString(), - } - - if serviceCIDR != nil { - ra.filterOutServiceRange(serviceCIDR) - } else { - glog.V(0).Info("No Service CIDR provided. Skipping filtering out service addresses.") - } - - if nodeList != nil { - for _, node := range nodeList.Items { - if node.Spec.PodCIDR == "" { - glog.Infof("Node %v has no CIDR, ignoring", node.Name) - continue - } else { - glog.Infof("Node %v has CIDR %s, occupying it in CIDR map", node.Name, node.Spec.PodCIDR) - } - if err := ra.occupyCIDR(&node); err != nil { - // This will happen if: - // 1. We find garbage in the podCIDR field. Retrying is useless. - // 2. CIDR out of range: This means a node CIDR has changed. - // This error will keep crashing controller-manager. - return nil, err - } - } - } - for i := 0; i < cidrUpdateWorkers; i++ { - go func(stopChan <-chan struct{}) { - for { - select { - case workItem, ok := <-ra.nodeCIDRUpdateChannel: - if !ok { - glog.Warning("NodeCIDRUpdateChannel read returned false.") - return - } - ra.updateCIDRAllocation(workItem) - case <-stopChan: - return - } - } - }(wait.NeverStop) - } - - return ra, nil -} - -func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool { - r.Lock() - defer r.Unlock() - if r.nodesInProcessing.Has(nodeName) { - return false - } - r.nodesInProcessing.Insert(nodeName) - return true -} - -func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) { - r.Lock() - defer r.Unlock() - r.nodesInProcessing.Delete(nodeName) -} - -func (r *rangeAllocator) occupyCIDR(node *v1.Node) error { - defer r.removeNodeFromProcessing(node.Name) - if node.Spec.PodCIDR == "" { - return nil - } - _, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR) - if err != nil { - return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR) - } - if err := r.cidrs.occupy(podCIDR); err != nil { - return fmt.Errorf("failed to mark cidr as occupied: %v", err) - } - return nil -} - -// AllocateOrOccupyCIDR looks at the given node, assigns it a valid CIDR -// if it doesn't currently have one or mark the CIDR as used if the node already have one. -// WARNING: If you're adding any return calls or defer any more work from this function -// you have to handle correctly nodesInProcessing. -func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { - if node == nil { - return nil - } - if !r.insertNodeToProcessing(node.Name) { - glog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name) - return nil - } - if node.Spec.PodCIDR != "" { - return r.occupyCIDR(node) - } - podCIDR, err := r.cidrs.allocateNext() - if err != nil { - r.removeNodeFromProcessing(node.Name) - recordNodeStatusChange(r.recorder, node, "CIDRNotAvailable") - return fmt.Errorf("failed to allocate cidr: %v", err) - } - - glog.V(10).Infof("Putting node %s with CIDR %s into the work queue", node.Name, podCIDR) - r.nodeCIDRUpdateChannel <- nodeAndCIDR{ - nodeName: node.Name, - cidr: podCIDR, - } - return nil -} - -// ReleaseCIDR releases the CIDR of the removed node -func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error { - if node == nil || node.Spec.PodCIDR == "" { - return nil - } - _, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR) - if err != nil { - return fmt.Errorf("Failed to parse CIDR %s on Node %v: %v", node.Spec.PodCIDR, node.Name, err) - } - - glog.V(4).Infof("release CIDR %s", node.Spec.PodCIDR) - if err = r.cidrs.release(podCIDR); err != nil { - return fmt.Errorf("Error when releasing CIDR %v: %v", node.Spec.PodCIDR, err) - } - return err -} - -// Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used, -// so that they won't be assignable. -func (r *rangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) { - // Checks if service CIDR has a nonempty intersection with cluster CIDR. It is the case if either - // clusterCIDR contains serviceCIDR with clusterCIDR's Mask applied (this means that clusterCIDR contains serviceCIDR) - // or vice versa (which means that serviceCIDR contains clusterCIDR). - if !r.clusterCIDR.Contains(serviceCIDR.IP.Mask(r.clusterCIDR.Mask)) && !serviceCIDR.Contains(r.clusterCIDR.IP.Mask(serviceCIDR.Mask)) { - return - } - - if err := r.cidrs.occupy(serviceCIDR); err != nil { - glog.Errorf("Error filtering out service cidr %v: %v", serviceCIDR, err) - } -} - -// Assigns CIDR to Node and sends an update to the API server. -func (r *rangeAllocator) updateCIDRAllocation(data nodeAndCIDR) error { - var err error - var node *v1.Node - defer r.removeNodeFromProcessing(data.nodeName) - for rep := 0; rep < podCIDRUpdateRetry; rep++ { - // TODO: change it to using PATCH instead of full Node updates. - node, err = r.client.Core().Nodes().Get(data.nodeName, metav1.GetOptions{}) - if err != nil { - glog.Errorf("Failed while getting node %v to retry updating Node.Spec.PodCIDR: %v", data.nodeName, err) - continue - } - if node.Spec.PodCIDR != "" { - glog.Errorf("Node %v already has allocated CIDR %v. Releasing assigned one if different.", node.Name, node.Spec.PodCIDR) - if node.Spec.PodCIDR != data.cidr.String() { - if err := r.cidrs.release(data.cidr); err != nil { - glog.Errorf("Error when releasing CIDR %v", data.cidr.String()) - } - } - return nil - } - node.Spec.PodCIDR = data.cidr.String() - if _, err := r.client.Core().Nodes().Update(node); err != nil { - glog.Errorf("Failed while updating Node.Spec.PodCIDR (%d retries left): %v", podCIDRUpdateRetry-rep-1, err) - } else { - break - } - } - if err != nil { - recordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed") - // We accept the fact that we may leek CIDRs here. This is safer than releasing - // them in case when we don't know if request went through. - // NodeController restart will return all falsely allocated CIDRs to the pool. - if !apierrors.IsServerTimeout(err) { - glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", data.nodeName, err) - if releaseErr := r.cidrs.release(data.cidr); releaseErr != nil { - glog.Errorf("Error releasing allocated CIDR for node %v: %v", data.nodeName, releaseErr) - } - } - } - return err -} diff --git a/pkg/controller/node/cloud_cidr_allocator.go b/pkg/controller/node/cloud_cidr_allocator.go new file mode 100644 index 00000000000..a6a0d8d4b10 --- /dev/null +++ b/pkg/controller/node/cloud_cidr_allocator.go @@ -0,0 +1,143 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package node + +import ( + "fmt" + "sync" + + "github.com/golang/glog" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + clientv1 "k8s.io/client-go/pkg/api/v1" + "k8s.io/client-go/tools/record" + + "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/client/clientset_generated/clientset" + "k8s.io/kubernetes/pkg/cloudprovider" + "k8s.io/kubernetes/pkg/cloudprovider/providers/gce" + nodeutil "k8s.io/kubernetes/pkg/util/node" +) + +// cloudCIDRAllocator allocates node CIDRs according to IP address aliases +// assigned by the cloud provider. In this case, the allocation and +// deallocation is delegated to the external provider, and the controller +// merely takes the assignment and updates the node spec. +type cloudCIDRAllocator struct { + lock sync.Mutex + + client clientset.Interface + cloud *gce.GCECloud + + recorder record.EventRecorder +} + +var _ CIDRAllocator = (*cloudCIDRAllocator)(nil) + +func NewCloudCIDRAllocator( + client clientset.Interface, + cloud cloudprovider.Interface) (ca CIDRAllocator, err error) { + + gceCloud, ok := cloud.(*gce.GCECloud) + if !ok { + err = fmt.Errorf("cloudCIDRAllocator does not support %v provider", cloud.ProviderName()) + return + } + + ca = &cloudCIDRAllocator{ + client: client, + cloud: gceCloud, + recorder: record.NewBroadcaster().NewRecorder( + api.Scheme, + clientv1.EventSource{Component: "cidrAllocator"}), + } + + glog.V(0).Infof("Using cloud CIDR allocator (provider: %v)", cloud.ProviderName()) + + return +} + +func (ca *cloudCIDRAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { + glog.V(2).Infof("Updating PodCIDR for node %v", node.Name) + + cidrs, err := ca.cloud.AliasRanges(types.NodeName(node.Name)) + + if err != nil { + recordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable") + return fmt.Errorf("failed to allocate cidr: %v", err) + } + + if len(cidrs) == 0 { + recordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable") + glog.V(2).Infof("Node %v has no CIDRs", node.Name) + return fmt.Errorf("failed to allocate cidr (none exist)") + } + + node, err = ca.client.Core().Nodes().Get(node.Name, metav1.GetOptions{}) + if err != nil { + glog.Errorf("Could not get Node object from Kubernetes: %v", err) + return err + } + + podCIDR := cidrs[0] + + if node.Spec.PodCIDR != "" { + if node.Spec.PodCIDR == podCIDR { + glog.V(3).Infof("Node %v has PodCIDR %v", node.Name, podCIDR) + return nil + } + glog.Errorf("PodCIDR cannot be reassigned, node %v spec has %v, but cloud provider has assigned %v", + node.Name, node.Spec.PodCIDR, podCIDR) + // We fall through and set the CIDR despite this error. This + // implements the same logic as implemented in the + // rangeAllocator. + // + // See https://github.com/kubernetes/kubernetes/pull/42147#discussion_r103357248 + } + + node.Spec.PodCIDR = cidrs[0] + if _, err := ca.client.Core().Nodes().Update(node); err == nil { + glog.V(2).Infof("Node %v PodCIDR set to %v", node.Name, podCIDR) + } else { + glog.Errorf("Could not update node %v PodCIDR to %v: %v", + node.Name, podCIDR, err) + return err + } + + err = nodeutil.SetNodeCondition(ca.client, types.NodeName(node.Name), v1.NodeCondition{ + Type: v1.NodeNetworkUnavailable, + Status: v1.ConditionFalse, + Reason: "RouteCreated", + Message: "NodeController create implicit route", + LastTransitionTime: metav1.Now(), + }) + if err != nil { + glog.Errorf("Error setting route status for node %v: %v", + node.Name, err) + } + + return err +} + +func (ca *cloudCIDRAllocator) ReleaseCIDR(node *v1.Node) error { + glog.V(2).Infof("Node %v PodCIDR (%v) will be released by external cloud provider (not managed by controller)", + node.Name, node.Spec.PodCIDR) + return nil +} diff --git a/pkg/controller/node/nodecontroller.go b/pkg/controller/node/nodecontroller.go index 0ca256b238d..b3b4f713ff9 100644 --- a/pkg/controller/node/nodecontroller.go +++ b/pkg/controller/node/nodecontroller.go @@ -109,11 +109,13 @@ type nodeStatusData struct { type NodeController struct { allocateNodeCIDRs bool - cloud cloudprovider.Interface - clusterCIDR *net.IPNet - serviceCIDR *net.IPNet - knownNodeSet map[string]*v1.Node - kubeClient clientset.Interface + allocatorType CIDRAllocatorType + + cloud cloudprovider.Interface + clusterCIDR *net.IPNet + serviceCIDR *net.IPNet + knownNodeSet map[string]*v1.Node + kubeClient clientset.Interface // Method for easy mocking in unittest. lookupIP func(host string) ([]net.IP, error) // Value used if sync_nodes_status=False. NodeController will not proactively @@ -162,9 +164,8 @@ type NodeController struct { podInformerSynced cache.InformerSynced - // allocate/recycle CIDRs for node if allocateNodeCIDRs == true cidrAllocator CIDRAllocator - // manages taints + taintManager *NoExecuteTaintManager forcefullyDeletePod func(*v1.Pod) error @@ -210,6 +211,7 @@ func NewNodeController( serviceCIDR *net.IPNet, nodeCIDRMaskSize int, allocateNodeCIDRs bool, + allocatorType CIDRAllocatorType, runTaintManager bool, useTaintBasedEvictions bool) (*NodeController, error) { eventBroadcaster := record.NewBroadcaster() @@ -254,6 +256,7 @@ func NewNodeController( clusterCIDR: clusterCIDR, serviceCIDR: serviceCIDR, allocateNodeCIDRs: allocateNodeCIDRs, + allocatorType: allocatorType, forcefullyDeletePod: func(p *v1.Pod) error { return forcefullyDeletePod(kubeClient, p) }, nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) }, evictionLimiterQPS: evictionLimiterQPS, @@ -309,7 +312,6 @@ func NewNodeController( }) nc.podInformerSynced = podInformer.Informer().HasSynced - nodeEventHandlerFuncs := cache.ResourceEventHandlerFuncs{} if nc.allocateNodeCIDRs { var nodeList *v1.NodeList var err error @@ -328,147 +330,32 @@ func NewNodeController( }); pollErr != nil { return nil, fmt.Errorf("Failed to list all nodes in %v, cannot proceed without updating CIDR map", apiserverStartupGracePeriod) } - nc.cidrAllocator, err = NewCIDRRangeAllocator(kubeClient, clusterCIDR, serviceCIDR, nodeCIDRMaskSize, nodeList) + + switch nc.allocatorType { + case RangeAllocatorType: + nc.cidrAllocator, err = NewCIDRRangeAllocator( + kubeClient, clusterCIDR, serviceCIDR, nodeCIDRMaskSize, nodeList) + case CloudAllocatorType: + nc.cidrAllocator, err = NewCloudCIDRAllocator(kubeClient, cloud) + default: + return nil, fmt.Errorf("Invalid CIDR allocator type: %v", nc.allocatorType) + } + if err != nil { return nil, err } - nodeEventHandlerFuncs = cache.ResourceEventHandlerFuncs{ - AddFunc: func(originalObj interface{}) { - obj, err := api.Scheme.DeepCopy(originalObj) - if err != nil { - utilruntime.HandleError(err) - return - } - node := obj.(*v1.Node) - - if err := nc.cidrAllocator.AllocateOrOccupyCIDR(node); err != nil { - utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err)) - } - if nc.taintManager != nil { - nc.taintManager.NodeUpdated(nil, node) - } - }, - UpdateFunc: func(oldNode, newNode interface{}) { - node := newNode.(*v1.Node) - prevNode := oldNode.(*v1.Node) - // If the PodCIDR is not empty we either: - // - already processed a Node that already had a CIDR after NC restarted - // (cidr is marked as used), - // - already processed a Node successfully and allocated a CIDR for it - // (cidr is marked as used), - // - already processed a Node but we did saw a "timeout" response and - // request eventually got through in this case we haven't released - // the allocated CIDR (cidr is still marked as used). - // There's a possible error here: - // - NC sees a new Node and assigns a CIDR X to it, - // - Update Node call fails with a timeout, - // - Node is updated by some other component, NC sees an update and - // assigns CIDR Y to the Node, - // - Both CIDR X and CIDR Y are marked as used in the local cache, - // even though Node sees only CIDR Y - // The problem here is that in in-memory cache we see CIDR X as marked, - // which prevents it from being assigned to any new node. The cluster - // state is correct. - // Restart of NC fixes the issue. - if node.Spec.PodCIDR == "" { - nodeCopy, err := api.Scheme.Copy(node) - if err != nil { - utilruntime.HandleError(err) - return - } - - if err := nc.cidrAllocator.AllocateOrOccupyCIDR(nodeCopy.(*v1.Node)); err != nil { - utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err)) - } - } - if nc.taintManager != nil { - nc.taintManager.NodeUpdated(prevNode, node) - } - }, - DeleteFunc: func(originalObj interface{}) { - obj, err := api.Scheme.DeepCopy(originalObj) - if err != nil { - utilruntime.HandleError(err) - return - } - - node, isNode := obj.(*v1.Node) - // We can get DeletedFinalStateUnknown instead of *v1.Node here and we need to handle that correctly. #34692 - if !isNode { - deletedState, ok := obj.(cache.DeletedFinalStateUnknown) - if !ok { - glog.Errorf("Received unexpected object: %v", obj) - return - } - node, ok = deletedState.Obj.(*v1.Node) - if !ok { - glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj) - return - } - } - if nc.taintManager != nil { - nc.taintManager.NodeUpdated(node, nil) - } - if err := nc.cidrAllocator.ReleaseCIDR(node); err != nil { - glog.Errorf("Error releasing CIDR: %v", err) - } - }, - } - } else { - nodeEventHandlerFuncs = cache.ResourceEventHandlerFuncs{ - AddFunc: func(originalObj interface{}) { - obj, err := api.Scheme.DeepCopy(originalObj) - if err != nil { - utilruntime.HandleError(err) - return - } - node := obj.(*v1.Node) - if nc.taintManager != nil { - nc.taintManager.NodeUpdated(nil, node) - } - }, - UpdateFunc: func(oldNode, newNode interface{}) { - node := newNode.(*v1.Node) - prevNode := oldNode.(*v1.Node) - if nc.taintManager != nil { - nc.taintManager.NodeUpdated(prevNode, node) - - } - }, - DeleteFunc: func(originalObj interface{}) { - obj, err := api.Scheme.DeepCopy(originalObj) - if err != nil { - utilruntime.HandleError(err) - return - } - - node, isNode := obj.(*v1.Node) - // We can get DeletedFinalStateUnknown instead of *v1.Node here and we need to handle that correctly. #34692 - if !isNode { - deletedState, ok := obj.(cache.DeletedFinalStateUnknown) - if !ok { - glog.Errorf("Received unexpected object: %v", obj) - return - } - node, ok = deletedState.Obj.(*v1.Node) - if !ok { - glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj) - return - } - } - if nc.taintManager != nil { - nc.taintManager.NodeUpdated(node, nil) - } - }, - } + nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: nc.onNodeAdd, + UpdateFunc: nc.onNodeUpdate, + DeleteFunc: nc.onNodeDelete, + }) } if nc.runTaintManager { nc.taintManager = NewNoExecuteTaintManager(kubeClient) } - nodeInformer.Informer().AddEventHandler(nodeEventHandlerFuncs) nc.nodeLister = nodeInformer.Lister() nc.nodeInformerSynced = nodeInformer.Informer().HasSynced @@ -546,6 +433,90 @@ func (nc *NodeController) doTaintingPass() { } } +func (nc *NodeController) onNodeAdd(originalObj interface{}) { + obj, err := api.Scheme.DeepCopy(originalObj) + if err != nil { + utilruntime.HandleError(err) + return + } + node := obj.(*v1.Node) + + if err := nc.cidrAllocator.AllocateOrOccupyCIDR(node); err != nil { + utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err)) + } + if nc.taintManager != nil { + nc.taintManager.NodeUpdated(nil, node) + } +} + +func (nc *NodeController) onNodeUpdate(oldNode, newNode interface{}) { + node := newNode.(*v1.Node) + prevNode := oldNode.(*v1.Node) + // If the PodCIDR is not empty we either: + // - already processed a Node that already had a CIDR after NC restarted + // (cidr is marked as used), + // - already processed a Node successfully and allocated a CIDR for it + // (cidr is marked as used), + // - already processed a Node but we did saw a "timeout" response and + // request eventually got through in this case we haven't released + // the allocated CIDR (cidr is still marked as used). + // There's a possible error here: + // - NC sees a new Node and assigns a CIDR X to it, + // - Update Node call fails with a timeout, + // - Node is updated by some other component, NC sees an update and + // assigns CIDR Y to the Node, + // - Both CIDR X and CIDR Y are marked as used in the local cache, + // even though Node sees only CIDR Y + // The problem here is that in in-memory cache we see CIDR X as marked, + // which prevents it from being assigned to any new node. The cluster + // state is correct. + // Restart of NC fixes the issue. + if node.Spec.PodCIDR == "" { + nodeCopy, err := api.Scheme.Copy(node) + if err != nil { + utilruntime.HandleError(err) + return + } + + if err := nc.cidrAllocator.AllocateOrOccupyCIDR(nodeCopy.(*v1.Node)); err != nil { + utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err)) + } + } + if nc.taintManager != nil { + nc.taintManager.NodeUpdated(prevNode, node) + } +} + +func (nc *NodeController) onNodeDelete(originalObj interface{}) { + obj, err := api.Scheme.DeepCopy(originalObj) + if err != nil { + utilruntime.HandleError(err) + return + } + + node, isNode := obj.(*v1.Node) + // We can get DeletedFinalStateUnknown instead of *v1.Node here and + // we need to handle that correctly. #34692 + if !isNode { + deletedState, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + glog.Errorf("Received unexpected object: %v", obj) + return + } + node, ok = deletedState.Obj.(*v1.Node) + if !ok { + glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj) + return + } + } + if nc.taintManager != nil { + nc.taintManager.NodeUpdated(node, nil) + } + if err := nc.cidrAllocator.ReleaseCIDR(node); err != nil { + glog.Errorf("Error releasing CIDR: %v", err) + } +} + // Run starts an asynchronous loop that monitors the status of cluster nodes. func (nc *NodeController) Run() { go func() { diff --git a/pkg/controller/node/nodecontroller_test.go b/pkg/controller/node/nodecontroller_test.go index f2596c8431b..b26c99d4550 100644 --- a/pkg/controller/node/nodecontroller_test.go +++ b/pkg/controller/node/nodecontroller_test.go @@ -101,6 +101,7 @@ func NewNodeControllerFromClient( serviceCIDR, nodeCIDRMaskSize, allocateNodeCIDRs, + RangeAllocatorType, useTaints, useTaints, ) @@ -549,9 +550,22 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) { } for _, item := range table { - nodeController, _ := NewNodeControllerFromClient(nil, item.fakeNodeHandler, - evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, testUnhealtyThreshold, testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, nil, 0, false, false) + nodeController, _ := NewNodeControllerFromClient( + nil, + item.fakeNodeHandler, + evictionTimeout, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealtyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + nil, + nil, + 0, + false, + false) nodeController.now = func() metav1.Time { return fakeNow } nodeController.recorder = testutil.NewFakeRecorder() for _, ds := range item.daemonSets { diff --git a/pkg/controller/node/range_allocator.go b/pkg/controller/node/range_allocator.go new file mode 100644 index 00000000000..9d63e485bbd --- /dev/null +++ b/pkg/controller/node/range_allocator.go @@ -0,0 +1,262 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package node + +import ( + "fmt" + "net" + "sync" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" + v1core "k8s.io/client-go/kubernetes/typed/core/v1" + clientv1 "k8s.io/client-go/pkg/api/v1" + "k8s.io/client-go/tools/record" + "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/client/clientset_generated/clientset" + + "github.com/golang/glog" +) + +// TODO: figure out the good setting for those constants. +const ( + // controls how many NodeSpec updates NC can process concurrently. + cidrUpdateWorkers = 10 + cidrUpdateQueueSize = 5000 + // podCIDRUpdateRetry controls the number of retries of writing Node.Spec.PodCIDR update. + podCIDRUpdateRetry = 5 +) + +type rangeAllocator struct { + client clientset.Interface + cidrs *cidrSet + clusterCIDR *net.IPNet + maxCIDRs int + // Channel that is used to pass updating Nodes with assigned CIDRs to the background + // This increases a throughput of CIDR assignment by not blocking on long operations. + nodeCIDRUpdateChannel chan nodeAndCIDR + recorder record.EventRecorder + // Keep a set of nodes that are currectly being processed to avoid races in CIDR allocation + sync.Mutex + nodesInProcessing sets.String +} + +// NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDR for node +// Caller must ensure subNetMaskSize is not less than cluster CIDR mask size. +// Caller must always pass in a list of existing nodes so the new allocator +// can initialize its CIDR map. NodeList is only nil in testing. +func NewCIDRRangeAllocator(client clientset.Interface, clusterCIDR *net.IPNet, serviceCIDR *net.IPNet, subNetMaskSize int, nodeList *v1.NodeList) (CIDRAllocator, error) { + eventBroadcaster := record.NewBroadcaster() + recorder := eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "cidrAllocator"}) + eventBroadcaster.StartLogging(glog.Infof) + if client != nil { + glog.V(0).Infof("Sending events to api server.") + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(client.Core().RESTClient()).Events("")}) + } else { + glog.Fatalf("kubeClient is nil when starting NodeController") + } + + ra := &rangeAllocator{ + client: client, + cidrs: newCIDRSet(clusterCIDR, subNetMaskSize), + clusterCIDR: clusterCIDR, + nodeCIDRUpdateChannel: make(chan nodeAndCIDR, cidrUpdateQueueSize), + recorder: recorder, + nodesInProcessing: sets.NewString(), + } + + if serviceCIDR != nil { + ra.filterOutServiceRange(serviceCIDR) + } else { + glog.V(0).Info("No Service CIDR provided. Skipping filtering out service addresses.") + } + + if nodeList != nil { + for _, node := range nodeList.Items { + if node.Spec.PodCIDR == "" { + glog.Infof("Node %v has no CIDR, ignoring", node.Name) + continue + } else { + glog.Infof("Node %v has CIDR %s, occupying it in CIDR map", + node.Name, node.Spec.PodCIDR) + } + if err := ra.occupyCIDR(&node); err != nil { + // This will happen if: + // 1. We find garbage in the podCIDR field. Retrying is useless. + // 2. CIDR out of range: This means a node CIDR has changed. + // This error will keep crashing controller-manager. + return nil, err + } + } + } + for i := 0; i < cidrUpdateWorkers; i++ { + go func(stopChan <-chan struct{}) { + for { + select { + case workItem, ok := <-ra.nodeCIDRUpdateChannel: + if !ok { + glog.Warning("NodeCIDRUpdateChannel read returned false.") + return + } + ra.updateCIDRAllocation(workItem) + case <-stopChan: + return + } + } + }(wait.NeverStop) + } + + return ra, nil +} + +func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool { + r.Lock() + defer r.Unlock() + if r.nodesInProcessing.Has(nodeName) { + return false + } + r.nodesInProcessing.Insert(nodeName) + return true +} + +func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) { + r.Lock() + defer r.Unlock() + r.nodesInProcessing.Delete(nodeName) +} + +func (r *rangeAllocator) occupyCIDR(node *v1.Node) error { + defer r.removeNodeFromProcessing(node.Name) + if node.Spec.PodCIDR == "" { + return nil + } + _, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR) + if err != nil { + return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR) + } + if err := r.cidrs.occupy(podCIDR); err != nil { + return fmt.Errorf("failed to mark cidr as occupied: %v", err) + } + return nil +} + +// WARNING: If you're adding any return calls or defer any more work from this +// function you have to handle correctly nodesInProcessing. +func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { + if node == nil { + return nil + } + if !r.insertNodeToProcessing(node.Name) { + glog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name) + return nil + } + if node.Spec.PodCIDR != "" { + return r.occupyCIDR(node) + } + podCIDR, err := r.cidrs.allocateNext() + if err != nil { + r.removeNodeFromProcessing(node.Name) + recordNodeStatusChange(r.recorder, node, "CIDRNotAvailable") + return fmt.Errorf("failed to allocate cidr: %v", err) + } + + glog.V(10).Infof("Putting node %s with CIDR %s into the work queue", node.Name, podCIDR) + r.nodeCIDRUpdateChannel <- nodeAndCIDR{ + nodeName: node.Name, + cidr: podCIDR, + } + return nil +} + +func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error { + if node == nil || node.Spec.PodCIDR == "" { + return nil + } + _, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR) + if err != nil { + return fmt.Errorf("Failed to parse CIDR %s on Node %v: %v", node.Spec.PodCIDR, node.Name, err) + } + + glog.V(4).Infof("release CIDR %s", node.Spec.PodCIDR) + if err = r.cidrs.release(podCIDR); err != nil { + return fmt.Errorf("Error when releasing CIDR %v: %v", node.Spec.PodCIDR, err) + } + return err +} + +// Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used, +// so that they won't be assignable. +func (r *rangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) { + // Checks if service CIDR has a nonempty intersection with cluster + // CIDR. It is the case if either clusterCIDR contains serviceCIDR with + // clusterCIDR's Mask applied (this means that clusterCIDR contains + // serviceCIDR) or vice versa (which means that serviceCIDR contains + // clusterCIDR). + if !r.clusterCIDR.Contains(serviceCIDR.IP.Mask(r.clusterCIDR.Mask)) && !serviceCIDR.Contains(r.clusterCIDR.IP.Mask(serviceCIDR.Mask)) { + return + } + + if err := r.cidrs.occupy(serviceCIDR); err != nil { + glog.Errorf("Error filtering out service cidr %v: %v", serviceCIDR, err) + } +} + +// Assigns CIDR to Node and sends an update to the API server. +func (r *rangeAllocator) updateCIDRAllocation(data nodeAndCIDR) error { + var err error + var node *v1.Node + defer r.removeNodeFromProcessing(data.nodeName) + for rep := 0; rep < podCIDRUpdateRetry; rep++ { + // TODO: change it to using PATCH instead of full Node updates. + node, err = r.client.Core().Nodes().Get(data.nodeName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("Failed while getting node %v to retry updating Node.Spec.PodCIDR: %v", data.nodeName, err) + continue + } + if node.Spec.PodCIDR != "" { + glog.Errorf("Node %v already has allocated CIDR %v. Releasing assigned one if different.", node.Name, node.Spec.PodCIDR) + if node.Spec.PodCIDR != data.cidr.String() { + if err := r.cidrs.release(data.cidr); err != nil { + glog.Errorf("Error when releasing CIDR %v", data.cidr.String()) + } + } + return nil + } + node.Spec.PodCIDR = data.cidr.String() + if _, err := r.client.Core().Nodes().Update(node); err != nil { + glog.Errorf("Failed while updating Node.Spec.PodCIDR (%d retries left): %v", podCIDRUpdateRetry-rep-1, err) + } else { + break + } + } + if err != nil { + recordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed") + // We accept the fact that we may leek CIDRs here. This is safer than releasing + // them in case when we don't know if request went through. + // NodeController restart will return all falsely allocated CIDRs to the pool. + if !apierrors.IsServerTimeout(err) { + glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", data.nodeName, err) + if releaseErr := r.cidrs.release(data.cidr); releaseErr != nil { + glog.Errorf("Error releasing allocated CIDR for node %v: %v", data.nodeName, releaseErr) + } + } + } + return err +} diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go index f82ab0fe3c6..ee19297dd33 100644 --- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go +++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go @@ -168,7 +168,7 @@ func init() { ObjectMeta: metav1.ObjectMeta{Name: saRolePrefix + "node-controller"}, Rules: []rbac.PolicyRule{ rbac.NewRule("get", "list", "update", "delete", "patch").Groups(legacyGroup).Resources("nodes").RuleOrDie(), - rbac.NewRule("update").Groups(legacyGroup).Resources("nodes/status").RuleOrDie(), + rbac.NewRule("patch", "update").Groups(legacyGroup).Resources("nodes/status").RuleOrDie(), // used for pod eviction rbac.NewRule("update").Groups(legacyGroup).Resources("pods/status").RuleOrDie(), rbac.NewRule("list", "delete").Groups(legacyGroup).Resources("pods").RuleOrDie(), diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml index 72c5d81d843..e331d7d3f55 100644 --- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml +++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml @@ -561,6 +561,7 @@ items: resources: - nodes/status verbs: + - patch - update - apiGroups: - ""