From e2eb1b0dc48fce811125f2b28a55a8d5278ef36d Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Wed, 20 Jan 2016 09:20:29 -0500 Subject: [PATCH] 19848: Retry service IP repair on conflict Components can write services during startup, which results in the ip allocator map being updated. Since core controllers *must* succeed for the masters to start, we should retry a few times in order to pass. --- .../service/ipallocator/controller/repair.go | 15 ++++++++++++--- .../service/portallocator/controller/repair.go | 12 +++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pkg/registry/service/ipallocator/controller/repair.go b/pkg/registry/service/ipallocator/controller/repair.go index cc95a576238..2cf212e4c14 100644 --- a/pkg/registry/service/ipallocator/controller/repair.go +++ b/pkg/registry/service/ipallocator/controller/repair.go @@ -22,6 +22,8 @@ import ( "time" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/errors" + client "k8s.io/kubernetes/pkg/client/unversioned" "k8s.io/kubernetes/pkg/registry/service" "k8s.io/kubernetes/pkg/registry/service/ipallocator" "k8s.io/kubernetes/pkg/util" @@ -72,6 +74,11 @@ func (c *Repair) RunUntil(ch chan struct{}) { // RunOnce verifies the state of the cluster IP allocations and returns an error if an unrecoverable problem occurs. func (c *Repair) RunOnce() error { + return client.RetryOnConflict(client.DefaultBackoff, c.runOnce) +} + +// runOnce verifies the state of the cluster IP allocations and returns an error if an unrecoverable problem occurs. +func (c *Repair) runOnce() error { // TODO: (per smarterclayton) if Get() or ListServices() is a weak consistency read, // or if they are executed against different leaders, // the ordering guarantee required to ensure no IP is allocated twice is violated. @@ -127,12 +134,14 @@ func (c *Repair) RunOnce() error { } } - err = r.Snapshot(latest) - if err != nil { - return fmt.Errorf("unable to persist the updated service IP allocations: %v", err) + if err := r.Snapshot(latest); err != nil { + return fmt.Errorf("unable to snapshot the updated service IP allocations: %v", err) } if err := c.alloc.CreateOrUpdate(latest); err != nil { + if errors.IsConflict(err) { + return err + } return fmt.Errorf("unable to persist the updated service IP allocations: %v", err) } return nil diff --git a/pkg/registry/service/portallocator/controller/repair.go b/pkg/registry/service/portallocator/controller/repair.go index 2f8d255d28f..d216f6b2941 100644 --- a/pkg/registry/service/portallocator/controller/repair.go +++ b/pkg/registry/service/portallocator/controller/repair.go @@ -21,6 +21,8 @@ import ( "time" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/errors" + client "k8s.io/kubernetes/pkg/client/unversioned" "k8s.io/kubernetes/pkg/registry/service" "k8s.io/kubernetes/pkg/registry/service/portallocator" "k8s.io/kubernetes/pkg/util" @@ -57,6 +59,11 @@ func (c *Repair) RunUntil(ch chan struct{}) { // RunOnce verifies the state of the port allocations and returns an error if an unrecoverable problem occurs. func (c *Repair) RunOnce() error { + return client.RetryOnConflict(client.DefaultBackoff, c.runOnce) +} + +// runOnce verifies the state of the port allocations and returns an error if an unrecoverable problem occurs. +func (c *Repair) runOnce() error { // TODO: (per smarterclayton) if Get() or ListServices() is a weak consistency read, // or if they are executed against different leaders, // the ordering guarantee required to ensure no port is allocated twice is violated. @@ -116,10 +123,13 @@ func (c *Repair) RunOnce() error { err = r.Snapshot(latest) if err != nil { - return fmt.Errorf("unable to persist the updated port allocations: %v", err) + return fmt.Errorf("unable to snapshot the updated port allocations: %v", err) } if err := c.alloc.CreateOrUpdate(latest); err != nil { + if errors.IsConflict(err) { + return err + } return fmt.Errorf("unable to persist the updated port allocations: %v", err) } return nil