From 9ece24c33fafdd9b49260a82d95ab34e2cae3d9f Mon Sep 17 00:00:00 2001 From: Mike Danese Date: Tue, 4 Dec 2018 13:02:24 -0800 Subject: [PATCH 1/3] remove custom timeout in test that is never exercised and misc cleanup --- pkg/kubelet/cloudresource/BUILD | 1 + .../cloudresource/cloud_request_manager.go | 39 ++++++++++--------- .../cloud_request_manager_test.go | 26 ++----------- 3 files changed, 25 insertions(+), 41 deletions(-) diff --git a/pkg/kubelet/cloudresource/BUILD b/pkg/kubelet/cloudresource/BUILD index 1ab5c46dec2..9c539ff6741 100644 --- a/pkg/kubelet/cloudresource/BUILD +++ b/pkg/kubelet/cloudresource/BUILD @@ -21,6 +21,7 @@ go_test( deps = [ "//pkg/cloudprovider/providers/fake:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library", ], ) diff --git a/pkg/kubelet/cloudresource/cloud_request_manager.go b/pkg/kubelet/cloudresource/cloud_request_manager.go index dbc05f094df..d66d2a9534d 100644 --- a/pkg/kubelet/cloudresource/cloud_request_manager.go +++ b/pkg/kubelet/cloudresource/cloud_request_manager.go @@ -63,27 +63,27 @@ func NewSyncManager(cloud cloudprovider.Interface, nodeName types.NodeName, sync } } -func (manager *cloudResourceSyncManager) getNodeAddressSafe() ([]v1.NodeAddress, error) { - manager.nodeAddressesMux.Lock() - defer manager.nodeAddressesMux.Unlock() +func (m *cloudResourceSyncManager) getNodeAddressSafe() ([]v1.NodeAddress, error) { + m.nodeAddressesMux.Lock() + defer m.nodeAddressesMux.Unlock() - return manager.nodeAddresses, manager.nodeAddressesErr + return m.nodeAddresses, m.nodeAddressesErr } -func (manager *cloudResourceSyncManager) setNodeAddressSafe(nodeAddresses []v1.NodeAddress, err error) { - manager.nodeAddressesMux.Lock() - defer manager.nodeAddressesMux.Unlock() +func (m *cloudResourceSyncManager) setNodeAddressSafe(nodeAddresses []v1.NodeAddress, err error) { + m.nodeAddressesMux.Lock() + defer m.nodeAddressesMux.Unlock() - manager.nodeAddresses = nodeAddresses - manager.nodeAddressesErr = err + m.nodeAddresses = nodeAddresses + m.nodeAddressesErr = err } // NodeAddresses does not wait for cloud provider to return a node addresses. // It always returns node addresses or an error. -func (manager *cloudResourceSyncManager) NodeAddresses() ([]v1.NodeAddress, error) { +func (m *cloudResourceSyncManager) NodeAddresses() ([]v1.NodeAddress, error) { // wait until there is something for { - nodeAddresses, err := manager.getNodeAddressSafe() + nodeAddresses, err := m.getNodeAddressSafe() if len(nodeAddresses) == 0 && err == nil { klog.V(5).Infof("Waiting for %v for cloud provider to provide node addresses", nodeAddressesRetryPeriod) time.Sleep(nodeAddressesRetryPeriod) @@ -93,12 +93,12 @@ func (manager *cloudResourceSyncManager) NodeAddresses() ([]v1.NodeAddress, erro } } -func (manager *cloudResourceSyncManager) collectNodeAddresses(ctx context.Context, nodeName types.NodeName) { +func (m *cloudResourceSyncManager) collectNodeAddresses(ctx context.Context, nodeName types.NodeName) { klog.V(5).Infof("Requesting node addresses from cloud provider for node %q", nodeName) - instances, ok := manager.cloud.Instances() + instances, ok := m.cloud.Instances() if !ok { - manager.setNodeAddressSafe(nil, fmt.Errorf("failed to get instances from cloud provider")) + m.setNodeAddressSafe(nil, fmt.Errorf("failed to get instances from cloud provider")) return } @@ -109,16 +109,17 @@ func (manager *cloudResourceSyncManager) collectNodeAddresses(ctx context.Contex nodeAddresses, err := instances.NodeAddresses(ctx, nodeName) if err != nil { - manager.setNodeAddressSafe(nil, fmt.Errorf("failed to get node address from cloud provider: %v", err)) + m.setNodeAddressSafe(nil, fmt.Errorf("failed to get node address from cloud provider: %v", err)) klog.V(2).Infof("Node addresses from cloud provider for node %q not collected", nodeName) } else { - manager.setNodeAddressSafe(nodeAddresses, nil) + m.setNodeAddressSafe(nodeAddresses, nil) klog.V(5).Infof("Node addresses from cloud provider for node %q collected", nodeName) } } -func (manager *cloudResourceSyncManager) Run(stopCh <-chan struct{}) { +// Run starts the cloud resource sync manager's sync loop. +func (m *cloudResourceSyncManager) Run(stopCh <-chan struct{}) { wait.Until(func() { - manager.collectNodeAddresses(context.TODO(), manager.nodeName) - }, manager.syncPeriod, stopCh) + m.collectNodeAddresses(context.TODO(), m.nodeName) + }, m.syncPeriod, stopCh) } diff --git a/pkg/kubelet/cloudresource/cloud_request_manager_test.go b/pkg/kubelet/cloudresource/cloud_request_manager_test.go index a0777f5bf5c..4404e71de48 100644 --- a/pkg/kubelet/cloudresource/cloud_request_manager_test.go +++ b/pkg/kubelet/cloudresource/cloud_request_manager_test.go @@ -17,33 +17,15 @@ limitations under the License. package cloudresource import ( - "fmt" "reflect" "testing" "time" "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/diff" "k8s.io/kubernetes/pkg/cloudprovider/providers/fake" ) -func collectNodeAddresses(manager *cloudResourceSyncManager) ([]v1.NodeAddress, error) { - var nodeAddresses []v1.NodeAddress - var err error - - collected := make(chan struct{}, 1) - go func() { - nodeAddresses, err = manager.NodeAddresses() - close(collected) - }() - - select { - case <-collected: - return nodeAddresses, err - case <-time.Tick(2 * nodeAddressesRetryPeriod): - return nil, fmt.Errorf("Timeout after %v waiting for address to appear", 2*nodeAddressesRetryPeriod) - } -} - func createNodeInternalIPAddress(address string) []v1.NodeAddress { return []v1.NodeAddress{ { @@ -67,12 +49,12 @@ func TestNodeAddressesRequest(t *testing.T) { manager := NewSyncManager(cloud, "defaultNode", syncPeriod).(*cloudResourceSyncManager) go manager.Run(stopCh) - nodeAddresses, err := collectNodeAddresses(manager) + nodeAddresses, err := manager.NodeAddresses() if err != nil { t.Errorf("Unexpected err: %q\n", err) } if !reflect.DeepEqual(nodeAddresses, cloud.Addresses) { - t.Errorf("Unexpected list of node addresses %#v, expected %#v: %v", nodeAddresses, cloud.Addresses, err) + t.Errorf("Unexpected diff of node addresses: %v", diff.ObjectReflectDiff(nodeAddresses, cloud.Addresses)) } // Change the IP address @@ -80,7 +62,7 @@ func TestNodeAddressesRequest(t *testing.T) { // Wait until the IP address changes for i := 0; i < maxRetry; i++ { - nodeAddresses, err := collectNodeAddresses(manager) + nodeAddresses, err := manager.NodeAddresses() t.Logf("nodeAddresses: %#v, err: %v", nodeAddresses, err) if err != nil { t.Errorf("Unexpected err: %q\n", err) From 33fc5b354b1662740203daae3316443ec1dadb7e Mon Sep 17 00:00:00 2001 From: Mike Danese Date: Tue, 4 Dec 2018 13:07:58 -0800 Subject: [PATCH 2/3] remove artificial sleeps that made tests passes --- .../cloudresource/cloud_request_manager.go | 55 ++++++++++--------- .../cloud_request_manager_test.go | 10 ++-- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/pkg/kubelet/cloudresource/cloud_request_manager.go b/pkg/kubelet/cloudresource/cloud_request_manager.go index d66d2a9534d..d6112b13078 100644 --- a/pkg/kubelet/cloudresource/cloud_request_manager.go +++ b/pkg/kubelet/cloudresource/cloud_request_manager.go @@ -30,8 +30,6 @@ import ( "k8s.io/klog" ) -var nodeAddressesRetryPeriod = 5 * time.Second - // SyncManager is an interface for making requests to a cloud provider type SyncManager interface { Run(stopCh <-chan struct{}) @@ -46,50 +44,53 @@ type cloudResourceSyncManager struct { // Sync period syncPeriod time.Duration - nodeAddressesMux sync.Mutex - nodeAddressesErr error - nodeAddresses []v1.NodeAddress + nodeAddressesMonitor *sync.Cond + nodeAddressesErr error + nodeAddresses []v1.NodeAddress nodeName types.NodeName } -// NewSyncManager creates a manager responsible for collecting resources -// from a cloud provider through requests that are sensitive to timeouts and hanging +// NewSyncManager creates a manager responsible for collecting resources from a +// cloud provider through requests that are sensitive to timeouts and hanging func NewSyncManager(cloud cloudprovider.Interface, nodeName types.NodeName, syncPeriod time.Duration) SyncManager { return &cloudResourceSyncManager{ cloud: cloud, syncPeriod: syncPeriod, nodeName: nodeName, + // nodeAddressesMonitor is a monitor that guards a result (nodeAddresses, + // nodeAddressesErr) of the sync loop under the condition that a result has + // been saved at least once. The semantics here are: + // + // * Readers of the result will wait on the monitor until the first result + // has been saved. + // * The sync loop (i.e. the only writer), will signal all waiters every + // time it updates the result. + nodeAddressesMonitor: sync.NewCond(&sync.Mutex{}), } } -func (m *cloudResourceSyncManager) getNodeAddressSafe() ([]v1.NodeAddress, error) { - m.nodeAddressesMux.Lock() - defer m.nodeAddressesMux.Unlock() +func (m *cloudResourceSyncManager) updateAddresses(addrs []v1.NodeAddress, err error) { + m.nodeAddressesMonitor.L.Lock() + defer m.nodeAddressesMonitor.L.Unlock() + defer m.nodeAddressesMonitor.Broadcast() - return m.nodeAddresses, m.nodeAddressesErr -} - -func (m *cloudResourceSyncManager) setNodeAddressSafe(nodeAddresses []v1.NodeAddress, err error) { - m.nodeAddressesMux.Lock() - defer m.nodeAddressesMux.Unlock() - - m.nodeAddresses = nodeAddresses + m.nodeAddresses = addrs m.nodeAddressesErr = err } // NodeAddresses does not wait for cloud provider to return a node addresses. // It always returns node addresses or an error. func (m *cloudResourceSyncManager) NodeAddresses() ([]v1.NodeAddress, error) { + m.nodeAddressesMonitor.L.Lock() + defer m.nodeAddressesMonitor.L.Unlock() // wait until there is something for { - nodeAddresses, err := m.getNodeAddressSafe() - if len(nodeAddresses) == 0 && err == nil { - klog.V(5).Infof("Waiting for %v for cloud provider to provide node addresses", nodeAddressesRetryPeriod) - time.Sleep(nodeAddressesRetryPeriod) - continue + if addrs, err := m.nodeAddresses, m.nodeAddressesErr; len(addrs) > 0 || err != nil { + return addrs, err } - return nodeAddresses, err + klog.V(5).Infof("Waiting for cloud provider to provide node addresses") + m.nodeAddressesMonitor.Wait() } } @@ -98,7 +99,7 @@ func (m *cloudResourceSyncManager) collectNodeAddresses(ctx context.Context, nod instances, ok := m.cloud.Instances() if !ok { - m.setNodeAddressSafe(nil, fmt.Errorf("failed to get instances from cloud provider")) + m.updateAddresses(nil, fmt.Errorf("failed to get instances from cloud provider")) return } @@ -109,10 +110,10 @@ func (m *cloudResourceSyncManager) collectNodeAddresses(ctx context.Context, nod nodeAddresses, err := instances.NodeAddresses(ctx, nodeName) if err != nil { - m.setNodeAddressSafe(nil, fmt.Errorf("failed to get node address from cloud provider: %v", err)) + m.updateAddresses(nil, fmt.Errorf("failed to get node address from cloud provider: %v", err)) klog.V(2).Infof("Node addresses from cloud provider for node %q not collected", nodeName) } else { - m.setNodeAddressSafe(nodeAddresses, nil) + m.updateAddresses(nodeAddresses, nil) klog.V(5).Infof("Node addresses from cloud provider for node %q collected", nodeName) } } diff --git a/pkg/kubelet/cloudresource/cloud_request_manager_test.go b/pkg/kubelet/cloudresource/cloud_request_manager_test.go index 4404e71de48..928fe58eefa 100644 --- a/pkg/kubelet/cloudresource/cloud_request_manager_test.go +++ b/pkg/kubelet/cloudresource/cloud_request_manager_test.go @@ -35,13 +35,12 @@ func createNodeInternalIPAddress(address string) []v1.NodeAddress { } } -func TestNodeAddressesRequest(t *testing.T) { - syncPeriod := 300 * time.Millisecond - maxRetry := 5 +func TestNodeAddressesDelay(t *testing.T) { + syncPeriod := 100 * time.Millisecond cloud := &fake.FakeCloud{ Addresses: createNodeInternalIPAddress("10.0.1.12"), // Set the request delay so the manager timeouts and collects the node addresses later - RequestDelay: 400 * time.Millisecond, + RequestDelay: 200 * time.Millisecond, } stopCh := make(chan struct{}) defer close(stopCh) @@ -61,6 +60,7 @@ func TestNodeAddressesRequest(t *testing.T) { cloud.SetNodeAddresses(createNodeInternalIPAddress("10.0.1.13")) // Wait until the IP address changes + maxRetry := 5 for i := 0; i < maxRetry; i++ { nodeAddresses, err := manager.NodeAddresses() t.Logf("nodeAddresses: %#v, err: %v", nodeAddresses, err) @@ -69,7 +69,7 @@ func TestNodeAddressesRequest(t *testing.T) { } // It is safe to read cloud.Addresses since no routine is changing the value at the same time if err == nil && nodeAddresses[0].Address != cloud.Addresses[0].Address { - time.Sleep(100 * time.Millisecond) + time.Sleep(syncPeriod) continue } if err != nil { From bf99565fbb7ff2454355881e82eeb71f1134f130 Mon Sep 17 00:00:00 2001 From: Mike Danese Date: Tue, 4 Dec 2018 13:26:55 -0800 Subject: [PATCH 3/3] fallback to previously collected addresses if a sync loop fails --- .../cloudresource/cloud_request_manager.go | 71 +++++++++++-------- .../cloud_request_manager_test.go | 52 ++++++++++++++ 2 files changed, 92 insertions(+), 31 deletions(-) diff --git a/pkg/kubelet/cloudresource/cloud_request_manager.go b/pkg/kubelet/cloudresource/cloud_request_manager.go index d6112b13078..879e131e375 100644 --- a/pkg/kubelet/cloudresource/cloud_request_manager.go +++ b/pkg/kubelet/cloudresource/cloud_request_manager.go @@ -70,17 +70,10 @@ func NewSyncManager(cloud cloudprovider.Interface, nodeName types.NodeName, sync } } -func (m *cloudResourceSyncManager) updateAddresses(addrs []v1.NodeAddress, err error) { - m.nodeAddressesMonitor.L.Lock() - defer m.nodeAddressesMonitor.L.Unlock() - defer m.nodeAddressesMonitor.Broadcast() - - m.nodeAddresses = addrs - m.nodeAddressesErr = err -} - -// NodeAddresses does not wait for cloud provider to return a node addresses. -// It always returns node addresses or an error. +// NodeAddresses waits for the first sync loop to run. If no successful syncs +// have run, it will return the most recent error. If node addresses have been +// synced successfully, it will return the list of node addresses from the most +// recent successful sync. func (m *cloudResourceSyncManager) NodeAddresses() ([]v1.NodeAddress, error) { m.nodeAddressesMonitor.L.Lock() defer m.nodeAddressesMonitor.L.Unlock() @@ -94,33 +87,49 @@ func (m *cloudResourceSyncManager) NodeAddresses() ([]v1.NodeAddress, error) { } } -func (m *cloudResourceSyncManager) collectNodeAddresses(ctx context.Context, nodeName types.NodeName) { - klog.V(5).Infof("Requesting node addresses from cloud provider for node %q", nodeName) - +// getNodeAddresses calls the cloud provider to get a current list of node addresses. +func (m *cloudResourceSyncManager) getNodeAddresses() ([]v1.NodeAddress, error) { + // TODO(roberthbailey): Can we do this without having credentials to talk to + // the cloud provider? + // TODO(justinsb): We can if CurrentNodeName() was actually CurrentNode() and + // returned an interface. + // TODO: If IP addresses couldn't be fetched from the cloud provider, should + // kubelet fallback on the other methods for getting the IP below? instances, ok := m.cloud.Instances() if !ok { - m.updateAddresses(nil, fmt.Errorf("failed to get instances from cloud provider")) + return nil, fmt.Errorf("failed to get instances from cloud provider") + } + return instances.NodeAddresses(context.TODO(), m.nodeName) +} + +func (m *cloudResourceSyncManager) syncNodeAddresses() { + klog.V(5).Infof("Requesting node addresses from cloud provider for node %q", m.nodeName) + + addrs, err := m.getNodeAddresses() + + m.nodeAddressesMonitor.L.Lock() + defer m.nodeAddressesMonitor.L.Unlock() + defer m.nodeAddressesMonitor.Broadcast() + + if err != nil { + klog.V(2).Infof("Node addresses from cloud provider for node %q not collected: %v", m.nodeName, err) + + if len(m.nodeAddresses) > 0 { + // in the event that a sync loop fails when a previous sync had + // succeeded, continue to use the old addresses. + return + } + + m.nodeAddressesErr = fmt.Errorf("failed to get node address from cloud provider: %v", err) return } - // TODO(roberthbailey): Can we do this without having credentials to talk - // to the cloud provider? - // TODO(justinsb): We can if CurrentNodeName() was actually CurrentNode() and returned an interface - // TODO: If IP addresses couldn't be fetched from the cloud provider, should kubelet fallback on the other methods for getting the IP below? - - nodeAddresses, err := instances.NodeAddresses(ctx, nodeName) - if err != nil { - m.updateAddresses(nil, fmt.Errorf("failed to get node address from cloud provider: %v", err)) - klog.V(2).Infof("Node addresses from cloud provider for node %q not collected", nodeName) - } else { - m.updateAddresses(nodeAddresses, nil) - klog.V(5).Infof("Node addresses from cloud provider for node %q collected", nodeName) - } + klog.V(5).Infof("Node addresses from cloud provider for node %q collected", m.nodeName) + m.nodeAddressesErr = nil + m.nodeAddresses = addrs } // Run starts the cloud resource sync manager's sync loop. func (m *cloudResourceSyncManager) Run(stopCh <-chan struct{}) { - wait.Until(func() { - m.collectNodeAddresses(context.TODO(), m.nodeName) - }, m.syncPeriod, stopCh) + wait.Until(m.syncNodeAddresses, m.syncPeriod, stopCh) } diff --git a/pkg/kubelet/cloudresource/cloud_request_manager_test.go b/pkg/kubelet/cloudresource/cloud_request_manager_test.go index 928fe58eefa..e6d829e8447 100644 --- a/pkg/kubelet/cloudresource/cloud_request_manager_test.go +++ b/pkg/kubelet/cloudresource/cloud_request_manager_test.go @@ -17,6 +17,7 @@ limitations under the License. package cloudresource import ( + "errors" "reflect" "testing" "time" @@ -79,3 +80,54 @@ func TestNodeAddressesDelay(t *testing.T) { } t.Errorf("Timeout waiting for %q address to appear", cloud.Addresses[0].Address) } + +func TestNodeAddressesUsesLastSuccess(t *testing.T) { + cloud := &fake.FakeCloud{} + manager := NewSyncManager(cloud, "defaultNode", 0).(*cloudResourceSyncManager) + + // These tests are stateful and order dependant. + tests := []struct { + name string + addrs []v1.NodeAddress + err error + wantAddrs []v1.NodeAddress + wantErr bool + }{ + { + name: "first sync loop encounters an error", + err: errors.New("bad"), + wantErr: true, + }, + { + name: "subsequent sync loop succeeds", + addrs: createNodeInternalIPAddress("10.0.1.12"), + wantAddrs: createNodeInternalIPAddress("10.0.1.12"), + }, + { + name: "subsequent sync loop encounters an error, last addresses returned", + err: errors.New("bad"), + wantAddrs: createNodeInternalIPAddress("10.0.1.12"), + }, + { + name: "subsequent sync loop succeeds changing addresses", + addrs: createNodeInternalIPAddress("10.0.1.13"), + wantAddrs: createNodeInternalIPAddress("10.0.1.13"), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + cloud.Addresses = test.addrs + cloud.Err = test.err + + manager.syncNodeAddresses() + nodeAddresses, err := manager.NodeAddresses() + if (err != nil) != test.wantErr { + t.Errorf("unexpected err: %v", err) + } + if got, want := nodeAddresses, test.wantAddrs; !reflect.DeepEqual(got, want) { + t.Errorf("Unexpected diff of node addresses: %v", diff.ObjectReflectDiff(got, want)) + } + }) + } +}