From 1325c2f8bec73251fd5e4cb5ce61784b9f0244d6 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 7 Feb 2018 17:24:43 +0100 Subject: [PATCH 1/2] devicemanager testing: dynamically choose tmp dir Hard-coding the tests to use /tmp/device_plugin for sockets is problematic because it prevents running tests in parallel on the same machine (perhaps because there are multiple developers, perhaps because testing is done independently on different code checkouts). /tmp/device_plugin also was not removed after testing. This is probably not that relevant. But more importantly, this change also fixes https://github.com/kubernetes/kubernetes/issues/59488. "make test" failed in TestDevicePluginReRegistration because something removed /tmp/device_plugin/device-plugin.sock while something else tried to connect to it: 2018/02/07 14:34:39 Starting to serve on /tmp/device_plugin/device-plugin.sock [pid 29568] connect(14, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/server.sock"}, 33) = 0 [pid 29568] unlinkat(AT_FDCWD, "/tmp/device_plugin/server.sock", 0) = 0 [pid 29568] unlinkat(AT_FDCWD, "/tmp/device_plugin/device-plugin.sock", 0) = 0 [pid 29568] --- SIGPIPE {si_signo=SIGPIPE, si_code=SI_USER, si_pid=29568, si_uid=1000} --- [pid 29568] connect(6, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = -1 ENOENT (No such file or directory) E0207 14:34:39.961321 29568 endpoint.go:117] listAndWatch ended unexpectedly for device plugin mock with error rpc error: code = Unavailable desc = transport is closing strace: Process 29623 attached [pid 29574] connect(3, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = -1 ENOENT (No such file or directory) [pid 29623] connect(3, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = -1 ENOENT (No such file or directory) [pid 29574] connect(3, {sa_family=AF_UNIX, sun_path="/tmp/device_plugin/device-plugin.sock"}, 40) = -1 ENOENT (No such file or directory) E0207 14:34:49.961324 29568 endpoint.go:60] Can't create new endpoint with path /tmp/device_plugin/device-plugin.sock err failed to dial device plugin: context deadline exceeded E0207 14:34:49.961390 29568 manager.go:340] Failed to dial device plugin with request &RegisterRequest{Version:v1alpha2,Endpoint:device-plugin.sock,ResourceName:fake-domain/resource,}: failed to dial device plugin: context deadline exceeded panic: test timed out after 2m0s It's not entirely certain which code was to blame for this unlinkat() calls (perhaps some cleanup code from a previous test running in a goroutine?) but this no longer happened after switching to per-test socket directories. --- pkg/kubelet/cm/devicemanager/manager_test.go | 34 ++++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index 28cb53e2d0f..b11649f0bf5 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -40,18 +40,32 @@ import ( ) const ( - socketName = "/tmp/device_plugin/server.sock" - pluginSocketName = "/tmp/device_plugin/device-plugin.sock" testResourceName = "fake-domain/resource" ) +func tmpSocketDir() (socketDir, socketName, pluginSocketName string, err error) { + socketDir, err = ioutil.TempDir("", "device_plugin") + if err != nil { + return + } + socketName = socketDir + "/server.sock" + pluginSocketName = socketDir + "/device-plugin.sock" + return +} + func TestNewManagerImpl(t *testing.T) { - _, err := newManagerImpl(socketName) + socketDir, socketName, _, err := tmpSocketDir() + require.NoError(t, err) + defer os.RemoveAll(socketDir) + _, err = newManagerImpl(socketName) require.NoError(t, err) } func TestNewManagerImplStart(t *testing.T) { - m, p := setup(t, []*pluginapi.Device{}, func(n string, a, u, r []pluginapi.Device) {}) + socketDir, socketName, pluginSocketName, err := tmpSocketDir() + require.NoError(t, err) + defer os.RemoveAll(socketDir) + m, p := setup(t, []*pluginapi.Device{}, func(n string, a, u, r []pluginapi.Device) {}, socketName, pluginSocketName) cleanup(t, m, p) } @@ -59,6 +73,9 @@ func TestNewManagerImplStart(t *testing.T) { // making sure that after registration, devices are correctly updated and if a re-registration // happens, we will NOT delete devices; and no orphaned devices left. func TestDevicePluginReRegistration(t *testing.T) { + socketDir, socketName, pluginSocketName, err := tmpSocketDir() + require.NoError(t, err) + defer os.RemoveAll(socketDir) devs := []*pluginapi.Device{ {ID: "Dev1", Health: pluginapi.Healthy}, {ID: "Dev2", Health: pluginapi.Healthy}, @@ -77,7 +94,7 @@ func TestDevicePluginReRegistration(t *testing.T) { } callbackChan <- callbackCount } - m, p1 := setup(t, devs, callback) + m, p1 := setup(t, devs, callback, socketName, pluginSocketName) atomic.StoreInt32(&expCallbackCount, 1) p1.Register(socketName, testResourceName) // Wait for the first callback to be issued. @@ -87,7 +104,7 @@ func TestDevicePluginReRegistration(t *testing.T) { require.Equal(t, 2, len(devices[testResourceName]), "Devices are not updated.") p2 := NewDevicePluginStub(devs, pluginSocketName+".new") - err := p2.Start() + err = p2.Start() require.NoError(t, err) atomic.StoreInt32(&expCallbackCount, 2) p2.Register(socketName, testResourceName) @@ -114,7 +131,7 @@ func TestDevicePluginReRegistration(t *testing.T) { close(callbackChan) } -func setup(t *testing.T, devs []*pluginapi.Device, callback monitorCallback) (Manager, *Stub) { +func setup(t *testing.T, devs []*pluginapi.Device, callback monitorCallback, socketName string, pluginSocketName string) (Manager, *Stub) { m, err := newManagerImpl(socketName) require.NoError(t, err) @@ -139,6 +156,9 @@ func cleanup(t *testing.T, m Manager, p *Stub) { } func TestUpdateCapacityAllocatable(t *testing.T) { + socketDir, socketName, _, err := tmpSocketDir() + require.NoError(t, err) + defer os.RemoveAll(socketDir) testManager, err := newManagerImpl(socketName) as := assert.New(t) as.NotNil(testManager) From 0d828e061b5600082ccbb6bee04f064d37bc695b Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 9 Feb 2018 14:01:32 +0100 Subject: [PATCH 2/2] devicemanager testing: time out sooner Each individual step should not take longer than a second. Suggest by Vikas Choudhary (https://github.com/kubernetes/kubernetes/pull/59489#discussion_r167205672). --- pkg/kubelet/cm/devicemanager/manager_test.go | 22 +++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index b11649f0bf5..902e429f787 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -24,6 +24,7 @@ import ( "reflect" "sync/atomic" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -99,7 +100,12 @@ func TestDevicePluginReRegistration(t *testing.T) { p1.Register(socketName, testResourceName) // Wait for the first callback to be issued. - <-callbackChan + select { + case <-callbackChan: + break + case <-time.After(time.Second): + t.FailNow() + } devices := m.Devices() require.Equal(t, 2, len(devices[testResourceName]), "Devices are not updated.") @@ -109,7 +115,12 @@ func TestDevicePluginReRegistration(t *testing.T) { atomic.StoreInt32(&expCallbackCount, 2) p2.Register(socketName, testResourceName) // Wait for the second callback to be issued. - <-callbackChan + select { + case <-callbackChan: + break + case <-time.After(time.Second): + t.FailNow() + } devices2 := m.Devices() require.Equal(t, 2, len(devices2[testResourceName]), "Devices shouldn't change.") @@ -121,7 +132,12 @@ func TestDevicePluginReRegistration(t *testing.T) { atomic.StoreInt32(&expCallbackCount, 3) p3.Register(socketName, testResourceName) // Wait for the second callback to be issued. - <-callbackChan + select { + case <-callbackChan: + break + case <-time.After(time.Second): + t.FailNow() + } devices3 := m.Devices() require.Equal(t, 1, len(devices3[testResourceName]), "Devices of plugin previously registered should be removed.")