e2e storage: close all pending streams when proxy listener gets closed

After updating gRPC in node-driver-registrar from v1.40.0 to v1.47.0 the
behavior of gRPC change in a way such that it no longer detected the
single-sided closing of the stream as a loss of connection. This caused gRPC in
the e2e.test to get stuck, possibly in a Read or Write for the HTTP stream
because those have neither a context nor a timeout.

Changing the connection handling so that all active connections are tracking in
the listener and closing them when the listener gets closed fixed this problem.
This commit is contained in:
Patrick Ohly 2022-09-05 20:59:51 +02:00
parent bcea98234f
commit 0bd9a1a29a

View File

@ -24,7 +24,6 @@ import (
"net" "net"
"net/http" "net/http"
"sync" "sync"
"sync/atomic"
"time" "time"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
@ -45,6 +44,15 @@ import (
// and the remote kernel which then aren't needed. // and the remote kernel which then aren't needed.
const maxConcurrentConnections = 10 const maxConcurrentConnections = 10
// This delay determines how quickly we notice when someone has
// connected inside the cluster. With socat, we cannot make this too small
// because otherwise we get many rejected connections. With the mock
// driver as proxy that doesn't happen as long as we don't
// ask for too many concurrent connections because the mock driver
// keeps the listening port open at all times and the Linux
// kernel automatically accepts our connection requests.
const connectionPollInterval = 100 * time.Millisecond
// Listen creates a listener which returns new connections whenever someone connects // Listen creates a listener which returns new connections whenever someone connects
// to a socat or mock driver proxy instance running inside the given pod. // to a socat or mock driver proxy instance running inside the given pod.
// //
@ -85,53 +93,50 @@ func Listen(ctx context.Context, clientset kubernetes.Interface, restConfig *res
prefix := fmt.Sprintf("port forwarding for %s", addr) prefix := fmt.Sprintf("port forwarding for %s", addr)
ctx, cancel := context.WithCancel(ctx) ctx, cancel := context.WithCancel(ctx)
l := &listener{ l := &listener{
connections: make(chan *connection),
ctx: ctx, ctx: ctx,
cancel: cancel, cancel: cancel,
addr: addr, addr: addr,
} }
var connectionsCreated, connectionsClosed int32 var connectionsCreated int
runForwarding := func() { runForwarding := func() {
klog.V(2).Infof("%s: starting connection polling", prefix) klog.V(2).Infof("%s: starting connection polling", prefix)
defer klog.V(2).Infof("%s: connection polling ended", prefix) defer klog.V(2).Infof("%s: connection polling ended", prefix)
// This delay determines how quickly we notice when someone has tryConnect := time.NewTicker(connectionPollInterval)
// connected inside the cluster. With socat, we cannot make this too small
// because otherwise we get many rejected connections. With the mock
// driver as proxy that doesn't happen as long as we don't
// ask for too many concurrent connections because the mock driver
// keeps the listening port open at all times and the Linux
// kernel automatically accepts our connection requests.
tryConnect := time.NewTicker(100 * time.Millisecond)
defer tryConnect.Stop() defer tryConnect.Stop()
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return return
case <-tryConnect.C: case <-tryConnect.C:
currentClosed := atomic.LoadInt32(&connectionsClosed) func() {
openConnections := connectionsCreated - currentClosed l.mutex.Lock()
if openConnections >= maxConcurrentConnections { defer l.mutex.Unlock()
break
}
klog.V(5).Infof("%s: trying to create a new connection #%d, %d open", prefix, connectionsCreated, openConnections) for i, c := range l.connections {
if c == nil {
klog.V(5).Infof("%s: trying to create a new connection #%d", prefix, connectionsCreated)
stream, err := dial(ctx, fmt.Sprintf("%s #%d", prefix, connectionsCreated), dialer, addr.Port) stream, err := dial(ctx, fmt.Sprintf("%s #%d", prefix, connectionsCreated), dialer, addr.Port)
if err != nil { if err != nil {
klog.Errorf("%s: no connection: %v", prefix, err) klog.Errorf("%s: no connection: %v", prefix, err)
break return
} }
// Make the connection available to Accept below. // Make the connection available to Accept below.
klog.V(5).Infof("%s: created a new connection #%d", prefix, connectionsCreated) klog.V(5).Infof("%s: created a new connection #%d", prefix, connectionsCreated)
l.connections <- &connection{ c := &connection{
l: l,
stream: stream, stream: stream,
addr: addr, addr: addr,
counter: connectionsCreated, counter: connectionsCreated,
closed: &connectionsClosed,
} }
l.connections[i] = c
connectionsCreated++ connectionsCreated++
return
}
}
}()
} }
} }
} }
@ -248,9 +253,11 @@ func (s *stream) Close() {
type listener struct { type listener struct {
addr Addr addr Addr
connections chan *connection
ctx context.Context ctx context.Context
cancel func() cancel func()
mutex sync.Mutex
connections [maxConcurrentConnections]*connection
} }
var _ net.Listener = &listener{} var _ net.Listener = &listener{}
@ -258,25 +265,53 @@ var _ net.Listener = &listener{}
func (l *listener) Close() error { func (l *listener) Close() error {
klog.V(5).Infof("forward listener for %s: closing", l.addr) klog.V(5).Infof("forward listener for %s: closing", l.addr)
l.cancel() l.cancel()
l.mutex.Lock()
defer l.mutex.Unlock()
for _, c := range l.connections {
if c != nil {
c.stream.Close()
}
}
return nil return nil
} }
func (l *listener) Accept() (net.Conn, error) { func (l *listener) Accept() (net.Conn, error) {
tryAccept := time.NewTicker(connectionPollInterval)
defer tryAccept.Stop()
for {
select { select {
case <-l.ctx.Done(): case <-l.ctx.Done():
return nil, errors.New("listening was stopped") return nil, errors.New("listening was stopped")
case c := <-l.connections: case <-tryAccept.C:
conn := func() net.Conn {
l.mutex.Lock()
defer l.mutex.Unlock()
for _, c := range l.connections {
if c != nil && !c.accepted {
klog.V(5).Infof("forward listener for %s: got a new connection #%d", l.addr, c.counter) klog.V(5).Infof("forward listener for %s: got a new connection #%d", l.addr, c.counter)
return c, nil c.accepted = true
return c
}
}
return nil
}()
if conn != nil {
return conn, nil
}
}
} }
} }
type connection struct { type connection struct {
l *listener
stream *stream stream *stream
addr Addr addr Addr
counter int32 counter int
closed *int32
mutex sync.Mutex mutex sync.Mutex
accepted, closed bool
} }
var _ net.Conn = &connection{} var _ net.Conn = &connection{}
@ -320,13 +355,21 @@ func (c *connection) Write(b []byte) (int, error) {
func (c *connection) Close() error { func (c *connection) Close() error {
c.mutex.Lock() c.mutex.Lock()
defer c.mutex.Unlock() defer c.mutex.Unlock()
if c.closed != nil { if !c.closed {
// Do the logging and book-keeping only once. The function itself may be called more than once. // Do the logging and book-keeping only once. The function itself may be called more than once.
klog.V(5).Infof("forward connection #%d for %s: closing our side", c.counter, c.addr) klog.V(5).Infof("forward connection #%d for %s: closing our side", c.counter, c.addr)
atomic.AddInt32(c.closed, 1)
c.closed = nil c.l.mutex.Lock()
defer c.l.mutex.Unlock()
for i, c2 := range c.l.connections {
if c2 == c {
c.l.connections[i] = nil
break
}
}
} }
c.stream.Close() c.stream.Close()
return nil return nil
} }