diff --git a/pkg/master/master.go b/pkg/master/master.go index 24fec4ed937..2439411a614 100644 --- a/pkg/master/master.go +++ b/pkg/master/master.go @@ -807,7 +807,9 @@ func (m *Master) replaceTunnels(user, keyfile string, newAddrs []string) error { if err != nil { return err } - tunnels.Open() + if err := tunnels.Open(); err != nil { + return err + } if m.tunnels != nil { m.tunnels.Close() } @@ -844,31 +846,24 @@ func (m *Master) refreshTunnels(user, keyfile string) error { func (m *Master) setupSecureProxy(user, keyfile string) { // Sync loop for tunnels // TODO: switch this to watch. - go func() { - for { - if err := m.loadTunnels(user, keyfile); err != nil { - glog.Errorf("Failed to load SSH Tunnels: %v", err) - } - var sleep time.Duration - if len(m.tunnels) == 0 { - sleep = time.Second - } else { - // tunnels could lag behind current set of nodes - sleep = 10 * time.Second - } - time.Sleep(sleep) + go util.Until(func() { + if err := m.loadTunnels(user, keyfile); err != nil { + glog.Errorf("Failed to load SSH Tunnels: %v", err) } - }() + if len(m.tunnels) != 0 { + // Sleep for 10 seconds if we have some tunnels. + // TODO (cjcullen): tunnels can lag behind actually existing nodes. + time.Sleep(9 * time.Second) + } + }, 1*time.Second, util.NeverStop) // Refresh loop for tunnels // TODO: could make this more controller-ish - go func() { - for { - time.Sleep(5 * time.Minute) - if err := m.refreshTunnels(user, keyfile); err != nil { - glog.Errorf("Failed to refresh SSH Tunnels: %v", err) - } + go util.Until(func() { + time.Sleep(5 * time.Minute) + if err := m.refreshTunnels(user, keyfile); err != nil { + glog.Errorf("Failed to refresh SSH Tunnels: %v", err) } - }() + }, 0*time.Second, util.NeverStop) } func (m *Master) generateSSHKey(user, keyfile string) error { diff --git a/pkg/util/ssh.go b/pkg/util/ssh.go index 706190985ba..d4eed8a3795 100644 --- a/pkg/util/ssh.go +++ b/pkg/util/ssh.go @@ -32,9 +32,30 @@ import ( "time" "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/crypto/ssh" ) +var ( + tunnelOpenCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "ssh_tunnel_open_count", + Help: "Counter of ssh tunnel total open attempts", + }, + ) + tunnelOpenFailCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "ssh_tunnel_open_fail_count", + Help: "Counter of ssh tunnel failed open attempts", + }, + ) +) + +func init() { + prometheus.MustRegister(tunnelOpenCounter) + prometheus.MustRegister(tunnelOpenFailCounter) +} + // TODO: Unit tests for this code, we can spin up a test SSH server with instructions here: // https://godoc.org/golang.org/x/crypto/ssh#ServerConn type SSHTunnel struct { @@ -83,7 +104,9 @@ func makeSSHTunnel(user string, signer ssh.Signer, host string) (*SSHTunnel, err func (s *SSHTunnel) Open() error { var err error s.client, err = ssh.Dial("tcp", net.JoinHostPort(s.Host, s.SSHPort), s.Config) + tunnelOpenCounter.Inc() if err != nil { + tunnelOpenFailCounter.Inc() return err } return nil @@ -97,6 +120,9 @@ func (s *SSHTunnel) Dial(network, address string) (net.Conn, error) { } func (s *SSHTunnel) tunnel(conn net.Conn, remoteHost, remotePort string) error { + if s.client == nil { + return errors.New("tunnel is not opened.") + } tunnel, err := s.client.Dial("tcp", net.JoinHostPort(remoteHost, remotePort)) if err != nil { return err @@ -107,6 +133,9 @@ func (s *SSHTunnel) tunnel(conn net.Conn, remoteHost, remotePort string) error { } func (s *SSHTunnel) Close() error { + if s.client == nil { + return errors.New("Cannot close tunnel. Tunnel was not opened.") + } if err := s.client.Close(); err != nil { return err } @@ -196,9 +225,14 @@ func MakeSSHTunnels(user, keyfile string, addresses []string) (SSHTunnelList, er func (l SSHTunnelList) Open() error { for ix := range l { if err := l[ix].Tunnel.Open(); err != nil { - return err + // Remove a failed Open from the list. + glog.Errorf("Failed to open tunnel %v: %v", l[ix], err) + l = append(l[:ix], l[ix+1:]...) } } + if len(l) == 0 { + return errors.New("Failed to open any tunnels.") + } return nil } @@ -209,6 +243,7 @@ func (l SSHTunnelList) Close() { for ix := range l { entry := l[ix] go func() { + defer HandleCrash() time.Sleep(1 * time.Minute) if err := entry.Tunnel.Close(); err != nil { glog.Errorf("Failed to close tunnel %v: %v", entry, err)