mirror of
https://github.com/woodpecker-ci/woodpecker.git
synced 2025-09-16 05:40:14 +00:00
Gracefully shutdown agent (#3895)
This commit is contained in:
@@ -23,12 +23,12 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
"github.com/tevino/abool/v2"
|
||||
"github.com/urfave/cli/v2"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/codes"
|
||||
grpc_credentials "google.golang.org/grpc/credentials"
|
||||
@@ -47,7 +47,43 @@ import (
|
||||
"go.woodpecker-ci.org/woodpecker/v2/version"
|
||||
)
|
||||
|
||||
const (
|
||||
reportHealthInterval = time.Second * 10
|
||||
authInterceptorRefreshInterval = time.Minute * 30
|
||||
)
|
||||
|
||||
const (
|
||||
shutdownTimeout = time.Second * 5
|
||||
)
|
||||
|
||||
var (
|
||||
stopAgentFunc context.CancelCauseFunc = func(error) {}
|
||||
shutdownCancelFunc context.CancelFunc = func() {}
|
||||
shutdownCtx = context.Background()
|
||||
)
|
||||
|
||||
func run(c *cli.Context, backends []types.Backend) error {
|
||||
ctx := utils.WithContextSigtermCallback(c.Context, func() {
|
||||
log.Info().Msg("termination signal is received, shutting down agent")
|
||||
})
|
||||
|
||||
agentCtx, ctxCancel := context.WithCancelCause(ctx)
|
||||
stopAgentFunc = func(err error) {
|
||||
msg := "shutdown of whole agent"
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg(msg)
|
||||
} else {
|
||||
log.Info().Msg(msg)
|
||||
}
|
||||
stopAgentFunc = func(error) {}
|
||||
shutdownCtx, shutdownCancelFunc = context.WithTimeout(shutdownCtx, shutdownTimeout)
|
||||
ctxCancel(err)
|
||||
}
|
||||
defer stopAgentFunc(nil)
|
||||
defer shutdownCancelFunc()
|
||||
|
||||
serviceWaitingGroup := errgroup.Group{}
|
||||
|
||||
agentConfigPath := c.String("agent-config")
|
||||
hostname := c.String("hostname")
|
||||
if len(hostname) == 0 {
|
||||
@@ -58,11 +94,23 @@ func run(c *cli.Context, backends []types.Backend) error {
|
||||
counter.Running = 0
|
||||
|
||||
if c.Bool("healthcheck") {
|
||||
go func() {
|
||||
if err := http.ListenAndServe(c.String("healthcheck-addr"), nil); err != nil {
|
||||
log.Error().Err(err).Msgf("cannot listen on address %s", c.String("healthcheck-addr"))
|
||||
}
|
||||
}()
|
||||
serviceWaitingGroup.Go(
|
||||
func() error {
|
||||
server := &http.Server{Addr: c.String("healthcheck-addr")}
|
||||
go func() {
|
||||
<-agentCtx.Done()
|
||||
log.Info().Msg("shutdown healthcheck server ...")
|
||||
if err := server.Shutdown(shutdownCtx); err != nil { //nolint:contextcheck
|
||||
log.Error().Err(err).Msg("shutdown healthcheck server failed")
|
||||
} else {
|
||||
log.Info().Msg("healthcheck server stopped")
|
||||
}
|
||||
}()
|
||||
if err := server.ListenAndServe(); err != nil {
|
||||
log.Error().Err(err).Msgf("cannot listen on address %s", c.String("healthcheck-addr"))
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
var transport grpc.DialOption
|
||||
@@ -88,8 +136,10 @@ func run(c *cli.Context, backends []types.Backend) error {
|
||||
agentConfig := readAgentConfig(agentConfigPath)
|
||||
|
||||
agentToken := c.String("grpc-token")
|
||||
grpcClientCtx, grpcClientCtxCancel := context.WithCancelCause(context.Background())
|
||||
defer grpcClientCtxCancel(nil)
|
||||
authClient := agent_rpc.NewAuthGrpcClient(authConn, agentToken, agentConfig.AgentID)
|
||||
authInterceptor, err := agent_rpc.NewAuthInterceptor(authClient, 30*time.Minute) //nolint:mnd
|
||||
authInterceptor, err := agent_rpc.NewAuthInterceptor(grpcClientCtx, authClient, authInterceptorRefreshInterval)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -110,30 +160,12 @@ func run(c *cli.Context, backends []types.Backend) error {
|
||||
defer conn.Close()
|
||||
|
||||
client := agent_rpc.NewGrpcClient(conn)
|
||||
agentConfigPersisted := atomic.Bool{}
|
||||
|
||||
sigterm := abool.New()
|
||||
ctx := metadata.NewOutgoingContext(
|
||||
context.Background(),
|
||||
metadata.Pairs("hostname", hostname),
|
||||
)
|
||||
|
||||
agentConfigPersisted := abool.New()
|
||||
ctx = utils.WithContextSigtermCallback(ctx, func() {
|
||||
log.Info().Msg("termination signal is received, shutting down")
|
||||
sigterm.Set()
|
||||
|
||||
// Remove stateless agents from server
|
||||
if agentConfigPersisted.IsNotSet() {
|
||||
log.Debug().Msg("unregistering agent from server")
|
||||
err := client.UnregisterAgent(ctx)
|
||||
if err != nil {
|
||||
log.Err(err).Msg("failed to unregister agent from server")
|
||||
}
|
||||
}
|
||||
})
|
||||
grpcCtx := metadata.NewOutgoingContext(grpcClientCtx, metadata.Pairs("hostname", hostname))
|
||||
|
||||
// check if grpc server version is compatible with agent
|
||||
grpcServerVersion, err := client.Version(ctx)
|
||||
grpcServerVersion, err := client.Version(grpcCtx)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("could not get grpc server version")
|
||||
return err
|
||||
@@ -147,12 +179,8 @@ func run(c *cli.Context, backends []types.Backend) error {
|
||||
return err
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
parallel := c.Int("max-workflows")
|
||||
wg.Add(parallel)
|
||||
|
||||
// new engine
|
||||
backendCtx := context.WithValue(ctx, types.CliContext, c)
|
||||
backendCtx := context.WithValue(agentCtx, types.CliContext, c)
|
||||
backendName := c.String("backend-engine")
|
||||
backendEngine, err := backend.FindBackend(backendCtx, backends, backendName)
|
||||
if err != nil {
|
||||
@@ -172,14 +200,34 @@ func run(c *cli.Context, backends []types.Backend) error {
|
||||
}
|
||||
log.Debug().Msgf("loaded %s backend engine", backendEngine.Name())
|
||||
|
||||
agentConfig.AgentID, err = client.RegisterAgent(ctx, engInfo.Platform, backendEngine.Name(), version.String(), parallel)
|
||||
maxWorkflows := c.Int("max-workflows")
|
||||
agentConfig.AgentID, err = client.RegisterAgent(grpcCtx, engInfo.Platform, backendEngine.Name(), version.String(), maxWorkflows)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
serviceWaitingGroup.Go(func() error {
|
||||
// we close grpc client context once unregister was handled
|
||||
defer grpcClientCtxCancel(nil)
|
||||
// we wait till agent context is done
|
||||
<-agentCtx.Done()
|
||||
// Remove stateless agents from server
|
||||
if !agentConfigPersisted.Load() {
|
||||
log.Debug().Msg("unregistering agent from server ...")
|
||||
// we want to run it explicit run when context got canceled so run it in background
|
||||
err := client.UnregisterAgent(grpcClientCtx)
|
||||
if err != nil {
|
||||
log.Err(err).Msg("failed to unregister agent from server")
|
||||
} else {
|
||||
log.Info().Msg("agent unregistered from server")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if agentConfigPath != "" {
|
||||
if err := writeAgentConfig(agentConfig, agentConfigPath); err == nil {
|
||||
agentConfigPersisted.Set()
|
||||
agentConfigPersisted.Store(true)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,66 +248,62 @@ func run(c *cli.Context, backends []types.Backend) error {
|
||||
|
||||
log.Debug().Msgf("agent registered with ID %d", agentConfig.AgentID)
|
||||
|
||||
go func() {
|
||||
serviceWaitingGroup.Go(func() error {
|
||||
for {
|
||||
if sigterm.IsSet() {
|
||||
log.Debug().Msg("terminating health reporting")
|
||||
return
|
||||
}
|
||||
|
||||
err := client.ReportHealth(ctx)
|
||||
err := client.ReportHealth(grpcCtx)
|
||||
if err != nil {
|
||||
log.Err(err).Msg("failed to report health")
|
||||
}
|
||||
|
||||
<-time.After(time.Second * 10)
|
||||
select {
|
||||
case <-agentCtx.Done():
|
||||
log.Debug().Msg("terminating health reporting")
|
||||
return nil
|
||||
case <-time.After(reportHealthInterval):
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
|
||||
for i := 0; i < parallel; i++ {
|
||||
for i := 0; i < maxWorkflows; i++ {
|
||||
i := i
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
r := agent.NewRunner(client, filter, hostname, counter, &backendEngine)
|
||||
serviceWaitingGroup.Go(func() error {
|
||||
runner := agent.NewRunner(client, filter, hostname, counter, &backendEngine)
|
||||
log.Debug().Msgf("created new runner %d", i)
|
||||
|
||||
for {
|
||||
if sigterm.IsSet() {
|
||||
log.Debug().Msgf("terminating runner %d", i)
|
||||
return
|
||||
if agentCtx.Err() != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
log.Debug().Msg("polling new steps")
|
||||
if err := r.Run(ctx); err != nil {
|
||||
log.Error().Err(err).Msg("pipeline done with error")
|
||||
return
|
||||
if err := runner.Run(agentCtx, shutdownCtx); err != nil {
|
||||
log.Error().Err(err).Msg("runner done with error")
|
||||
return err
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
}
|
||||
|
||||
log.Info().Msgf(
|
||||
"starting Woodpecker agent with version '%s' and backend '%s' using platform '%s' running up to %d pipelines in parallel",
|
||||
version.String(), backendEngine.Name(), engInfo.Platform, parallel)
|
||||
version.String(), backendEngine.Name(), engInfo.Platform, maxWorkflows)
|
||||
|
||||
wg.Wait()
|
||||
return nil
|
||||
return serviceWaitingGroup.Wait()
|
||||
}
|
||||
|
||||
func runWithRetry(backendEngines []types.Backend) func(context *cli.Context) error {
|
||||
return func(context *cli.Context) error {
|
||||
if err := logger.SetupGlobalLogger(context, true); err != nil {
|
||||
func runWithRetry(backendEngines []types.Backend) func(c *cli.Context) error {
|
||||
return func(c *cli.Context) error {
|
||||
if err := logger.SetupGlobalLogger(c, true); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
initHealth()
|
||||
|
||||
retryCount := context.Int("connect-retry-count")
|
||||
retryDelay := context.Duration("connect-retry-delay")
|
||||
retryCount := c.Int("connect-retry-count")
|
||||
retryDelay := c.Duration("connect-retry-delay")
|
||||
var err error
|
||||
for i := 0; i < retryCount; i++ {
|
||||
if err = run(context, backendEngines); status.Code(err) == codes.Unavailable {
|
||||
if err = run(c, backendEngines); status.Code(err) == codes.Unavailable {
|
||||
log.Warn().Err(err).Msg(fmt.Sprintf("cannot connect to server, retrying in %v", retryDelay))
|
||||
time.Sleep(retryDelay)
|
||||
} else {
|
||||
|
Reference in New Issue
Block a user