diff --git a/Dockerfile.agent b/Dockerfile.agent index acf533bcc..96198ccab 100644 --- a/Dockerfile.agent +++ b/Dockerfile.agent @@ -1,9 +1,11 @@ # docker build --rm -f Dockerfile.agent -t drone/agent . +EXPOSE 3000 + FROM centurylink/ca-certs ENV GODEBUG=netdns=go ADD release/drone-agent /bin/ -ENTRYPOINT ["/bin/drone-agent"] - HEALTHCHECK CMD ["/bin/drone-agent", "ping"] + +ENTRYPOINT ["/bin/drone-agent"] diff --git a/cmd/drone-agent/agent.go b/cmd/drone-agent/agent.go index c8849290c..c4ce0a7ab 100644 --- a/cmd/drone-agent/agent.go +++ b/cmd/drone-agent/agent.go @@ -56,6 +56,9 @@ func loop(c *cli.Context) error { ) } + counter.Polling = c.Int("max-procs") + counter.Running = 0 + if c.BoolT("healthcheck") { go http.ListenAndServe(":3000", nil) } @@ -138,9 +141,22 @@ func run(ctx context.Context, client rpc.Peer, filter rpc.Filter) error { return nil } + timeout := time.Hour + if minutes := work.Timeout; minutes != 0 { + timeout = time.Duration(minutes) * time.Minute + } + + counter.Add( + work.ID, + timeout, + extractRepositoryName(work.Config), // hack + extractBuildNumber(work.Config), // hack + ) + defer counter.Done(work.ID) + logger := log.With(). - Str("repo", extractRepositoryName(work.Config)). - Str("build", extractBuildNumber(work.Config)). + Str("repo", extractRepositoryName(work.Config)). // hack + Str("build", extractBuildNumber(work.Config)). // hack Str("id", work.ID). Logger() @@ -157,11 +173,6 @@ func run(ctx context.Context, client rpc.Peer, filter rpc.Filter) error { return err } - timeout := time.Hour - if minutes := work.Timeout; minutes != 0 { - timeout = time.Duration(minutes) * time.Minute - } - ctx, cancel := context.WithTimeout(ctxmeta, timeout) defer cancel() diff --git a/cmd/drone-agent/health.go b/cmd/drone-agent/health.go index 8d0dd5ce3..99c68fca1 100644 --- a/cmd/drone-agent/health.go +++ b/cmd/drone-agent/health.go @@ -3,7 +3,10 @@ package main import ( "encoding/json" "fmt" + "io" "net/http" + "sync" + "time" "github.com/drone/drone/version" "github.com/urfave/cli" @@ -14,12 +17,17 @@ import ( // https://github.com/mozilla-services/Dockerflow func init() { - http.HandleFunc("/__heartbeat__", handleHeartbeat) - http.HandleFunc("/__version__", handleVersion) + http.HandleFunc("/varz", handleStats) + http.HandleFunc("/healthz", handleHeartbeat) + http.HandleFunc("/version", handleVersion) } func handleHeartbeat(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(200) + if counter.Healthy() { + w.WriteHeader(200) + } else { + w.WriteHeader(500) + } } func handleVersion(w http.ResponseWriter, r *http.Request) { @@ -31,15 +39,87 @@ func handleVersion(w http.ResponseWriter, r *http.Request) { }) } +func handleStats(w http.ResponseWriter, r *http.Request) { + if counter.Healthy() { + w.WriteHeader(200) + } else { + w.WriteHeader(500) + } + w.Header().Add("Content-Type", "text/json") + counter.writeTo(w) +} + type versionResp struct { Version string `json:"version"` Source string `json:"source"` } +// default statistics counter +var counter = &state{ + Metadata: map[string]info{}, +} + +type state struct { + sync.Mutex `json:"-"` + Polling int `json:"polling_count"` + Running int `json:"running_count"` + Metadata map[string]info `json:"running"` +} + +type info struct { + ID string `json:"id"` + Repo string `json:"repository"` + Build string `json:"build_number"` + Started time.Time `json:"build_started"` + Timeout time.Duration `json:"build_timeout"` +} + +func (s *state) Add(id string, timeout time.Duration, repo, build string) { + s.Lock() + s.Polling-- + s.Running++ + s.Metadata[id] = info{ + ID: id, + Repo: repo, + Build: build, + Timeout: timeout, + Started: time.Now().UTC(), + } + s.Unlock() +} + +func (s *state) Done(id string) { + s.Lock() + s.Polling++ + s.Running-- + delete(s.Metadata, id) + s.Unlock() +} + +func (s *state) Healthy() bool { + s.Lock() + defer s.Unlock() + now := time.Now() + buf := time.Hour // 1 hour buffer + for _, item := range s.Metadata { + if now.After(item.Started.Add(item.Timeout).Add(buf)) { + return false + } + } + return true +} + +func (s *state) writeTo(w io.Writer) (int, error) { + s.Lock() + out, _ := json.Marshal(s) + s.Unlock() + return w.Write(out) +} + // handles pinging the endpoint and returns an error if the // agent is in an unhealthy state. func pinger(c *cli.Context) error { - resp, err := http.Get("http://localhost:3000/__heartbeat__") + resp, err := http.Get("http://localhost:3000/healthz") if err != nil { return err } diff --git a/cmd/drone-agent/health_test.go b/cmd/drone-agent/health_test.go new file mode 100644 index 000000000..17505c867 --- /dev/null +++ b/cmd/drone-agent/health_test.go @@ -0,0 +1,45 @@ +package main + +import ( + "testing" + "time" +) + +func TestHealthy(t *testing.T) { + s := state{} + s.Metadata = map[string]info{} + + s.Add("1", time.Hour, "octocat/hello-world", "42") + + if got, want := s.Metadata["1"].ID, "1"; got != want { + t.Errorf("got ID %s, want %s", got, want) + } + if got, want := s.Metadata["1"].Timeout, time.Hour; got != want { + t.Errorf("got duration %v, want %v", got, want) + } + if got, want := s.Metadata["1"].Repo, "octocat/hello-world"; got != want { + t.Errorf("got repository name %s, want %s", got, want) + } + + s.Metadata["1"] = info{ + Timeout: time.Hour, + Started: time.Now().UTC(), + } + if s.Healthy() == false { + t.Error("want healthy status when timeout not exceeded, got false") + } + + s.Metadata["1"] = info{ + Started: time.Now().UTC().Add(-(time.Minute * 30)), + } + if s.Healthy() == false { + t.Error("want healthy status when timeout+buffer not exceeded, got false") + } + + s.Metadata["1"] = info{ + Started: time.Now().UTC().Add(-(time.Hour + time.Minute)), + } + if s.Healthy() == true { + t.Error("want unhealthy status when timeout+buffer not exceeded, got true") + } +}