diff --git a/test/e2e_node/services/internal_services.go b/test/e2e_node/services/internal_services.go new file mode 100644 index 00000000000..b1edf9e4784 --- /dev/null +++ b/test/e2e_node/services/internal_services.go @@ -0,0 +1,148 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package services + +import ( + "io/ioutil" + "os" + "os/signal" + "syscall" + + "github.com/golang/glog" +) + +// e2eService manages e2e services in current process. +type e2eServices struct { + rmDirs []string + // statically linked e2e services + etcdServer *EtcdServer + apiServer *APIServer + nsController *NamespaceController +} + +func newE2EServices() *e2eServices { + return &e2eServices{} +} + +// terminationSignals are signals that cause the program to exit in the +// supported platforms (linux, darwin, windows). +var terminationSignals = []os.Signal{syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT} + +// run starts all e2e services and wait for the termination signal. Once receives the +// termination signal, it will stop the e2e services gracefully. +func (es *e2eServices) run() error { + defer es.stop() + if err := es.start(); err != nil { + return err + } + // Wait until receiving a termination signal. + sig := make(chan os.Signal, 1) + signal.Notify(sig, terminationSignals...) + <-sig + return nil +} + +// start starts the tests embedded services or returns an error. +func (es *e2eServices) start() error { + glog.Info("Starting e2e services...") + err := es.startEtcd() + if err != nil { + return err + } + err = es.startApiServer() + if err != nil { + return err + } + err = es.startNamespaceController() + if err != nil { + return nil + } + glog.Info("E2E services started.") + return nil +} + +// stop stops the embedded e2e services. +func (es *e2eServices) stop() { + glog.Info("Stopping e2e services...") + // TODO(random-liu): Use a loop to stop all services after introducing + // service interface. + glog.Info("Stopping namespace controller") + if es.nsController != nil { + if err := es.nsController.Stop(); err != nil { + glog.Errorf("Failed to stop %q: %v", es.nsController.Name(), err) + } + } + + glog.Info("Stopping API server") + if es.apiServer != nil { + if err := es.apiServer.Stop(); err != nil { + glog.Errorf("Failed to stop %q: %v", es.apiServer.Name(), err) + } + } + + glog.Info("Stopping etcd") + if es.etcdServer != nil { + if err := es.etcdServer.Stop(); err != nil { + glog.Errorf("Failed to stop %q: %v", es.etcdServer.Name(), err) + } + } + + for _, d := range es.rmDirs { + glog.Info("Deleting directory %v", d) + err := os.RemoveAll(d) + if err != nil { + glog.Errorf("Failed to delete directory %s.\n%v", d, err) + } + } + + glog.Info("E2E services stopped.") +} + +// startEtcd starts the embedded etcd instance or returns an error. +func (es *e2eServices) startEtcd() error { + glog.Info("Starting etcd") + dataDir, err := ioutil.TempDir("", "node-e2e") + if err != nil { + return err + } + // Mark the dataDir as directories to remove. + es.rmDirs = append(es.rmDirs, dataDir) + es.etcdServer = NewEtcd(dataDir) + return es.etcdServer.Start() +} + +// startApiServer starts the embedded API server or returns an error. +func (es *e2eServices) startApiServer() error { + glog.Info("Starting API server") + es.apiServer = NewAPIServer() + return es.apiServer.Start() +} + +// startNamespaceController starts the embedded namespace controller or returns an error. +func (es *e2eServices) startNamespaceController() error { + glog.Info("Starting namespace controller") + es.nsController = NewNamespaceController() + return es.nsController.Start() +} + +// getServicesHealthCheckURLs returns the health check urls for the internal services. +func getServicesHealthCheckURLs() []string { + return []string{ + getEtcdHealthCheckURL(), + getAPIServerHealthCheckURL(), + } +} diff --git a/test/e2e_node/services/server.go b/test/e2e_node/services/server.go new file mode 100644 index 00000000000..1ed3ddb9ea2 --- /dev/null +++ b/test/e2e_node/services/server.go @@ -0,0 +1,366 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package services + +import ( + "flag" + "fmt" + "net/http" + "os" + "os/exec" + "path" + "reflect" + "strconv" + "strings" + "syscall" + "time" + + "github.com/golang/glog" + + "k8s.io/kubernetes/test/e2e/framework" +) + +var serverStartTimeout = flag.Duration("server-start-timeout", time.Second*120, "Time to wait for each server to become healthy.") + +// A server manages a separate server process started and killed with +// commands. +type server struct { + // name is the name of the server, it is only used for logging. + name string + // startCommand is the command used to start the server + startCommand *exec.Cmd + // killCommand is the command used to stop the server. It is not required. If it + // is not specified, `sudo kill` will be used to stop the server. + killCommand *exec.Cmd + // restartCommand is the command used to restart the server. If provided, it will be used + // instead of startCommand when restarting the server. + restartCommand *exec.Cmd + // healthCheckUrls is the urls used to check whether the server is ready. + healthCheckUrls []string + // outFilename is the name of the log file. The stdout and stderr of the server + // will be redirected to this file. + outFilename string + // restartOnExit determines whether a restart loop is launched with the server + restartOnExit bool + // Writing to this channel, if it is not nil, stops the restart loop. + // When tearing down a server, you should check for this channel and write to it if it exists. + stopRestartingCh chan<- bool + // Read from this to confirm that the restart loop has stopped. + ackStopRestartingCh <-chan bool +} + +// newServer returns a new server with the given name, commands, health check +// URLs, etc. +func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, restartOnExit bool) *server { + return &server{ + name: name, + startCommand: start, + killCommand: kill, + restartCommand: restart, + healthCheckUrls: urls, + outFilename: outputFileName, + restartOnExit: restartOnExit, + } +} + +// commandToString format command to string. +func commandToString(c *exec.Cmd) string { + if c == nil { + return "" + } + return strings.Join(append([]string{c.Path}, c.Args[1:]...), " ") +} + +func (s *server) String() string { + return fmt.Sprintf("server %q start-command: `%s`, kill-command: `%s`, restart-command: `%s`, health-check: %v, output-file: %q", s.name, + commandToString(s.startCommand), commandToString(s.killCommand), commandToString(s.restartCommand), s.healthCheckUrls, s.outFilename) +} + +// readinessCheck checks whether services are ready via the supplied health +// check URLs. Once there is an error in errCh, the function will stop waiting +// and return the error. +// TODO(random-liu): Move this to util +func readinessCheck(name string, urls []string, errCh <-chan error) error { + glog.Infof("Running readiness check for service %q", name) + endTime := time.Now().Add(*serverStartTimeout) + blockCh := make(chan error) + defer close(blockCh) + for endTime.After(time.Now()) { + select { + // We *always* want to run the health check if there is no error on the channel. + // With systemd, reads from errCh report nil because cmd.Run() waits + // on systemd-run, rather than the service process. systemd-run quickly + // exits with status 0, causing the channel to be closed with no error. In + // this case, you want to wait for the health check to complete, rather + // than returning from readinessCheck as soon as the channel is closed. + case err, ok := <-errCh: + if ok { // The channel is not closed, this is a real error + if err != nil { // If there is an error, return it + return err + } + // If not, keep checking readiness. + } else { // The channel is closed, this is only a zero value. + // Replace the errCh with blockCh to avoid busy loop, + // and keep checking readiness. + errCh = blockCh + } + case <-time.After(time.Second): + ready := true + for _, url := range urls { + resp, err := http.Head(url) + if err != nil || resp.StatusCode != http.StatusOK { + ready = false + break + } + } + if ready { + return nil + } + } + } + return fmt.Errorf("e2e service %q readiness check timeout %v", name, *serverStartTimeout) +} + +// start starts the server by running its commands, monitors it with a health +// check, and ensures that it is restarted if applicable. +// +// Note: restartOnExit == true requires len(s.healthCheckUrls) > 0 to work properly. +func (s *server) start() error { + glog.Infof("Starting server %q with command %q", s.name, commandToString(s.startCommand)) + errCh := make(chan error) + + // Set up restart channels if the server is configured for restart on exit. + var stopRestartingCh, ackStopRestartingCh chan bool + if s.restartOnExit { + if len(s.healthCheckUrls) == 0 { + return fmt.Errorf("Tried to start %s which has s.restartOnExit == true, but no health check urls provided.", s) + } + + stopRestartingCh = make(chan bool) + ackStopRestartingCh = make(chan bool) + + s.stopRestartingCh = stopRestartingCh + s.ackStopRestartingCh = ackStopRestartingCh + } + + // This goroutine actually runs the start command for the server. + go func() { + defer close(errCh) + + // Create the output filename + outPath := path.Join(framework.TestContext.ReportDir, s.outFilename) + outfile, err := os.Create(outPath) + if err != nil { + errCh <- fmt.Errorf("failed to create file %q for `%s` %v.", outPath, s, err) + return + } else { + glog.Infof("Output file for server %q: %v", s.name, outfile.Name()) + } + defer outfile.Close() + defer outfile.Sync() + + // Set the command to write the output file + s.startCommand.Stdout = outfile + s.startCommand.Stderr = outfile + + // Death of this test process should kill the server as well. + attrs := &syscall.SysProcAttr{} + // Hack to set linux-only field without build tags. + deathSigField := reflect.ValueOf(attrs).Elem().FieldByName("Pdeathsig") + if deathSigField.IsValid() { + deathSigField.Set(reflect.ValueOf(syscall.SIGTERM)) + } else { + errCh <- fmt.Errorf("failed to set Pdeathsig field (non-linux build)") + return + } + s.startCommand.SysProcAttr = attrs + + // Start the command + err = s.startCommand.Start() + if err != nil { + errCh <- fmt.Errorf("failed to run %s: %v", s, err) + return + } + if !s.restartOnExit { + glog.Infof("Waiting for server %q start command to complete", s.name) + // If we aren't planning on restarting, ok to Wait() here to release resources. + // Otherwise, we Wait() in the restart loop. + err = s.startCommand.Wait() + if err != nil { + errCh <- fmt.Errorf("failed to run start command for server %q: %v", s.name, err) + return + } + } else { + usedStartCmd := true + for { + glog.Infof("Running health check for service %q", s.name) + // Wait for an initial health check to pass, so that we are sure the server started. + err := readinessCheck(s.name, s.healthCheckUrls, nil) + if err != nil { + if usedStartCmd { + glog.Infof("Waiting for server %q start command to complete after initial health check failed", s.name) + s.startCommand.Wait() // Release resources if necessary. + } + // This should not happen, immediately stop the e2eService process. + glog.Fatalf("Restart loop readinessCheck failed for %s", s) + } else { + glog.Infof("Initial health check passed for service %q", s.name) + } + + // Initial health check passed, wait until a health check fails again. + stillAlive: + for { + select { + case <-stopRestartingCh: + ackStopRestartingCh <- true + return + case <-time.After(time.Second): + for _, url := range s.healthCheckUrls { + resp, err := http.Head(url) + if err != nil || resp.StatusCode != http.StatusOK { + break stillAlive + } + } + } + } + + if usedStartCmd { + s.startCommand.Wait() // Release resources from last cmd + usedStartCmd = false + } + if s.restartCommand != nil { + // Always make a fresh copy of restartCommand before + // running, we may have to restart multiple times + s.restartCommand = &exec.Cmd{ + Path: s.restartCommand.Path, + Args: s.restartCommand.Args, + Env: s.restartCommand.Env, + Dir: s.restartCommand.Dir, + Stdin: s.restartCommand.Stdin, + Stdout: s.restartCommand.Stdout, + Stderr: s.restartCommand.Stderr, + ExtraFiles: s.restartCommand.ExtraFiles, + SysProcAttr: s.restartCommand.SysProcAttr, + } + // Run and wait for exit. This command is assumed to have + // short duration, e.g. systemctl restart + glog.Infof("Restarting server %q with restart command", s.name) + err = s.restartCommand.Run() + if err != nil { + // This should not happen, immediately stop the e2eService process. + glog.Fatalf("Restarting server %s with restartCommand failed. Error: %v.", s, err) + } + } else { + s.startCommand = &exec.Cmd{ + Path: s.startCommand.Path, + Args: s.startCommand.Args, + Env: s.startCommand.Env, + Dir: s.startCommand.Dir, + Stdin: s.startCommand.Stdin, + Stdout: s.startCommand.Stdout, + Stderr: s.startCommand.Stderr, + ExtraFiles: s.startCommand.ExtraFiles, + SysProcAttr: s.startCommand.SysProcAttr, + } + glog.Infof("Restarting server %q with start command", s.name) + err = s.startCommand.Start() + usedStartCmd = true + if err != nil { + // This should not happen, immediately stop the e2eService process. + glog.Fatalf("Restarting server %s with startCommand failed. Error: %v.", s, err) + } + } + } + } + }() + + return readinessCheck(s.name, s.healthCheckUrls, errCh) +} + +// kill runs the server's kill command. +func (s *server) kill() error { + glog.Infof("Kill server %q", s.name) + name := s.name + cmd := s.startCommand + + // If s has a restart loop, turn it off. + if s.restartOnExit { + s.stopRestartingCh <- true + <-s.ackStopRestartingCh + } + + if s.killCommand != nil { + return s.killCommand.Run() + } + + if cmd == nil { + return fmt.Errorf("could not kill %q because both `killCommand` and `startCommand` are nil", name) + } + + if cmd.Process == nil { + glog.V(2).Infof("%q not running", name) + return nil + } + pid := cmd.Process.Pid + if pid <= 1 { + return fmt.Errorf("invalid PID %d for %q", pid, name) + } + + // Attempt to shut down the process in a friendly manner before forcing it. + waitChan := make(chan error) + go func() { + _, err := cmd.Process.Wait() + waitChan <- err + close(waitChan) + }() + + const timeout = 10 * time.Second + for _, signal := range []string{"-TERM", "-KILL"} { + glog.V(2).Infof("Killing process %d (%s) with %s", pid, name, signal) + cmd := exec.Command("sudo", "kill", signal, strconv.Itoa(pid)) + + // Run the 'kill' command in a separate process group so sudo doesn't ignore it + attrs := &syscall.SysProcAttr{} + // Hack to set unix-only field without build tags. + setpgidField := reflect.ValueOf(attrs).Elem().FieldByName("Setpgid") + if setpgidField.IsValid() { + setpgidField.Set(reflect.ValueOf(true)) + } else { + return fmt.Errorf("Failed to set Setpgid field (non-unix build)") + } + cmd.SysProcAttr = attrs + + _, err := cmd.Output() + if err != nil { + glog.Errorf("Error signaling process %d (%s) with %s: %v", pid, name, signal, err) + continue + } + + select { + case err := <-waitChan: + if err != nil { + return fmt.Errorf("error stopping %q: %v", name, err) + } + // Success! + return nil + case <-time.After(timeout): + // Continue. + } + } + + return fmt.Errorf("unable to stop %q", name) +} diff --git a/test/e2e_node/services/services.go b/test/e2e_node/services/services.go index 6cb1dfbbd29..80f97f1f3bd 100644 --- a/test/e2e_node/services/services.go +++ b/test/e2e_node/services/services.go @@ -17,21 +17,14 @@ limitations under the License. package services import ( - "flag" "fmt" "io/ioutil" "math/rand" - "net/http" "os" "os/exec" - "os/signal" "path" "path/filepath" - "reflect" - "strconv" "strings" - "syscall" - "time" "github.com/golang/glog" "github.com/kardianos/osext" @@ -41,132 +34,12 @@ import ( "k8s.io/kubernetes/test/e2e_node/build" ) -// TODO(random-liu): Move this file to a separate package. -var serverStartTimeout = flag.Duration("server-start-timeout", time.Second*120, "Time to wait for each server to become healthy.") - // E2EServices starts and stops e2e services in a separate process. The test // uses it to start and stop all e2e services. type E2EServices struct { services *server -} - -// NewE2EServices returns a new E2EServices instance. -func NewE2EServices() *E2EServices { - return &E2EServices{} -} - -// services.log is the combined log of all services -const servicesLogFile = "services.log" - -// Start starts the e2e services in another process by calling back into the -// test binary. Returns when all e2e services are ready or an error. -// -// We want to statically link e2e services into the test binary, but we don't -// want their glog output to pollute the test result. So we run the binary in -// run- services-mode to start e2e services in another process. -func (e *E2EServices) Start() error { - var err error - // Create the manifest path for kubelet. - // TODO(random-liu): Remove related logic when we move kubelet starting logic out of the test. - framework.TestContext.ManifestPath, err = ioutil.TempDir("", "node-e2e-pod") - if err != nil { - return fmt.Errorf("failed to create static pod manifest directory: %v", err) - } - testBin, err := osext.Executable() - if err != nil { - return fmt.Errorf("can't get current binary: %v", err) - } - // TODO(random-liu): Add sudo after we statically link apiserver and etcd, because apiserver needs - // sudo. We can't add sudo now, because etcd may not be in PATH of root. - startCmd := exec.Command(testBin, - // TODO(mtaufen): Flags e.g. that target the TestContext need to be manually forwarded to the - // test binary when we start it up in run-services mode. This is not ideal. - // Very unintuitive because it prevents any falgs NOT manually forwarded here - // from being set via TEST_ARGS when running tests from the command line. - "--run-services-mode", - "--server-start-timeout", serverStartTimeout.String(), - "--report-dir", framework.TestContext.ReportDir, - // TODO(random-liu): Remove the following flags after we move kubelet starting logic - // out of the test. - "--node-name", framework.TestContext.NodeName, - "--disable-kubenet="+strconv.FormatBool(framework.TestContext.DisableKubenet), - // TODO: enable when flag is introduced in 1.5 - // "--cgroups-per-qos="+strconv.FormatBool(framework.TestContext.CgroupsPerQOS), - "--manifest-path", framework.TestContext.ManifestPath, - "--eviction-hard", framework.TestContext.EvictionHard, - "--feature-gates", framework.TestContext.FeatureGates, - "--runtime-integration-type", framework.TestContext.RuntimeIntegrationType, - "--logtostderr", - "--vmodule=*=4", - ) - e.services = newServer("services", startCmd, nil, nil, getHealthCheckURLs(), servicesLogFile, false) - return e.services.start() -} - -// Stop stops the e2e services. -func (e *E2EServices) Stop() error { - defer func() { - // Cleanup the manifest path for kubelet. - manifestPath := framework.TestContext.ManifestPath - if manifestPath != "" { - err := os.RemoveAll(manifestPath) - if err != nil { - glog.Errorf("Failed to delete static pod manifest directory %s: %v", manifestPath, err) - } - } - }() - if e.services == nil { - glog.Errorf("can't stop e2e services, because `services` is nil") - } - return e.services.kill() -} - -// RunE2EServices actually start the e2e services. This function is used to -// start e2e services in current process. This is only used in run-services-mode. -func RunE2EServices() { - // Populate global DefaultFeatureGate with value from TestContext.FeatureGates. - // This way, statically-linked components see the same feature gate config as the test context. - utilconfig.DefaultFeatureGate.Set(framework.TestContext.FeatureGates) - e := newE2EService() - if err := e.run(); err != nil { - glog.Fatalf("Failed to run e2e services: %v", err) - } -} - -// Ports of different e2e services. -const ( - kubeletPort = "10250" - kubeletReadOnlyPort = "10255" -) - -// Health check urls of different e2e services. -var ( - kubeletHealthCheckURL = getEndpoint(kubeletReadOnlyPort) + "/healthz" -) - -// getEndpoint generates endpoint url from service port. -func getEndpoint(port string) string { - return "http://127.0.0.1:" + port -} - -func getHealthCheckURLs() []string { - return []string{ - getEtcdHealthCheckURL(), - getAPIServerHealthCheckURL(), - kubeletHealthCheckURL, - } -} - -// e2eService manages e2e services in current process. -type e2eService struct { - services []*server - rmDirs []string + kubelet *server logFiles map[string]logFileData - - // statically linked e2e services - etcdServer *EtcdServer - apiServer *APIServer - nsController *NamespaceController } // logFileData holds data about logfiles to fetch with a journalctl command or @@ -176,185 +49,123 @@ type logFileData struct { journalctlCommand []string } +// NewE2EServices returns a new E2EServices instance. +func NewE2EServices() *E2EServices { + return &E2EServices{ + // Special log files that need to be collected for additional debugging. + logFiles: map[string]logFileData{ + "kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}}, + "docker.log": {[]string{"/var/log/docker.log", "/var/log/upstart/docker.log"}, []string{"-u", "docker"}}, + "cloud-init.log": {[]string{"/var/log/cloud-init.log"}, []string{"-u", "cloud*"}}, + }, + } +} + +// Start starts the e2e services in another process by calling back into the +// test binary. Returns when all e2e services are ready or an error. +// +// We want to statically link e2e services into the test binary, but we don't +// want their glog output to pollute the test result. So we run the binary in +// run-services-mode to start e2e services in another process. +// The function starts 2 processes: +// * internal e2e services: services which statically linked in the test binary - apiserver, etcd and +// namespace controller. +// * kubelet: kubelet binary is outside. (We plan to move main kubelet start logic out when we have +// standard kubelet launcher) +func (e *E2EServices) Start() error { + var err error + // Start kubelet + // Create the manifest path for kubelet. + // TODO(random-liu): Remove related logic when we move kubelet starting logic out of the test. + framework.TestContext.ManifestPath, err = ioutil.TempDir("", "node-e2e-pod") + if err != nil { + return fmt.Errorf("failed to create static pod manifest directory: %v", err) + } + e.kubelet, err = e.startKubelet() + if err != nil { + return fmt.Errorf("failed to start kubelet: %v", err) + } + e.services, err = e.startInternalServices() + return err +} + +// Stop stops the e2e services. +func (e *E2EServices) Stop() { + defer func() { + // Collect log files. + e.getLogFiles() + // Cleanup the manifest path for kubelet. + manifestPath := framework.TestContext.ManifestPath + if manifestPath != "" { + err := os.RemoveAll(manifestPath) + if err != nil { + glog.Errorf("Failed to delete static pod manifest directory %s: %v", manifestPath, err) + } + } + }() + if e.services != nil { + if err := e.services.kill(); err != nil { + glog.Errorf("Failed to stop services: %v", err) + } + } + if e.kubelet != nil { + if err := e.kubelet.kill(); err != nil { + glog.Errorf("Failed to stop kubelet: %v", err) + } + } +} + +// RunE2EServices actually start the e2e services. This function is used to +// start e2e services in current process. This is only used in run-services-mode. +func RunE2EServices() { + // Populate global DefaultFeatureGate with value from TestContext.FeatureGates. + // This way, statically-linked components see the same feature gate config as the test context. + utilconfig.DefaultFeatureGate.Set(framework.TestContext.FeatureGates) + e := newE2EServices() + if err := e.run(); err != nil { + glog.Fatalf("Failed to run e2e services: %v", err) + } +} + const ( - // This is consistent with the level used in a cluster e2e test. + // services.log is the combined log of all services + servicesLogFile = "services.log" + // LOG_VERBOSITY_LEVEL is consistent with the level used in a cluster e2e test. LOG_VERBOSITY_LEVEL = "4" ) -func newE2EService() *e2eService { - // Special log files that need to be collected for additional debugging. - var logFiles = map[string]logFileData{ - "kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}}, - "docker.log": {[]string{"/var/log/docker.log", "/var/log/upstart/docker.log"}, []string{"-u", "docker"}}, - "cloud-init.log": {[]string{"/var/log/cloud-init.log"}, []string{"-u", "cloud*"}}, - } - - return &e2eService{logFiles: logFiles} -} - -// terminationSignals are signals that cause the program to exit in the -// supported platforms (linux, darwin, windows). -var terminationSignals = []os.Signal{syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT} - -// run starts all e2e services and wait for the termination signal. Once receives the -// termination signal, it will stop the e2e services gracefully. -func (es *e2eService) run() error { - defer es.stop() - if err := es.start(); err != nil { - return err - } - // Wait until receiving a termination signal. - sig := make(chan os.Signal, 1) - signal.Notify(sig, terminationSignals...) - <-sig - return nil -} - -// start starts the tests embedded services or returns an error. -func (es *e2eService) start() error { - glog.Info("Starting e2e services...") - err := es.startEtcd() +// startInternalServices starts the internal services in a separate process. +func (e *E2EServices) startInternalServices() (*server, error) { + testBin, err := osext.Executable() if err != nil { - return err + return nil, fmt.Errorf("can't get current binary: %v", err) } - - err = es.startApiServer() - if err != nil { - return err - } - - s, err := es.startKubelet() - if err != nil { - return err - } - es.services = append(es.services, s) - - err = es.startNamespaceController() - if err != nil { - return nil - } - glog.Info("E2E services started.") - return nil + startCmd := exec.Command("sudo", testBin, + // TODO(mtaufen): Flags e.g. that target the TestContext need to be manually forwarded to the + // test binary when we start it up in run-services mode. This is not ideal. + // Very unintuitive because it prevents any flags NOT manually forwarded here + // from being set via TEST_ARGS when running tests from the command line. + "--run-services-mode", + "--server-start-timeout", serverStartTimeout.String(), + "--feature-gates", framework.TestContext.FeatureGates, + "--logtostderr", + "--vmodule=*="+LOG_VERBOSITY_LEVEL, + ) + server := newServer("services", startCmd, nil, nil, getServicesHealthCheckURLs(), servicesLogFile, false) + return server, server.start() } -// getLogFiles gets logs of interest either via journalctl or by creating sym -// links. Since we scp files from the remote directory, symlinks will be -// treated as normal files and file contents will be copied over. -func (es *e2eService) getLogFiles() { - // Nothing to do if report dir is not specified. - if framework.TestContext.ReportDir == "" { - return - } - glog.Info("Fetching log files...") - journaldFound := isJournaldAvailable() - for targetFileName, logFileData := range es.logFiles { - targetLink := path.Join(framework.TestContext.ReportDir, targetFileName) - if journaldFound { - // Skip log files that do not have an equivalent in journald-based machines. - if len(logFileData.journalctlCommand) == 0 { - continue - } - glog.Infof("Get log file %q with journalctl command %v.", targetFileName, logFileData.journalctlCommand) - out, err := exec.Command("sudo", append([]string{"journalctl"}, logFileData.journalctlCommand...)...).CombinedOutput() - if err != nil { - glog.Errorf("failed to get %q from journald: %v, %v", targetFileName, string(out), err) - } else { - if err = ioutil.WriteFile(targetLink, out, 0644); err != nil { - glog.Errorf("failed to write logs to %q: %v", targetLink, err) - } - } - continue - } - for _, file := range logFileData.files { - if _, err := os.Stat(file); err != nil { - // Expected file not found on this distro. - continue - } - if err := copyLogFile(file, targetLink); err != nil { - glog.Error(err) - } else { - break - } - } - } -} - -// stop stops the embedded e2e services. -func (es *e2eService) stop() { - glog.Info("Stopping e2e services...") - es.getLogFiles() - - // TODO(random-liu): Use a loop to stop all services after introducing - // service interface. - glog.Info("Stopping namespace controller") - if es.nsController != nil { - if err := es.nsController.Stop(); err != nil { - glog.Errorf("Failed to stop %q: %v", es.nsController.Name(), err) - } - } - - glog.Info("Stopping API server") - if es.apiServer != nil { - if err := es.apiServer.Stop(); err != nil { - glog.Errorf("Failed to stop %q: %v", es.apiServer.Name(), err) - } - } - - for _, s := range es.services { - glog.Info("Stopping service %q", s.name) - if err := s.kill(); err != nil { - glog.Errorf("Failed to stop %v: %v", s.name, err) - } - } - - glog.Info("Stopping etcd") - if es.etcdServer != nil { - if err := es.etcdServer.Stop(); err != nil { - glog.Errorf("Failed to stop %q: %v", es.etcdServer.Name(), err) - } - } - - for _, d := range es.rmDirs { - glog.Info("Deleting directory %v", d) - err := os.RemoveAll(d) - if err != nil { - glog.Errorf("Failed to delete directory %s.\n%v", d, err) - } - } - - glog.Info("E2E services stopped.") -} - -// startEtcd starts the embedded etcd instance or returns an error. -func (es *e2eService) startEtcd() error { - glog.Info("Starting etcd") - dataDir, err := ioutil.TempDir("", "node-e2e") - if err != nil { - return err - } - // Mark the dataDir as directories to remove. - es.rmDirs = append(es.rmDirs, dataDir) - es.etcdServer = NewEtcd(dataDir) - return es.etcdServer.Start() -} - -// startApiServer starts the embedded API server or returns an error. -func (es *e2eService) startApiServer() error { - glog.Info("Starting API server") - es.apiServer = NewAPIServer() - return es.apiServer.Start() -} - -// startNamespaceController starts the embedded namespace controller or returns an error. -func (es *e2eService) startNamespaceController() error { - glog.Info("Starting namespace controller") - es.nsController = NewNamespaceController() - return es.nsController.Start() -} +const ( + // Ports of different e2e services. + kubeletPort = "10250" + kubeletReadOnlyPort = "10255" + // Health check url of kubelet + kubeletHealthCheckURL = "http://127.0.0.1:" + kubeletReadOnlyPort + "/healthz" +) // startKubelet starts the Kubelet in a separate process or returns an error // if the Kubelet fails to start. -func (es *e2eService) startKubelet() (*server, error) { +func (e *E2EServices) startKubelet() (*server, error) { glog.Info("Starting kubelet") var killCommand, restartCommand *exec.Cmd cmdArgs := []string{} @@ -367,7 +178,7 @@ func (es *e2eService) startKubelet() (*server, error) { cmdArgs = append(cmdArgs, systemdRun, "--unit="+unitName, "--remain-after-exit", build.GetKubeletServerBin()) killCommand = exec.Command("sudo", "systemctl", "kill", unitName) restartCommand = exec.Command("sudo", "systemctl", "restart", unitName) - es.logFiles["kubelet.log"] = logFileData{ + e.logFiles["kubelet.log"] = logFileData{ journalctlCommand: []string{"-u", unitName}, } framework.TestContext.EvictionHard = adjustConfigForSystemd(framework.TestContext.EvictionHard) @@ -395,6 +206,7 @@ func (es *e2eService) startKubelet() (*server, error) { "--eviction-hard", framework.TestContext.EvictionHard, "--eviction-pressure-transition-period", "30s", "--feature-gates", framework.TestContext.FeatureGates, + "--runtime-integration-type", framework.TestContext.RuntimeIntegrationType, "--v", LOG_VERBOSITY_LEVEL, "--logtostderr", ) if framework.TestContext.RuntimeIntegrationType != "" { @@ -430,294 +242,59 @@ func (es *e2eService) startKubelet() (*server, error) { return server, server.start() } -// A server manages a separate server process started and killed with -// commands. -type server struct { - // name is the name of the server, it is only used for logging. - name string - // startCommand is the command used to start the server - startCommand *exec.Cmd - // killCommand is the command used to stop the server. It is not required. If it - // is not specified, `sudo kill` will be used to stop the server. - killCommand *exec.Cmd - // restartCommand is the command used to restart the server. If provided, it will be used - // instead of startCommand when restarting the server. - restartCommand *exec.Cmd - // healthCheckUrls is the urls used to check whether the server is ready. - healthCheckUrls []string - // outFilename is the name of the log file. The stdout and stderr of the server - // will be redirected to this file. - outFilename string - // restartOnExit determines whether a restart loop is launched with the server - restartOnExit bool - // Writing to this channel, if it is not nil, stops the restart loop. - // When tearing down a server, you should check for this channel and write to it if it exists. - stopRestartingCh chan<- bool - // Read from this to confirm that the restart loop has stopped. - ackStopRestartingCh <-chan bool -} - -// newServer returns a new server with the given name, commands, health check -// URLs, etc. -func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, restartOnExit bool) *server { - return &server{ - name: name, - startCommand: start, - killCommand: kill, - restartCommand: restart, - healthCheckUrls: urls, - outFilename: outputFileName, - restartOnExit: restartOnExit, - } -} - -// commandToString format command to string. -func commandToString(c *exec.Cmd) string { - if c == nil { - return "" - } - return strings.Join(append([]string{c.Path}, c.Args[1:]...), " ") -} - -func (s *server) String() string { - return fmt.Sprintf("server %q start-command: `%s`, kill-command: `%s`, restart-command: `%s`, health-check: %v, output-file: %q", s.name, - commandToString(s.startCommand), commandToString(s.killCommand), commandToString(s.restartCommand), s.healthCheckUrls, s.outFilename) -} - -// start starts the server by running its commands, monitors it with a health -// check, and ensures that it is restarted if applicable. -// -// Note: restartOnExit == true requires len(s.healthCheckUrls) > 0 to work properly. -func (s *server) start() error { - glog.Infof("Starting server %q with command %q", s.name, commandToString(s.startCommand)) - errCh := make(chan error) - - // Set up restart channels if the server is configured for restart on exit. - var stopRestartingCh, ackStopRestartingCh chan bool - if s.restartOnExit { - if len(s.healthCheckUrls) == 0 { - return fmt.Errorf("Tried to start %s which has s.restartOnExit == true, but no health check urls provided.", s) - } - - stopRestartingCh = make(chan bool) - ackStopRestartingCh = make(chan bool) - - s.stopRestartingCh = stopRestartingCh - s.ackStopRestartingCh = ackStopRestartingCh - } - - // This goroutine actually runs the start command for the server. - go func() { - defer close(errCh) - - // Create the output filename - outPath := path.Join(framework.TestContext.ReportDir, s.outFilename) - outfile, err := os.Create(outPath) - if err != nil { - errCh <- fmt.Errorf("failed to create file %q for `%s` %v.", outPath, s, err) - return - } else { - glog.Infof("Output file for server %q: %v", s.name, outfile.Name()) - } - defer outfile.Close() - defer outfile.Sync() - - // Set the command to write the output file - s.startCommand.Stdout = outfile - s.startCommand.Stderr = outfile - - // Death of this test process should kill the server as well. - attrs := &syscall.SysProcAttr{} - // Hack to set linux-only field without build tags. - deathSigField := reflect.ValueOf(attrs).Elem().FieldByName("Pdeathsig") - if deathSigField.IsValid() { - deathSigField.Set(reflect.ValueOf(syscall.SIGTERM)) - } else { - errCh <- fmt.Errorf("failed to set Pdeathsig field (non-linux build)") - return - } - s.startCommand.SysProcAttr = attrs - - // Start the command - err = s.startCommand.Start() - if err != nil { - errCh <- fmt.Errorf("failed to run %s: %v", s, err) - return - } - if !s.restartOnExit { - glog.Infof("Waiting for server %q start command to complete", s.name) - // If we aren't planning on restarting, ok to Wait() here to release resources. - // Otherwise, we Wait() in the restart loop. - err = s.startCommand.Wait() - if err != nil { - errCh <- fmt.Errorf("failed to run start command for server %q: %v", s.name, err) - return - } - } else { - usedStartCmd := true - for { - glog.Infof("Running health check for service %q", s.name) - // Wait for an initial health check to pass, so that we are sure the server started. - err := readinessCheck(s.name, s.healthCheckUrls, nil) - if err != nil { - if usedStartCmd { - glog.Infof("Waiting for server %q start command to complete after initial health check failed", s.name) - s.startCommand.Wait() // Release resources if necessary. - } - // This should not happen, immediately stop the e2eService process. - glog.Fatalf("restart loop readinessCheck failed for %s", s) - } else { - glog.Infof("Initial health check passed for service %q", s.name) - } - - // Initial health check passed, wait until a health check fails again. - stillAlive: - for { - select { - case <-stopRestartingCh: - ackStopRestartingCh <- true - return - case <-time.After(time.Second): - for _, url := range s.healthCheckUrls { - resp, err := http.Head(url) - if err != nil || resp.StatusCode != http.StatusOK { - break stillAlive - } - } - } - } - - if usedStartCmd { - s.startCommand.Wait() // Release resources from last cmd - usedStartCmd = false - } - if s.restartCommand != nil { - // Always make a fresh copy of restartCommand before - // running, we may have to restart multiple times - s.restartCommand = &exec.Cmd{ - Path: s.restartCommand.Path, - Args: s.restartCommand.Args, - Env: s.restartCommand.Env, - Dir: s.restartCommand.Dir, - Stdin: s.restartCommand.Stdin, - Stdout: s.restartCommand.Stdout, - Stderr: s.restartCommand.Stderr, - ExtraFiles: s.restartCommand.ExtraFiles, - SysProcAttr: s.restartCommand.SysProcAttr, - } - // Run and wait for exit. This command is assumed to have - // short duration, e.g. systemctl restart - glog.Infof("Restarting server %q with restart command", s.name) - err = s.restartCommand.Run() - if err != nil { - // This should not happen, immediately stop the e2eService process. - glog.Fatalf("restarting server %s with restartCommand failed. Error: %v.", s, err) - } - } else { - s.startCommand = &exec.Cmd{ - Path: s.startCommand.Path, - Args: s.startCommand.Args, - Env: s.startCommand.Env, - Dir: s.startCommand.Dir, - Stdin: s.startCommand.Stdin, - Stdout: s.startCommand.Stdout, - Stderr: s.startCommand.Stderr, - ExtraFiles: s.startCommand.ExtraFiles, - SysProcAttr: s.startCommand.SysProcAttr, - } - glog.Infof("Restarting server %q with start command", s.name) - err = s.startCommand.Start() - usedStartCmd = true - if err != nil { - // This should not happen, immediately stop the e2eService process. - glog.Fatalf("Restarting %s with startCommand failed. Error: %v.", s, err) - } - } - } - } - }() - - return readinessCheck(s.name, s.healthCheckUrls, errCh) -} - -// kill runs the server's kill command. -func (s *server) kill() error { - glog.Infof("Kill server %q", s.name) - name := s.name - cmd := s.startCommand - - // If s has a restart loop, turn it off. - if s.restartOnExit { - s.stopRestartingCh <- true - <-s.ackStopRestartingCh - } - - if s.killCommand != nil { - return s.killCommand.Run() - } - - if cmd == nil { - return fmt.Errorf("could not kill %q because both `killCommand` and `startCommand` are nil", name) - } - - if cmd.Process == nil { - glog.V(2).Infof("%q not running", name) - return nil - } - pid := cmd.Process.Pid - if pid <= 1 { - return fmt.Errorf("invalid PID %d for %q", pid, name) - } - - // Attempt to shut down the process in a friendly manner before forcing it. - waitChan := make(chan error) - go func() { - _, err := cmd.Process.Wait() - waitChan <- err - close(waitChan) - }() - - const timeout = 10 * time.Second - for _, signal := range []string{"-TERM", "-KILL"} { - glog.V(2).Infof("Killing process %d (%s) with %s", pid, name, signal) - cmd := exec.Command("sudo", "kill", signal, strconv.Itoa(pid)) - - // Run the 'kill' command in a separate process group so sudo doesn't ignore it - attrs := &syscall.SysProcAttr{} - // Hack to set unix-only field without build tags. - setpgidField := reflect.ValueOf(attrs).Elem().FieldByName("Setpgid") - if setpgidField.IsValid() { - setpgidField.Set(reflect.ValueOf(true)) - } else { - return fmt.Errorf("Failed to set Setpgid field (non-unix build)") - } - cmd.SysProcAttr = attrs - - _, err := cmd.Output() - if err != nil { - glog.Errorf("Error signaling process %d (%s) with %s: %v", pid, name, signal, err) - continue - } - - select { - case err := <-waitChan: - if err != nil { - return fmt.Errorf("error stopping %q: %v", name, err) - } - // Success! - return nil - case <-time.After(timeout): - // Continue. - } - } - - return fmt.Errorf("unable to stop %q", name) -} - func adjustConfigForSystemd(config string) string { return strings.Replace(config, "%", "%%", -1) } +// getLogFiles gets logs of interest either via journalctl or by creating sym +// links. Since we scp files from the remote directory, symlinks will be +// treated as normal files and file contents will be copied over. +func (e *E2EServices) getLogFiles() { + // Nothing to do if report dir is not specified. + if framework.TestContext.ReportDir == "" { + return + } + glog.Info("Fetching log files...") + journaldFound := isJournaldAvailable() + for targetFileName, logFileData := range e.logFiles { + targetLink := path.Join(framework.TestContext.ReportDir, targetFileName) + if journaldFound { + // Skip log files that do not have an equivalent in journald-based machines. + if len(logFileData.journalctlCommand) == 0 { + continue + } + glog.Infof("Get log file %q with journalctl command %v.", targetFileName, logFileData.journalctlCommand) + out, err := exec.Command("sudo", append([]string{"journalctl"}, logFileData.journalctlCommand...)...).CombinedOutput() + if err != nil { + glog.Errorf("failed to get %q from journald: %v, %v", targetFileName, string(out), err) + } else { + if err = ioutil.WriteFile(targetLink, out, 0644); err != nil { + glog.Errorf("failed to write logs to %q: %v", targetLink, err) + } + } + continue + } + for _, file := range logFileData.files { + if _, err := os.Stat(file); err != nil { + // Expected file not found on this distro. + continue + } + if err := copyLogFile(file, targetLink); err != nil { + glog.Error(err) + } else { + break + } + } + } +} + +// isJournaldAvailable returns whether the system executing the tests uses +// journald. +func isJournaldAvailable() bool { + _, err := exec.LookPath("journalctl") + return err == nil +} + func copyLogFile(src, target string) error { // If not a journald based distro, then just symlink files. if out, err := exec.Command("sudo", "cp", src, target).CombinedOutput(); err != nil { @@ -728,55 +305,3 @@ func copyLogFile(src, target string) error { } return nil } - -// isJournaldAvailable returns whether the system executing the tests uses -// journald. -func isJournaldAvailable() bool { - _, err := exec.LookPath("journalctl") - return err == nil -} - -// readinessCheck checks whether services are ready via the supplied health -// check URLs. Once there is an error in errCh, the function will stop waiting -// and return the error. -// TODO(random-liu): Move this to util -func readinessCheck(name string, urls []string, errCh <-chan error) error { - glog.Infof("Running readiness check for service %q", name) - endTime := time.Now().Add(*serverStartTimeout) - blockCh := make(chan error) - defer close(blockCh) - for endTime.After(time.Now()) { - select { - // We *always* want to run the health check if there is no error on the channel. - // With systemd, reads from errCh report nil because cmd.Run() waits - // on systemd-run, rather than the service process. systemd-run quickly - // exits with status 0, causing the channel to be closed with no error. In - // this case, you want to wait for the health check to complete, rather - // than returning from readinessCheck as soon as the channel is closed. - case err, ok := <-errCh: - if ok { // The channel is not closed, this is a real error - if err != nil { // If there is an error, return it - return err - } - // If not, keep checking readiness. - } else { // The channel is closed, this is only a zero value. - // Replace the errCh with blockCh to avoid busy loop, - // and keep checking readiness. - errCh = blockCh - } - case <-time.After(time.Second): - ready := true - for _, url := range urls { - resp, err := http.Head(url) - if err != nil || resp.StatusCode != http.StatusOK { - ready = false - break - } - } - if ready { - return nil - } - } - } - return fmt.Errorf("e2e service %q readiness check timeout %v", name, *serverStartTimeout) -}